Homework 2

Pulling in the data from my github

library(RCurl)
my_git_url <- getURL("https://raw.githubusercontent.com/aelsaeyed/BridgeR/main/datasets/hw2/HealthInsurance.csv")
# hinsurance_csv <- read.csv("/Users/aelsaeyed/BridgeR/datasets/hw2/HealthInsurance.csv", quote = "" )
hinsurance_csv <- read.csv(text = my_git_url, quote = "")

Question 1

Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes of your data.

my_summary = summary(hinsurance_csv)
my_summary

##      X..             X.health.             X.age.        X.limit.        
##  Length:8802        Length:8802        Min.   :18.00   Length:8802       
##  Class :character   Class :character   1st Qu.:30.00   Class :character  
##  Mode  :character   Mode  :character   Median :39.00   Mode  :character  
##                                        Mean   :38.94                     
##                                        3rd Qu.:48.00                     
##                                        Max.   :62.00                     
##   X.gender.         X.insurance.        X.married.         X.selfemp.       
##  Length:8802        Length:8802        Length:8802        Length:8802       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    X.family.       X.region.         X.ethnicity.       X.education.      
##  Min.   : 1.000   Length:8802        Length:8802        Length:8802       
##  1st Qu.: 2.000   Class :character   Class :character   Class :character  
##  Median : 3.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 3.094                                                           
##  3rd Qu.: 4.000                                                           
##  Max.   :14.000

mean1 = mean(hinsurance_csv$X.age)
median1 = median(hinsurance_csv$X.age)
mean1

## [1] 38.93683

median1

## [1] 39

mean2 = mean(hinsurance_csv$X.family)
median2 = median(hinsurance_csv$X.family)
mean2

## [1] 3.093501

median2

## [1] 3

Question 2

Create a new data frame with a subset of the columns AND rows. There are several ways to do this so feel free to try a couple if you want. Make sure to rename the new data set so it simply just doesn’t write it over.

insurance_subset = subset(hinsurance_csv, hinsurance_csv$X.age > 35 & hinsurance_csv$X.family >= 2)
insurance_new_cols = insurance_subset[, c("X.age.", "X.family.", "X.ethnicity.", "X.region.")]
summary(insurance_new_cols)

##      X.age.        X.family.      X.ethnicity.        X.region.        
##  Min.   :36.00   Min.   : 2.000   Length:4586        Length:4586       
##  1st Qu.:40.00   1st Qu.: 2.000   Class :character   Class :character  
##  Median :46.00   Median : 3.000   Mode  :character   Mode  :character  
##  Mean   :46.36   Mean   : 3.348                                        
##  3rd Qu.:52.00   3rd Qu.: 4.000                                        
##  Max.   :62.00   Max.   :14.000

Question 3

Create new column names for each column in the new data frame created in step 2.

names(insurance_new_cols) <- c('age', 'family', 'ethnicity', 'region')
summary(insurance_new_cols)

##       age            family        ethnicity            region         
##  Min.   :36.00   Min.   : 2.000   Length:4586        Length:4586       
##  1st Qu.:40.00   1st Qu.: 2.000   Class :character   Class :character  
##  Median :46.00   Median : 3.000   Mode  :character   Mode  :character  
##  Mean   :46.36   Mean   : 3.348                                        
##  3rd Qu.:52.00   3rd Qu.: 4.000                                        
##  Max.   :62.00   Max.   :14.000

Question 4

Use the summary function to create an overview of your new data frame created in step 2. The print the mean and median for the same two attributes. Please compare (i.e. tell me how the values changed and why).

summary(insurance_new_cols)

##       age            family        ethnicity            region         
##  Min.   :36.00   Min.   : 2.000   Length:4586        Length:4586       
##  1st Qu.:40.00   1st Qu.: 2.000   Class :character   Class :character  
##  Median :46.00   Median : 3.000   Mode  :character   Mode  :character  
##  Mean   :46.36   Mean   : 3.348                                        
##  3rd Qu.:52.00   3rd Qu.: 4.000                                        
##  Max.   :62.00   Max.   :14.000

newmean1 = mean(insurance_new_cols$age)
newmedian1 = median(insurance_new_cols$age)
newmean1

## [1] 46.35543

newmedian1

## [1] 46

newmean2 = mean(insurance_new_cols$family)
newmedian2 = median(insurance_new_cols$family)
newmean2

## [1] 3.348234

newmedian2

## [1] 3

#The mean and median of age rose dramatically because I previously created a subset of the original data by filtering for rows containing ages greater than 35. The mean family size went up a little bit because I also filtered for rows where the size is 2 or greater. It seems the mean family size is still the same.

Question 5

For at least 3 different/distinct values in a column please rename so that every value in that column is renamed. For example, change the letter “e” to “excellent”, the letter “a” to “average’ and the word “bad” to “terrible”.

#resetting the index 
row.names(insurance_new_cols) <- NULL

insurance_new_cols$ethnicity[insurance_new_cols$ethnicity == '"cauc"'] <- "Caucasian"
insurance_new_cols$ethnicity[insurance_new_cols$ethnicity == '"afam"'] <- "African American"

insurance_new_cols$region[insurance_new_cols$region == '"west"'] <- "West Coast"
insurance_new_cols$region[insurance_new_cols$region == '"south"'] <- "Southern Coast"

Question 6

Display enough rows to see examples of all of steps 1-5 above. This means use a function to show me enough row values that I can see the changes.

head(insurance_new_cols, 20)

##    age family        ethnicity         region
## 1   54      5        Caucasian     West Coast
## 2   39      5        Caucasian     West Coast
## 3   56      2        Caucasian     West Coast
## 4   60      2        Caucasian Southern Coast
## 5   62      2        Caucasian Southern Coast
## 6   52      2 African American    "northeast"
## 7   50      2 African American    "northeast"
## 8   38      7        Caucasian     West Coast
## 9   48      8        Caucasian Southern Coast
## 10  48      8        Caucasian Southern Coast
## 11  53      2        Caucasian      "midwest"
## 12  50      2        Caucasian      "midwest"
## 13  43      2        Caucasian Southern Coast
## 14  40      4        Caucasian      "midwest"
## 15  53      6 African American    "northeast"
## 16  40      6 African American    "northeast"
## 17  51      3        Caucasian     West Coast
## 18  58      3        Caucasian     West Coast
## 19  45      4        Caucasian     West Coast
## 20  43      4        Caucasian      "midwest"