library(readr)
#Problem 1 Extract and create new data frame of the HRS data, including variables of BMI and years of education.
hrs <- read_csv("/Users/Nazija/Downloads/HRS_2020.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## hhidpn = col_double(),
## r1agey_m = col_double(),
## r2agey_m = col_double(),
## r2cesd = col_double(),
## r1bmi = col_double(),
## r2bmi = col_double(),
## r2mobila = col_double(),
## h1itot = col_double(),
## h2itot = col_double()
## )
## See spec(...) for full column specifications.
head(hrs)
## # A tibble: 6 x 21
## hhidpn r1mstat r2mstat r1cenreg r1cendiv ragender raracem raedyrs raeduc
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1.01e3 5.divo… 5.divo… 1.north… 2.mid a… 1.male 1.whit… 16 5.col…
## 2 2.01e3 7.wido… 7.wido… 2.midwe… 3.en ce… 2.female 1.whit… 8 1.lt …
## 3 3.01e3 1.marr… 1.marr… 4.west 9.pacif… 1.male 1.whit… 12 3.hig…
## 4 3.02e3 1.marr… 1.marr… 4.west 9.pacif… 2.female 1.whit… 16 5.col…
## 5 1.00e7 8.neve… 8.neve… 1.north… 2.mid a… 1.male 1.whit… 12 3.hig…
## 6 1.00e7 1.marr… 1.marr… 1.north… 2.mid a… 1.male 1.whit… 16 5.col…
## # … with 12 more variables: rarelig <chr>, ravetrn <chr>, rameduc <chr>,
## # rafeduc <chr>, r1agey_m <dbl>, r2agey_m <dbl>, r2cesd <dbl>, r1bmi <dbl>,
## # r2bmi <dbl>, r2mobila <dbl>, h1itot <dbl>, h2itot <dbl>
hrs.sub = data.frame(hrs$r1bmi, as.integer(hrs$raedyrs))
## Warning in data.frame(hrs$r1bmi, as.integer(hrs$raedyrs)): NAs introduced by
## coercion
head(hrs.sub)
## hrs.r1bmi as.integer.hrs.raedyrs.
## 1 30.7 16
## 2 18.5 8
## 3 25.8 12
## 4 32.4 16
## 5 23.7 12
## 6 30.4 16
Remove observations with missing data
dim(hrs.sub)
## [1] 20871 2
hrs.sub = na.omit(hrs.sub)
dim(hrs.sub)
## [1] 11399 2
colnames(hrs.sub) = c("bmi", "edu")
summary(hrs.sub)
## bmi edu
## Min. : 12.80 Min. : 1.0
## 1st Qu.: 23.70 1st Qu.:10.0
## Median : 26.50 Median :12.0
## Mean : 27.19 Mean :11.6
## 3rd Qu.: 29.80 3rd Qu.:13.0
## Max. :102.70 Max. :16.0
head(hrs.sub)
## bmi edu
## 1 30.7 16
## 2 18.5 8
## 3 25.8 12
## 4 32.4 16
## 5 23.7 12
## 6 30.4 16
Write a function to randomly select observations from the new data frame
rsample = function(n){
sampling = sample(11399, size = n, replace = FALSE)
return(sampling)
}
Use the function and a for loop to iteratively select six random subsamples with sizes 100, 200, 400, 600, 800, and 1000.
takesample = function(n){
indices = rsample(n)
subsample = matrix(nrow = n, ncol = 2)
colnames(subsample) <- c("bmi", "edu")
for (i in 1:n){
p = indices[i]
subsample[i,1] = hrs.sub[p,1]
subsample[i,2] = hrs.sub[p,2]
}
return(subsample)
}
takesample(10) #test
## bmi edu
## [1,] 23.6 12
## [2,] 29.2 10
## [3,] 21.8 10
## [4,] 26.0 12
## [5,] 25.8 11
## [6,] 31.3 12
## [7,] 24.1 12
## [8,] 29.2 14
## [9,] 27.5 12
## [10,] 28.7 12
6 subsamples
hundred<-as.data.frame(takesample(100))
twohundred<-as.data.frame(takesample(200))
fourhundred<-as.data.frame(takesample(400))
sixhundred<-as.data.frame(takesample(600))
eighthundred<-as.data.frame(takesample(800))
thousand<-as.data.frame(takesample(1000))
Filter out edu <=12
#hundred.12 = hundred[hundred[,2]<=12]
hundred.12 <- subset(hundred, edu <=12)
twoh.12<-subset(twohundred, edu<=12)
fourh.12<- subset(fourhundred, edu <=12)
sixh.12 <- subset(sixhundred, edu<=12)
eighth.12 <- subset(eighthundred, edu <=12)
thousand.12 <- subset(thousand, edu <=12)
In each subsample, create the density plot of the bmi for only the observations who did NOT go to college or above
par(mfrow = c(3,2))
plot(density(hundred.12$bmi), main = "Density Plot of Body Mass Index, n=100", xlab = "BMI")
plot(density(twoh.12$bmi), main = "Density Plot of Body Mass Index, n=200", xlab = "BMI")
plot(density(fourh.12$bmi), main = "Density Plot of Body Mass Index, n=400", xlab = "BMI")
plot(density(sixh.12$bmi), main = "Density Plot of Body Mass Index, n=600", xlab = "BMI")
plot(density(eighth.12$bmi), main = "Density Plot of Body Mass Index, n=800", xlab = "BMI")
plot(density(thousand.12$bmi), main = "Density Plot of Body Mass Index, n=1,000", xlab = "BMI")