DATA 306 HW#3

library(readr)

#Problem 1 Extract and create new data frame of the HRS data, including variables of BMI and years of education.

hrs <- read_csv("/Users/Nazija/Downloads/HRS_2020.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   hhidpn = col_double(),
##   r1agey_m = col_double(),
##   r2agey_m = col_double(),
##   r2cesd = col_double(),
##   r1bmi = col_double(),
##   r2bmi = col_double(),
##   r2mobila = col_double(),
##   h1itot = col_double(),
##   h2itot = col_double()
## )

## See spec(...) for full column specifications.

head(hrs)

## # A tibble: 6 x 21
##   hhidpn r1mstat r2mstat r1cenreg r1cendiv ragender raracem raedyrs raeduc
##    <dbl> <chr>   <chr>   <chr>    <chr>    <chr>    <chr>   <chr>   <chr> 
## 1 1.01e3 5.divo… 5.divo… 1.north… 2.mid a… 1.male   1.whit… 16      5.col…
## 2 2.01e3 7.wido… 7.wido… 2.midwe… 3.en ce… 2.female 1.whit… 8       1.lt …
## 3 3.01e3 1.marr… 1.marr… 4.west   9.pacif… 1.male   1.whit… 12      3.hig…
## 4 3.02e3 1.marr… 1.marr… 4.west   9.pacif… 2.female 1.whit… 16      5.col…
## 5 1.00e7 8.neve… 8.neve… 1.north… 2.mid a… 1.male   1.whit… 12      3.hig…
## 6 1.00e7 1.marr… 1.marr… 1.north… 2.mid a… 1.male   1.whit… 16      5.col…
## # … with 12 more variables: rarelig <chr>, ravetrn <chr>, rameduc <chr>,
## #   rafeduc <chr>, r1agey_m <dbl>, r2agey_m <dbl>, r2cesd <dbl>, r1bmi <dbl>,
## #   r2bmi <dbl>, r2mobila <dbl>, h1itot <dbl>, h2itot <dbl>

hrs.sub = data.frame(hrs$r1bmi, as.integer(hrs$raedyrs))

## Warning in data.frame(hrs$r1bmi, as.integer(hrs$raedyrs)): NAs introduced by
## coercion

head(hrs.sub)

##   hrs.r1bmi as.integer.hrs.raedyrs.
## 1      30.7                      16
## 2      18.5                       8
## 3      25.8                      12
## 4      32.4                      16
## 5      23.7                      12
## 6      30.4                      16

Remove observations with missing data

dim(hrs.sub)

## [1] 20871     2

hrs.sub = na.omit(hrs.sub)
dim(hrs.sub)

## [1] 11399     2

colnames(hrs.sub) = c("bmi", "edu")
summary(hrs.sub)

##       bmi              edu      
##  Min.   : 12.80   Min.   : 1.0  
##  1st Qu.: 23.70   1st Qu.:10.0  
##  Median : 26.50   Median :12.0  
##  Mean   : 27.19   Mean   :11.6  
##  3rd Qu.: 29.80   3rd Qu.:13.0  
##  Max.   :102.70   Max.   :16.0

head(hrs.sub)

##    bmi edu
## 1 30.7  16
## 2 18.5   8
## 3 25.8  12
## 4 32.4  16
## 5 23.7  12
## 6 30.4  16

Write a function to randomly select observations from the new data frame

rsample = function(n){
 sampling = sample(11399, size = n, replace = FALSE)
 return(sampling)
}

Use the function and a for loop to iteratively select six random subsamples with sizes 100, 200, 400, 600, 800, and 1000.

takesample = function(n){
  indices = rsample(n)
  subsample = matrix(nrow = n, ncol = 2)
  colnames(subsample) <- c("bmi", "edu")
  for (i in 1:n){
    p = indices[i]
    subsample[i,1] = hrs.sub[p,1]
    subsample[i,2] = hrs.sub[p,2]
  }
  return(subsample)
}
takesample(10) #test

##        bmi edu
##  [1,] 23.6  12
##  [2,] 29.2  10
##  [3,] 21.8  10
##  [4,] 26.0  12
##  [5,] 25.8  11
##  [6,] 31.3  12
##  [7,] 24.1  12
##  [8,] 29.2  14
##  [9,] 27.5  12
## [10,] 28.7  12

6 subsamples

hundred<-as.data.frame(takesample(100))
twohundred<-as.data.frame(takesample(200))
fourhundred<-as.data.frame(takesample(400))
sixhundred<-as.data.frame(takesample(600))
eighthundred<-as.data.frame(takesample(800))
thousand<-as.data.frame(takesample(1000))

Filter out edu <=12

#hundred.12 = hundred[hundred[,2]<=12]
hundred.12 <- subset(hundred, edu <=12)
twoh.12<-subset(twohundred, edu<=12)
fourh.12<- subset(fourhundred, edu <=12)
sixh.12 <- subset(sixhundred, edu<=12)
eighth.12 <- subset(eighthundred, edu <=12)
thousand.12 <-  subset(thousand, edu <=12)

In each subsample, create the density plot of the bmi for only the observations who did NOT go to college or above

par(mfrow = c(3,2))
plot(density(hundred.12$bmi), main = "Density Plot of Body Mass Index, n=100", xlab = "BMI")
plot(density(twoh.12$bmi), main = "Density Plot of Body Mass Index, n=200", xlab = "BMI")
plot(density(fourh.12$bmi), main = "Density Plot of Body Mass Index, n=400", xlab = "BMI")
plot(density(sixh.12$bmi), main = "Density Plot of Body Mass Index, n=600", xlab = "BMI")
plot(density(eighth.12$bmi), main = "Density Plot of Body Mass Index, n=800", xlab = "BMI")
plot(density(thousand.12$bmi), main = "Density Plot of Body Mass Index, n=1,000", xlab = "BMI")