Problem 3:
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
HRS_w1sub_2_ <- read_csv("Downloads/HRS_w1sub (2).csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## hhidpn = col_double(),
## raedyrs = col_double(),
## raeduc = col_character(),
## rameduc = col_double(),
## rafeduc = col_double(),
## ragender = col_character(),
## raracem = col_character(),
## r1agey_b = col_double(),
## r1agey_e = col_double(),
## r1agey_m = col_double(),
## r1bmi = col_double(),
## r1gender = col_double(),
## r1race = col_double()
## )
1. Histogram of the variable r1bmi
hist(HRS_w1sub_2_$r1bmi, breaks = 100)

2. Q-Q plot of r1bmi
attach(HRS_w1sub_2_)
plot(density(r1bmi), col = "skyblue", main="Density Plot: BMI", xlab="BMI")

qqnorm(r1bmi, col = "darkblue", main="Population Dis. of BMI")
qqline(r1bmi)

3. Mean of r1bmi
m1 = mean(r1bmi)
m1
## [1] 27.09804
4a. Histogram of the sampling distribution
avgmean <- matrix(nrow = 10000)
for(i in 1:10000){
sampling = sample(r1bmi, 2, replace = FALSE)
temp = mean(sampling)
avgmean[i] = temp
}
hist(avgmean, col= "aquamarine3")

4b. Q-Q plot of the sampling distribution to check its normality
qqnorm(avgmean, col="aquamarine3")

4c. Mean of “sampling distribution” vs “population mean”
mean(avgmean) # mean of sampling distribution
## [1] 27.08254
mean(r1bmi) # population mean
## [1] 27.09804
Both the sampling distribution & population have the same mean
5. Draw 10,000 random samples of size n = 10 from the same population and repeat the tasks as 4a, 4b, and 4c :
meansize10 <- matrix(nrow = 10000)
for(i in 1:10000){
sampling = sample(r1bmi, 10, replace = FALSE)
temp = mean(sampling)
meansize10[i] = temp
}
hist(meansize10, col = "cadetblue")

qqnorm(meansize10, col="cadetblue")

mean(meansize10)
## [1] 27.1093
6.
mean1000 <- matrix(nrow = 10000)
for(i in 1:10000){
sampling = sample(r1bmi, 10, replace = FALSE)
temp = mean(sampling)
mean1000[i] = temp
}
hist(mean1000, col = "darkorchid1", breaks = 100)

qqnorm(mean1000, col="darkorchid1")

mean(mean1000)
## [1] 27.07168
7. Confirm (or reject) the two properties CLT by comparing the outputs from these tasks
hist(r1bmi, col = "gold", breaks = 100)

hist(avgmean, col = "gold", breaks = 100)

hist(meansize10, col = "gold", breaks = 100)

hist(mean1000, col = "gold", breaks = 100)

qqnorm(r1bmi, col ="indianred1")

qqnorm(avgmean, col ="indianred1")

qqnorm(meansize10, col ="indianred1")

qqnorm(mean1000, col ="indianred1")

The CLT theorm can be confirmed because the sampling distribution of the mean approaches a normal distribution due to the sample size increasing.