DATA306 Midterm

Problem 1 I make 100 random samples of boys heights and take their means, then count how many of the means are above 50.

means <- matrix(nrow = 100)
for (i in 1:100){
  boys= rnorm(100,51,3)
  means[i] = mean(boys)
}
mean(means)

## [1] 51.02402

count = 0
for(i in 1:100){
  if (means[i] > 50) count = count + 1
}
count

## [1] 100

The standard error is 3/10. The probability that the sample mean is greater than 50 is (50 - 51)/ (.3)

zscore = -1/.3
zscore

## [1] -3.333333

The z-score is -3.33, so we are looking for the probability that the z-score is -3.33 or greater. Since 99.7% of scores fall within 3 standard deviations of the mean, we can approximate that there is a near 100% probability of the sample mean being greater than 50. The count shows that 99% of the means were above 50

Problem 2

difference <- matrix(nrow = 100)
for (i in 1:100){
  boys = mean(rnorm(100, 51, 3))
  girls = mean(rnorm(100,53, 3))
  difference[i] = girls - boys
}
mean(difference)

## [1] 1.989222

count = 0
for(i in 1:100){
  if (difference[i] < 0) count = count + 1
}
count

## [1] 0

Of the samples, the girls’ average mean height is near 2 inches taller than the boys’ average mean height. There is a near 0% chance of the boys’ sample mean being higher than the girls’ sample mean.

Problem 3

library(readr)
hrs <- read_csv("/Users/Nazija/Downloads/HRS_r1bmi.csv")

## Parsed with column specification:
## cols(
##   hhidpn = col_double(),
##   raedyrs = col_double(),
##   raeduc = col_character(),
##   rameduc = col_double(),
##   rafeduc = col_double(),
##   ragender = col_character(),
##   raracem = col_character(),
##   r1agey_b = col_double(),
##   r1agey_e = col_double(),
##   r1agey_m = col_double(),
##   r1bmi = col_double()
## )

hrs.sub = (hrs$r1bmi)

Histogram of hrs$r1bmi

hist(hrs.sub)

QQPlot of hrs$r1bmi

qqnorm(hrs.sub)

mean(hrs.sub)

## [1] 27.09804

Draw 10,000 random samples of size n = 2 from the same population

means2 <- matrix(nrow = 10000)
for(i in 1:10000){
  sampling = sample(hrs.sub, 2, replace = FALSE)
  temp = mean(sampling)
  means2[i] = temp
}

Histogram, qqplot, and mean of sampling distribution of 10,000 samples with sample size 2

hist(means2)

qqnorm(means2)

mean(means2)

## [1] 27.11753

Draw 10,000 random samples of size n = 10 from the same population

means10 <- matrix(nrow = 10000)
for(i in 1:10000){
  sampling = sample(hrs.sub, 10, replace = FALSE)
  temp = mean(sampling)
  means10[i] = temp
}

Histogram, qqplot, and mean of sampling distribution of 10,000 samples with sample size 10

mean(means10)

## [1] 27.07383

hist(means10)

qqnorm(means10)

Draw 10,000 random samples of size n = 1000 from the same population

means1k <- matrix(nrow = 10000)
for(i in 1:10000){
  sampling = sample(hrs.sub, 1000, replace = FALSE)
  temp = mean(sampling)
  means1k[i] = temp
}

Histogram, qqplot, and mean of sampling distribution of 10,000 samples with sample size 1000

mean(means1k)

## [1] 27.098

hist(means1k)

qqnorm(means1k)

par(mfrow = c(2,3))
hist(means2)
hist(means10)
hist(means1k)
qqnorm(means10, main = "n = 2")
qqnorm(means2, main = "n = 10")
qqnorm(means1k, main = "n = 1000")

7. This confirms the two properties of Central Limit Theorem. As the sampling size increases, the range of values for the mean decreases, and the qqplot shows how the distribution is most normal when n = 1000.