pawnee <- read.csv("pawnee.csv", header = TRUE)
# 1(a)
# Look at the first 6 rows of the data frame
head(pawnee)
##   ID Latitude Longitude Arsenic Sulfur New_hlth_issue
## 1  1 41.09414 -85.60974       0      0              N
## 2  2 41.09054 -85.70344       0    130              N
## 3  3 41.08601 -85.71996       4    170              N
## 4  4 41.08100 -85.75415       0      0              Y
## 5  5 41.07435 -85.70043       0      0              N
## 6  6 41.07399 -85.71788       0      0              N
# view dimension
dim(pawnee)
## [1] 541   6
# 1(b)
# Set the seed for reproducibility
set.seed(1337)

# sample 30 row indices
sample_index = sample(nrow(pawnee), size = 30)
# Use the indies to extract the sampled rows
pawnee_sample = pawnee[sample_index, ]
# Confirm sample was saved correctly
head(pawnee_sample)
##      ID Latitude Longitude Arsenic Sulfur New_hlth_issue
## 147 147 41.03971 -85.72783       2    100              N
## 49   49 41.06113 -85.65553       0      0              Y
## 210 210 41.03178 -85.64253       0      0              N
## 356 356 41.01178 -85.66516       0      0              N
## 425 425 41.00096 -85.72899       0      0              N
## 239 239 41.02772 -85.72901       0      0              N
# 1(c)
# Sample mean arsenic level
mean_arsenic_sample = mean(pawnee_sample$Arsenic)
mean_arsenic_sample
## [1] 0.85
# Proportion
# A logical vector (TRUE/FALSE) is treated as 1/0 inside mean()
prop_hlth_sample = mean(pawnee_sample$New_hlth_issue == "Y")
prop_hlth_sample
## [1] 0.2
#1(d)
# The sample mean is denoted x_bar
# The sample proportion is denoted p_hat
# 1(e)
# Sample size
n = 30

#Standard error of the sample proportion
SE = sqrt(prop_hlth_sample * (1 - prop_hlth_sample) / n)
SE
## [1] 0.07302967
# Critical z-value for 3 confidence levels
z90 = 1.645
z95 = 1.96
z99 = 2.576

# 3 confidence intervals
CI_90 = prop_hlth_sample + c(-1, 1) * z90 * SE
CI_95 = prop_hlth_sample + c(-1, 1) * z95 * SE
CI_99 = prop_hlth_sample + c(-1, 1) * z99 * SE
CI_90
## [1] 0.07986619 0.32013381
CI_95
## [1] 0.05686184 0.34313816
CI_99
## [1] 0.01187556 0.38812444
# 1(f)
# A 100% confidence interval must contain every possible value of the parameter. Since a proportion is always between 0 and 1, the 100% CI is [0, 1].
# 1(g)
pop_prop = mean(pawnee$New_hlth_issue == "Y")
pop_prop
## [1] 0.2920518
# 1(h)
hist(pawnee$Arsenic,
       breaks = 40,
     col = "lightgreen",
     border = "pink",
     main = "Distribution of Arsenic Levels in Pawnee",
     xlab = "Arsenic (ppm)")

# The distribution is strongly right-skewed with most household near 0 ppm.
#2(a)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
n = 30 #sample size for each repetition
N = 541 #population size
M = 1000 #number of repetitions

#Vector to store the sample proportion from each repetition

phats = numeric(M)

#Set the seed
set.seed(123)

# for-loop
for(i in seq_len(M)) {
    #Sample 30 row indices from the population
  index = sample(N, size = n)
  #Extract those rows as the i-th sample
  pawnee <- read.csv("pawnee.csv")
  sample_i = pawnee[index, ]
  #Compute the proportion of "Y" health issues in this sample
  phats[i] = mean(sample_i$New_hlth_issue =="Y")
}

# Plot histogram
histogram(~phats,
          col = "yellow",
          border = "orange",
          type = "density", # density = relative frequency
          fit = "normal",
          xlab = "Sample proportion (p-hat)",
          main = "Sampling distribution of p-hat (n = 30)")

#2(b)
mean(phats)
## [1] 0.2928
sd(phats)
## [1] 0.07951963
#2c
#Roughly yes, but it is slightly right-skewed.
#2(d)
#Population proportion
p = mean(pawnee$New_hlth_issue =="Y")

#The CLT-based prediction for the sampling distribution of p-hat

clt_mean = p
clt_sd = sqrt(p*(1-p)/n)
clt_mean
## [1] 0.2920518
clt_sd
## [1] 0.08301757
#The simulation values in 2(b) match the CLT predictions very closely.
#3(a)
library(mosaic)

n = 30 #sample size for each repetition
N = 514 #population size
M = 1000 #number of repetitions

#Vector to store the sample proportion from each repetition

xbars = numeric(M)

#Set the seed
set.seed(123)
for(i in seq_len(M)) {
  index = sample(N, size = n)
  sample_i = pawnee [index, ]
  #Sample mean of arsenic for this repetition
  xbars[i] = mean(sample_i$Arsenic)
}
#3(b)
histogram(~ xbars,
          col = "purple",
          border = "lightblue",
           type = "density",
           fit = "normal", 
           xlab = "Sample mean arsenic (ppm)",
           main = "Sampling Distribution of x-bar (n = 30")

#Sample
mean(xbars)
## [1] 3.220804
sd(xbars)
## [1] 2.229346
#Population
mean(pawnee$Arsenic)
## [1] 3.383272
sd(pawnee$Arsenic) / sqrt(n) #CLT-predicted standard error
## [1] 2.448554
#3(c)
#No, it is clearly right-skewed. It is different with exercise (2c)
#because of the population shape. In exercise 1(h), we saw the
#population of arsenic levels is extremely right-skewed.