pawnee <- read.csv("pawnee.csv", header = TRUE)
# 1(a)
# Look at the first 6 rows of the data frame
head(pawnee)
## ID Latitude Longitude Arsenic Sulfur New_hlth_issue
## 1 1 41.09414 -85.60974 0 0 N
## 2 2 41.09054 -85.70344 0 130 N
## 3 3 41.08601 -85.71996 4 170 N
## 4 4 41.08100 -85.75415 0 0 Y
## 5 5 41.07435 -85.70043 0 0 N
## 6 6 41.07399 -85.71788 0 0 N
# view dimension
dim(pawnee)
## [1] 541 6
# 1(b)
# Set the seed for reproducibility
set.seed(1337)
# sample 30 row indices
sample_index = sample(nrow(pawnee), size = 30)
# Use the indies to extract the sampled rows
pawnee_sample = pawnee[sample_index, ]
# Confirm sample was saved correctly
head(pawnee_sample)
## ID Latitude Longitude Arsenic Sulfur New_hlth_issue
## 147 147 41.03971 -85.72783 2 100 N
## 49 49 41.06113 -85.65553 0 0 Y
## 210 210 41.03178 -85.64253 0 0 N
## 356 356 41.01178 -85.66516 0 0 N
## 425 425 41.00096 -85.72899 0 0 N
## 239 239 41.02772 -85.72901 0 0 N
# 1(c)
# Sample mean arsenic level
mean_arsenic_sample = mean(pawnee_sample$Arsenic)
mean_arsenic_sample
## [1] 0.85
# Proportion
# A logical vector (TRUE/FALSE) is treated as 1/0 inside mean()
prop_hlth_sample = mean(pawnee_sample$New_hlth_issue == "Y")
prop_hlth_sample
## [1] 0.2
#1(d)
# The sample mean is denoted x_bar
# The sample proportion is denoted p_hat
# 1(e)
# Sample size
n = 30
#Standard error of the sample proportion
SE = sqrt(prop_hlth_sample * (1 - prop_hlth_sample) / n)
SE
## [1] 0.07302967
# Critical z-value for 3 confidence levels
z90 = 1.645
z95 = 1.96
z99 = 2.576
# 3 confidence intervals
CI_90 = prop_hlth_sample + c(-1, 1) * z90 * SE
CI_95 = prop_hlth_sample + c(-1, 1) * z95 * SE
CI_99 = prop_hlth_sample + c(-1, 1) * z99 * SE
CI_90
## [1] 0.07986619 0.32013381
CI_95
## [1] 0.05686184 0.34313816
CI_99
## [1] 0.01187556 0.38812444
# 1(f)
# A 100% confidence interval must contain every possible value of the parameter. Since a proportion is always between 0 and 1, the 100% CI is [0, 1].
# 1(g)
pop_prop = mean(pawnee$New_hlth_issue == "Y")
pop_prop
## [1] 0.2920518
# 1(h)
hist(pawnee$Arsenic,
breaks = 40,
col = "lightgreen",
border = "pink",
main = "Distribution of Arsenic Levels in Pawnee",
xlab = "Arsenic (ppm)")

# The distribution is strongly right-skewed with most household near 0 ppm.
#2(a)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
n = 30 #sample size for each repetition
N = 541 #population size
M = 1000 #number of repetitions
#Vector to store the sample proportion from each repetition
phats = numeric(M)
#Set the seed
set.seed(123)
# for-loop
for(i in seq_len(M)) {
#Sample 30 row indices from the population
index = sample(N, size = n)
#Extract those rows as the i-th sample
pawnee <- read.csv("pawnee.csv")
sample_i = pawnee[index, ]
#Compute the proportion of "Y" health issues in this sample
phats[i] = mean(sample_i$New_hlth_issue =="Y")
}
# Plot histogram
histogram(~phats,
col = "yellow",
border = "orange",
type = "density", # density = relative frequency
fit = "normal",
xlab = "Sample proportion (p-hat)",
main = "Sampling distribution of p-hat (n = 30)")

#2(b)
mean(phats)
## [1] 0.2928
sd(phats)
## [1] 0.07951963
#2c
#Roughly yes, but it is slightly right-skewed.
#2(d)
#Population proportion
p = mean(pawnee$New_hlth_issue =="Y")
#The CLT-based prediction for the sampling distribution of p-hat
clt_mean = p
clt_sd = sqrt(p*(1-p)/n)
clt_mean
## [1] 0.2920518
clt_sd
## [1] 0.08301757
#The simulation values in 2(b) match the CLT predictions very closely.
#3(a)
library(mosaic)
n = 30 #sample size for each repetition
N = 514 #population size
M = 1000 #number of repetitions
#Vector to store the sample proportion from each repetition
xbars = numeric(M)
#Set the seed
set.seed(123)
for(i in seq_len(M)) {
index = sample(N, size = n)
sample_i = pawnee [index, ]
#Sample mean of arsenic for this repetition
xbars[i] = mean(sample_i$Arsenic)
}
#3(b)
histogram(~ xbars,
col = "purple",
border = "lightblue",
type = "density",
fit = "normal",
xlab = "Sample mean arsenic (ppm)",
main = "Sampling Distribution of x-bar (n = 30")

#Sample
mean(xbars)
## [1] 3.220804
sd(xbars)
## [1] 2.229346
#Population
mean(pawnee$Arsenic)
## [1] 3.383272
sd(pawnee$Arsenic) / sqrt(n) #CLT-predicted standard error
## [1] 2.448554
#3(c)
#No, it is clearly right-skewed. It is different with exercise (2c)
#because of the population shape. In exercise 1(h), we saw the
#population of arsenic levels is extremely right-skewed.