protein = read.csv("https://raw.githubusercontent.com/pengdsci/sta321/main/ww02/w02-Protein_Supply_Quantity_Data.csv",
header = TRUE)
#head(protein)
#dim(protein)
original.sample = sample( protein$Pulses, # Protein intake
170, # sample size = 170 values in the sample
replace = FALSE # sample without replacement
)
CI= quantile(original.sample, c(0.025,0.975))
CI
## 2.5% 97.5%
## 0.0146425 10.6659875
bt.sample = sample( protein$Pulses, # Data set / original sample
170, # bootstrap sample size:same as the original sample size
replace = FALSE) # sampling with replacement
bt.mean = mean(bt.sample)
bt.mean
## [1] 2.656114
bt.sampme.avg = NULL
original.sample = sample( protein$Pulses, # Protein intake
170, # sample size = 170 values in the sample
replace = FALSE # sample without replacement
)
### Bootstrap sampling begins
bt.sample.mean.vec = NULL # define an empty vector to hold sample means of repeated samples.
for(i in 1:1000){ # starting for-loop to take bootstrap samples with n = 170
ith.bt.sample = sample( original.sample, # Original sample with 170 WCU students' protein intake
170, # sample size = 170 MUST be equal to the sample size!!
replace = TRUE # MUST use WITH REPLACEMENT!!
) # this is the i-th Bootstrap sample
bt.sampme.avg[i] = mean(ith.bt.sample) # calculate the mean of i-th bootstrap sample and
# save it in the empty vector: sample.bt.mean.vec
}
CI= quantile(bt.sampme.avg, c(0.025,0.975))
CI
## 2.5% 97.5%
## 2.214961 3.113823
##plot
hist(bt.sampme.avg, #creating a histogram of the bootstap sample to check if normally distributed
breaks = 23,
xlab = "Bootstrap sample means",
main="Bootstrap Sampling Distribution \n of Sample Means")
When reviewing the CI for bootstrapping I found a 95% CI [2.244399, 3.103593]. When using the original sample my 95% CI [0.0146425, 10.6659875]. In comparing these two values its apparent that the bootstrapping CI was much smaller and therefore more useful in creating an estimate when looking at the Protein data set more specifically pulses. The histogram and the confidnece intervals both show that the the Data is normally distributed.