1 Input Data

1.1 Load data

protein = read.csv("https://raw.githubusercontent.com/pengdsci/sta321/main/ww02/w02-Protein_Supply_Quantity_Data.csv", 
                   header = TRUE)
#head(protein)
#dim(protein)
original.sample = sample( protein$Pulses,    # Protein intake
                       170,                      # sample size = 170 values in the sample
                       replace = FALSE          # sample without replacement
                 )  

CI= quantile(original.sample, c(0.025,0.975))
  CI
##       2.5%      97.5% 
##  0.0146425 10.6659875

1.2 Bootstrap Sampling

bt.sample = sample( protein$Pulses,    # Data set / original sample 
                       170,                    # bootstrap sample size:same as the original sample size 
                       replace = FALSE)        # sampling with replacement
                                               
bt.mean = mean(bt.sample)         
bt.mean
## [1] 2.656114
bt.sampme.avg = NULL
original.sample = sample( protein$Pulses,    # Protein intake 
                       170,                      # sample size = 170 values in the sample
                       replace = FALSE          # sample without replacement
                 )                              
### Bootstrap sampling begins 
bt.sample.mean.vec = NULL      # define an empty vector to hold sample means of repeated samples.
for(i in 1:1000){              # starting for-loop to take bootstrap samples with n = 170
  ith.bt.sample = sample( original.sample,    # Original sample with 170 WCU students' protein intake
                       170,                    # sample size = 170 MUST be equal to the sample size!!
                       replace = TRUE         # MUST use WITH REPLACEMENT!!
                 )                            # this is the i-th Bootstrap sample
  bt.sampme.avg[i] = mean(ith.bt.sample) # calculate the mean of i-th bootstrap sample and 
                                              # save it in the empty vector: sample.bt.mean.vec 
}
  
  CI= quantile(bt.sampme.avg, c(0.025,0.975))
  CI
##     2.5%    97.5% 
## 2.214961 3.113823

##plot

hist(bt.sampme.avg,       #creating a histogram of the bootstap sample to check if normally distributed 
      breaks = 23,  
      xlab = "Bootstrap sample means",     
     main="Bootstrap Sampling Distribution \n of Sample Means")

1.3 Comparison

When reviewing the CI for bootstrapping I found a 95% CI [2.244399, 3.103593]. When using the original sample my 95% CI [0.0146425, 10.6659875]. In comparing these two values its apparent that the bootstrapping CI was much smaller and therefore more useful in creating an estimate when looking at the Protein data set more specifically pulses. The histogram and the confidnece intervals both show that the the Data is normally distributed.