Chapter Five Homework

Consider a population that has a normal distribution with mean $\mu = 36$, standard deviation $\sigma = 8$.
1. The sampling distribution of $\bar{X}$ for samples of size 200 will have what distribution, mean, and standard error?
2. Use R to draw a random sample of size $200$ from this population. Conduct EDA on your sample.
3. Compute the bootstrap distribution for your sample mean, and note the bootstrap mean and standard error.
4. Compare the bootstrap distribution to the theoretical sampling distribution by creating a table like Table 5.2.
5. Repeat parts a-d for sample sizes of $n = 10$ and $n = 50$. Carefully describe your observations about the effects of sample size on the bootstrap distribution.

Your answers:

$\bar{X} \sim N(36, \frac{8}{\sqrt{200}})$

set.seed(13)
Sam <- rnorm(200,36, 8)
hist(Sam)

qqnorm(Sam)

xbar <- mean(Sam)
SD <- sd(Sam)

Sam_s <- c(mean(Sam), sd(Sam))
names(Sam_s) <- list("Mean", "SD")
Sam_s

    Mean       SD 
35.87853  8.17609

B <- 10^4
bsmean <- numeric(B)
for(i in 1:B){
  bsmean[i] <- mean(sample(Sam, 200, replace = TRUE))
}
bsxbar <- mean(bsmean)
bsse <- sd(bsmean)

BS_s <- c(bsxbar, bsse)
names(BS_s) <- list("BS Mean", "BS SE")
BS_s

   BS Mean      BS SE 
35.8795634  0.5815149

bs1 <- matrix(c(36, 36, xbar, bsxbar, 8, 8/sqrt(200), SD, bsse), nrow = 4)
dimnames(bs1) <- list("n=200" = c("Population", "Sampling", "Sample", "Bootstrap"), Table_1 = c("Mean", "SD"))
bs1

            Table_1
n=200            Mean        SD
  Population 36.00000 8.0000000
  Sampling   36.00000 0.5656854
  Sample     35.87853 8.1760899
  Bootstrap  35.87956 0.5815149

Parts a-d for $n = 10$:

$\bar{X} \sim N(36, \frac{8}{\sqrt{10}})$

Sam1 <- rnorm(10, 36, 8/sqrt(10))
hist(Sam1)

qqnorm(Sam1)

xbar1 <- mean(Sam1)
SD1 <- sd(Sam1)

Sam_s1 <- c(mean(Sam1), sd(Sam1))
names(Sam_s1) <- list("Mean", "SD")
Sam_s1

     Mean        SD 
36.959024  3.149675

set.seed(31)
B <- 10^4
bsmean1 <- numeric(B)
for(i in 1:B){
  bsmean1[i] <- mean(sample(Sam1, 10, replace = TRUE))
}
bsxbar1 <- mean(bsmean1)
bsse1 <- sd(bsmean1)

BS_s1 <- c(bsxbar1, bsse1)
names(BS_s1) <- list("BS Mean", "BS SE")
BS_s1

   BS Mean      BS SE 
36.9678585  0.9489149

bs2 <- matrix(c(36, 36, xbar1, bsxbar1, 8, 8/sqrt(10), SD1, bsse1), nrow = 4)
dimnames(bs2) <- list("n=200" = c("Population", "Sampling", "Sample", "Bootstrap"), Table_2 = c("Mean", "SD"))
bs2

            Table_2
n=200            Mean        SD
  Population 36.00000 8.0000000
  Sampling   36.00000 2.5298221
  Sample     36.95902 3.1496754
  Bootstrap  36.96786 0.9489149

Parts a-d for $n = 50$:

${X} N(36, )

set.seed(31)
Sam2 <- rnorm(50, 36, 8/sqrt(50))
hist(Sam2)

qqnorm(Sam2)

xbar2 <- mean(Sam2)
SD2 <- sd(Sam2)

Sam_s2 <- c(mean(Sam2), sd(Sam2))
names(Sam_s2) <- list("Mean", "SD")
Sam_s2

     Mean        SD 
35.780626  1.062875

set.seed(31)
B <- 10^4
bsmean2 <- numeric(B)
for(i in 1:B){
  bsmean2[i] <- mean(sample(Sam2, 50, replace = TRUE))
}
bsxbar2 <- mean(bsmean2)
bsse2 <- sd(bsmean2)

BS_s2 <- c(bsxbar2, bsse2)
names(BS_s2) <- list("BS Mean", "BS SE")
BS_s2

   BS Mean      BS SE 
35.7818745  0.1493988

bs3 <- matrix(c(36, 36, xbar2, bsxbar2, 8, 8/sqrt(50), SD2, bsse2), nrow = 4)
dimnames(bs3) <- list("n=200" = c("Population", "Sampling", "Sample", "Bootstrap"), Table_3 = c("Mean", "SD"))
bs3

            Table_3
n=200            Mean        SD
  Population 36.00000 8.0000000
  Sampling   36.00000 1.1313708
  Sample     35.78063 1.0628750
  Bootstrap  35.78187 0.1493988

As $n$ increases, the distribution becomes more normal and closer to resembling the original distribution.

We investigate the bootstrap distribution of the median. Create random samples of size $n$ for various $n$ and bootstrap the median. Describe the bootstrap distribution. Change the sample sizes to 36 and 37; 200 and 201; 10,000 and 10,001. Note the similarities/dissimilarities, trends and so on. Why does the parity of the sample size matter?

set.seed(31)
ne <- 14 # n even
no <- 15 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 14", "n = 15"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 1: Histograms of bootstrapped median values

set.seed(31)
ne <- 36 # n even
no <- 37 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 36", "n = 37"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 2: Histograms of bootstrapped median values

set.seed(31)
ne <- 200 # n even
no <- 201 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 200", "n = 201"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 3: Histograms of bootstrapped median values

set.seed(31)
ne <- 10000 # n even
no <- 10001 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 10,000", "n = 10,001"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 4: Histograms of bootstrapped median values

As the sample size $n$ increases, the spread of the distribution decreases. Note that the scale goes from -1 to 1 on the graphs with $n = 14$ and $n = 15$, but decreases to a range of -0.08 to 0.04 on the graphs with $n = 10,000$ and $n = 10,001$. The amount of variation between $n$ and $n+1$ also drastically decreases. Much of $n = 14$ is loosely distributed while $n = 15$ is heavily concentrated around the median. The graphs for $n = 36$ and $n = 37$ are similarly conflicting in their amount of variation. Above $n = 200$, this concern is cleared up, however. All of the graphs demonstrate a roughly symmetric, bell-shaped curve.

Import the data from data set Bangladesh. In addition to arsenic concentrations for 271 wells, the data set contains cobalt and chlorine concentrations.
1. Conduct EDA on the chlorine concentrations and describe the salient features.
2. Bootstrap the mean.
3. Find and interpret the 95% bootstrap percentile confidence interval.
4. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?

Bangladesh <- read.csv("http://www1.appstate.edu/~arnholta/Data/Bangladesh.csv")
head(Bangladesh)

  Arsenic Chlorine Cobalt
1    2400      6.2   0.42
2       6    116.0   0.45
3     904     14.8   0.63
4     321     35.9   0.68
5    1280     18.9   0.58
6     151      7.8   0.35

The Chlorine variable has some missing values. The following code will remove these entries:

chlorine <- subset(Bangladesh, select = Chlorine, subset = !is.na(Chlorine), drop = TRUE)

Your answers:

hist(chlorine)

qqnorm(chlorine)

med <- median(chlorine)
IQR <- IQR(chlorine)

chlorine_eda <- c(med, IQR)
names(chlorine_eda) <- list("Med", "IQR")
chlorine_eda

 Med  IQR 
14.2 50.5

This is an extreme right-skewed function, hence the use of median and interquartile range to describe center and spread.

Bangladesh1 <- Bangladesh %>% 
  filter(!is.na(Chlorine))
library(boot)
bs.mean <- function(data, i){
  d <- data[i]
  M <- mean(d)
  M
  }
boot.obj <- boot(data = Bangladesh1$Chlorine, statistic = bs.mean, R = 10^4-1)
boot.obj


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Bangladesh1$Chlorine, statistic = bs.mean, R = 10^4 - 
    1)


Bootstrap Statistics :
    original     bias    std. error
t1* 78.08401 -0.1172602    12.76439

Thus the bootstrapped mean is 78.08401.

boot.ci(boot.obj, type = "perc", conf = 0.95)

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 9999 bootstrap replicates

CALL : 
boot.ci(boot.out = boot.obj, conf = 0.95, type = "perc")

Intervals : 
Level     Percentile     
95%   ( 54.75, 104.43 )  
Calculations and Intervals on Original Scale

There is a 95% chance that our bootstrapped mean falls between 54.75 and 104.43.

From the R code in part b, we know that the bias is -0.1172602.

Consider Bangladesh chlorine (concentration). Bootstrap the trimmed mean (say, trim the upper and lower 25%) and compare your results with the usual mean (previous exercise).

Your answer:

bs.trim.mean <- function(data, i){
  d <- data[i]
  M <- mean(d, .25)
  M
  }
boot.obj.1 <- boot(data = Bangladesh1$Chlorine, statistic = bs.trim.mean, R = 10^4-1)
boot.obj.1


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Bangladesh1$Chlorine, statistic = bs.trim.mean, R = 10^4 - 
    1)


Bootstrap Statistics :
    original    bias    std. error
t1*  17.6363 0.1958157    2.439056

This mean is much lower than the previous mean, with a difference of 60.44771.

The data set FishMercury contains mercury levels (parts per million) for 30 fish caught in lakes in Minnesota.
1. Create a histogram or boxplot of the data. What do you observe?
2. Bootstrap the mean and record the bootstrap standard error and the 95% bootstrap percentile interval.
3. Remove the outlier and bootstrap the mean of the remaining data. Record the bootstrap standard error and the 95% bootstrap percentile interval.
4. What effect did removing the outlier have on the bootstrap distribution, in particular, the standard error?

FishMercury <- read.csv("http://www1.appstate.edu/~arnholta/Data/FishMercury.csv")
head(FishMercury)

Your answers:

ggplot(FishMercury, aes(x = Mercury)) +
  geom_histogram(binwidth = .1)

boxplot(FishMercury)

Note that there is one value (1.87) very far removed from the rest of the values.

bs.mean <- function(data, i){
  d <- data[i]
  M <- mean(d)
  M
  }
boot.obj <- boot(data = FishMercury$Mercury, statistic = bs.mean, R = 10^4-1)
boot.obj


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = FishMercury$Mercury, statistic = bs.mean, R = 10^4 - 
    1)


Bootstrap Statistics :
     original       bias    std. error
t1* 0.1818667 3.744374e-05  0.05727569

boot.ci(boot.obj, type = "perc", conf = 0.95)

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 9999 bootstrap replicates

CALL : 
boot.ci(boot.out = boot.obj, conf = 0.95, type = "perc")

Intervals : 
Level     Percentile     
95%   ( 0.1125,  0.3066 )  
Calculations and Intervals on Original Scale

Thus the bootstrapped mean is 0.1818667 and the 95% confidence interval is (0.1125, 0.3066).

FishMercury1 <- FishMercury %>% 
  filter(Mercury < 1)
bs.mean <- function(data, i){
  d <- data[i]
  M <- mean(d)
  M
  }
boot.obj <- boot(data = FishMercury1$Mercury, statistic = bs.mean, R = 10^4-1)
boot.obj


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = FishMercury1$Mercury, statistic = bs.mean, R = 10^4 - 
    1)


Bootstrap Statistics :
     original        bias    std. error
t1* 0.1236552 -9.037111e-05   0.0078916

boot.ci(boot.obj, type = "perc", conf = 0.95)

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 9999 bootstrap replicates

CALL : 
boot.ci(boot.out = boot.obj, conf = 0.95, type = "perc")

Intervals : 
Level     Percentile     
95%   ( 0.1077,  0.1390 )  
Calculations and Intervals on Original Scale

After removing the outlier, we now have the bootstrapped standard error of 0.0078916 and the 95% bootstrapped confidence interval of (0.1077, 0.1390).

Removing the outlier significantly decreased the standard error and thus lessened the range of the confidence interval. The original standard error of 0.05727569 allows for a confidence interval range of about 0.2 while the standard error with the outlier removed of 0.0078916 generated a confidence interval range of only approximately 0.02.

In section 3.3, we performed a permutation test to determine if men and women consumed, on average, different amounts of hot wings.
1. Bootstrap the difference in means and describe the bootstrap distribution.
2. Find a 95% bootstrap percentile confidence interval for the difference of means and give a sentence interpreting this interval.
3. How do the bootstrap and permutation distribution differ?

BeerWings <- read.csv("http://www1.appstate.edu/~arnholta/Data/Beerwings.csv")
head(BeerWings)

  ID Hotwings Beer Gender
1  1        4   24      F
2  2        5    0      F
3  3        5   12      F
4  4        6   12      F
5  5        7   12      F
6  6        7   12      F

Your answers:

wings.f <- subset(BeerWings, select = Hotwings, subset = Gender == "F", drop = TRUE)
wings.m <- subset(BeerWings, select = Hotwings, subset = Gender == "M", drop = TRUE)

N <- 10^4
wings.diff.mean <- numeric(N)
for(i in 1:N){
  f.sample <- sample(wings.f, 15, replace = TRUE)
  m.sample <- sample(wings.m, 15, replace = TRUE)
  wings.diff.mean[i] <- mean(f.sample) - mean(m.sample)
}

hist(wings.diff.mean)

qqnorm(wings.diff.mean)

The bootstrap distribution is approximately normal.

quantile(wings.diff.mean, c(0.025, 0.975))

     2.5%     97.5% 
-7.933333 -2.333333

We know with 95% confidence that the mean difference in wings eaten per gender is betweeen -7.93 and -2.33.

Since we sample with replacement for bootstrap distributions, the distribution is centered around the original observed mean difference rather than 0, as the null hypothesis for a permutation distribution would be.

Import the data from Girls2004 (see Section 1.2).
1. Perform some exploratory data analysis and obtain summary statistics on the weight of baby girls born in Wyoming and Arkansas (do seperate analyses for each state).
2. Bootstrap the difference in means, plot the distribution, and give the summary statistics. Obtain a 95% bootstrap percentile confidence interval and interpret this interval.
3. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?
4. Conduct a permutation test to calculate the difference in mean weights and state your conclusion?
5. For what population(s), if any does this calculation hold? Explain?

Girls2004 <- read.csv("http://www1.appstate.edu/~arnholta/Data/Girls2004.csv")
head(Girls2004)

  ID State MothersAge Smoker Weight Gestation
1  1    WY      15-19     No   3085        40
2  2    WY      35-39     No   3515        39
3  3    WY      25-29     No   3775        40
4  4    WY      20-24     No   3265        39
5  5    WY      25-29     No   2970        40
6  6    WY      20-24     No   2850        38

Your answers:

weight.wy <- subset(Girls2004, select = Weight, subset = State == "WY", drop = TRUE)
weight.ak <- subset(Girls2004, select = Weight, subset = State == "AK", drop = TRUE)

hist(weight.wy)

as.data.frame(weight.wy) %>% 
  summarize(n(), mean(weight.wy), sd(weight.wy))

  n() mean(weight.wy) sd(weight.wy)
1  40          3207.9      418.3184

hist(weight.ak)

as.data.frame(weight.ak) %>% 
  summarize(n(), mean(weight.ak), sd(weight.ak))

  n() mean(weight.ak) sd(weight.ak)
1  40         3516.35      578.8336

N <- 10^4
weight.diff.mean <- numeric(N)
for(i in 1:N){
  wy.sample <- sample(weight.wy, 40, replace = TRUE)
  ak.sample <- sample(weight.ak, 40, replace = TRUE)
  weight.diff.mean[i] <- mean(wy.sample) - mean(ak.sample)
}

hist(weight.diff.mean)

qqnorm(weight.diff.mean)

mean(weight.diff.mean)

[1] -307.0004

sd(weight.diff.mean)

[1] 110.5607

quantile(weight.diff.mean, c(0.025, 0.975))

      2.5%      97.5% 
-523.69125  -91.92375

bias <- mean(weight.diff.mean) - (mean(weight.wy) - mean(weight.ak))

bias/sd(weight.diff.mean)

[1] 0.01311126

obs <- mean(weight.wy) - mean(weight.ak)
N <- 10^4
weight.diff.mean.p <- numeric(N)
for(i in 1:N){
  wy.sample <- sample(weight.ak, 40, replace = FALSE)
  ak.sample <- sample(weight.ak, 40, replace = FALSE)
  weight.diff.mean.p[i] <- mean(wy.sample) - mean(ak.sample)
}
p_value <- 2*(sum(weight.diff.mean.p >= obs) + 1) / (N + 1)
p_value

[1] 2

Since the $p-value$ is greater than $\alpha = 0.05$, we fail to reject the null hypothesis that there is no difference between the weight of girls in Wyoming and Arkansas.

This calculation holds for female children born in Wyoming and Arkansas born in 2004.

Do chocolate and vanilla ice creams have the same number of calories? The data set IceCream contains calorie information for a sample of brands of chocolate and vanilla ice cream. Use the bootstrap to determine whether or not there is a difference in the mean number of calories.

IceCream <- read.csv("http://www1.appstate.edu/~arnholta/Data/IceCream.csv")
head(IceCream)

           Brand VanillaCalories VanillaFat VanillaSugar ChocolateCalories
1 Baskin Robbins             260       16.0         26.0               260
2  Ben & Jerry's             240       16.0         19.0               260
3     Blue Bunny             140        7.0         12.0               130
4        Breyers             140        7.0         13.0               140
5      Brigham's             190       12.0         17.0               200
6          Bulla             234       13.5         21.8               266
  ChocolateFat ChocolateSugar
1           14           31.0
2           16           22.0
3            7           14.0
4            8           16.0
5           12           18.0
6           15           22.6

Your answer:

cal.ch <- subset(IceCream, select = ChocolateCalories, drop = TRUE)
cal.van <- subset(IceCream, select = VanillaCalories, drop = TRUE)

N <- 10^4
cal.diff.mean <- numeric(N)
for(i in 1:N){
  ch.sample <- sample(weight.wy, 39, replace = TRUE)
  van.sample <- sample(weight.ak, 39, replace = TRUE)
  cal.diff.mean[i] <- mean(ch.sample) - mean(van.sample)
}

quantile(cal.diff.mean, c(0.025, 0.975))

      2.5%      97.5% 
-530.79808  -86.08205

Since 0 does not fall within the confidence interval, we can say with 95 percent confidence that vanilla and chocolate ice cream do not have the same number of calories.

Import the data from Flight Delays Case Study in Section 1.1 data into R. Although the data are on all UA and AA flights flown in May and June of 2009, we will assume these represent a sample from a larger population of UA and AA flights flown under similar circumstances. We will consider the ratio of the means of the flight delay lengths, $\mu_{\text{UA}} / \mu_{\text{AA}}$.
1. Perform some exploratory data analysis on flight delay lengths for each of UA and AA flights.
2. Bootstrap the mean of flight delay lengths for each airline seperately and describe the distribution.
3. Bootstrap the ratio of means. Provide plots of the bootstrap distribution and describe the distribution.
4. Find the 95% bootstrap percentile interval for the ratio of means. Interpret this interval.
5. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?
6. For inference in this text, we assume that the observations are independent. Is that condition met here? Explain.

FlightDelays <- read.csv("http://www1.appstate.edu/~arnholta/Data/FlightDelays.csv")

Your answers:

FlightDelays %>% 
  group_by(Carrier) %>% 
  summarize(Mean = mean(Delay), SD = sd(Delay), n = n())

# A tibble: 2 x 4
  Carrier     Mean       SD     n
   <fctr>    <dbl>    <dbl> <int>
1      AA 10.09738 40.08063  2906
2      UA 15.98308 45.13895  1123

ggplot(FlightDelays, aes(x = Delay)) +
  geom_histogram() + 
  facet_grid(Carrier ~.)

delay.ua <- subset(FlightDelays, select = Delay, subset = Carrier == "UA", drop = TRUE)

N <- 10^4
delay.ua.mean <- numeric(N)
for(i in 1:N){
  ua.sample <- sample(delay.ua, 1123, replace = TRUE)
  delay.ua.mean[i] <- mean(ua.sample)
}

hist(delay.ua.mean)

mean(delay.ua.mean)

[1] 15.97407

delay.aa <- subset(FlightDelays, select = Delay, subset = Carrier == "AA", drop = TRUE)

N <- 10^4
delay.aa.mean <- numeric(N)
for(i in 1:N){
  aa.sample <- sample(delay.aa, 2906, replace = TRUE)
  delay.aa.mean[i] <- mean(aa.sample)
}

hist(delay.aa.mean)

mean(delay.aa.mean)

[1] 10.08878

Both carriers have flight delays with a strong right skew.

N <- 10^4
delay.mean.ratio <- numeric(N)
for(i in 1:N){
  aa.sample <- sample(delay.aa, 2906, replace = TRUE)
  ua.sample <- sample(delay.ua, 1123, replace = TRUE)
  delay.mean.ratio[i] <- mean(ua.sample)/mean(aa.sample)
}

hist(delay.mean.ratio)

We can say with 95% confidence that the ratio of flight delays is between 1.27 and 1.96.

quantile(delay.mean.ratio, c(0.025, 0.975))

    2.5%    97.5% 
1.268373 1.964924

bias <- mean(delay.mean.ratio) - (mean(delay.ua)/mean(delay.aa))

bias/sd(delay.mean.ratio)

[1] 0.04645315

This condition is met because there is no way for one flight to correspond to both carriers. Thus, the observations must be independent.

Two college students collected data on the price of hardcover textbooks from two disciplinary areas: Mathematics and the Natural Sciences, and the Social Sciences (Hien and Baker (2010)). The data are in the file BookPrices.
1. Perform some exploratory data analysis on book prices for each of the two disciplinary areas.
2. Bootstrap the mean of the book price for each area separately and describe the distributions.
3. Bootstrap the ratio of means. Provide plots of the bootstrap distribution and comment.
4. Find the 95% bootstrap percentile interval for the ratio of means. Interpret this interval.
5. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?

BookPrices <- read.csv("http://www1.appstate.edu/~arnholta/Data/BookPrices.csv")

Your answers:

BookPrices %>% 
  group_by(Area) %>% 
  summarize(Mean = mean(Price), SD = sd(Price), n = n())

# A tibble: 2 x 4
             Area     Mean       SD     n
           <fctr>    <dbl>    <dbl> <int>
1  Math & Science 156.7341 39.14483    27
2 Social Sciences  98.9900 71.91385    17

ggplot(BookPrices, aes(x = Price)) +
  geom_histogram() + 
  facet_grid(Area ~.)

price.mat <- subset(BookPrices, select = Price, subset = Area == "Math & Science", drop = TRUE)

N <- 10^4
price.mat.mean <- numeric(N)
for(i in 1:N){
  mat.sample <- sample(price.mat, 27, replace = TRUE)
  price.mat.mean[i] <- mean(mat.sample)
}

hist(price.mat.mean)

mean(price.mat.mean)

[1] 156.6845

price.ss <- subset(BookPrices, select = Price, subset = Area == "Social Sciences", drop = TRUE)

N <- 10^4
price.ss.mean <- numeric(N)
for(i in 1:N){
  ss.sample <- sample(price.ss, 17, replace = TRUE)
  price.ss.mean[i] <- mean(ss.sample)
}

hist(price.ss.mean)

mean(price.ss.mean)

[1] 99.03632

Both distributions are approximately normal.

N <- 10^4
price.mean.ratio <- numeric(N)
for(i in 1:N){
  mat.sample <- sample(price.mat, 27, replace = TRUE)
  ss.sample <- sample(price.ss, 17, replace = TRUE)
  price.mean.ratio[i] <- mean(mat.sample)/mean(ss.sample)
}

hist(price.mean.ratio)

The price mean ratio has a slight right skew.

quantile(price.mean.ratio, c(0.025, 0.975))

    2.5%    97.5% 
1.172105 2.381640

We can say with 95% confidence that the mean price ratio falls between 1.17 and 2.38.

bias <- mean(price.mean.ratio) - (mean(price.mat)/mean(price.ss))

bias/sd(price.mean.ratio)

[1] 0.1511019

Chapter Five Homework

Lindsey Wise

Tuesday, Nov 21, 2017 - 04:57:08 PM