Chapter Five Homework

Consider a population that has a normal distribution with mean \(\mu = 36\), standard deviation \(\sigma = 8\)
1. The sampling distribution of \(\bar{X}\) for samples of size 200 will have what distribution, mean, and standard error?
2. Use R to draw a random sample of size \(200\) from this population. Conduct EDA on your sample.
3. Compute the bootstrap distribution for your sample mean, and note the bootstrap mean and standard error.
4. Compare the bootstrap distribution to the theoretical sampling distribution by creating a table like Table 5.2.
5. Repeat parts a-d for sample sizes of \(n = 50\) and \(n = 10\). Carefully describe your observations about the effects of sample size on the bootstrap distribution.

Your answers:

\(\bar{X}\)~N(36, 0.566)

set.seed(13)
Samp <- rnorm(200, 36, 8)
hist(Samp)

qqnorm(Samp)
qqline(Samp)

xbar <- mean(Samp)
xbar

[1] 35.87853

SD <- sd(Samp)
SD

[1] 8.17609

set.seed(13)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Samp, 200, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 35.87957

BSS <- sd(bsm)
BSS

[1] 0.581513

Bias <- BSM - xbar
Bias

[1] 0.001048422

standard error = 0.566

library(htmlTable)
dist <- round(c(36, 36, xbar, BSM, 8, 0.566, SD, BSS), 3)
Dist <- as.matrix(dist, nrow = 4, byrow = TRUE)
Dist

       [,1]
[1,] 36.000
[2,] 36.000
[3,] 35.879
[4,] 35.880
[5,]  8.000
[6,]  0.566
[7,]  8.176
[8,]  0.582

options(table_counter = 3)
htmlTable::htmlTable(dist, align = "lrrr", css.cell = "padding-left: 1em; padding-right: 1em;", header = c("Mean", "Standard Deviation"), rnames = c("Population", "Sampling Distribution of X&#772", "Sample", "Bootstrap Distribution"))

	Mean	Standard Deviation
Population	36	8
Sampling Distribution of X̄	36	0.566
Sample	35.879	8.176
Bootstrap Distribution	35.88	0.582

Parts a-d for \(n = 10\):

\(\bar{X}\)~N(36, 2.5298)

Samp <- rnorm(10, 36, 8)
hist(Samp)

qqnorm(Samp)
qqline(Samp)

xbar <- mean(Samp)
xbar

[1] 35.64098

SD <- sd(Samp)
SD

[1] 8.821555

set.seed(31)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Samp, 10, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 35.61177

BSS <- sd(bsm)
BSS

[1] 2.654853

Bias <- BSM - xbar
Bias

[1] -0.02921394

standard error = 2.5298

library(htmlTable)
dist <- round(c(36, 36, xbar, BSM, 8, 2.5298, SD, BSS), 3)
Dist <- as.matrix(dist, nrow = 4, byrow = TRUE)
Dist

       [,1]
[1,] 36.000
[2,] 36.000
[3,] 35.641
[4,] 35.612
[5,]  8.000
[6,]  2.530
[7,]  8.822
[8,]  2.655

options(table_counter = 3)
htmlTable::htmlTable(dist, align = "lrrr", css.cell = "padding-left: 1em; padding-right: 1em;", header = c("Mean", "Standard Deviation"), rnames = c("Population", "Sampling Distribution of X&#772", "Sample", "Bootstrap Distribution"))

	Mean	Standard Deviation
Population	36	8
Sampling Distribution of X̄	36	2.53
Sample	35.641	8.822
Bootstrap Distribution	35.612	2.655

Parts a-d for \(n = 50\):

\(\bar{X}\)~N(36, 1.1314)

Samp <- rnorm(50, 36, 8)
hist(Samp)

qqnorm(Samp)
qqline(Samp)

xbar <- mean(Samp)
xbar

[1] 36.76212

SD <- sd(Samp)
SD

[1] 7.24557

set.seed(31)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Samp, 50, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 36.75975

BSS <- sd(bsm)
BSS

[1] 1.018347

Bias <- BSM - xbar
Bias

[1] -0.002371345

standard error = 1.1314

library(htmlTable)
dist <- round(c(36, 36, xbar, BSM, 8, 1.1314, SD, BSS), 3)
Dist <- as.matrix(dist, nrow = 4, byrow = TRUE)
Dist

       [,1]
[1,] 36.000
[2,] 36.000
[3,] 36.762
[4,] 36.760
[5,]  8.000
[6,]  1.131
[7,]  7.246
[8,]  1.018

options(table_counter = 3)
htmlTable::htmlTable(dist, align = "lrrr", css.cell = "padding-left: 1em; padding-right: 1em;", header = c("Mean", "Standard Deviation"), rnames = c("Population", "Sampling Distribution of X&#772", "Sample", "Bootstrap Distribution"))

	Mean	Standard Deviation
Population	36	8
Sampling Distribution of X̄	36	1.131
Sample	36.762	7.246
Bootstrap Distribution	36.76	1.018

With an increase in sample size, the data is more exact to what the population truly is. The sample size needs to be sufficiently large for an accurate representation.

We investigate the bootstrap distribution of the median. Create random samples of size \(n\) for various \(n\) and bootstrap the median. Describe the bootstrap distribution. Change the sample sizes to 36 and 37; 200 and 201; 10,000 and 10,001. Note the similarities/dissimilarities, trends and so on. Why does the parity of the sample size matter?

set.seed(31)
ne <- 14 # n even
no <- 15 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 14", "n = 15"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 1: Histograms of bootstrapped median values

set.seed(31)
ne <- 36 # n even
no <- 37 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 36", "n = 37"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 2: Histograms of bootstrapped median values

set.seed(31)
ne <- 200 # n even
no <- 201 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 200", "n = 201"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 3: Histograms of bootstrapped median values

set.seed(31)
ne <- 10000 # n even
no <- 10001 # n odd

wwe <- rnorm(ne) # draw random sample of size ne
wwo <- rnorm(no) # draw random sample of size no

N <- 10^4
even.boot <- numeric(N) # save space
odd.boot <- numeric(N)
for (i in 1:N)
{
  x.even <- sample(wwe, ne, replace = TRUE)
  x.odd <- sample(wwo, no, replace = TRUE)
  even.boot[i] <- median(x.even)
  odd.boot[i] <- median(x.odd)
}

Median <- c(even.boot, odd.boot)
Parity <- rep(c("n = 10000", "n = 10001"), each = N)
DF <- data.frame(Median = Median, Parity = Parity)

ggplot(data = DF, aes(x = Median)) + 
  geom_histogram(fill = "lightblue", color = "black") + 
  theme_bw() + 
  facet_grid(Parity ~.)

Figure 4: Histograms of bootstrapped median values

Sample medians make the bootstrap distribution a poor representation of the sampling distribution. Since bootstrap distributions are very sensitive to gaps among the observations near the center of the sample, even values of the sample sizes will help eliminate this problem and will help create a more “whole” distribution.

Import the data from data set Bangladesh. In addition to arsenic concentrations for 271 wells, the data set contains cobalt and chlorine concentrations.
1. Conduct EDA on the chlorine concentrations and describe the salient features.
2. Bootstrap the mean.
3. Find and interpret the 95% bootstrap percentile confidence interval.
4. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?

Bangladesh <- read.csv("http://www1.appstate.edu/~arnholta/Data/Bangladesh.csv")
head(Bangladesh)

  Arsenic Chlorine Cobalt
1    2400      6.2   0.42
2       6    116.0   0.45
3     904     14.8   0.63
4     321     35.9   0.68
5    1280     18.9   0.58
6     151      7.8   0.35

The Chlorine variable has some missing values. The following code will remove these entries:

chlorine <- subset(Bangladesh, select = Chlorine, subset = !is.na(Chlorine), drop = TRUE)
library(dplyr)
Bangladesh %>%
  summarize(mean = mean(chlorine), sd = sd(chlorine), n())

      mean       sd n()
1 78.08401 210.0192 271

Your answers:

\(\bar{X}\)~N(78.084, 210.02)

set.seed(13)
Samp <- rnorm(271, mean(chlorine), sd(chlorine))
hist(Samp)

qqnorm(Samp)
qqline(Samp)

xbar <- mean(Samp)
xbar

[1] 64.33899

SD <- sd(Samp)
SD

[1] 217.931

set.seed(13)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Samp, 271, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 64.53167

BSS <- sd(bsm)
BSS

[1] 13.38041

Bias <- BSM - mean(chlorine)
Bias

[1] -13.55235

quantile(bsm, prob = c(.025, .975))

    2.5%    97.5% 
38.42227 90.10764

mean(bsm) - qnorm(.975) * sd(bsm)

[1] 38.30655

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 90.75679

We are 95% confident that the true mean of chlorine concentration is within the interval (0.0692,0.2943).

Standard error = 210.02 The bias is about 6% of the standard error.

Consider Bangladesh chlorine (concentration). Bootstrap the trimmed mean (say, trim the upper and lower 25%) and compare your results with the usual mean (previous exercise).

Your answer:

Samp <- rnorm(271, mean(chlorine, trim = 0.25), sd(chlorine))
set.seed(13)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Samp, 271, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 19.65255

Using the trimmed mean is losing the outliers so 19.65 would be the approximate mean without the outliers pulling the mean. However, this does not do a good job of representing our data because using the trimmed mean is disregarding 25% of the data on each end which is losing 50% of the original data.

The data set FishMercury contains mercury levels (parts per million) for 30 fish caught in lakes in Minnesota.
1. Create a histogram or boxplot of the data. What do you observe?
2. Bootstrap the mean and record the bootstrap standard error and the 95% bootstrap percentile interval.
3. Remove the outlier and bootstrap the mean of the remaining data. Record the bootstrap standard error and the 95% bootstrap percentile interval.
4. What effect did removing the outlier have on the bootstrap distribution, in particular, the standard error?

FishMercury <- read.csv("http://www1.appstate.edu/~arnholta/Data/FishMercury.csv")
head(FishMercury)

Your answers:

hist(FishMercury$Mercury)

boxplot(FishMercury)

Note that there is one value (1.87) very far removed from the rest of the values. Because of this one value, the distribution is skewed. With the removal of this outlier, the distribution could be approximately normal.

set.seed(13)
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(FishMercury$Mercury, 30, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 0.1817277

BSS <- sd(bsm)
BSS

[1] 0.05742464

Bias <- BSM - mean(FishMercury$Mercury)
Bias

[1] -0.0001389333

quantile(bsm, prob = c(.025, .975))

     2.5%     97.5% 
0.1121667 0.3064675

mean(bsm) - qnorm(.975) * sd(bsm)

[1] 0.06917751

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 0.294278

no_outliers <- boxplot(FishMercury$Mercury, outline = FALSE)

y <- no_outliers$stats
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(y, 29, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 0.1211786

BSS <- sd(bsm)
BSS

[1] 0.00964194

Bias <- BSM - mean(FishMercury$Mercury)
Bias

[1] -0.06068803

quantile(bsm, prob = c(.025, .975))

     2.5%     97.5% 
0.1015172 0.1396905

mean(bsm) - qnorm(.975) * sd(bsm)

[1] 0.1022808

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 0.1400765

Removing the outlier allowed the distibution to be approximately normal with a smaller confidence interval and standard error. Although, it doesn’t allow for the whole dataset to be represented.

In section 3.3, we performed a permutation test to determine if men and women consumed, on average, different amounts of hot wings.
1. Bootstrap the difference in means and describe the bootstrap distribution.
2. Find a 95% bootstrap percentile confidence interval for the difference of means and give a sentence interpreting this interval.
3. How do the bootstrap and permutation distribution differ?

BeerWings <- read.csv("http://www1.appstate.edu/~arnholta/Data/Beerwings.csv")
head(BeerWings)

  ID Hotwings Beer Gender
1  1        4   24      F
2  2        5    0      F
3  3        5   12      F
4  4        6   12      F
5  5        7   12      F
6  6        7   12      F

Your answers:

library(dplyr)
BW <- BeerWings %>%
  group_by(Gender) %>%
  summarize(mean = mean(Hotwings))
BW

# A tibble: 2 x 2
  Gender      mean
  <fctr>     <dbl>
1      F  9.333333
2      M 14.533333

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(BW$mean, 15, replace = TRUE)) - mean(sample(BW$mean, 15, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 0.004298667

BSS <- sd(bsm)
BSS

[1] 0.9556567

Bias <- BSM + 5.2
Bias

[1] 5.204299

quantile(bsm, prob = c(.025, .975))

     2.5%     97.5% 
-1.733333  1.733333

mean(bsm) - qnorm(.975) * sd(bsm)

[1] -1.868754

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 1.877351

We are 95% confident that the true mean number of hotwings a male eats subtracted by the mean number of hotwings a female eats falls within the interval (-3.408, 3.3899).

In this situation, the bootstrap distribution describes the entire population of males and females better than the permutation distribution. There is not a large enough n to accurately represent the population through a permutation test.

Import the data from Girls2004 (see Section 1.2).
1. Perform some exploratory data analysis and obtain summary statistics on the weight of baby girls born in Wyoming and Arkansas (do seperate analyses for each state).
2. Bootstrap the difference in means, plot the distribution, and give the summary statistics. Obtain a 95% bootstrap percentile confidence interval and interpret this interval.
3. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?
4. Conduct a permutation test to calculate the difference in mean weights and state your conclusion?
5. For what population(s), if any does this calculation hold? Explain?

Girls2004 <- read.csv("http://www1.appstate.edu/~arnholta/Data/Girls2004.csv")
head(Girls2004)

  ID State MothersAge Smoker Weight Gestation
1  1    WY      15-19     No   3085        40
2  2    WY      35-39     No   3515        39
3  3    WY      25-29     No   3775        40
4  4    WY      20-24     No   3265        39
5  5    WY      25-29     No   2970        40
6  6    WY      20-24     No   2850        38

Your answers:

Girls <- Girls2004 %>%
  group_by(State) %>%
  summarize(Mean = mean(Weight), StDev = sd(Weight), n())
Girls

# A tibble: 2 x 4
   State    Mean    StDev `n()`
  <fctr>   <dbl>    <dbl> <int>
1     AK 3516.35 578.8336    40
2     WY 3207.90 418.3184    40

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Girls$Mean, 40, replace = TRUE)) - mean(sample(Girls$Mean, 40, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] -0.06708788

BSS <- sd(bsm)
BSS

[1] 34.76641

quantile(bsm, prob = c(.025, .975))

     2.5%     97.5% 
-69.40125  69.40125

mean(bsm) - qnorm(.975) * sd(bsm)

[1] -68.20801

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 68.07383

We are 95% confident that the true difference of mean weight of baby girls in 2004 in Arkansas and Wyoming is within the interval (-66.74, 67.61).

Bias <- BSM - 308.45
Bias

[1] -308.5171

standard error = 34.27 Where the bias is a tremendous amount more than the standard error.

m_weight <- Girls2004 %>%
  group_by(State) %>%
  summarize(Mean = mean(Weight), n()) %>%
  summarize(obs_diff = diff(Mean))
m_weight

# A tibble: 1 x 1
  obs_diff
     <dbl>
1  -308.45

sims <- 10^4 -1
ts <- numeric(sims)
for(i in 1:sims) {
  index <- sample(80, 40, replace = FALSE)
  ts[i] <- mean(Girls2004$Weight[index]) - mean(Girls2004$Weight[-index])
}
hist(ts)

pvalue <- ((sum(ts <= m_weight$obs_diff) + 1)/ (sims + 1)) * 2
pvalue

[1] 0.0082

There is no sufficient evidence to reject the notion that the mean weight of baby girls in Arkansas and Wyoming in 2004 are the same.

The population for this dataset are all baby girls born in 2004 in Arkansas and Wyoming. This data does not stand for any other year or any other states.

Do chocolate and vanilla ice creams have the same number of calories? The data set IceCream contains calorie information for a sample of brands of chocolate and vanilla ice cream. Use the bootstrap to determine whether or not there is a difference in the mean number of calories.

IceCream <- read.csv("http://www1.appstate.edu/~arnholta/Data/IceCream.csv")
head(IceCream)

           Brand VanillaCalories VanillaFat VanillaSugar ChocolateCalories
1 Baskin Robbins             260       16.0         26.0               260
2  Ben & Jerry's             240       16.0         19.0               260
3     Blue Bunny             140        7.0         12.0               130
4        Breyers             140        7.0         13.0               140
5      Brigham's             190       12.0         17.0               200
6          Bulla             234       13.5         21.8               266
  ChocolateFat ChocolateSugar
1           14           31.0
2           16           22.0
3            7           14.0
4            8           16.0
5           12           18.0
6           15           22.6

Your answer:

IC <- IceCream %>%
  summarize(MeanChocolate = mean(ChocolateCalories), MeanVanilla = mean(VanillaCalories), n())
IC

  MeanChocolate MeanVanilla n()
1      198.7436    191.4103  39

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(IC$MeanChocolate, 39, replace = TRUE)) - mean(sample(IC$MeanVanilla, 39, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 3.999177

BSS <- sd(bsm)
BSS

[1] 12.64522

Bias <- BSM - 7.333
Bias

[1] -3.333823

We have sufficient evidence to prove that the mean calories between chocolate and vanilla are not the same.

Import the data from Flight Delays Case Study in Section 1.1 data into R. Although the data are on all UA and AA flights flown in May and June of 2009, we will assume these represent a sample from a larger population of UA and AA flights flown under similar circumstances. We will consider the ratio of the means of the flight delay lengths, \(\mu_{\text{UA}} / \mu_{\text{AA}}\).
1. Perform some exploratory data analysis on flight delay lengths for each of UA and AA flights.
2. Bootstrap the mean of flight delay lengths for each airline seperately and describe the distribution.
3. Bootstrap the ratio of means. Provide plots of the bootstrap distribution and describe the distribution.
4. Find the 95% bootstrap percentile interval for the ratio of means. Interpret this interval.
5. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?
6. For inference in this text, we assume that the observations are independent. Is that condition met here? Explain.

FlightDelays <- read.csv("http://www1.appstate.edu/~arnholta/Data/FlightDelays.csv")

Your answers:

FD <- FlightDelays %>%
  group_by(Carrier) %>%
  summarize(Mean = mean(Delay), StDev = sd(Delay), n())
FD

# A tibble: 2 x 4
  Carrier     Mean    StDev `n()`
   <fctr>    <dbl>    <dbl> <int>
1      AA 10.09738 40.08063  2906
2      UA 15.98308 45.13895  1123

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(FD$Mean, 2906, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 13.04061

BSS <- sd(bsm)
BSS

[1] 0.05450534

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(FD$Mean, 1123, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 13.04063

BSS <- sd(bsm)
BSS

[1] 0.08772604

Both distributions are approximately normal with almost the same means, but slightly different standard deviations.

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(FD$Mean, 2906, replace = TRUE))/mean(sample(FD$Mean, 1123, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 1.000198

BSS <- sd(bsm)
BSS

[1] 0.007947081

This distribution is approximately normal with a mean of about 1 and a standard deviation of 0.008

quantile(bsm, prob = c(.025, .975))

     2.5%     97.5% 
0.9847468 1.0160042

mean(bsm) - qnorm(.975) * sd(bsm)

[1] 0.9846222

mean(bsm) + qnorm(.975) * sd(bsm)

[1] 1.015774

We are 95% confident that the true ratio of delay times among carriers are within the interval (0.9845, 1.0153).

Bias <- BSM - 0.6318
Bias

[1] 0.3683982

standard error = 2.507691 The bias is about 14.6% of the standard error.

Since the flights are flown under similar circumstances we can assume independence. They were delayed because of outside forces not because of the other carrier.

Two college students collected data on the price of hardcover textbooks from two disciplinary areas: Mathematics and the Natural Sciences, and the Social Sciences (Hien and Baker (2010)). The data are in the file BookPrices.
1. Perform some exploratory data analysis on book prices for each of the two disciplinary areas.
2. Bootstrap the mean of the book price for each area separately and describe the distributions.
3. Bootstrap the ratio of means. Provide plots of the bootstrap distribution and comment.
4. Find the 95% bootstrap percentile interval for the ratio of means. Interpret this interval.
5. What is the bootstrap estimate of the bias? What fraction of the bootstrap standard error does it represent?

BookPrices <- read.csv("http://www1.appstate.edu/~arnholta/Data/BookPrices.csv")

Your answers:

Prices <- BookPrices %>%
  group_by(Area) %>%
  summarize(Mean = mean(Price), StDev = sd(Price), n())
Prices

# A tibble: 2 x 4
             Area     Mean    StDev `n()`
           <fctr>    <dbl>    <dbl> <int>
1  Math & Science 156.7341 39.14483    27
2 Social Sciences  98.9900 71.91385    17

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Prices$Mean, 27, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 127.8753

BSS <- sd(bsm)
BSS

[1] 5.535408

B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bsm[i] <- mean(sample(Prices$Mean, 17, replace = TRUE))
}
hist(bsm)

qqnorm(bsm)
qqline(bsm)

BSM <- mean(bsm)
BSM

[1] 127.9045

BSS <- sd(bsm)
BSS

[1] 6.944476

Each distribution is approximately normal.

B <- 10^4
bsm2 <- numeric(B)
for(i in 1:B){
  bsm2[i] <- mean(sample(Prices$Mean, 27, replace = TRUE))/mean(sample(Prices$Mean, 17, replace = TRUE))
}
hist(bsm2)

qqnorm(bsm2)
qqline(bsm2)

BSM <- mean(bsm2)
BSM

[1] 1.003948

BSS <- sd(bsm2)
BSS

[1] 0.07052699

Now the distribution is slightly skewed to the left.

quantile(bsm2, prob = c(.025, .975))

     2.5%     97.5% 
0.8732091 1.1517613

mean(bsm2) - qnorm(.975) * sd(bsm2)

[1] 0.8657179

mean(bsm2) + qnorm(.975) * sd(bsm2)

[1] 1.142179

We are 95% confident that the true ratio of prices of books in a certain area is within the interval (0.863, 1.142).

Bias <- BSM - (156.7341/98.99)
Bias

[1] -0.5793844

standard error = 0.5443307 The bias is about 100% of the standard error.

Chapter Five Homework

Sara Hursa

Monday, Nov 20, 2017 - 02:52:57 PM