2) Chapter 5 Conceptual Exercise 4

Use bootstrapping: take n samples of paired \((X_i, Y_i)\) from the original dataset (with replacement), fit the statistical learning model on the new dataset, and predict Y using the value of X given in the problem. Repeat this process a large number of times, which should give a distribution of predictions for Y for the particular value of X. Take the sample standard deviation of the predicted responses for Y, and use this as the estimate for the standard deviation of the prediction.

3)

a)

library(bootstrap)
data(law)
plot(law)

cor(law$LSAT, law$GPA)
## [1] 0.7763745

Yes, there looks to be a positive relationship between GPA and LSAT score, and the sample correlation is .776.

b)

set.seed(8675309)
b <- 1000
rownum <- nrow(law) 
correlations <- rep(0, b)
for(i in 1:b){
        id <- sample(rownum, replace = T)
        newdataset <- law[id,]
        newcor <- cor(newdataset$LSAT, newdataset$GPA)
        correlations[i] <- newcor
}

hist(correlations, breaks = 25)
abline(v = cor(law$LSAT, law$GPA), col = "red", lwd = 2)

c)

lower <- quantile(correlations, .025)
upper <- quantile(correlations, .975)

hist(correlations, breaks = 25)
abline(v = cor(law$LSAT, law$GPA), col = "red", lwd = 2)
abline(v = c(upper,lower), col = "blue", lwd = 2)

c(lower, upper)
##      2.5%     97.5% 
## 0.4693971 0.9593159

So a 95 percent confidence interval for the correlation would be (.469, .959). Based on this, we would fail to reject the null hypothesis that the true correlation is .5.

d)

theta0 <- cor(law$LSAT, law$GPA)
set.seed(8675309)
bias <- mean(correlations) - theta0
bias
## [1] -0.004535547
biaslower <- 2*theta0 - upper[[1]]
biasupper <- 2*theta0 - lower[[1]]
c(biaslower, biasupper)
## [1] 0.5934331 1.0833518
hist(correlations, breaks = 25, xlim = c(0, 1.25))
abline(v = cor(law$LSAT, law$GPA), col = "red", lwd = 2)
abline(v = c(upper,lower), col = "blue", lwd = 2)
abline(v = c(biaslower, biasupper), col = "green", lwd = 2)

The bootstrap estimate of the bias is -.0045, and the bias-corrected confidence interval is (.59, 1.08). If this confidence interval is used, we can reject the null hypothesis and conclude that the true correlation is greater than .5.

e)

set.seed(8675309)
gpa <- law$GPA
lsat <- law$LSAT
r <- cor(gpa, lsat)

b <- 10000
rdist <- rep(0, b)
for(i in 1:b){
        shuffled <- sample(gpa)
        rdist[i] <- cor(lsat, shuffled)
}
hist(rdist, breaks = 25, main = "Histogram of Permuted Correlations")
abline(v = r, col = "blue", lwd = 2)
abline(v = quantile(rdist, .975), col = "red", lwd = 2)

c(quantile(rdist, .975), r)
##     97.5%           
## 0.5089138 0.7763745

The original sample correlation falls above the .975 quantile of the permuted correlations. So we can reject the null and conclude that the true correlation is not equal to zero.

4)

a)

set.seed(8675309)
x1 <- runif(50)
x2 <- runif(50)
eps <- rnorm(50, sd = .25)
y <- x1 + x2 + eps

train <- data.frame(y, x1, x2, eps)

b )

set.seed(8675309)
x1 <- runif(30)
x2 <- runif(30)
eps <- rnorm(30, sd = .25)
y <- x1 + x2 + eps

test <- data.frame(y, x1, x2, eps)

lm <- lm(y ~ x1 + x2, data = train)

mse0 <- mean((test$y - predict(lm, test))^2)
mse0
## [1] 0.0381968

So the MSE\(_0\) is .038.

c)

set.seed(8675309)
b <- 1000
allmse <- rep(0, b)
for(i in 1:b){
        id <- sample(nrow(train))
        shuffle <- train[id, -1]
        newdf <- cbind(train$y, shuffle)
        names(newdf)[1] <- "y"
        newlm <- lm(y ~ x1 + x2, data = newdf)
        newmse <- mean((test$y - predict(newlm, test))^2)
        allmse[i] <- newmse
}

hist(allmse, xlim = c(0, .5), breaks = 25)
abline(v = mse0, col = "red", lwd = 2)

c(quantile(allmse, .025), mse0)
##      2.5%           
## 0.1100638 0.0381968

Since MSE\(_0\) is less than the 2.5% quantile of the permuted MSEs, we can reject the null hypothesis and conclude that at least one of the predictors is significant.

d)

set.seed(8675309)
b <- 1000
allmse <- rep(0, b)
for(i in 1:b){
        id <- sample(nrow(train))
        shuffle <- train[id, "x2"]
        newdf <- train
        newdf$x2 <- shuffle
        newlm <- lm(y ~ x1 + x2, data = newdf)
        newmse <- mean((test$y - predict(newlm, test))^2)
        allmse[i] <- newmse
}

hist(allmse, xlim = c(0, .5), breaks = 25)
abline(v = mse0, col = "red", lwd = 2)

c(quantile(allmse, .025), mse0)
##       2.5%            
## 0.08552865 0.03819680

Since the MSE with both predictors included is less than the 2.5% quantile of the MSEs obtained with X2 permuted, we can reject the null and conclude that \(\beta_2 \neq 0\).

e)

set.seed(8675309)
n = 500
train <- data.frame(replicate(10, runif(n)), eps = rnorm(n, sd = .25))
train$y <- rowSums(train)
head(train)
set.seed(8675309)
n = 50
test <- data.frame(replicate(10, runif(n)), eps = rnorm(n, sd = .25))
test$y <- rowSums(test)
head(test)

f)

set.seed(8675309)
lm1 <- lm(y ~ ., data = train[, -11])
mse0 <- mean((test$y - predict(lm1, test))^2)

b <- 1000
allmse <- rep(0, b)
for(i in 1:b){
        id <- sample(nrow(train))
        shuffle <- train[id, 8:10]
        newdf <- cbind(train[, c(12,1:7)], shuffle)
        newlm <- lm(y ~ ., data = newdf)
        newmse <- mean((test$y - predict(newlm, test))^2)
        allmse[i] <- newmse
}

hist(allmse, xlim = c(0, .4))
abline(v = mse0, col = "red", lwd = 2)

c(quantile(allmse, .025), mse0)
##       2.5%            
## 0.24096985 0.05603269

Since the MSE for the model with all 10 predictors is smaller than the 2.5% quantile of MSEs with X8, X9, and X10 permuted, we can reject the null and conclude that at least one of X8, X9 or X10 is significant.

5)

a)

The parameter we are interested in is \(p_1-p_2\) where \(p_1\) is the population proportion of those who got COVID after getting vaccinated, and \(p_2\) is the population proportion of those of got COVID without the vaccine.
\(H_0: p_1=p_2\) and \(H_A: p_1 < p_2\)
To test this hypothesis, we can use the two-proportion Z test since the sample size is very large.

prop.test(x = c(8, 162), n = c(21500, 21500), alternative = "less")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(8, 162) out of c(21500, 21500)
## X-squared = 138.25, df = 1, p-value < 2.2e-16
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000 -0.006122375
## sample estimates:
##      prop 1      prop 2 
## 0.000372093 0.007534884

Since the p-value is less than .05, reject the null and conclude that \(p_1 > p_2\).

b)

If the vaccine was not effective, we should expect to see a similar amount of people contracting COVID from the placebo group and vaccine group.

c)

A good test statistic for a randomization process might be the difference in the number of people who contract COVID between the two groups. If the original difference (154) is greater than at least 95 percent of the randomized differences, we can reject the null that the vaccine is not effective.

d)

We know that 170 enrollees contracted COVID, so in a randomization test, we would assign this variable randomly to the two groups.

set.seed(8675309)
diffs <- rep(0, 1000)
for(i in 1:1000){
        random <- sample(c("placebo", "vaccine"), 170, replace = T)
        difference <- length(random[random == "placebo"]) - length(random[random == "vaccine"])
        diffs[i] <- difference
}
hist(diffs, main = "Randomized Differences", breaks = 25, xlim = c(-50, 160))
abline(v = 154, col = "blue", lwd = 2)

max(diffs)
## [1] 42

Since the original test statistic is greater than the maximum randomized difference, the p-value is 0. So we reject the null and conclude that the vaccine is effective.