Question 3

# (3a)
invisible("You split your data into k equal groups. Then, you train and test your model k times so that every group gets a turn being the test set, and you average the scores at the end.")

# (3b-I)
invisible("k-fold gives a much more accurate and reliable score because it tests on the entire dataset, meaning no data is wasted. It is much slower and uses more computing power because you have to train the model k times instead of just once.")

# (3b-II)
invisible("k-fold is much faster and uses less computing power because it only trains the model k times instead of training it on almost every single data point like LOOCV. k-fold can give slightly less accurate or more biased results because it trains on less data during each loop compared to LOOCV.")

Question 5

# 5(a)
set.seed(1)
model <- glm(default ~ income + balance, data = Default, family = binomial)

# 5(b-I)
set.seed(1)
train <- sample(nrow(Default), nrow(Default) / 2)

# 5(b-II)
model_train <- glm(default ~ income + balance, data = Default, family = binomial, subset = train)

# 5(b-III)
probs <- predict(model_train, Default[-train, ], type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")

# 5(b-IV)
mean(preds != Default[-train, ]$default)
## [1] 0.0254
# 5(c)
# Split 1
set.seed(2)
train1 <- sample(nrow(Default), nrow(Default) / 2)
model1 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train1)
preds1 <- ifelse(predict(model1, Default[-train1, ], type = "response") > 0.5, "Yes", "No")
mean(preds1 != Default[-train1, ]$default)
## [1] 0.0238
# Split 2
set.seed(3)
train2 <- sample(nrow(Default), nrow(Default) / 2)
model2 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train2)
preds2 <- ifelse(predict(model2, Default[-train2, ], type = "response") > 0.5, "Yes", "No")
mean(preds2 != Default[-train2, ]$default)
## [1] 0.0264
# Split 3
set.seed(4)
train3 <- sample(nrow(Default), nrow(Default) / 2)
model3 <- glm(default ~ income + balance, data = Default, family = binomial, subset = train3)
preds3 <- ifelse(predict(model3, Default[-train3, ], type = "response") > 0.5, "Yes", "No")
mean(preds3 != Default[-train3, ]$default)
## [1] 0.0256
invisible("Results show error rates of 2.38%, 2.64%, and 2.56%. The test error rate changes depending on how the data is split, which is the main drawback of the validation set approach.")

# 5(d)
set.seed(1)
train <- sample(nrow(Default), nrow(Default) / 2)
model_student <- glm(default ~ income + balance + student, data = Default, family = binomial, subset = train)
probs <- predict(model_student, Default[-train, ], type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")
mean(preds != Default[-train, ]$default)
## [1] 0.026
invisible("New error rate is 2.6%, which is slightly higher than the original 2.54%. Student dummy variable does not lead to a reduction in the test error rate.")

Question 6

# (6a)
set.seed(1)
model <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model)$coefficients
##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136
# (6b)
boot.fn <- function(data, index) {
  model <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
  return(coef(model)[c("income", "balance")])
}

# (6c)
set.seed(1)
boot(Default, boot.fn, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04
# (6d)
invisible("The standard errors from glm() and the bootstrap are similar for both income and balance. Shows that the standard formula assumptions hold well for this dataset. Both methods provide reliable estimates of coefficient variability.")

Question 9

# (9a)
estimated_mean <- mean(Boston$medv)
estimated_mean
## [1] 22.53281
# (9b)
se_mean <- sd(Boston$medv) / sqrt(nrow(Boston))
se_mean
## [1] 0.4088611
invisible("Estimated standard error is 0.40886. Sample mean would vary by about 0.41 units from the true population mean.")

# (9c)
boot.mean <- function(data, index) {
  return(mean(data$medv[index]))
}

set.seed(1)
boot(Boston, boot.mean, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.mean, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 0.007650791   0.4106622
invisible("The bootstrap standard error estimate of 0.4107 vs 0.4089 from part (b). Both methods are consistent & reliable.")

# (9d)
estimated_mean - 2 * 0.4106622
## [1] 21.71148
estimated_mean + 2 * 0.4106622
## [1] 23.35413
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281
invisible("The bootstrap 95% confidence interval [21.711, 23.354] vs t.test() interval [21.730, 23.336]. Bootstrap provides a reliable interval estimate.")

# (9e)
estimated_median <- median(Boston$medv)
estimated_median
## [1] 21.2
# (9f)
boot.median <- function(data, index) {
  return(median(data$medv[index]))
}

set.seed(1)
boot(Boston, boot.median, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.median, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 0.02295   0.3778075
invisible("The bootstrap standard error estimate for the median is 0.3778. Sample median is a stable/reliable estimate for the population median.")

# (9g)
estimated_pct10 <- quantile(Boston$medv, 0.1)
estimated_pct10
##   10% 
## 12.75
# (9h)
boot.pct10 <- function(data, index) {
  return(quantile(data$medv[index], 0.1))
}

set.seed(1)
boot(Boston, boot.pct10, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.pct10, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  0.0339   0.4767526
invisible("The bootstrap standard error estimate for the tenth percentile is 0.4768. Relative to the estimate of 12.75 suggests percentile is stable and precise, even though tail percentiles show higher variability than mean or median.")