—————————–

Q5: Logistic Regression on Default data

—————————–

data(Default)

(a) Fit model

glm1 <- glm(default ~ income + balance, data=Default, family=binomial)

(b–c) Validation set approach, repeat 3 times

validation_errors <- numeric(3)
for (i in 1:3) {
  train_idx <- sample(nrow(Default), size = nrow(Default)/2)
  train <- Default[train_idx, ]
  valid <- Default[-train_idx, ]
  
  fit <- glm(default ~ income + balance, data=train, family=binomial)
  preds <- predict(fit, newdata=valid, type="response")
  class_pred <- ifelse(preds > 0.5, "Yes", "No")
  validation_errors[i] <- mean(class_pred != valid$default)
}
validation_errors
## [1] 0.0276 0.0246 0.0262

(d) Add student variable

validation_err_student <- numeric(3)
for (i in 1:3) {
  train_idx <- sample(nrow(Default), size = nrow(Default)/2)
  train <- Default[train_idx, ]
  valid <- Default[-train_idx, ]
  
  fit2 <- glm(default ~ income + balance + student, data=train, family=binomial)
  preds2 <- predict(fit2, newdata=valid, type="response")
  class_pred2 <- ifelse(preds2 > 0.5, "Yes", "No")
  validation_err_student[i] <- mean(class_pred2 != valid$default)
}
validation_errors
## [1] 0.0276 0.0246 0.0262
validation_err_student
## [1] 0.0288 0.0268 0.0264

—————————–

Q6: Standard Errors via glm() vs bootstrap

—————————–

(a) glm() standard errors

summary(glm1)$coefficients[c("income", "balance"), "Std. Error"]
##       income      balance 
## 4.985167e-06 2.273731e-04

(b) bootstrap function

boot.fn <- function(data, idx) {
  d <- data[idx, ]
  fit <- glm(default ~ income + balance, data=d, family=binomial)
  coef(fit)[c("income", "balance")]
}

(c) bootstrap with 1000 resamples

boot_results <- boot(Default, statistic = boot.fn, R = 1000)
boot_results
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 2.080898e-05 1.703483e-07 4.729343e-06
## t2* 5.647103e-03 1.294969e-05 2.219378e-04

(d) Compare std errors

boot_results$t0
##       income      balance 
## 2.080898e-05 5.647103e-03
apply(boot_results$t, 2, sd)
## [1] 4.729343e-06 2.219378e-04
summary(glm1)$coefficients[c("income", "balance"), "Std. Error"]
##       income      balance 
## 4.985167e-06 2.273731e-04

—————————–

Q9: Boston housing data

—————————–

data(Boston)

(a) Estimate population mean of medv

mu_hat <- mean(Boston$medv)

(b) Standard error (theoretical)

se_theoretical <- sd(Boston$medv) / sqrt(nrow(Boston))

(c) Bootstrap estimate of SE

boot_fn2 <- function(data, idx) mean(data[idx])
boot_boston <- boot(Boston$medv, statistic = boot_fn2, R = 1000)
se_bootstrap <- sd(boot_boston$t)

(d) 95% CI from bootstrap and t.test

ci_bootstrap <- boot.ci(boot_boston, type = "perc")$percent[4:5]
ci_theoretical <- mu_hat + c(-2, 2) * se_bootstrap
ci_ttest <- t.test(Boston$medv)$conf.int

(e) Estimate median

median_hat <- median(Boston$medv)

(f) Bootstrap SE for median

boot_fn_med <- function(data, idx) median(data[idx])
boot_boston_med <- boot(Boston$medv, statistic = boot_fn_med, R = 1000)
se_median <- sd(boot_boston_med$t)

(g) Estimate 10th percentile

p10_hat <- quantile(Boston$medv, probs = 0.1)

(h) Bootstrap SE for 10th percentile

boot_fn_p10 <- function(data, idx) quantile(data[idx], probs = 0.1)
boot_boston_p10 <- boot(Boston$medv, statistic = boot_fn_p10, R = 1000)
se_p10 <- sd(boot_boston_p10$t)