—————————–
Q5: Logistic Regression on Default data
(a) Fit model
glm1 <- glm(default ~ income + balance, data=Default, family=binomial)
(b–c) Validation set approach, repeat 3 times
validation_errors <- numeric(3)
for (i in 1:3) {
train_idx <- sample(nrow(Default), size = nrow(Default)/2)
train <- Default[train_idx, ]
valid <- Default[-train_idx, ]
fit <- glm(default ~ income + balance, data=train, family=binomial)
preds <- predict(fit, newdata=valid, type="response")
class_pred <- ifelse(preds > 0.5, "Yes", "No")
validation_errors[i] <- mean(class_pred != valid$default)
}
validation_errors
## [1] 0.0276 0.0246 0.0262
(d) Add student variable
validation_err_student <- numeric(3)
for (i in 1:3) {
train_idx <- sample(nrow(Default), size = nrow(Default)/2)
train <- Default[train_idx, ]
valid <- Default[-train_idx, ]
fit2 <- glm(default ~ income + balance + student, data=train, family=binomial)
preds2 <- predict(fit2, newdata=valid, type="response")
class_pred2 <- ifelse(preds2 > 0.5, "Yes", "No")
validation_err_student[i] <- mean(class_pred2 != valid$default)
}
validation_errors
## [1] 0.0276 0.0246 0.0262
validation_err_student
## [1] 0.0288 0.0268 0.0264
—————————–
Q6: Standard Errors via glm() vs bootstrap
—————————–
(a) glm() standard errors
summary(glm1)$coefficients[c("income", "balance"), "Std. Error"]
## income balance
## 4.985167e-06 2.273731e-04
(b) bootstrap function
boot.fn <- function(data, idx) {
d <- data[idx, ]
fit <- glm(default ~ income + balance, data=d, family=binomial)
coef(fit)[c("income", "balance")]
}
(c) bootstrap with 1000 resamples
boot_results <- boot(Default, statistic = boot.fn, R = 1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.703483e-07 4.729343e-06
## t2* 5.647103e-03 1.294969e-05 2.219378e-04
(d) Compare std errors
boot_results$t0
## income balance
## 2.080898e-05 5.647103e-03
apply(boot_results$t, 2, sd)
## [1] 4.729343e-06 2.219378e-04
summary(glm1)$coefficients[c("income", "balance"), "Std. Error"]
## income balance
## 4.985167e-06 2.273731e-04
—————————–
Q9: Boston housing data
(a) Estimate population mean of medv
mu_hat <- mean(Boston$medv)
(b) Standard error (theoretical)
se_theoretical <- sd(Boston$medv) / sqrt(nrow(Boston))
(c) Bootstrap estimate of SE
boot_fn2 <- function(data, idx) mean(data[idx])
boot_boston <- boot(Boston$medv, statistic = boot_fn2, R = 1000)
se_bootstrap <- sd(boot_boston$t)
(d) 95% CI from bootstrap and t.test
ci_bootstrap <- boot.ci(boot_boston, type = "perc")$percent[4:5]
ci_theoretical <- mu_hat + c(-2, 2) * se_bootstrap
ci_ttest <- t.test(Boston$medv)$conf.int
(g) Estimate 10th percentile
p10_hat <- quantile(Boston$medv, probs = 0.1)
(h) Bootstrap SE for 10th percentile
boot_fn_p10 <- function(data, idx) quantile(data[idx], probs = 0.1)
boot_boston_p10 <- boot(Boston$medv, statistic = boot_fn_p10, R = 1000)
se_p10 <- sd(boot_boston_p10$t)
Print results
cat("Q5 validation errors (income+balance):", validation_errors, "\n")
## Q5 validation errors (income+balance): 0.0276 0.0246 0.0262
cat("Q5 validation errors (with student):", validation_err_student, "\n")
## Q5 validation errors (with student): 0.0288 0.0268 0.0264
cat("Q6 glm SEs:\n"); print(summary(glm1)$coefficients[c("income", "balance"), "Std. Error"])
## Q6 glm SEs:
## income balance
## 4.985167e-06 2.273731e-04
cat("Q6 bootstrap SEs:\n", apply(boot_results$t, 2, sd), "\n")
## Q6 bootstrap SEs:
## 4.729343e-06 0.0002219378
cat("Q9 mean:", mu_hat, "; theoretical SE:", se_theoretical, "; bootstrap SE:", se_bootstrap, "\n")
## Q9 mean: 22.53281 ; theoretical SE: 0.4088611 ; bootstrap SE: 0.4164609
cat("95% CI bootstrap:", ci_bootstrap, "; approximate CI:", ci_theoretical, "; t.test CI:", ci_ttest, "\n")
## 95% CI bootstrap: 21.70357 23.35719 ; approximate CI: 21.69988 23.36573 ; t.test CI: 21.72953 23.33608
cat("Median:", median_hat, "; SE of median (boot):", se_median, "\n")
## Median: 21.2 ; SE of median (boot): 0.389254
cat("10th percentile:", p10_hat, "; SE (boot):", se_p10, "\n")
## 10th percentile: 12.75 ; SE (boot): 0.506845