3
- K-fold cross-validation is a model evaluation method that involves
dividing the dataset into k approximately equal-sized folds. The model
is trained k times, each time using a different fold as the validation
set and the remaining k–1 folds as the training set. After training, the
model’s performance is evaluated on the validation fold.
- Compared to the validation set approach, k-fold cross-validation has
the advantage of producing a more accurate estimate of the test error
since it uses multiple training-validation splits. The disadvantage is
k-fold is more computationally intensive. In comparison to LOOCV, k-fold
is less computationally expensive. The disadvantage is k-fold CV
introduce more bias. ## 5
library(ISLR)
set.seed(1)
# (a)
model_full <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(model_full)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# (b)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_index, ]
valid_data <- Default[-train_index, ]
model_train <- glm(default ~ income + balance, data = train_data, family = "binomial")
probs <- predict(model_train, newdata = valid_data, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")
actual <- valid_data$default
mean(preds != actual)
## [1] 0.0254
# (c)
set.seed(1)
train1 <- sample(1:nrow(Default), nrow(Default) / 2)
model1 <- glm(default ~ income + balance, data = Default, subset = train1, family = "binomial")
probs1 <- predict(model1, newdata = Default[-train1, ], type = "response")
preds1 <- ifelse(probs1 > 0.5, "Yes", "No")
error1 <- mean(preds1 != Default$default[-train1])
set.seed(2)
train2 <- sample(1:nrow(Default), nrow(Default) / 2)
model2 <- glm(default ~ income + balance, data = Default, subset = train2, family = "binomial")
probs2 <- predict(model2, newdata = Default[-train2, ], type = "response")
preds2 <- ifelse(probs2 > 0.5, "Yes", "No")
error2 <- mean(preds2 != Default$default[-train2])
set.seed(3)
train3 <- sample(1:nrow(Default), nrow(Default) / 2)
model3 <- glm(default ~ income + balance, data = Default, subset = train3, family = "binomial")
probs3 <- predict(model3, newdata = Default[-train3, ], type = "response")
preds3 <- ifelse(probs3 > 0.5, "Yes", "No")
error3 <- mean(preds3 != Default$default[-train3])
c(error1, error2, error3)
## [1] 0.0254 0.0238 0.0264
# (d)
set.seed(1)
train <- sample(1:nrow(Default), nrow(Default) / 2)
model_with_student <- glm(default ~ income + balance + student, data = Default, subset = train, family = "binomial")
probs_student <- predict(model_with_student, newdata = Default[-train, ], type = "response")
preds_student <- ifelse(probs_student > 0.5, "Yes", "No")
error_student <- mean(preds_student != Default$default[-train])
error_student
## [1] 0.026
6
# (a)
set.seed(1)
glm_fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm_fit)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
# (b)
boot_fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data, subset = index, family = "binomial")
return(coef(fit)[2:3]) # Return only income and balance coefficients
}
# (c)
library(boot)
set.seed(1)
boot_results <- boot(Default, boot_fn, R = 1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04
boot_results$t0
## income balance
## 2.080898e-05 5.647103e-03
apply(boot_results$t, 2, sd)
## [1] 4.866284e-06 2.298949e-04
# (d) The standard error of 4.866284e-06 using the bootstrap was similar to the standard error from glm() of 4.985167e-06, suggesting that the model assumptions are met.
9
library(MASS)
data("Boston")
medv <- Boston$medv
n <- length(medv)
# (a)
mu_hat <- mean(medv)
mu_hat
## [1] 22.53281
# (b)
se_formula <- sd(medv) / sqrt(n)
se_formula
## [1] 0.4088611
# (c)
boot_mean <- function(data, index) {
mean(data[index])
}
set.seed(1)
boot_result <- boot(medv, boot_mean, R = 1000)
se_bootstrap <- sd(boot_result$t)
se_bootstrap
## [1] 0.4106622
# (d)
ci_bootstrap <- c(mu_hat - 2 * se_bootstrap, mu_hat + 2 * se_bootstrap)
ci_formula <- c(mu_hat - 2 * se_formula, mu_hat + 2 * se_formula)
ci_bootstrap
## [1] 21.71148 23.35413
ci_formula
## [1] 21.71508 23.35053
# (e)
mu_median <- median(medv)
mu_median
## [1] 21.2
# (f)
boot_median <- function(data, index) {
median(data[index])
}
set.seed(1)
boot_median_result <- boot(medv, boot_median, R = 1000)
se_median <- sd(boot_median_result$t)
se_median
## [1] 0.3778075
# (g)
mu_10 <- quantile(medv, 0.1)
mu_10
## 10%
## 12.75
# (h)
boot_p10 <- function(data, index) {
quantile(data[index], 0.1)
}
set.seed(1)
boot_p10_result <- boot(medv, boot_p10, R = 1000)
se_p10 <- sd(boot_p10_result$t)
se_p10
## [1] 0.4767526