We now review k-fold cross-validation. (a) Explain how k-fold cross-validation is implemented.
k-fold cross-validation is implemented by splitting a dataset into x folds to split and test on different x portions of the data at a time. Data will essentially take turns being a training data set vs a testing data set.
The validation set approach? Adv. to validation set is less overfitting on the training dataset. Disadv. is that it is more computationally expensive since you have to fit on x folds instead of once.
LOOCV? Adv of k-fold here is that it’s less computational, since LOOCV is strictly testing on a single data point at a time. Disadv of k-fold is that results can vary depending on the random fold at the time of training.
In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis. (a) Fit a logistic regression model that uses income and balance to predict default.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.6.1
set.seed(1)
glm.fit = glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
train <- sample(nrow(Default), nrow(Default) / 2)
Default.train <- Default[train,]
Default.val <- Default[-train,]
glm.fit2 <- glm(default ~ income + balance, data = Default.train, family = binomial)
glm.probs <- predict(glm.fit2, Default.val, type = "response")
glm.pred <- rep("No", nrow(Default.val))
glm.pred[glm.probs > 0.5] <- "Yes"
mean(glm.pred != Default.val$default)
## [1] 0.0254
set.seed(2)
train <- sample(nrow(Default), nrow(Default) / 2)
glm.fit3 <- glm(default ~ income + balance, data = Default[train, ], family = binomial)
glm.probs <- predict(glm.fit3, Default[-train,], type = "response")
glm.pred <- rep("No", nrow(Default[-train,]))
glm.pred[glm.probs > 0.5] <- "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0238
set.seed(3)
train <- sample(nrow(Default), nrow(Default) / 2)
glm.fit4 <- glm(default ~ income + balance, data = Default[train, ], family = binomial)
glm.probs <- predict(glm.fit4, Default[-train,], type = "response")
glm.pred <- rep("No", nrow(Default[-train,]))
glm.pred[glm.probs > 0.5] <- "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0264
set.seed(4)
train <- sample(nrow(Default), nrow(Default) / 2)
glm.fit5 <- glm(default ~ income + balance, data = Default[train, ], family = binomial)
glm.probs <- predict(glm.fit5, Default[-train,], type = "response")
glm.pred <- rep("No", nrow(Default[-train,]))
glm.pred[glm.probs > 0.5] <- "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0256
set.seed(1)
train <- sample(nrow(Default), nrow(Default) / 2)
glm.fit6 <- glm(default ~ income + balance + student, data = Default[train,], family = binomial)
glm.probs <- predict(glm.fit6, Default[-train, ], type = "response")
glm.pred <- rep("No", nrow(Default[-train, ]))
glm.pred[glm.probs > 0.5] <- "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.026
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.6.1
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(boot)
set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn <- function(data, index) {
glm.fit <- glm(default ~ income + balance, data = data, subset = index, family = binomial)
return(coef(glm.fit))
}
set.seed(1)
boot(Default, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2* 2.080898e-05 1.680317e-07 4.866284e-06
## t3* 5.647103e-03 1.855765e-05 2.298949e-04
We will now consider the Boston housing data set, from the ISLR2 library.
library(ISLR2)
library(boot)
μ <- mean(Boston$medv)
summary(μ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.53 22.53 22.53 22.53 22.53 22.53
se.mu <- sd(Boston$medv) / sqrt(nrow(Boston))
se.mu
## [1] 0.4088611
boot.fn.mean <- function(data, index) {
return(mean(data$medv[index]))
}
boot(Boston, boot.fn.mean, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn.mean, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.0109415 0.414028
μ - 2 * 0.414028
## [1] 21.70475
μ + 2 * 0.414028
## [1] 23.36086
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
u.med <- median(Boston$medv)
summary(u.med)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.2 21.2 21.2 21.2 21.2 21.2
set.seed(1)
boot.fn.median <- function(data, index) {
return(median(data$medv[index]))
}
boot(Boston, boot.fn.median, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn.median, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.02295 0.3778075
u.0.1 <- quantile(Boston$medv, 0.1)
u.0.1
## 10%
## 12.75
set.seed(1)
boot.fn.tenth <- function(data, index) {
return(quantile(data$medv[index], 0.1))
}
boot(Boston, boot.fn.tenth, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn.tenth, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0339 0.4767526