library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(boot)
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma

3

We now review k-fold cross-validation.

  1. Explain how k-fold cross-validation is implemented.

The data is split into k folds of equal size. The model is trained on k-1 folds. Then, the test error is computed on the excluded fold. This process is repeated until each fold has been used as the validation set. The error for each k fold are then averaged.

  1. What are the advantages and disadvantages of k-fold cross- validation relative to:
  1. The validation set approach?

CV has lower variability than the validation set approach.

Validation set tends to overestimate test error.

  1. LOOCV?

CV is less computationally expensive compared to LOOCV.

CV usually results in more bias than LOOCV, but less variance.

5

log1 <- glm(default ~ income + balance, data = Default, family = "binomial")
set.seed(1)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy 
##   0.9524
set.seed(2)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy 
##   0.9556
set.seed(3)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy 
##   0.9536
set.seed(4)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy 
##   0.9526

For all iterations of this method, the error rate remains roughly equal.

set.seed(1)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance + student, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy 
##   0.9516

The student variable resulted in a slight increase in the accuracy. .9516 compared to .9524

6

set.seed(1)
log3 <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(log3)$coefficients
##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136
boot.fn <- function(data, index){
  log3 <- glm(default ~ income + balance, family = "binomial", data = data[index,])
  return(summary(log3)$coefficients[2:3,2])
}


boot.fn(Default,1:nrow(Default))
##       income      balance 
## 4.985167e-06 2.273731e-04
boot(Default,boot.fn,100)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 4.985167e-06 1.625905e-08 1.391092e-07
## t2* 2.273731e-04 5.382713e-07 1.043266e-05
  1. The standard errors are reduced with the boot function.

9

mu <- mean(Boston$medv)
mu
## [1] 22.53281
se <- sd(Boston$medv)/sqrt(nrow(Boston))
se
## [1] 0.4088611
boot.fn <- function(data, index){
  return(mean(data[index,]$medv))
}

boot.fn(Boston,1:nrow(Boston))
## [1] 22.53281
boot(Boston,boot.fn,1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 22.53281 -0.001814032   0.3916794

The SE is remarkably close.

mu-2*se
## [1] 21.71508
mu+2*se
## [1] 23.35053
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

The confidence interval and t-test are very similar

med <- median(Boston$medv)
med
## [1] 21.2
boot.fn <- function(data, index){
  return(median(data[index,]$medv))
}

boot(Boston,boot.fn,1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0199   0.3751996

The estimated value from bootstrap is identical with a low standard error.

ten <- quantile(Boston$medv, 0.1)
ten
##   10% 
## 12.75
boot.fn <- function(data, index){
  return(quantile(data[index,]$medv,.1))
}

boot(Boston,boot.fn,1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75  -0.011   0.5287898

The estimated value from bootstrap is identical with a low standard error.