library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
We now review k-fold cross-validation.
The data is split into k folds of equal size. The model is trained on k-1 folds. Then, the test error is computed on the excluded fold. This process is repeated until each fold has been used as the validation set. The error for each k fold are then averaged.
CV has lower variability than the validation set approach.
Validation set tends to overestimate test error.
CV is less computationally expensive compared to LOOCV.
CV usually results in more bias than LOOCV, but less variance.
log1 <- glm(default ~ income + balance, data = Default, family = "binomial")
set.seed(1)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy
## 0.9524
set.seed(2)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy
## 0.9556
set.seed(3)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy
## 0.9536
set.seed(4)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy
## 0.9526
For all iterations of this method, the error rate remains roughly equal.
set.seed(1)
train <- sample(nrow(Default),nrow(Default)/2)
log2 <- glm(default ~ income + balance + student, data = Default[train,], family = "binomial")
log2_predict <- predict(log2, data = Default[-train,], type = "response")
log2_response <- ifelse(log2_predict>.5,"Yes","No")
log2M <- caret::confusionMatrix(reference = as.factor(Default[-train,]$default), data = as.factor(log2_response),positive= "Yes")
log2M$overall[1]
## Accuracy
## 0.9516
The student variable resulted in a slight increase in the accuracy. .9516 compared to .9524
set.seed(1)
log3 <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(log3)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
boot.fn <- function(data, index){
log3 <- glm(default ~ income + balance, family = "binomial", data = data[index,])
return(summary(log3)$coefficients[2:3,2])
}
boot.fn(Default,1:nrow(Default))
## income balance
## 4.985167e-06 2.273731e-04
boot(Default,boot.fn,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 4.985167e-06 1.625905e-08 1.391092e-07
## t2* 2.273731e-04 5.382713e-07 1.043266e-05
mu <- mean(Boston$medv)
mu
## [1] 22.53281
se <- sd(Boston$medv)/sqrt(nrow(Boston))
se
## [1] 0.4088611
boot.fn <- function(data, index){
return(mean(data[index,]$medv))
}
boot.fn(Boston,1:nrow(Boston))
## [1] 22.53281
boot(Boston,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.001814032 0.3916794
The SE is remarkably close.
mu-2*se
## [1] 21.71508
mu+2*se
## [1] 23.35053
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The confidence interval and t-test are very similar
med <- median(Boston$medv)
med
## [1] 21.2
boot.fn <- function(data, index){
return(median(data[index,]$medv))
}
boot(Boston,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0199 0.3751996
The estimated value from bootstrap is identical with a low standard error.
ten <- quantile(Boston$medv, 0.1)
ten
## 10%
## 12.75
boot.fn <- function(data, index){
return(quantile(data[index,]$medv,.1))
}
boot(Boston,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 -0.011 0.5287898
The estimated value from bootstrap is identical with a low standard error.