From chapter 5 ISLR exercises
knitr::opts_chunk$set(echo = TRUE)
library(ISLR2)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(e1071)
library(boot)
We now review k-fold cross-validation.
The data is split into k number of groups and then a model is trained on all but one of the groups and tested on the one that is left out. This processes is repeated so that each group has a turn as the testing set. The performance is averaged.
Advantage of k-fold compared: much lower variability, all the data is used, validation set approach can over-estimate error. Disadvantages: VSA is simpler and thus has computational advantages ##### ii. LOOCV? Advantage of k-fold compared: less computationally demanding, more accurate estimate of the test error Disadvantages: element of randomness to k-fold, less computational power.
In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
set.seed(23)
d1 = Default
glm1 = glm(default ~ income + balance, data = d1, family = "binomial")
summary(glm1)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = d1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(23)
index <- sample(nrow(d1),0.7*nrow(d1),replace = F)
train_d1 <- d1[index, ]
val_d1 <- d1[-index, ]
set.seed(23)
glm2 = glm(default ~ income + balance, data = train_d1, family = "binomial")
summary(glm2)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train_d1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.189e+01 5.410e-01 -21.975 < 2e-16 ***
## income 2.798e-05 6.086e-06 4.597 4.28e-06 ***
## balance 5.686e-03 2.794e-04 20.351 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1976.0 on 6999 degrees of freedom
## Residual deviance: 1067.3 on 6997 degrees of freedom
## AIC: 1073.3
##
## Number of Fisher Scoring iterations: 8
d1_prob = predict(glm2, val_d1, type = "response")
d1_pred = rep("No", nrow(val_d1))
d1_pred[d1_prob > .5] = "Yes"
mean(d1_pred!=val_d1$default)
## [1] 0.02733333
set.seed(23)
index <- sample(nrow(d1),0.5*nrow(d1),replace = F)
train_d1 <- d1[index, ]
val_d1 <- d1[-index, ]
glm2 = glm(default ~ income + balance, data = train_d1, family = "binomial")
d1_prob = predict(glm2, val_d1, type = "response")
d1_pred = rep("No", nrow(val_d1))
d1_pred[d1_prob > .5] = "Yes"
mean(d1_pred!=val_d1$default)
## [1] 0.029
set.seed(23)
index <- sample(nrow(d1),0.8*nrow(d1),replace = F)
train_d1 <- d1[index, ]
val_d1 <- d1[-index, ]
glm2 = glm(default ~ income + balance, data = train_d1, family = "binomial")
d1_prob = predict(glm2, val_d1, type = "response")
d1_pred = rep("No", nrow(val_d1))
d1_pred[d1_prob > .5] = "Yes"
mean(d1_pred!=val_d1$default)
## [1] 0.0245
set.seed(23)
index <- sample(nrow(d1),0.9*nrow(d1),replace = F)
train_d1 <- d1[index, ]
val_d1 <- d1[-index, ]
glm2 = glm(default ~ income + balance, data = train_d1, family = "binomial")
d1_prob = predict(glm2, val_d1, type = "response")
d1_pred = rep("No", nrow(val_d1))
d1_pred[d1_prob > .5] = "Yes"
mean(d1_pred!=val_d1$default)
## [1] 0.027
set.seed(23)
index <- sample(nrow(d1),0.9*nrow(d1),replace = F)
train_d1 <- d1[index, ]
val_d1 <- d1[-index, ]
glm2 = glm(default ~ income + balance + as.factor(student), data = train_d1, family = "binomial")
d1_prob = predict(glm2, val_d1, type = "response")
d1_pred = rep("No", nrow(val_d1))
d1_pred[d1_prob > .5] = "Yes"
mean(d1_pred!=val_d1$default)
## [1] 0.027
Adding the student predictor made increased my test error.
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
glm3 = glm(default ~ income + balance, data = d1, family = "binomial")
summary(glm3)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
The ESE for income is 4.985e-06 and for balance is 2.274e-04.
boot.fn = function(data, index) + coef(glm(default ~ income + balance, data = d1, subset = index, family = "binomial"))
set.seed(23)
boot.fn(d1, 1:10000)[-1]
## income balance
## 2.080898e-05 5.647103e-03
The coefficient estimate for income is 2.08e-05 and for balance is 5.65e-03.
set.seed(23)
boot(data = d1, statistic = boot.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = d1, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -6.425518e-03 4.252997e-01
## t2* 2.080898e-05 -4.567628e-08 4.571768e-06
## t3* 5.647103e-03 -3.014214e-07 2.228104e-04
The ESE for income is 4.572e-06 and for balance is 2.228e-04.
Using the glm() function, the ESE for income is 4.985e-06 and for balance is 2.274e-04. Using the bootstrap function function, the ESE for income is income is 4.572e-06 and for balance is 2.228e-04.
Standard error was lower when using bootstrap.
We will now consider the Boston housing data set, from the ISLR2 library.
bos = Boston
mean(bos$medv)
## [1] 22.53281
sd(bos$medv)/sqrt(length(bos$medv))
## [1] 0.4088611
boot.fn = function(vector, index) {mean(vector[index])}
set.seed(23)
boot_res = boot(bos$medv, boot.fn, 1000)
boot_res
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = bos$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.01904763 0.4000822
Very similar error but slightly lower (0.400 vs 0.409).
boot_res_sd = sd(boot_res$t)
c(mean(bos$medv) - 2*boot_res_sd, mean(bos$medv) + 2*boot_res_sd)
## [1] 21.73264 23.33297
t.test(bos$medv)
##
## One Sample t-test
##
## data: bos$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Very similar
median(bos$medv)
## [1] 21.2
boot.fn = function(vector, index) {median(vector[index])}
set.seed(23)
boot_res = boot(bos$medv, boot.fn, 1000)
boot_res
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = bos$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.02375 0.3647828
Standard error is small
quantile(bos$medv, 0.1)
## 10%
## 12.75
boot.fn = function(vector, index) {quantile(vector[index], 0.1)}
set.seed(23)
boot_res = boot(bos$medv, boot.fn, 1000)
boot_res
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = bos$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.028 0.4964549
Standard error is large than in our other bootstraps.