It is used to evaluate the performance of a machine learning model on unseen data. It shuffles the entire dataset and divides it into k sized folds then runs the model for said k folds.
The validation set approach? The validation set approach has a lower variance since it averages the results across multiple splits. The disadvantage of this approach is that it is computationally expensive since it can require running the model k times.
LOOCV? Advantages of this approach is that it has a better balance between the bias-variance tradeoff. Another advantage is that this is computationally less expensive. The disadvantage is that this does have a higher bias and can slightly overestimate the true test error.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(boot)
data(Default)
set.seed(42)
log_model <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
train_indices <- sample(nrow(Default), 0.8 * nrow(Default))
train_set <- Default[train_indices, ]
validation_set <- Default[-train_indices, ]
model_train <- glm(default ~ income + balance, data = train_set, family = binomial)
pred_probs <- predict(model_train, newdata = validation_set, type = "response")
pred_default <- ifelse(pred_probs > 0.5, "Yes", "No")
validation_error <- mean(pred_default != validation_set$default)
print(validation_error)
## [1] 0.0255
get_validation_error <- function(seed_val) {
set.seed(seed_val)
model_train <- glm(default ~ income + balance, data = train_set, family = binomial)
pred_probs <- predict(model_train, newdata = validation_set, type = "response")
pred_default <- ifelse(pred_probs > 0.5, "Yes", "No")
return(mean(pred_default != validation_set$default))
}
seeds <- c(123, 456, 789)
error_rates <- sapply(seeds, get_validation_error)
results <- data.frame(Seed = seeds, ValidationError = error_rates)
print(results)
## Seed ValidationError
## 1 123 0.0255
## 2 456 0.0255
## 3 789 0.0255
set.seed(42)
log_model_student <- glm(default ~ income + balance + student, data = Default, family = "binomial")
summary(log_model_student)
##
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
pred_probs1 <- predict(log_model_student, newdata = validation_set, type = "response")
pred_default1 <- ifelse(pred_probs > 0.5, "Yes", "No")
validation_error1 <- mean(pred_default1 != validation_set$default)
print(validation_error1)
## [1] 0.0255
The test error rate is the same as all other test error rates even with the inclusion of the student dummyvariable.
set.seed(42)
log_model <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(42)
boot.fn <- function(data, index) {
coef(glm(default ~ income + balance, data = data, subset = index, family = "binomial"))
}
boot.fn(Default, 1:8000)
## (Intercept) income balance
## -1.179560e+01 2.226489e-05 5.786903e-03
set.seed(42)
boot(Default, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.292405e-02 4.435269e-01
## t2* 2.080898e-05 2.737444e-08 5.073444e-06
## t3* 5.647103e-03 1.176249e-05 2.299133e-04
The standard errors using the boot function are slightly higher than the ones obtained using the glm function.
data(Boston)
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
sd <- sd(Boston$medv)
n <- length(Boston$medv)
se_mu_hat <- sd / sqrt(n)
se_mu_hat
## [1] 0.4088611
mean.fn <- function(data, index)
mean(data[index])
boot(Boston$medv, mean.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = mean.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.002375494 0.4185115
The standard error using the boot function is higher than the standard error that was manually calculated.
mu_hat - 2*.4185115
## [1] 21.69578
mu_hat + 2*.4185115
## [1] 23.36983
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The confidence interval calculated manually is slightly lower than that of the one provided using the t.test function.
mu_hat_med <- median(Boston$medv)
mu_hat_med
## [1] 21.2
med.fn <- function(data, index)
median(data[index])
boot(Boston$medv, med.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = med.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0137 0.3906596
mu_hat_01 <- quantile(Boston$medv, 0.10)
mu_hat_01
## 10%
## 12.75
percentile <- function(data, index)
quantile(data[index], 0.10)
boot(data = Boston$medv, percentile, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = percentile, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0201 0.4950819