We split our dataset into k groups. We train our model on k-1 folds and then test on the last fold. We repeat this until every k has been tested once.
It is both a lower variabilty and it uses all of the data when compared to validation. This being said, validation is faster to compute, which becomes a larger factor the bigger your data set is.
It is faster to compute than LOOCV is. This being said, LOOCV doesn’t have the same variability that k-fold cross-validation can have.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
log.default <- glm(default ~ income + balance, data = Default, family = 'binomial')
summary(log.default)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
index <- createDataPartition(y = Default$default, p = .75, list = FALSE)
train <- Default[index,]
test <- Default[-index,]
trainlog.default <- glm(default ~ income + balance, data = train, family = 'binomial')
summary(trainlog.default)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4142 -0.1450 -0.0581 -0.0219 3.7082
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.128e+01 4.961e-01 -22.730 <2e-16 ***
## income 1.679e-05 5.802e-06 2.895 0.0038 **
## balance 5.558e-03 2.580e-04 21.539 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2192.2 on 7500 degrees of freedom
## Residual deviance: 1184.4 on 7498 degrees of freedom
## AIC: 1190.4
##
## Number of Fisher Scoring iterations: 8
glm.probs <- predict(trainlog.default, newdata = test, type = 'response')
glm.predict <- rep("No", length(glm.probs))
glm.predict[glm.probs >.5] <- "Yes"
mean(glm.predict != test)
## [1] 0.5833333
index1 <- createDataPartition(y = Default$default, p = .9, list = FALSE)
train1 <- Default[index1,]
test1 <- Default[-index1,]
trainlog.default1 <- glm(default ~ income + balance, data = train1, family = 'binomial')
glm.probs1 <- predict(trainlog.default1, newdata = test1, type = 'response')
glm.predict1 <- rep("No", length(glm.probs1))
glm.predict1[glm.probs1 >.5] <- "Yes"
mean(glm.predict1 != test1)
## [1] 0.5780781
index2 <- createDataPartition(y = Default$default, p = .6, list = FALSE)
train2 <- Default[index2,]
test2 <- Default[-index2,]
trainlog.default2 <- glm(default ~ income + balance, data = train2, family = 'binomial')
glm.probs2 <- predict(trainlog.default2, newdata = test2, type = 'response')
glm.predict2 <- rep("No", length(glm.probs2))
glm.predict2[glm.probs2 >.5] <- "Yes"
mean(glm.predict2 != test2)
## [1] 0.5787072
index3 <- createDataPartition(y = Default$default, p = .5, list = FALSE)
train3 <- Default[index3,]
test3 <- Default[-index3,]
trainlog.default3 <- glm(default ~ income + balance, data = train3, family = 'binomial')
glm.probs3 <- predict(trainlog.default3, newdata = test3, type = 'response')
glm.predict3 <- rep("No", length(glm.probs3))
glm.predict3[glm.probs >.5] <- "Yes"
mean(glm.predict3 != test3)
## [1] 0.5847169
It looks like the prediction error still remains around the 57-59% error range.
#back to .75
index4 <- createDataPartition(y = Default$default, p = .75, list = FALSE)
train4 <- Default[index4,]
test4 <- Default[-index4,]
trainlog.default4 <- glm(default ~ income + balance + student, data = train4, family = 'binomial')
glm.probs4 <- predict(trainlog.default4, newdata = test4, type = 'response')
glm.predict4 <- rep("No", length(glm.probs4))
glm.predict4[glm.probs >.5] <- "Yes"
mean(glm.predict4 != test4)
## [1] 0.5853341
This is roughly the same error rate that we got without the student variable. It seems like the dummy variable doesn’t significantly effect our error rate.
set.seed(1)
sixglm.fit <- glm(default ~ income + balance, data = Default, family = 'binomial')
summary(sixglm.fit)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
boot.fn <- function(data, index) {
glm.fit <- glm(default ~ income + balance, data = data, family = 'binomial', subset = index)
return(coef(glm.fit))
}
boot(Default, boot.fn, 500)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 500)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.298134e-02 4.212334e-01
## t2* 2.080898e-05 -2.053140e-07 5.184890e-06
## t3* 5.647103e-03 1.678038e-05 2.178846e-04
The bootstrap has a lower std. error for t1 and t3, but a higher std. error for t2. This being said, the std errors are pretty similar to one another.
muhat <- mean(Boston$medv)
muhat
## [1] 22.53281
sehat <- sd(Boston$medv) / sqrt(dim(Boston)[1])
sehat
## [1] 0.4088611
On average, our sample mean will be .4088 off from our population mean.
bootfn <- function(data, index) {
muhat <- mean(data[index])
return (muhat)
}
bootstrap <- boot(Boston$medv, bootfn, 500)
bootstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = bootfn, R = 500)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.008388933 0.4106569
It’s slightly lower, but the standard errors are essentially equivalent.
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
ci <- c(bootstrap$t0 - 2 * .408, bootstrap$t0 + 2 * .408)
ci
## [1] 21.71681 23.34881
medianhat <- median(Boston$medv)
medianhat
## [1] 21.2
bootfn <- function(data, index) {
medianhat <- median(data[index])
return (medianhat)
}
boot(Boston$medv, bootfn, 500)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = bootfn, R = 500)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0403 0.393396
The medians are essentially the same.
tenthmedv <- quantile(Boston$medv, c(.1))
tenthmedv
## 10%
## 12.75
bootfn <- function(data, index) {
tenthmedv <- quantile(data[index], c(0.1))
return (tenthmedv)
}
boot(Boston$medv, bootfn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = bootfn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0144 0.5069168
The standard error is .5, leaving a confidence interval for one standard deviation of (12.25,13.25).