The k-fold cross-validation approach involves randomly dividing the set of observations into k groups, or folds, of approximately equal size. The first fold is treated as a validation set, and the method is fit on the remaining k-1 folds. The mean squared error is then computed on the observations in the held-out fold. This procedure is repeated k times.
In terms of variance and bias, the advantages and disadvantages of K-fold are: Compared to validation set approach = Less Variance, More Bias Compared to LOOCV approach = More Variance, Less Bias
require(ISLR)
## Loading required package: ISLR
data(Default)
set.seed(1)
fit1 <- glm(default ~ income + balance, data=Default, family=binomial)
summary(fit1)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
table(pred2, Default[-train,]$default)
##
## pred2 No Yes
## No 4805 115
## Yes 28 52
mean(Default[-train,]$default != pred2) # test error
## [1] 0.0286
Test Error - 0.0286
Repeat 1
set.seed(2)
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2)
## [1] 0.0276
Test Error = 0.0276
Repeat 2
set.seed(3) # Repeat 2
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2)
## [1] 0.0248
Test Error = 0.0248
Repeat 3
set.seed(4) # Repeat 3
train <- sample(nrow(Default), nrow(Default)*0.5)
fit2 <- glm(default ~ income + balance, data=Default, family=binomial, subset=train)
prob2 <- predict(fit2, Default[-train,], type="response")
pred2 <- ifelse(prob2 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred2)
## [1] 0.0262
Test Error = 0.0262
Error is consistent aroud 2.5% and not a very high variance.
set.seed(1)
train <- sample(nrow(Default), nrow(Default)*0.5)
fit3 <- glm(default ~ income + balance + student, data=Default, family=binomial, subset=train)
prob3 <- predict(fit3, Default[-train,], type="response")
pred3 <- ifelse(prob3 > 0.5, "Yes", "No")
mean(Default[-train,]$default != pred3)
## [1] 0.0288
Test Error = 0.0288 There is no significant reduction from student included or removed.
require(ISLR)
data(Default)
set.seed(1)
fit1 <- glm(default ~ income + balance, data=Default, family=binomial)
summary(fit1)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Estimated Standard Error using GLM: Income = 4.985e-06 Balance = 2.274e-04
set.seed(1)
boot.fn <- function(df, trainid) {
return(coef(glm(default ~ income + balance, data=df, family=binomial, subset=trainid)))
}
boot.fn(Default, 1:nrow(Default))
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
require(boot)
## Loading required package: boot
boot(Default, boot.fn, R=100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 9.699111e-02 4.101121e-01
## t2* 2.080898e-05 6.715005e-08 4.127740e-06
## t3* 5.647103e-03 -5.733883e-05 2.105660e-04
Standard Error using bootstrap (R=100): Income = 4.127740e-06 Balance = 2.105660e-04
With R=100, the standard errors are pretty close for both GLM versus Bootstrap Income = 4.985e-06 (GLM); 4.127740e-06(Bootstrap) Balance = 2.274e-04 (GLM); 2.105660e-04(Bootstrap)
require(MASS)
## Loading required package: MASS
require(boot)
data(Boston)
(medv.mu <- mean(Boston$medv))
## [1] 22.53281
mu hat = 22.53281 (Estimate)
(medv.sd <- sd(Boston$medv)/sqrt(nrow(Boston)))
## [1] 0.4088611
estimate of standard error = 0.4088611
set.seed(1)
mean.fn <- function(var, id) {
return(mean(var[id]))
}
(boot.res <- boot(Boston$medv, mean.fn, R=100))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = mean.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.04836957 0.3815554
Estimate of standard error = 0.3815554, which is very close to the previous estimation.
boot.res$t0 - 2*sd(boot.res$t) # lower bound
## [1] 21.7697
boot.res$t0 + 2*sd(boot.res$t) # upper bound
## [1] 23.29592
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
95% confidence interval: Lower bound: 21.7697 and Upper bound: 23.29592 T-test = 22.53281
(medv.median <- median(Boston$medv))
## [1] 21.2
medv.median = 21.2
set.seed(1)
median.fn <- function(var, id) {
return(median(var[id]))
}
(boot.res <- boot(Boston$medv, median.fn, R=100))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = median.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.026 0.3683488
Standard Error = 0.3683488
(medv.mu10 <- quantile(Boston$medv, 0.1))
## 10%
## 12.75
10the percentile estimate of medv in boston suburbs = 12.75
set.seed(1)
quantile10.fn <- function(var, id) {
return(quantile(var[id], 0.1))
}
(boot.res <- boot(Boston$medv, quantile10.fn, R=100))
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = quantile10.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 -0.0985 0.4988094
Estimated Standard Error = 0.4988094