This technique is implemented splitting the data set in k groups, where k is any number smaller than the data set total number of observartions.One of the groups will be training set, used to calculated the coeficients of the model, and the other sets will be testing sets. At end, we use the total error of each set to calculate the mean to obtain the final testing error.
The main advantage of the validation set is that it is a less complex process, that does not demand high computational power. However, since the single set approach randomly split the data set in two subsets, this model is subject to a higher random variability, which is reduced in the k-fold as the value K increases: the higher the value of K, the smaller the random element as more observations will be used in both training and testing. That also will reduce the model bias.
Since LOOCV is a case of k-fold where the value k is actually the number of observations in the data set, the LOOCV will have accentuate the same characteristics. It is computationally very demanding process, specially with large data sets. Also, the LOOCV will generated n models, which increase the variance. The advantage of the LOOCV is that is will provide a more stable model in the sense it is not subject to the random assignment of the observations to the model fit that happens in the k-fold model: all observations will be used actually to calculate the total error. That at end will reduce the bias of the LOOCV compared with the k-fold.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ISLR)
attach(Default)
as_tibble(Default)
## # A tibble: 10,000 x 4
## default student balance income
## <fct> <fct> <dbl> <dbl>
## 1 No No 730. 44362.
## 2 No Yes 817. 12106.
## 3 No No 1074. 31767.
## 4 No No 529. 35704.
## 5 No No 786. 38463.
## 6 No Yes 920. 7492.
## 7 No No 826. 24905.
## 8 No Yes 809. 17600.
## 9 No No 1161. 37469.
## 10 No No 0 29275.
## # ... with 9,990 more rows
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
logreg_default = glm(default ~ income + balance, data = Default, family = binomial)
summary(logreg_default)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
default_train <- Default[sample(dim(Default)[1], dim(Default)[1]/2),]
default_test <- Default[-sample(dim(Default)[1], dim(Default)[1]/2),]
logreg_default = glm(default ~ income + balance, data = default_train, family = binomial)
test_predi <- rep("No", dim(default_test)[1])
fit_prob <- predict(logreg_default, default_test, type = "response")
test_predi[fit_prob > 0.5] = "Yes"
table(Predictions=test_predi, Actual=default_test$default)
## Actual
## Predictions No Yes
## No 4815 113
## Yes 23 49
#Accuracy
mean(test_predi == default_test$default)
## [1] 0.9728
#Validation error
split1 <- mean(test_predi != default_test$default)
split1
## [1] 0.0272
set.seed(2)
default_train <- Default[sample(dim(Default)[1], dim(Default)[1]/2),]
default_test <- Default[-sample(dim(Default)[1], dim(Default)[1]/2),]
logreg_default = glm(default ~ income + balance, data = default_train, family = binomial)
test_predi <- rep("No", dim(default_test)[1])
fit_prob <- predict(logreg_default, default_test, type = "response")
test_predi[fit_prob > 0.5] = "Yes"
table(Predictions=test_predi, Actual=default_test$default)
## Actual
## Predictions No Yes
## No 4797 122
## Yes 21 60
#Accuracy
mean(test_predi == default_test$default)
## [1] 0.9714
#Validation error
split2 <- mean(test_predi != default_test$default)
split2
## [1] 0.0286
set.seed(3)
default_train <- Default[sample(dim(Default)[1], dim(Default)[1]/2),]
default_test <- Default[-sample(dim(Default)[1], dim(Default)[1]/2),]
logreg_default = glm(default ~ income + balance, data = default_train, family = binomial)
test_predi <- rep("No", dim(default_test)[1])
fit_prob <- predict(logreg_default, default_test, type = "response")
test_predi[fit_prob > 0.5] = "Yes"
table(Predictions=test_predi, Actual=default_test$default)
## Actual
## Predictions No Yes
## No 4805 116
## Yes 24 55
#Accuracy
mean(test_predi == default_test$default)
## [1] 0.972
#Validation error
split3 <- mean(test_predi != default_test$default)
split3
## [1] 0.028
set.seed(4)
default_train <- Default[sample(dim(Default)[1], dim(Default)[1]/2),]
default_test <- Default[-sample(dim(Default)[1], dim(Default)[1]/2),]
logreg_default = glm(default ~ income + balance, data = default_train, family = binomial)
test_predi <- rep("No", dim(default_test)[1])
fit_prob <- predict(logreg_default, default_test, type = "response")
test_predi[fit_prob > 0.5] = "Yes"
table(Predictions=test_predi, Actual=default_test$default)
## Actual
## Predictions No Yes
## No 4801 117
## Yes 20 62
#Accuracy
mean(test_predi == default_test$default)
## [1] 0.9726
#Validation error
split4 <- mean(test_predi != default_test$default)
split4
## [1] 0.0274
(split1+split2+split3+split4)/4
## [1] 0.0278
The average of 4 logistic regression using random splits of the data was 2.78% error.
set.seed(5)
default_train <- Default[sample(dim(Default)[1], dim(Default)[1]/2),]
default_test <- Default[-sample(dim(Default)[1], dim(Default)[1]/2),]
logreg_default = glm(default ~ income + balance+student, data = default_train, family = binomial)
test_predi <- rep("No", dim(default_test)[1])
fit_prob <- predict(logreg_default, default_test, type = "response")
test_predi[fit_prob > 0.5] = "Yes"
table(Predictions=test_predi, Actual=default_test$default)
## Actual
## Predictions No Yes
## No 4814 111
## Yes 21 54
#Accuracy
mean(test_predi == default_test$default)
## [1] 0.9736
#Validation error
split5 <- mean(test_predi != default_test$default)
split5
## [1] 0.0264
The addition of student did not seem to significantly improve the model.
logreg_default = glm(default ~ income + balance, data = Default, family = binomial)
summary(logreg_default)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn = function(data, index) return(coef(glm(default ~ income + balance,
data = data, family = binomial, subset = index)))
set.seed(1)
boot.fn(Default,sample(1000,500,replace = T))
## (Intercept) income balance
## -1.029272e+01 1.558656e-05 5.236697e-03
library(boot)
boot(Default, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 8.496943e-03 4.123046e-01
## t2* 2.080898e-05 -4.142289e-07 4.178409e-06
## t3* 5.647103e-03 -3.808035e-06 2.226444e-04
The results seem to be within an acceptable range.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
as_tibble(Boston)
## # A tibble: 506 x 14
## crim zn indus chas nox rm age dis rad tax ptratio black
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3 397.
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8 397.
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 393.
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7 395.
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7 397.
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7 394.
## 7 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311 15.2 396.
## 8 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311 15.2 397.
## 9 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311 15.2 387.
## 10 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311 15.2 387.
## # ... with 496 more rows, and 2 more variables: lstat <dbl>, medv <dbl>
set.seed(1)
attach(Boston)
estimate_μ <-mean(medv)
estimate_μ
## [1] 22.53281
Median value is 22.53281
sderror_μ<- sd(medv)/sqrt(length(medv))
sderror_μ
## [1] 0.4088611
Standard error is 0.4088611
boot.fn = function(data, index) return(mean(data[index]))
library(boot)
bstrap = boot(medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
The results are similar: 0.4088611 from the parametric method and 0.4106622 from the bootstrap.
c(estimate_µ-(2* sderror_µ), estimate_µ+(2* sderror_µ) )
## [1] 21.71508 23.35053
t.test(medv)
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
c(bstrap$t0 - 2 * 0.4119, bstrap$t0 + 2 * 0.4119)
## [1] 21.70901 23.35661
The confidence interval was similar.
estimate_μmed <- median(medv)
estimate_μmed
## [1] 21.2
Median value is 21.2.
boot.fn = function(data, index) return(median(data[index]))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0386 0.3770241
The bootstrap obtain an acceptable estimate for the median (standard error of 0.3770241)
estimate_μO.I <- quantile(medv, c(0.1))
estimate_μO.I
## 10%
## 12.75
The tenth percentile is 12.75
boot.fn = function(data, index) return(quantile(data[index], c(0.1)))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0186 0.4925766
Using Bootstrap, we obtain the same tenth percentile of 12.75, with a slightly higher standard error (0.4925766) than before.