Chapter 05 (page 197): 3, 5, 6, 9
This approach involves randomly dividing the set of observations into k groups, or folds, of approximately equal size. The first fold is treated as a validation set, and the method is fit on the remaining k − 1 folds. The mean squared error, MSE1, is then computed on the observations in the held-out fold. This procedure is repeated k times; each time, a different group of observations is treated as a validation set. This process results in k estimates of the test error,. The k-fold CV estimate is computed by averaging these values
Advantages:The validation set approach is conceptually simple and is easy to implement
Disadvantages: The validation MSE can be highly variable and Only a subset of observations are used to fit the model (training data).
Advantages: LOOCV have less bias. The validation approach produces different MSE when applied repeatedly due to randomness in the splitting process, while performing LOOCV multiple times will always yield the same results, because we split based on 1 obs. each time.
Disadvantage: LOOCV is computationally intensive
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
data("Default")
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
logmodel1 = glm(default ~ balance + income, data = Default, family = binomial)
summary(logmodel1)
##
## Call:
## glm(formula = default ~ balance + income, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
2.74% test error rate.
#i
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]
#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##
## log.pred_def No Yes
## No 4811 109
## Yes 27 53
#iv
mean(log.pred_def !=testDefault$default)
## [1] 0.0272
All three splits gave me a different testing error which means observations differ across each training and validation set.
#i
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]
#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##
## log.pred_def No Yes
## No 4815 107
## Yes 19 59
#iv
mean(log.pred_def !=testDefault$default)
## [1] 0.0252
#i
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]
#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##
## log.pred_def No Yes
## No 4811 118
## Yes 16 55
#iv
mean(log.pred_def !=testDefault$default)
## [1] 0.0268
#i
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]
#ii
LOGmodel2 = glm(default ~ balance + income, data = Default, family = binomial, subset = trainDefault)
#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##
## log.pred_def No Yes
## No 4806 120
## Yes 19 55
#iv
mean(log.pred_def !=testDefault$default)
## [1] 0.0278
2.7% error rate, not much differenct than the other splits. Adding the student variable doesn’t have much impact on the error rate.
#i
trainDefault = sample(dim(Default)[1], dim(Default)[1]*0.50)
testDefault = Default[-trainDefault, ]
#ii
LOGmodel2 = glm(default ~ balance + income + student, data = Default, family = binomial, subset = trainDefault)
#iii
log.prob_def = predict(LOGmodel2, testDefault, type = "response")
log.pred_def = rep("No", dim(Default)[1]*0.50)
log.pred_def[log.prob_def > 0.5] = "Yes"
table(log.pred_def, testDefault$default)
##
## log.pred_def No Yes
## No 4819 109
## Yes 18 54
#iv
mean(log.pred_def !=testDefault$default)
## [1] 0.0254
The estimated standard errors for the coefficients for balance and income are 2.274e-04. and 4.985e-06
LOGmodel3 = glm(default ~ balance + income, data = Default, family = binomial)
summary(LOGmodel3)
##
## Call:
## glm(formula = default ~ balance + income, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn = function(data, index) return(coef(glm(default ~ balance + income, data = data, family = binomial, subset = index)))
library(boot)
boot(Default, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -8.436098e-03 4.415036e-01
## t2* 5.647103e-03 6.469440e-06 2.215877e-04
## t3* 2.080898e-05 -4.922651e-07 4.825706e-06
The estimated standard errors for the coefficients for income and balance are 2.294044e-04 and 3.875525e-06.
library(MASS)
data("Boston")
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
mean.medv = mean(medv)
mean.medv
## [1] 22.53281
stderr.mean = sd(medv)/sqrt(length(medv))
stderr.mean
## [1] 0.4088611
standard error estimate in (b) is simmilar to the bootstap standard error
boot.fn2 = function(data, index) return(mean(data[index]))
boot2 = boot(medv, boot.fn2, 100)
boot2
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn2, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.03062253 0.3828675
t.test confidence interval is simmilar to the bootstap confidence interval
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
CI.bos = c(22.53 - 2 * 0.4174872, 22.53 + 2 * 0.4174872)
CI.bos
## [1] 21.69503 23.36497
median.medv = median(medv)
median.medv
## [1] 21.2
estimated median value of 21.2 fromm bootstrap which is sames as the value in (e), and the standard error of 0.377
boot.fn3 = function(data, index) return(median(data[index]))
boot3 = boot(medv, boot.fn3, 1000)
boot3
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn3, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.01315 0.371891
tenth.medv = quantile(medv, c(0.1))
tenth.medv
## 10%
## 12.75
estimated quantile value of 12.75 from bootstap which is same as the value in (g), and the standard error of 0.487
boot.fn4 = function(data, index) return(quantile(data[index], c(0.1)))
boot4 = boot(medv, boot.fn4, 1000)
boot4
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn4, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0143 0.5065097