Explain how k-fold cross-validation is implemented.
K fold cross validation is implemented as a way to validate a model. How it is implemented is by dividing the data set into k combinations testing and training sets of data. Then taking the average of the errors of all K combinations. This means that the model was tested and trained on all the observations.
What are the advantages and disadvantages of k-fold cross validation relative to:
set.seed(123)
df <- Default
log.fit <- glm(default ~ income + balance,family = binomial, df)
summary(log.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, df$default)
##
## log.pred No Yes
## No 9629 225
## Yes 38 108
terror <- function(table){
1 - ((table[1,1] + table[2,2]) / sum(table))
}
terror(table(log.pred, df$default))
## [1] 0.0263
Using .50 Split
split <- .50
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2810 -0.1348 -0.0529 -0.0185 3.7767
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.194e+01 6.451e-01 -18.504 < 2e-16 ***
## income 2.210e-05 7.381e-06 2.995 0.00275 **
## balance 5.874e-03 3.362e-04 17.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1429.88 on 4999 degrees of freedom
## Residual deviance: 752.69 on 4997 degrees of freedom
## AIC: 758.69
##
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##
## log.pred No Yes
## No 4808 117
## Yes 21 54
terror(table(log.pred, test$default))
## [1] 0.0276
Using .75 split
split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4523 -0.1480 -0.0600 -0.0224 3.4593
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.139e+01 4.890e-01 -23.298 < 2e-16 ***
## income 2.072e-05 5.664e-06 3.658 0.000254 ***
## balance 5.566e-03 2.564e-04 21.703 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2219 on 7499 degrees of freedom
## Residual deviance: 1218 on 7497 degrees of freedom
## AIC: 1224
##
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##
## log.pred No Yes
## No 2415 52
## Yes 6 27
terror(table(log.pred, test$default))
## [1] 0.0232
Using .25 Split
split <- .25
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
log.fit <- glm(default ~ income + balance,family = binomial, train)
summary(log.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3753 -0.1354 -0.0529 -0.0202 3.7947
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.112e+01 8.734e-01 -12.730 <2e-16 ***
## income 4.087e-06 1.045e-05 0.391 0.696
## balance 5.718e-03 4.655e-04 12.285 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 708.14 on 2499 degrees of freedom
## Residual deviance: 368.05 on 2497 degrees of freedom
## AIC: 374.05
##
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##
## log.pred No Yes
## No 7215 173
## Yes 32 80
terror(table(log.pred, test$default))
## [1] 0.02733333
split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
train <- df[index,]
test <- df[-index,]
log.fit <- glm(default ~ income + balance + student,family = binomial, train)
summary(log.fit)
##
## Call:
## glm(formula = default ~ income + balance + student, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4543 -0.1513 -0.0607 -0.0226 3.6464
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.080e+01 5.576e-01 -19.370 <2e-16 ***
## income 7.777e-06 9.411e-06 0.826 0.4086
## balance 5.592e-03 2.624e-04 21.308 <2e-16 ***
## studentYes -5.489e-01 2.732e-01 -2.009 0.0445 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2178.7 on 7499 degrees of freedom
## Residual deviance: 1214.3 on 7496 degrees of freedom
## AIC: 1222.3
##
## Number of Fisher Scoring iterations: 8
log.probs <- predict(log.fit, test, type = "response")
log.pred <- ifelse(log.probs > .5, "Yes", "No")
table(log.pred, test$default)
##
## log.pred No Yes
## No 2404 53
## Yes 11 32
terror(table(log.pred, test$default))
## [1] 0.0256
It does not seem that adding student in the model will significantly decreasse the total error rate.
log.fit <- glm(default ~ income + balance,family = binomial, df)
summary(log.fit)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
boot.fn <- function(data, index){
coef(glm(default ~ income + balance, family = binomial, data = data, subset = index))
}
split <- .75
index <- sample(1:nrow(df), round(nrow(df)*split))
boot.fn(df, index)
## (Intercept) income balance
## -1.157436e+01 2.142962e-05 5.681451e-03
boot(df,boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = df, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.793522e-02 4.209967e-01
## t2* 2.080898e-05 1.693713e-07 4.729635e-06
## t3* 5.647103e-03 1.289558e-05 2.219446e-04
We can see that the boot strapping is very similar to the glm function
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
df2 <- Boston
summary(glm(medv ~ 1 , data = df2))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.53281 0.4088611 55.11115 9.370624e-216
boot.fn <- function(data, index){
coef(glm(medv ~ 1, data = data, subset = index))
}
boot(df2,boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = df2, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.01299229 0.4170413
We can see that the results of the bootstrap and glm are very similar.
median.fn <- function(data, index){
median(data[index])
}
medboot <- boot(df2$medv,median.fn, 1000)
medboot.stderr <- sqrt(var(medboot$t))
list(medboot$t0 - 2 * medboot.stderr, medboot$t0 + 2 * medboot.stderr)
## [[1]]
## [,1]
## [1,] 20.42276
##
## [[2]]
## [,1]
## [1,] 21.97724
quantile(df2$medv, .1)
## 10%
## 12.75
tenth.fn <- function(data, index){
quantile(df2$medv[index], .1)
}
boot(df2, tenth.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = df2, statistic = tenth.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 -0.01735 0.5071575
Boot strap yet again seems great at getting the estimate parameter. And showcases the the ability to get std. error for statistics that we donโt know the sampling distribution for.