*#3. We now review k-fold cross-validation. (a) Explain how k-fold cross-validation is implemented. K-fold cross validation is where you break up the entire population of observations into “k” number of sample group observations. Then you leave one group of observations out of the calculation as your “test/validation set” and then use the other sample group observations as your “training set”. You calculate the MSE on your “test/validation set” based upon the model developed on your “training set”. A common example in Design of Experiment statistical classes is the perfect, normal, binomial distribution with n=100 observations, k=5, and each k-sample has 20 observations each. i.e. You take all the observations in the first sample set, k=1, and set aside as your test/validation set (your first fold). You build your model from the other 80 observations in the combined k=2, 3, 4, & 5 sets. You test that model against your k = 1 set, then repeat the same process for each of the sample groups (k = 2, 3, 4, & 5) of 20 observations, for a total of 5 folds. Then the MSE of each k-sample group is averaged for an overall population MSE.
The validation set approach? The k-fold CV has advantages in that instead of simply dividing the population in half (half training, half test/validation), it’s divided into a k-number of computationally manageable groups. With the validation set, the variability of the model is higher than with k-fold; but it can also give you a good starting point before performing the k-fold CV approach. If your population is extremely large, than dividing in half maybe all you can do in the interim while you wait on your k-fold CV or LOOCV to compute.
LOOCV? Advantages would be presumably a very tightly fitted model that has very low MSE. Disadvantages this could result in 1) over-fitting 2) burning out your server from computing all those MSEs on such a large population (n) size. As much as people are complaining about crypto-mining data farms straining resources, I can only imagine what would happen if all organizations started using LOOCV on their data sets. *
#load ISLR and MASS library to access Default data
library(ISLR)
library(MASS)
attach(Default)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
#data structure/characteristics
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# set seed
set.seed(997)
Default$default <- as.factor(Default$default)
#create logistic regression model using income and balance as factors to predict default
log.reg = glm(default ~ income + balance, family = binomial, data = Default)
log.reg
##
## Call: glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## (Intercept) income balance
## -1.154e+01 2.081e-05 5.647e-03
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585
# divide population in half
set.seed(997)
train = sample(1000,500)
test <- Default[-train,]
pred.logreg <- predict(log.reg, test, type = "response")
class.logreg <- ifelse(pred.logreg > 0.5, "Yes", "No")
table(test$default, class.logreg, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 9153 38
## Yes 207 102
#MSE with the 50/50 split
round(mean(class.logreg != test$default), 4)
## [1] 0.0258
# divide population in half
set.seed(997)
train2 = sample(1000,333)
test2 <- Default[-train2,]
pred.logreg <- predict(log.reg, test2, type = "response")
class.logreg <- ifelse(pred.logreg > 0.5, "Yes", "No")
table(test2$default, class.logreg, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 9309 38
## Yes 216 104
#MSE with the 1/3
round(mean(class.logreg != test2$default), 4)
## [1] 0.0263
# divide population
set.seed(997)
train3 = sample(1000,800)
test3 <- Default[-train3,]
pred.logreg <- predict(log.reg, test3, type = "response")
class.logreg <- ifelse(pred.logreg > 0.5, "Yes", "No")
table(test3$default, class.logreg, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 8858 36
## Yes 205 101
#MSE with the 900
round(mean(class.logreg != test3$default), 4)
## [1] 0.0262
Default$student <- as.factor(Default$student)
set.seed(997)
log.reg2 = glm(default ~ income + balance + student, family = binomial, data = Default, subset = train)
log.reg2
##
## Call: glm(formula = default ~ income + balance + student, family = binomial,
## data = Default, subset = train)
##
## Coefficients:
## (Intercept) income balance studentYes
## -1.099e+01 6.314e-06 6.187e-03 -4.810e-01
##
## Degrees of Freedom: 499 Total (i.e. Null); 496 Residual
## Null Deviance: 192.6
## Residual Deviance: 97.69 AIC: 105.7
set.seed(997)
train4 = sample(1000,500)
test4 <- Default[-train4,]
pred.logreg <- predict(log.reg, test4, type = "response")
class.logreg <- ifelse(pred.logreg > 0.5, "Yes", "No")
table(test4$default, class.logreg, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 9153 38
## Yes 207 102
#MSE with student factor included
round(mean(class.logreg!=test4$default),4)
## [1] 0.0258
set.seed(997)
train5 <- sample(dim(Default)[1], dim(Default)[1]/2)
fit.glm <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train5)
summary(fit.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train5)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.185e+01 6.323e-01 -18.744 < 2e-16 ***
## income 2.382e-05 7.063e-06 3.373 0.000743 ***
## balance 5.807e-03 3.317e-04 17.507 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1483.83 on 4999 degrees of freedom
## Residual deviance: 798.72 on 4997 degrees of freedom
## AIC: 804.72
##
## Number of Fisher Scoring iterations: 8
boot.fn <- function(data, index) {
fit.glm <- glm(default ~ income + balance, data = data[index, ], family = "binomial")
return(coef(fit.glm)[2:3])
}
fit.glm
##
## Call: glm(formula = default ~ income + balance, family = "binomial",
## data = Default, subset = train5)
##
## Coefficients:
## (Intercept) income balance
## -1.185e+01 2.382e-05 5.807e-03
##
## Degrees of Freedom: 4999 Total (i.e. Null); 4997 Residual
## Null Deviance: 1484
## Residual Deviance: 798.7 AIC: 804.7
library(boot)
set.seed(998)
results <- boot(Default, boot.fn, R = 1000)
results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 6.044926e-08 4.884814e-06
## t2* 5.647103e-03 1.079285e-05 2.273975e-04
glm_se <- summary(fit.glm)$coefficients[, "Std. Error"]
bootstrap_se <- apply(results$t, 2, sd)
cat("GLM Standard Errors:\n", glm_se, "\n")
## GLM Standard Errors:
## 0.6322513 7.062876e-06 0.0003317074
cat("Bootstrap Standard Errors:\n", bootstrap_se, "\n")
## Bootstrap Standard Errors:
## 4.884814e-06 0.0002273975
The standard errors are slightly better with the bootstrap methods.
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
data(Boston)
mean_medv <- mean(Boston$medv)
mean_medv
## [1] 22.53281
n <- length(Boston$medv)
s <- sd(Boston$medv)
se_sample <- s/sqrt(n)
se_sample
## [1] 0.4088611
set.seed(997)
B <- 1000
boot_means <- replicate(B, {
sample_data <- sample(Boston$medv, n, replace = TRUE)
mean(sample_data)
})
boot_se1 <- sd(boot_means)
boot_se1
## [1] 0.413037
CI_lower <- mean_medv - 2 * boot_se1
CI_upper <- mean_medv + 2 * boot_se1
c(CI_lower, CI_upper)
## [1] 21.70673 23.35888
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
med_medv <- median(Boston$medv)
med_medv
## [1] 21.2
boot_medians <- replicate(B, {
sample_data <- sample(Boston$medv, n, replace = TRUE)
median(sample_data)
})
boot_se_median <- sd(boot_medians)
boot_se_median
## [1] 0.3849359
mu_0_1 <- quantile(Boston$medv, 0.1)
mu_0_1
## 10%
## 12.75
boot_tenth <- replicate(B, {
sample_data <- sample(Boston$medv, n, replace = TRUE)
quantile(sample_data, 0.1)
})
boot_se_tenth <- sd(boot_tenth)
boot_se_tenth
## [1] 0.4820805