K-fold divides the dataset into k number of folds. For example, if k=3 then the dataset will be divided into 3 samples. One of the samples will be held for testing purposes and the other samples will be used to train the model. After the model is trained the next sample is used for testing and the rest for training. This is repeated until every fold has been used to test the model. The accuracy of the model will be the average of all the test validations that were performed.
The validation set approach only uses one testing sample to validate the model. This is less computationally expensive than using k-folds, but due to the randomness of selecting the testing sample the MSE can vary significantly between different seeds.
LOOCV splits the data into n-1 for the training dataset and leaves one for testing. This model is repeated n times, where n represents the number of observations. This process is more computationally expensive than the validation set and k-folds approach. The advantages of using this process is the reduction in bias and variability of the MSE between runs.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
default = Default
default.log = glm(default ~ income + balance, data = default, family = binomial)
set.seed(42)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 963 3
## Yes 22 12
##
## Accuracy : 0.975
## 95% CI : (0.9633, 0.9838)
## No Information Rate : 0.985
## P-Value [Acc > NIR] : 0.9941839
##
## Kappa : 0.4789
##
## Mcnemar's Test P-Value : 0.0003182
##
## Sensitivity : 0.8000
## Specificity : 0.9777
## Pos Pred Value : 0.3529
## Neg Pred Value : 0.9969
## Prevalence : 0.0150
## Detection Rate : 0.0120
## Detection Prevalence : 0.0340
## Balanced Accuracy : 0.8888
##
## 'Positive' Class : Yes
##
set.seed(2)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 959 4
## Yes 28 9
##
## Accuracy : 0.968
## 95% CI : (0.9551, 0.978)
## No Information Rate : 0.987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3474
##
## Mcnemar's Test P-Value : 4.785e-05
##
## Sensitivity : 0.6923
## Specificity : 0.9716
## Pos Pred Value : 0.2432
## Neg Pred Value : 0.9958
## Prevalence : 0.0130
## Detection Rate : 0.0090
## Detection Prevalence : 0.0370
## Balanced Accuracy : 0.8320
##
## 'Positive' Class : Yes
##
set.seed(3)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 956 5
## Yes 23 16
##
## Accuracy : 0.972
## 95% CI : (0.9598, 0.9813)
## No Information Rate : 0.979
## P-Value [Acc > NIR] : 0.945563
##
## Kappa : 0.5202
##
## Mcnemar's Test P-Value : 0.001315
##
## Sensitivity : 0.7619
## Specificity : 0.9765
## Pos Pred Value : 0.4103
## Neg Pred Value : 0.9948
## Prevalence : 0.0210
## Detection Rate : 0.0160
## Detection Prevalence : 0.0390
## Balanced Accuracy : 0.8692
##
## 'Positive' Class : Yes
##
All of the models performed differently. Although the overall accuracy of the models are around the same, the positive predictive values have considerably different results. The second model returned a postive predictive value of .24, while the third model had a value of .41.
default$dummystudent = as.character(default$student)
default$dummystudent[default$student == "No"] = 0
default$dummystudent[default$student == "Yes"] = 1
default$dummystudent = as.factor(default$dummystudent)
set.seed(42)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log3 = glm(default ~ income + balance + dummystudent, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log3, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 964 2
## Yes 22 12
##
## Accuracy : 0.976
## 95% CI : (0.9645, 0.9846)
## No Information Rate : 0.986
## P-Value [Acc > NIR] : 0.9952776
##
## Kappa : 0.4899
##
## Mcnemar's Test P-Value : 0.0001052
##
## Sensitivity : 0.8571
## Specificity : 0.9777
## Pos Pred Value : 0.3529
## Neg Pred Value : 0.9979
## Prevalence : 0.0140
## Detection Rate : 0.0120
## Detection Prevalence : 0.0340
## Balanced Accuracy : 0.9174
##
## 'Positive' Class : Yes
##
Adding the dummy variable just slightly improves the model. I used the same seed from the first model that was performed and the results are almost identical.
summary(default.log)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
library(boot)
boot.fn=function(data,index){
return(coef(glm(default~income + balance,family = binomial, data=data, subset = index)))
}
boot.fn(default,1:10000)
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
boot(default,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.282144e-02 4.434988e-01
## t2* 2.080898e-05 2.553308e-08 5.075467e-06
## t3* 5.647103e-03 1.173133e-05 2.298843e-04
The standard error between the summary function and bootstrap method are almost identical. The estimates are the same, but the bootstrap method doesnโt round up like the regular function. The standard error for the intercept is also the same, but the income and balance are slightly different.
library(MASS)
boston = Boston
mean.medv = mean(boston$medv)
mean.medv
## [1] 22.53281
sd.medv = sd(boston$medv)
stderror.medv = (sd.medv/sqrt(length(boston$medv)))
stderror.medv
## [1] 0.4088611
The standard deviation of the mean is used to create the distribution of the data.
sd.fn = function(x, index){
return((sd(x[index])*sqrt((length(x[index])-1)/length(x[index])))/sqrt(length(x[index])))
}
boot.sd = sd.fn(boston$medv, 1:506)
boot.sd
## [1] 0.4084569
The standard errors are very similar. The one computed without the boot function returned a slightly higher standard error.
t.test(boston$medv)
##
## One Sample t-test
##
## data: boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
margin = qt(.975, df=506-1)*boot.sd
low = mean.medv - margin
high = mean.medv + margin
low
## [1] 21.73032
high
## [1] 23.33529
Once again, the intervals are very similar. The interval given by the t.test formula has a slightly larger range due to the higher standard error
median.medv = median(boston$medv)
median.medv
## [1] 21.2
boot.sd.med = 1.253*(sd.fn(boston$medv, 1:506))
boot.sd.med
## [1] 0.5117965
quantile(boston$medv, probs = .1)
## 10%
## 12.75