K-fold divides the dataset into k number of folds. For example, if k=3 then the dataset will be divided into 3 samples. One of the samples will be held for testing purposes and the other samples will be used to train the model. After the model is trained the next sample is used for testing and the rest for training. This is repeated until every fold has been used to test the model. The accuracy of the model will be the average of all the test validations that were performed.
The validation set approach only uses one testing sample to validate the model. This is less computationally expensive than using k-folds, but due to the randomness of selecting the testing sample the MSE can vary significantly between different seeds.
LOOCV splits the data into n-1 for the training dataset and leaves one for testing. This model is repeated n times, where n represents the number of observations. This process is more computationally expensive than the validation set and k-folds approach. The advantages of using this process is the reduction in bias and variability of the MSE between runs.
library(ISLR)
default = Default
default.log = glm(default ~ income + balance, data = default, family = binomial)
set.seed(42)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 963 3
## Yes 22 12
##
## Accuracy : 0.975
## 95% CI : (0.9633, 0.9838)
## No Information Rate : 0.985
## P-Value [Acc > NIR] : 0.9941839
##
## Kappa : 0.4789
##
## Mcnemar's Test P-Value : 0.0003182
##
## Sensitivity : 0.8000
## Specificity : 0.9777
## Pos Pred Value : 0.3529
## Neg Pred Value : 0.9969
## Prevalence : 0.0150
## Detection Rate : 0.0120
## Detection Prevalence : 0.0340
## Balanced Accuracy : 0.8888
##
## 'Positive' Class : Yes
##
set.seed(2)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 959 4
## Yes 28 9
##
## Accuracy : 0.968
## 95% CI : (0.9551, 0.978)
## No Information Rate : 0.987
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3474
##
## Mcnemar's Test P-Value : 4.785e-05
##
## Sensitivity : 0.6923
## Specificity : 0.9716
## Pos Pred Value : 0.2432
## Neg Pred Value : 0.9958
## Prevalence : 0.0130
## Detection Rate : 0.0090
## Detection Prevalence : 0.0370
## Balanced Accuracy : 0.8320
##
## 'Positive' Class : Yes
##
set.seed(3)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log2 = glm(default ~ income + balance, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log2, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 956 5
## Yes 23 16
##
## Accuracy : 0.972
## 95% CI : (0.9598, 0.9813)
## No Information Rate : 0.979
## P-Value [Acc > NIR] : 0.945563
##
## Kappa : 0.5202
##
## Mcnemar's Test P-Value : 0.001315
##
## Sensitivity : 0.7619
## Specificity : 0.9765
## Pos Pred Value : 0.4103
## Neg Pred Value : 0.9948
## Prevalence : 0.0210
## Detection Rate : 0.0160
## Detection Prevalence : 0.0390
## Balanced Accuracy : 0.8692
##
## 'Positive' Class : Yes
##
All the models tried above have different responses. Accuracy of model 2 is 96.8 and for model3 is 97.2, however positive pred values for model 2 is 0.24 and for model3 is 0.41
default$dummystudent = as.character(default$student)
default$dummystudent[default$student == "No"] = 0
default$dummystudent[default$student == "Yes"] = 1
default$dummystudent = as.factor(default$dummystudent)
set.seed(42)
train = sample(10000, 1000)
default.train = default[-train, ]
default.test = default[train, ]
default.log3 = glm(default ~ income + balance + dummystudent, data = default.train, family = binomial)
default.PredProb = predict.glm(default.log3, newdata = default.test, type = "response")
default.Predsur = ifelse(default.PredProb >= .5, "Yes","No" )
caret::confusionMatrix(as.factor(default.test$default), as.factor(default.Predsur), positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 964 2
## Yes 22 12
##
## Accuracy : 0.976
## 95% CI : (0.9645, 0.9846)
## No Information Rate : 0.986
## P-Value [Acc > NIR] : 0.9952776
##
## Kappa : 0.4899
##
## Mcnemar's Test P-Value : 0.0001052
##
## Sensitivity : 0.8571
## Specificity : 0.9777
## Pos Pred Value : 0.3529
## Neg Pred Value : 0.9979
## Prevalence : 0.0140
## Detection Rate : 0.0120
## Detection Prevalence : 0.0340
## Balanced Accuracy : 0.9174
##
## 'Positive' Class : Yes
##
Adding of the dummy variable improves accuracy of the model to 97.6 and the results are almost same.
summary(default.log)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
library(boot)
boot.fn=function(data,index){
return(coef(glm(default~income + balance,family = binomial, data=data, subset = index)))
}
boot.fn(default,1:10000)
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
boot(default,boot.fn,1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.282144e-02 4.434988e-01
## t2* 2.080898e-05 2.553308e-08 5.075467e-06
## t3* 5.647103e-03 1.173133e-05 2.298843e-04
We can observe that the std. error are almost similar with summary function and bootstrap methods. Estimates and intercept values are also similar except for income and balance which are little different.
###A)
library(MASS)
boston = Boston
mean.medv = mean(boston$medv)
mean.medv
## [1] 22.53281
sd.medv = sd(boston$medv)
stderror.medv = (sd.medv/sqrt(length(boston$medv)))
stderror.medv
## [1] 0.4088611
The standard deviation of the mean is used to create the distribution of the data.
sd.fn = function(x, index){
return((sd(x[index])*sqrt((length(x[index])-1)/length(x[index])))/sqrt(length(x[index])))
}
boot.sd = sd.fn(boston$medv, 1:506)
boot.sd
## [1] 0.4084569
The standard errors are almost similar. We also observe that boot function has a little higher std. error.
t.test(boston$medv)
##
## One Sample t-test
##
## data: boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
margin = qt(.975, df=506-1)*boot.sd
low = mean.medv - margin
high = mean.medv + margin
low
## [1] 21.73032
high
## [1] 23.33529
We observe that intervals are almost similar and that with t test has little large range because of higher std. error.
median.medv = median(boston$medv)
median.medv
## [1] 21.2
boot.sd.med = 1.253*(sd.fn(boston$medv, 1:506))
boot.sd.med
## [1] 0.5117965
quantile(boston$medv, probs = .1)
## 10%
## 12.75