#Answer The implementation of the k-fold cross validation is as follows: 1. Divide the dataset randomly into k folds of equal size (k=5 or 10 preferred). 2. The first fold will be the validation dataset, and the statistical learning method will be trained using the remaining folds. 3. The MSE is then calculated using the validation dataset.The process is repeated K times each with a different validation dataset and a corresponding MSE. 4.An estimate of the test error which is the average of the MSEs is then calculated.
set.seed(1)
default.glm <- glm(default~income+balance, data=Default, family = "binomial")
summary(default.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(2)
split <- sample.split(Default, SplitRatio = 0.8)
Defaulttrain <- subset(Default, split==TRUE)
Defaulttest<- subset(Default, split==FALSE)
train.glm <- glm(default~income+balance, data=Defaulttrain, family="binomial")
summary(train.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Defaulttrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2167 -0.1416 -0.0550 -0.0198 3.7482
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.171e+01 5.201e-01 -22.505 < 2e-16 ***
## income 2.172e-05 5.949e-06 3.651 0.000261 ***
## balance 5.714e-03 2.733e-04 20.907 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2062.7 on 7499 degrees of freedom
## Residual deviance: 1121.0 on 7497 degrees of freedom
## AIC: 1127
##
## Number of Fisher Scoring iterations: 8
Default.probs <- predict(train.glm, Defaulttest, type="response")
Default.pred <- rep("No", 2500)
Default.pred[Default.probs>0.5] <- "Yes"
table(Default.pred, Defaulttest$default)
##
## Default.pred No Yes
## No 2384 70
## Yes 14 32
mean(Default.pred!=Defaulttest$default)
## [1] 0.0336
The validation test error rate is 0.0336.
#1. 50-50 split
set.seed(3)
split1 <- sample.split(Default, SplitRatio = 0.5)
Defaulttrain1 <- subset(Default, split1==TRUE)
Defaulttest1<- subset(Default, split1==FALSE)
## Logistic model
train1.glm <- glm(default~income+balance, data=Defaulttrain1, family="binomial")
summary(train1.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Defaulttrain1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1769 -0.1387 -0.0524 -0.0184 3.7739
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.190e+01 6.448e-01 -18.456 < 2e-16 ***
## income 2.153e-05 7.357e-06 2.926 0.00343 **
## balance 5.863e-03 3.405e-04 17.220 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1402.61 on 4999 degrees of freedom
## Residual deviance: 741.39 on 4997 degrees of freedom
## AIC: 747.39
##
## Number of Fisher Scoring iterations: 8
Default.probs1 <- predict(train1.glm, Defaulttest1, type="response")
Default.pred1 <- rep("No", 5000)
Default.pred1[Default.probs1>0.5] <- "Yes"
table(Default.pred1, Defaulttest1$default)
##
## Default.pred1 No Yes
## No 4798 122
## Yes 27 53
#
mean(Default.pred1!=Defaulttest1$default)
## [1] 0.0298
The validation test error is 0.0298.
#2. 70-30 split
set.seed(4)
split2 <- sample.split(Default, SplitRatio = 0.7)
Defaulttrain2 <- subset(Default, split2==TRUE)
Defaulttest2<- subset(Default, split2==FALSE)
## Logistic model
train2.glm <- glm(default~income+balance, data=Defaulttrain2, family="binomial")
summary(train2.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Defaulttrain2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4353 -0.1470 -0.0604 -0.0229 3.6905
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.131e+01 5.898e-01 -19.185 < 2e-16 ***
## income 2.045e-05 6.806e-06 3.005 0.00265 **
## balance 5.520e-03 3.065e-04 18.011 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1543.58 on 4999 degrees of freedom
## Residual deviance: 834.82 on 4997 degrees of freedom
## AIC: 840.82
##
## Number of Fisher Scoring iterations: 8
Default.probs2 <- predict(train2.glm, Defaulttest2, type="response")
Default.pred2 <- rep("No", 5000)
Default.pred2[Default.probs2>0.5] <- "Yes"
table(Default.pred2, Defaulttest2$default)
##
## Default.pred2 No Yes
## No 4832 106
## Yes 14 48
#
mean(Default.pred2!=Defaulttest2$default)
## [1] 0.024
#3. 60-40 split
set.seed(9)
split3 <- sample.split(Default, SplitRatio = 0.6)
Defaulttrain3 <- subset(Default, split3==TRUE)
Defaulttest3<- subset(Default, split3==FALSE)
## Logistic model
train3.glm <- glm(default~income+balance, data=Defaulttrain3, family="binomial")
summary(train3.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Defaulttrain3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4085 -0.1487 -0.0623 -0.0236 3.6705
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.123e+01 5.891e-01 -19.055 < 2e-16 ***
## income 2.009e-05 6.778e-06 2.964 0.00304 **
## balance 5.462e-03 3.057e-04 17.863 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1517.14 on 4999 degrees of freedom
## Residual deviance: 836.79 on 4997 degrees of freedom
## AIC: 842.79
##
## Number of Fisher Scoring iterations: 8
Default.probs3 <- predict(train3.glm, Defaulttest3, type="response")
Default.pred3 <- rep("No", 5000)
Default.pred3[Default.probs3>0.5] <- "Yes"
table(Default.pred3, Defaulttest3$default)
##
## Default.pred3 No Yes
## No 4829 104
## Yes 13 54
#
mean(Default.pred3!=Defaulttest3$default)
## [1] 0.0234
There isn’t a big difference in the test error rate among the different splits.
set.seed(3)
splits <- sample.split(Default, SplitRatio = 0.8)
train <- subset(Default, splits==TRUE)
test<- subset(Default, splits==FALSE)
## Logistic model
glm.fit<- glm(default~income+balance+student, data=train, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1417 -0.1394 -0.0533 -0.0192 3.7559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.110e+01 5.865e-01 -18.932 <2e-16 ***
## income 5.910e-06 9.598e-06 0.616 0.5381
## balance 5.791e-03 2.779e-04 20.840 <2e-16 ***
## studentYes -5.852e-01 2.777e-01 -2.107 0.0351 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2062.7 on 7499 degrees of freedom
## Residual deviance: 1116.6 on 7496 degrees of freedom
## AIC: 1124.6
##
## Number of Fisher Scoring iterations: 8
probs <- predict(glm.fit,test, type="response")
pred <- rep("No", 2500)
pred[probs>0.5] <- "Yes"
table(pred, test$default)
##
## pred No Yes
## No 2382 70
## Yes 16 32
#
mean(pred!=test$default)
## [1] 0.0344
There isn’t a difference in the test error rate when the dummy variable was included.
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the Default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) usingthe standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
summary(glm(default~income+balance, data=Default, family="binomial"))$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
The standard error for the coefficient associated with income is 4.985e^-6 The standard error for the coefficient associated with balance is 2.274e^-4
boot.fn <- function(data, index=1:nrow(data)){
coef(glm(default~income+balance, data=data, subset=index,family="binomial"))[-1]
}
set.seed(1)
# The boot function
boot(Default, boot.fn, R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04
These estimates are pretty similar but we see the estimates for the bootstrap standard error is a little bit bigger than the one for the glm function() indicating the glm() function had the best fit.
#Exercise 9 9. We will now consider the Boston housing data set, from the ISLR2 library. (a) Based on this data set, provide an estimate for the population mean of medv. Call this estimate ˆµ.
u <-mean(medv)
u
## [1] 22.53281
s.e <- sd(medv)/sqrt(nrow(Boston))
s.e
## [1] 0.4088611
se.fn <- function(data, index){
mean(data[index])
}
set.seed(1)
boot(Boston$medv, se.fn, R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = se.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
cfI <- c(u-2*0.4106622, u+2*0.4106622)
cfI
## [1] 21.71148 23.35413
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
The confidence interval using the t-test function is wider indicating instability.
U.med <- median(medv)
U.med
## [1] 21.2
st.med <- function(data, index){
median(data[index])
}
set.seed(1)
boot(Boston$medv, st.med, R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = st.med, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.02295 0.3778075
u0.1 <- quantile(medv, 0.1)
u0.1
## 10%
## 12.75
quantile.fn <- function(data, index){
quantile(data[index], 0.1)
}
set.seed(1)
boot(Boston$medv, quantile.fn, R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = quantile.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0339 0.4767526
From the output, we see that there is little bias in the estimate for the 10th percentile.