In chapter 4, we used logistic regression to predict the probability of default using income and balance on the default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
attach(Default)
set.seed(1)
lr <- glm(default ~ income + balance, family = binomial, data = Default)
summary(lr)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
pred <- predict(lr, Default$default, type = "response")
pred.class <- ifelse(pred > 0.5, "Yes", "No")
round(mean(Default$default != pred.class), 4)
## [1] 0.0263
Split the sample set into a training set and a validation set.
Fit a multiple logistic regression model using only the training observations.
Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
subset <- sample(nrow(Default), nrow(Default) * 0.9)
default.train <- Default[subset, ]
default.test <- Default[-subset, ]
lr2 <- glm(default ~ income + balance, family = binomial, data = default.train)
pred2 <- predict(lr2, default.test, type = "response")
class2 <- ifelse(pred2 > 0.5, "Yes", "No")
table(default.test$default, class2, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 962 1
## Yes 28 9
mr <- round(mean(class2 != default.test$default), 4)
print(mr)
## [1] 0.029
subset<-sample(nrow(Default),nrow(Default)*0.8)
default.train<-Default[subset,]
default.test<-Default[-subset,]
lr3<-glm(default~income+balance,family = binomial,data=default.train)
pred3<-predict(lr3,default.test,type="response")
class3<-ifelse(pred3>0.5,"Yes","No")
mr3<-round(mean(class3!=default.test$default),4)
subset<-sample(nrow(Default),nrow(Default)*0.7)
default.train<-Default[subset,]
default.test<-Default[-subset,]
lr4<-glm(default~income+balance,family = binomial,data=default.train)
pred4<-predict(lr4,default.test,type="response")
class4<-ifelse(pred4>0.5,"Yes","No")
mr4<-round(mean(class4!=default.test$default),4)
subset<-sample(nrow(Default),nrow(Default)*0.5)
default.train<-Default[subset,]
default.test<-Default[-subset,]
lr5<-glm(default~income+balance,family = binomial,data=default.train)
pred5<-predict(lr5,default.test,type="response")
class5<-ifelse(pred5>0.5,"Yes","No")
mr5<-round(mean(class5!=default.test$default),4)
print(mr3)
## [1] 0.0235
print(mr4)
## [1] 0.0283
print(mr5)
## [1] 0.0292
set.seed(1)
subset <- sample(nrow(Default), nrow(Default) * 0.7)
default.train <- Default[subset, ]
default.test <- Default[-subset, ]
lr6 <- glm(default ~ income + balance + student, family = binomial, data = default.train)
summary(lr6)
##
## Call:
## glm(formula = default ~ income + balance + student, family = binomial,
## data = default.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.095e+01 5.880e-01 -18.616 <2e-16 ***
## income 6.273e-06 9.845e-06 0.637 0.524
## balance 5.678e-03 2.741e-04 20.715 <2e-16 ***
## studentYes -7.167e-01 2.886e-01 -2.483 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2030.3 on 6999 degrees of freedom
## Residual deviance: 1073.5 on 6996 degrees of freedom
## AIC: 1081.5
##
## Number of Fisher Scoring iterations: 8
pred6 <- predict(lr6, default.test, type = "response")
class6 <- ifelse(pred6 > 0.7, "Yes", "No")
table(default.test$default, class6, dnn = c("Actual", "Predicted"))
## Predicted
## Actual No Yes
## No 2898 0
## Yes 94 8
mr6 <- round(mean(class6 != default.test$default), 4)
print(mr6)
## [1] 0.0313
We continue to consider the use of a logistic regression model to predict the probability of default using income and balance on the default data set. In particular, we will now compute estimates for the standard errors of the income and balance logistic regression coefficients in two different ways: (1) using the bootstrap, and (2) using the standard formula for computing the standard errors in the glm() function. Do not forget to set a random seed before beginning your analysis.
set.seed(1)
lr <- glm(default ~ income + balance, family = binomial, data = Default)
summary(lr)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
return(coef(fit))
}
## boot.fn takes data and gives the coefficient estimate for the given index
library(boot)
boot(Default, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 8.556378e-03 4.122015e-01
## t2* 2.080898e-05 -3.993598e-07 4.186088e-06
## t3* 5.647103e-03 -4.116657e-06 2.226242e-04
In sections 5.3.2 and 5.3.3, we saw that the cv.glm() function can be used in order to compute the LOOCV test error estimate. Alternatively, one could compute those quantities using just the glm() and predict.glm() functions, and a for loop. You will now take this approach in order to compute the LOOCV error for a simple logistic regression model on the weekly data set. Recall that in the context of classification problems, the LOOCV error is given in (5.4).
attach(Weekly)
set.seed(1)
weekly <- glm(Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
summary(weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
weekly1 <- glm(Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1, ])
predweekly <- predict(weekly, Weekly[1, ], type = "response")
predweekly.class <- ifelse(predweekly > 0.5, "Up", "Down")
predweekly.class
## 1
## "Up"
Fit a logistic regression model using all but the ith observation to predict direction using lag1 and lag2.
Compute the posterior probability of the market moving up for the ith observation.
Use the posterior probability for the ith observation in order to predict whether of not the market moves up.
Determine whether or not an error was made in predicting the direction for the ith observation. If an error was made, then idicate this as a 1, and otherwise indicate it as a 0.
error<-rep(0,dim(Weekly)[1])
for (i in 1:dim(Weekly)[1]){
fit.glm<-fit.glm <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
pred.up <- predict.glm(fit.glm, Weekly[i, ], type = "response") > 0.5
true.up <- Weekly[i, ]$Direction == "Up"
if (pred.up != true.up)
error[i] <- 1
}
mean(error)
## [1] 0.4499541
## The LOOCV estimate is about 45%.
We will now consider the Boston housing data set, from the islr2 library.
library(MASS)
attach(Boston)
sample_mu <- mean(medv)
sample_mu
## [1] 22.53281
standarderror <- sd(medv) / sqrt(nrow(Boston))
standarderror
## [1] 0.4088611
## The estimated standard error is .4088611. This gives us the accuracy of the estimate or how much the mean will vary based on the sample chosen.
library(boot)
set.seed(1)
boot.fn <- function(data, index) {
mu <- mean(data[index])
return(mu)
}
set.seed(1)
boot(medv, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.009027668 0.3482331
## The standard error obtained here is very similar to that obtained in part (b).
t.test(medv)
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
confint <- c(sample_mu - 2 * standarderror, sample_mu + 2 * standarderror)
confint
## [1] 21.71508 23.35053
## The confidence intervals obtained from the two methods are very very similar.
mu_median <- median(medv)
mu_median
## [1] 21.2
library(boot)
set.seed(1)
boot.fn <- function(data, index) {
median_mu <- median(data[index])
return(median_mu)
}
set.seed(1)
boot(medv, boot.fn, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.029 0.3461316
## The standard error of the median is found here using bootstrapping.
quantile_mu <- quantile(medv, c(0.1))
quantile_mu
## 10%
## 12.75
boot.fn <- function(data, index) {
mu_quantile <- quantile(data[index], c(0.1))
return(mu_quantile)
}
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.0285 0.4861472
## This standard error is small compared to the tenth percentile so we know that we have a good estimate of the population.