Load Systematic Investor Toolbox (SIT)

con = gzcon(url('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', 'rb'))
source(con)
close(con)

a. Fit a logistic regression model that uses income and balance to predict default.

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.2.3
data <- Default
head(data,6)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
model1<- glm(default ~ income + balance, data=Default, family="binomial")
summary(model1)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

B. Using the validation set approach, estimate the test error of this model.

Step 1: Split the sample set into a training set and a validation set.

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked _by_ '.GlobalEnv':
## 
##     count, lst
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rsample)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.2     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
data_split <- initial_split(Default, prop = .7, strata ="default")
data_train <- training(data_split)
head(data_train)
##   default student  balance    income
## 1      No      No 729.5265 44361.625
## 2      No     Yes 817.1804 12106.135
## 4      No      No 529.2506 35704.494
## 5      No      No 785.6559 38463.496
## 6      No     Yes 919.5885  7491.559
## 7      No      No 825.5133 24905.227
# Step 2: Fit a multiple logistic regression model using only the training observations.
model2 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model2)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4444  -0.1445  -0.0582  -0.0215   3.7194  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.152e+01  5.201e-01 -22.147  < 2e-16 ***
## income       2.227e-05  6.133e-06   3.632 0.000282 ***
## balance      5.577e-03  2.703e-04  20.636  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1989.6  on 6999  degrees of freedom
## Residual deviance: 1077.2  on 6997  degrees of freedom
## AIC: 1083.2
## 
## Number of Fisher Scoring iterations: 8

Step 3: Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

pred <- predict(model2, newdata=data_train, type = "response")
model.pred <- rep("No", length(pred))
model.pred[pred > 0.5] <- "Yes"
# Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
mean(model.pred != data_train$default)
## [1] 0.025

#c. Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained. # The first observation

data_split1 <- initial_split(Default, prop = .8, strata ="default")
data_train1 <- training(data_split1)
head(data_train1)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

Step 2: Fit a multiple logistic regression model using only the training observations.

model3 <- glm(default ~ income + balance, data = data_train1, family="binomial")
summary(model3)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = data_train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4676  -0.1466  -0.0597  -0.0221   3.7168  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.157e+01  4.852e-01 -23.849  < 2e-16 ***
## income       2.404e-05  5.542e-06   4.338 1.44e-05 ***
## balance      5.587e-03  2.504e-04  22.316  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2360.7  on 7999  degrees of freedom
## Residual deviance: 1288.9  on 7997  degrees of freedom
## AIC: 1294.9
## 
## Number of Fisher Scoring iterations: 8

Step 3: Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

pred1 <- predict(model3, newdata=data_train1, type = "response")
model.pred1 <- rep("No", length(pred1))
model.pred1[pred1 > 0.5] <- "Yes"

Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.

obs1 <- mean(model.pred1 != data_train1$default)
obs1
## [1] 0.0265

The second observation

data_split2 <- initial_split(Default, prop = .9, strata ="default")
data_train2 <- training(data_split2)
head(data_train1)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

Step 2: Fit a multiple logistic regression model using only the training observations.

model4 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model4)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4444  -0.1445  -0.0582  -0.0215   3.7194  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.152e+01  5.201e-01 -22.147  < 2e-16 ***
## income       2.227e-05  6.133e-06   3.632 0.000282 ***
## balance      5.577e-03  2.703e-04  20.636  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1989.6  on 6999  degrees of freedom
## Residual deviance: 1077.2  on 6997  degrees of freedom
## AIC: 1083.2
## 
## Number of Fisher Scoring iterations: 8

Step 3: Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

pred2 <- predict(model4, newdata=data_train2, type = "response")
model.pred2 <- rep("No", length(pred2))
model.pred2[pred2 > 0.5] <- "Yes"

Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.

obs2 <- mean(model.pred2 != data_train2$default)
obs2
## [1] 0.02633333

The third observation

data_split3 <- initial_split(Default, prop = 0.5, strata ="default")
data_train3 <- training(data_split3)
head(data_train3)
##    default student  balance    income
## 2       No     Yes 817.1804 12106.135
## 5       No      No 785.6559 38463.496
## 6       No     Yes 919.5885  7491.559
## 7       No      No 825.5133 24905.227
## 8       No     Yes 808.6675 17600.451
## 13      No      No 237.0451 28251.695

Step 2: Fit a multiple logistic regression model using only the training observations.

model5 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model5)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4444  -0.1445  -0.0582  -0.0215   3.7194  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.152e+01  5.201e-01 -22.147  < 2e-16 ***
## income       2.227e-05  6.133e-06   3.632 0.000282 ***
## balance      5.577e-03  2.703e-04  20.636  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1989.6  on 6999  degrees of freedom
## Residual deviance: 1077.2  on 6997  degrees of freedom
## AIC: 1083.2
## 
## Number of Fisher Scoring iterations: 8

Step 3: Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

pred3 <- predict(model5, newdata=data_train3, type = "response")
model.pred3 <- rep("No", length(pred3))
model.pred3[pred3 > 0.5] <- "Yes"

Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.

obs3 <- mean(model.pred3 != data_train3$default)
obs3
## [1] 0.0262
prop <- c(0.8, 0.9, 0.5)
VSE <- c(obs1, obs2, obs3)
a <- data.frame(prop, VSE)
a
##   prop        VSE
## 1  0.8 0.02650000
## 2  0.9 0.02633333
## 3  0.5 0.02620000

d. Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including a dummy variable for student leads to a reduction in the test error rate.

data_split4 <- initial_split(Default, prop = 0.5, strata ="default")
data_train4 <- training(data_split4)
head(data_train4)
##    default student   balance    income
## 1       No      No  729.5265 44361.625
## 3       No      No 1073.5492 31767.139
## 4       No      No  529.2506 35704.494
## 5       No      No  785.6559 38463.496
## 6       No     Yes  919.5885  7491.559
## 11      No     Yes    0.0000 21871.073

Step 2: Fit a multiple logistic regression model using only the training observations.

model6 <- glm(default ~ income + balance + student, data = data_train4, family="binomial")
summary(model6)
## 
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial", 
##     data = data_train4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8810  -0.1313  -0.0503  -0.0183   3.7949  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.145e+01  7.067e-01 -16.205   <2e-16 ***
## income       7.300e-06  1.148e-05   0.636    0.525    
## balance      6.001e-03  3.412e-04  17.585   <2e-16 ***
## studentYes  -4.466e-01  3.335e-01  -1.339    0.181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1503.85  on 4999  degrees of freedom
## Residual deviance:  769.72  on 4996  degrees of freedom
## AIC: 777.72
## 
## Number of Fisher Scoring iterations: 8

Step 3: Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.

pred4 <- predict(model5, newdata=data_train4, type = "response")
model.pred4 <- rep("No", length(pred4))
model.pred4[pred4 > 0.5] <- "Yes"

Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.

mean(model.pred4 != data_train4$default)
## [1] 0.0268