con = gzcon(url('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', 'rb'))
source(con)
close(con)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.2.3
data <- Default
head(data,6)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
model1<- glm(default ~ income + balance, data=Default, family="binomial")
summary(model1)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked _by_ '.GlobalEnv':
##
## count, lst
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rsample)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
data_split <- initial_split(Default, prop = .7, strata ="default")
data_train <- training(data_split)
head(data_train)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
## 7 No No 825.5133 24905.227
# Step 2: Fit a multiple logistic regression model using only the training observations.
model2 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model2)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4444 -0.1445 -0.0582 -0.0215 3.7194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.152e+01 5.201e-01 -22.147 < 2e-16 ***
## income 2.227e-05 6.133e-06 3.632 0.000282 ***
## balance 5.577e-03 2.703e-04 20.636 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1989.6 on 6999 degrees of freedom
## Residual deviance: 1077.2 on 6997 degrees of freedom
## AIC: 1083.2
##
## Number of Fisher Scoring iterations: 8
pred <- predict(model2, newdata=data_train, type = "response")
model.pred <- rep("No", length(pred))
model.pred[pred > 0.5] <- "Yes"
# Step 4: Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
mean(model.pred != data_train$default)
## [1] 0.025
#c. Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained. # The first observation
data_split1 <- initial_split(Default, prop = .8, strata ="default")
data_train1 <- training(data_split1)
head(data_train1)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
model3 <- glm(default ~ income + balance, data = data_train1, family="binomial")
summary(model3)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = data_train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4676 -0.1466 -0.0597 -0.0221 3.7168
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.157e+01 4.852e-01 -23.849 < 2e-16 ***
## income 2.404e-05 5.542e-06 4.338 1.44e-05 ***
## balance 5.587e-03 2.504e-04 22.316 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2360.7 on 7999 degrees of freedom
## Residual deviance: 1288.9 on 7997 degrees of freedom
## AIC: 1294.9
##
## Number of Fisher Scoring iterations: 8
pred1 <- predict(model3, newdata=data_train1, type = "response")
model.pred1 <- rep("No", length(pred1))
model.pred1[pred1 > 0.5] <- "Yes"
obs1 <- mean(model.pred1 != data_train1$default)
obs1
## [1] 0.0265
data_split2 <- initial_split(Default, prop = .9, strata ="default")
data_train2 <- training(data_split2)
head(data_train1)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
model4 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model4)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4444 -0.1445 -0.0582 -0.0215 3.7194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.152e+01 5.201e-01 -22.147 < 2e-16 ***
## income 2.227e-05 6.133e-06 3.632 0.000282 ***
## balance 5.577e-03 2.703e-04 20.636 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1989.6 on 6999 degrees of freedom
## Residual deviance: 1077.2 on 6997 degrees of freedom
## AIC: 1083.2
##
## Number of Fisher Scoring iterations: 8
pred2 <- predict(model4, newdata=data_train2, type = "response")
model.pred2 <- rep("No", length(pred2))
model.pred2[pred2 > 0.5] <- "Yes"
obs2 <- mean(model.pred2 != data_train2$default)
obs2
## [1] 0.02633333
data_split3 <- initial_split(Default, prop = 0.5, strata ="default")
data_train3 <- training(data_split3)
head(data_train3)
## default student balance income
## 2 No Yes 817.1804 12106.135
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
## 7 No No 825.5133 24905.227
## 8 No Yes 808.6675 17600.451
## 13 No No 237.0451 28251.695
model5 <- glm(default ~ income + balance, data = data_train, family="binomial")
summary(model5)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4444 -0.1445 -0.0582 -0.0215 3.7194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.152e+01 5.201e-01 -22.147 < 2e-16 ***
## income 2.227e-05 6.133e-06 3.632 0.000282 ***
## balance 5.577e-03 2.703e-04 20.636 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1989.6 on 6999 degrees of freedom
## Residual deviance: 1077.2 on 6997 degrees of freedom
## AIC: 1083.2
##
## Number of Fisher Scoring iterations: 8
pred3 <- predict(model5, newdata=data_train3, type = "response")
model.pred3 <- rep("No", length(pred3))
model.pred3[pred3 > 0.5] <- "Yes"
obs3 <- mean(model.pred3 != data_train3$default)
obs3
## [1] 0.0262
prop <- c(0.8, 0.9, 0.5)
VSE <- c(obs1, obs2, obs3)
a <- data.frame(prop, VSE)
a
## prop VSE
## 1 0.8 0.02650000
## 2 0.9 0.02633333
## 3 0.5 0.02620000
data_split4 <- initial_split(Default, prop = 0.5, strata ="default")
data_train4 <- training(data_split4)
head(data_train4)
## default student balance income
## 1 No No 729.5265 44361.625
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
## 11 No Yes 0.0000 21871.073
model6 <- glm(default ~ income + balance + student, data = data_train4, family="binomial")
summary(model6)
##
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial",
## data = data_train4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8810 -0.1313 -0.0503 -0.0183 3.7949
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.145e+01 7.067e-01 -16.205 <2e-16 ***
## income 7.300e-06 1.148e-05 0.636 0.525
## balance 6.001e-03 3.412e-04 17.585 <2e-16 ***
## studentYes -4.466e-01 3.335e-01 -1.339 0.181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1503.85 on 4999 degrees of freedom
## Residual deviance: 769.72 on 4996 degrees of freedom
## AIC: 777.72
##
## Number of Fisher Scoring iterations: 8
pred4 <- predict(model5, newdata=data_train4, type = "response")
model.pred4 <- rep("No", length(pred4))
model.pred4[pred4 > 0.5] <- "Yes"
mean(model.pred4 != data_train4$default)
## [1] 0.0268