library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
logistic_model <- glm(default ~ balance + income, data = Default, family = "binomial")
summary(logistic_model)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Default |>
count()
## n
## 1 10000
# 10,000 entries
set.seed(1234)
train_indices <- sample(1:nrow(Default), 5000)
default_train <- Default[train_indices,]
default_validation <- Default[-train_indices,]
logistic_model1 <- glm(default ~ balance + income, data = default_train, family = "binomial")
summary(logistic_model1)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = default_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.121e+01 6.002e-01 -18.682 < 2e-16 ***
## balance 5.401e-03 3.133e-04 17.243 < 2e-16 ***
## income 2.171e-05 7.102e-06 3.056 0.00224 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1423.08 on 4999 degrees of freedom
## Residual deviance: 822.85 on 4997 degrees of freedom
## AIC: 828.85
##
## Number of Fisher Scoring iterations: 8
glm.probs <- predict(logistic_model1,default_validation,type = "response")
glm.probs[1:10]
## 1 2 4 6 7 8
## 1.816298e-03 1.448456e-03 5.108988e-04 2.276545e-03 1.999246e-03 1.558407e-03
## 10 11 12 13
## 2.549243e-05 2.170780e-05 1.297578e-02 8.970163e-05
glm.pred <- rep("No-Default", 5000)
glm.pred[glm.probs > .5] = "Default"
# computing confusion matrix for that
table(Predicted = glm.pred, Actual = default_validation$default)
## Actual
## Predicted No Yes
## Default 15 61
## No-Default 4813 111
So misclassified values are 11 and 101, so fraction will look like this 112/5000 that equates to 0.02.
# 70/30 split
set.seed(1)
train_indices1 <- sample(1:nrow(Default), size = 0.7 * nrow(Default))
default_train1 <- Default[train_indices1,]
default_validation1 <- Default[-train_indices1,]
logistic_model2 <- glm(default ~ balance + income, data = default_train1, family = 'binomial')
glm.probs1 <- predict(logistic_model2,default_validation1, type = 'response')
glm.probs1[1:10]
## 1 10 11 12 13 17
## 1.551069e-03 1.810004e-05 1.497496e-05 1.070570e-02 6.608037e-05 3.097617e-05
## 20 23 25 28
## 7.480338e-03 1.132023e-02 1.531505e-03 6.087753e-02
glm.pred1 <- rep("No-default", length(glm.probs1))
glm.pred1[glm.probs1 > .5] <- "Default"
table(Predicted = glm.pred1, Actual = default_validation1$default)
## Actual
## Predicted No Yes
## Default 2 24
## No-default 2896 78
For our first iteration of using different splits, we get validation set error of 0.026. We got this by adding misclassifications and dividing by total. Let’s see how other splits compare.
# 80/20 split: second iteration
set.seed(2)
train_indices2 <- sample(1:nrow(Default), size = 0.8 * nrow(Default))
default_train2 <- Default[train_indices2,]
default_validation2 <- Default[-train_indices2,]
logistic_model3 <- glm(default ~ balance + income, data = default_train2, family = 'binomial')
glm.probs2 <- predict(logistic_model3,default_validation2, type = 'response')
glm.probs2[1:10]
## 6 18 19 26 28 35
## 0.0025154553 0.0003313104 0.0005270453 0.0023231950 0.0691018486 0.0509363519
## 46 54 57 59
## 0.0003048062 0.0026761887 0.0206144559 0.0401659297
glm.pred2 <- rep("No-default", length(glm.probs2))
glm.pred2[glm.probs2 > .5] <- "Default"
table(Predicted = glm.pred2, Actual = default_validation2$default)
## Actual
## Predicted No Yes
## Default 9 29
## No-default 1929 33
length(glm.probs2)
## [1] 2000
With 80/20, we got validation test error of 0.021 which is bit lower than 70/30 split. But we’re reducing the test set, so it may not be fully accurate because we haven’t tested on larger test sets.
# 90/10
set.seed(3)
train_indices3 <- sample(1:nrow(Default), size = 0.9 * nrow(Default))
default_train3 <- Default[train_indices3,]
default_validation3 <- Default[-train_indices3,]
logistic_model4 <- glm(default ~ balance + income, data = default_train3, family = 'binomial')
glm.probs3 <- predict(logistic_model4,default_validation3, type = 'response')
glm.pred3 <- rep("No-default", length(glm.probs3))
glm.pred3[glm.probs3 > .5] <- "Default"
table(Predicted = glm.pred3, Actual = default_validation3$default)
## Actual
## Predicted No Yes
## Default 4 8
## No-default 970 18
With 90/10, the validation test error is 0.022. We got this by dividing 22/1000.
So all validation test errors are pretty much the same for all splits.
# let's create a dummy variable for student
Default$dummy_student <- ifelse(Default$student == 'Yes',1,0)
# We'll do a 70/30 validation set approach
set.seed(4)
train_indices4 <- sample(1:nrow(Default), size = 0.7 * nrow(Default))
default_train4 <- Default[train_indices4,]
default_validation4 <- Default[-train_indices4,]
logistic_model5 <- glm(default ~ balance + income + dummy_student, data = default_train4, family = 'binomial')
glm.probs4 <- predict(logistic_model5,default_validation4, type = 'response')
glm.pred4 <- rep("No-default", length(glm.probs4))
glm.pred4[glm.probs4 > .5] <- "Default"
table(Predicted = glm.pred4, Actual = default_validation4$default)
## Actual
## Predicted No Yes
## Default 17 32
## No-default 2897 54
Validation test error: 0.0236. Test error for 70/30 split without the dummy variable was 0.026. So our new number is slightly lower.
lg_model <- glm(default ~ balance + income, data = Default, family = 'binomial')
summary(lg_model)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
lg_model$coefficients
## (Intercept) balance income
## -1.154047e+01 5.647103e-03 2.080898e-05
boot.fn <- function(data,index){
coef(glm(default ~ balance + income, data = data, family = 'binomial', subset = index))
}
boot.fn(Default,1:10000)
## (Intercept) balance income
## -1.154047e+01 5.647103e-03 2.080898e-05
library(boot)
set.seed(100)
boot(Default, boot.fn, R=1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.927307e-02 4.446538e-01
## t2* 5.647103e-03 1.665079e-05 2.318937e-04
## t3* 2.080898e-05 -3.481513e-08 4.916633e-06
Okay so now we have standard errors from both bootstrap and standard formula using summary function.
Standard error for coefficient estimates: Summary function gave us 2.27 x 10-4 for balance and 4.985 x 10-6 for income. On the other hand, bootstrap gave us 2.31 x 10-4 for balance and 4.91 x 10-6 for income.
lg_model_a <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = 'binomial')
lg_model_b <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1,] ,family = 'binomial')
glm.prob_b <- predict(lg_model_b, Weekly[1,] , type = 'response')
glm.prob_b
## 1
## 0.5713923
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
So our model gave us probability of 0.57 which means it predicted ‘Up’ because it is greater than 0.5. It is not correctly classified because the actual direction was ‘Down’.
glm.pred_c <- rep('Down', nrow(Weekly))
error_vector <- rep(0,nrow(Weekly))
for (i in 1:nrow(Weekly)) {
m <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i,] ,family = 'binomial')
glm.prob_c <- predict(m, Weekly[i, ,drop = FALSE] , type = 'response')
if (glm.prob_c > 0.5) {
glm.pred_c[i] <- "Up"
}
error_vector[i] <- ifelse(glm.pred_c[i] != Weekly$Direction[i], 1,0)
}
mean(error_vector)
## [1] 0.4499541
We received mean of approx 0.45. This means that our model made error 45% of the time during the loop of our dataset. In other words, our model has error rate of 0.45.
mu_hat <- mean(Boston$medv)
std_err <- sd(Boston$medv)/sqrt(nrow(Boston))
std_err
## [1] 0.4088611
boot.mean <- function(data,index){
mean(data$medv[index])
}
set.seed(99)
boot(Boston, boot.mean, R= 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.mean, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.01791186 0.4305781
Standard error from the formula was 0.408 and bootstrap gave us 0.431. Bootstrap is probably more accurate because there are no assumptions involved unlike the formula.
# 95% Confidence Interval based on bootstrap estimate
left <- mu_hat - 2*0.4305781
right <- mu_hat + 2*0.4305781
paste("[",left,",",right,"]")
## [1] "[ 21.6716501241107 , 23.3939625241107 ]"
# CI using t-test
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Confidence interval from bootstrap was (21.67 to 23.39) and t-test CI was (21.73 to 23.34). So both of them are really close to each other.
mu_med <- median(Boston$medv)
mu_med
## [1] 21.2
boot.median <- function(data,index){
median(data$medv[index])
}
set.seed(987)
boot(Boston, boot.median, R= 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.median, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0193 0.3728559
Using bootstrap, our standard error for median was 0.37.
mu_0.1 <- quantile(Boston$medv, probs = 0.10)
mu_0.1
## 10%
## 12.75
boot.tenth_perc <- function(data,index){
quantile(data$medv[index],prob = 0.10)
}
set.seed(912)
boot(Boston, boot.tenth_perc, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.tenth_perc, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 -0.0099 0.4977899
Our bootstrap process gave us a standard error of approx 0.50 for tenth percentile value.