Lab 6 Cross Validation and the Bootstrap

5)

a) Fitting logistic regression model

library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
logistic_model <- glm(default ~ balance + income, data = Default, family = "binomial")
summary(logistic_model)
## 
## Call:
## glm(formula = default ~ balance + income, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

b) Estimating test error using validation set approach

Default |>
  count()
##       n
## 1 10000
# 10,000 entries
i) training and validation split
set.seed(1234)
train_indices <- sample(1:nrow(Default), 5000) 

default_train <- Default[train_indices,]
default_validation <- Default[-train_indices,]
ii) fitting logistic on training data
logistic_model1 <- glm(default ~ balance + income, data = default_train, family = "binomial")
summary(logistic_model1)
## 
## Call:
## glm(formula = default ~ balance + income, family = "binomial", 
##     data = default_train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.121e+01  6.002e-01 -18.682  < 2e-16 ***
## balance      5.401e-03  3.133e-04  17.243  < 2e-16 ***
## income       2.171e-05  7.102e-06   3.056  0.00224 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1423.08  on 4999  degrees of freedom
## Residual deviance:  822.85  on 4997  degrees of freedom
## AIC: 828.85
## 
## Number of Fisher Scoring iterations: 8
iii) predicting for individuals in validation set
glm.probs <- predict(logistic_model1,default_validation,type = "response")
glm.probs[1:10]
##            1            2            4            6            7            8 
## 1.816298e-03 1.448456e-03 5.108988e-04 2.276545e-03 1.999246e-03 1.558407e-03 
##           10           11           12           13 
## 2.549243e-05 2.170780e-05 1.297578e-02 8.970163e-05
glm.pred <- rep("No-Default", 5000)
glm.pred[glm.probs > .5] = "Default"
iv) computing validation set error, basically misclassifications
# computing confusion matrix for that
table(Predicted = glm.pred, Actual = default_validation$default)
##             Actual
## Predicted      No  Yes
##   Default      15   61
##   No-Default 4813  111

So misclassified values are 11 and 101, so fraction will look like this 112/5000 that equates to 0.02.

c) Three different splits

# 70/30 split
set.seed(1)
train_indices1 <- sample(1:nrow(Default), size = 0.7 * nrow(Default))

default_train1 <- Default[train_indices1,]
default_validation1 <- Default[-train_indices1,]
logistic_model2 <- glm(default ~ balance + income, data = default_train1, family = 'binomial')
glm.probs1 <- predict(logistic_model2,default_validation1, type = 'response')
glm.probs1[1:10]
##            1           10           11           12           13           17 
## 1.551069e-03 1.810004e-05 1.497496e-05 1.070570e-02 6.608037e-05 3.097617e-05 
##           20           23           25           28 
## 7.480338e-03 1.132023e-02 1.531505e-03 6.087753e-02
glm.pred1 <- rep("No-default", length(glm.probs1))
glm.pred1[glm.probs1 > .5] <- "Default"

table(Predicted = glm.pred1, Actual = default_validation1$default)
##             Actual
## Predicted      No  Yes
##   Default       2   24
##   No-default 2896   78

For our first iteration of using different splits, we get validation set error of 0.026. We got this by adding misclassifications and dividing by total. Let’s see how other splits compare.

# 80/20 split: second iteration
set.seed(2)
train_indices2 <- sample(1:nrow(Default), size = 0.8 * nrow(Default))

default_train2 <- Default[train_indices2,]
default_validation2 <- Default[-train_indices2,]
logistic_model3 <- glm(default ~ balance + income, data = default_train2, family = 'binomial')
glm.probs2 <- predict(logistic_model3,default_validation2, type = 'response')
glm.probs2[1:10]
##            6           18           19           26           28           35 
## 0.0025154553 0.0003313104 0.0005270453 0.0023231950 0.0691018486 0.0509363519 
##           46           54           57           59 
## 0.0003048062 0.0026761887 0.0206144559 0.0401659297
glm.pred2 <- rep("No-default", length(glm.probs2))
glm.pred2[glm.probs2 > .5] <- "Default"

table(Predicted = glm.pred2, Actual = default_validation2$default)
##             Actual
## Predicted      No  Yes
##   Default       9   29
##   No-default 1929   33
length(glm.probs2)
## [1] 2000

With 80/20, we got validation test error of 0.021 which is bit lower than 70/30 split. But we’re reducing the test set, so it may not be fully accurate because we haven’t tested on larger test sets.

# 90/10
set.seed(3)
train_indices3 <- sample(1:nrow(Default), size = 0.9 * nrow(Default))

default_train3 <- Default[train_indices3,]
default_validation3 <- Default[-train_indices3,]
logistic_model4 <- glm(default ~ balance + income, data = default_train3, family = 'binomial')
glm.probs3 <- predict(logistic_model4,default_validation3, type = 'response')


glm.pred3 <- rep("No-default", length(glm.probs3))
glm.pred3[glm.probs3 > .5] <- "Default"

table(Predicted = glm.pred3, Actual = default_validation3$default)
##             Actual
## Predicted     No Yes
##   Default      4   8
##   No-default 970  18

With 90/10, the validation test error is 0.022. We got this by dividing 22/1000.

So all validation test errors are pretty much the same for all splits.

d)

# let's create a dummy variable for student
Default$dummy_student <- ifelse(Default$student == 'Yes',1,0)
# We'll do a 70/30 validation set approach
set.seed(4)
train_indices4 <- sample(1:nrow(Default), size = 0.7 * nrow(Default))

default_train4 <- Default[train_indices4,]
default_validation4 <- Default[-train_indices4,]
logistic_model5 <- glm(default ~ balance + income + dummy_student, data = default_train4, family = 'binomial')
glm.probs4 <- predict(logistic_model5,default_validation4, type = 'response')


glm.pred4 <- rep("No-default", length(glm.probs4))
glm.pred4[glm.probs4 > .5] <- "Default"

table(Predicted = glm.pred4, Actual = default_validation4$default)
##             Actual
## Predicted      No  Yes
##   Default      17   32
##   No-default 2897   54

Validation test error: 0.0236. Test error for 70/30 split without the dummy variable was 0.026. So our new number is slightly lower.

6)

Estimating standard errors of logistic regression coefficients.

a)
lg_model <- glm(default ~ balance + income, data = Default, family = 'binomial')
summary(lg_model)
## 
## Call:
## glm(formula = default ~ balance + income, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
lg_model$coefficients
##   (Intercept)       balance        income 
## -1.154047e+01  5.647103e-03  2.080898e-05
b)
boot.fn <- function(data,index){
  coef(glm(default ~ balance + income, data = data, family = 'binomial', subset = index))
}

boot.fn(Default,1:10000)
##   (Intercept)       balance        income 
## -1.154047e+01  5.647103e-03  2.080898e-05
c)
library(boot)
set.seed(100)

boot(Default, boot.fn, R=1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -2.927307e-02 4.446538e-01
## t2*  5.647103e-03  1.665079e-05 2.318937e-04
## t3*  2.080898e-05 -3.481513e-08 4.916633e-06
d)

Okay so now we have standard errors from both bootstrap and standard formula using summary function.

Standard error for coefficient estimates: Summary function gave us 2.27 x 10-4 for balance and 4.985 x 10-6 for income. On the other hand, bootstrap gave us 2.31 x 10-4 for balance and 4.91 x 10-6 for income.

7)

a)
lg_model_a <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = 'binomial')
b)
lg_model_b <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1,] ,family = 'binomial')
c)
glm.prob_b <- predict(lg_model_b, Weekly[1,] , type = 'response')
glm.prob_b
##         1 
## 0.5713923
contrasts(Weekly$Direction)
##      Up
## Down  0
## Up    1

So our model gave us probability of 0.57 which means it predicted ‘Up’ because it is greater than 0.5. It is not correctly classified because the actual direction was ‘Down’.

d)
glm.pred_c <- rep('Down', nrow(Weekly))
error_vector <- rep(0,nrow(Weekly))
for (i in 1:nrow(Weekly)) {
  m <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i,] ,family = 'binomial')
  glm.prob_c <- predict(m, Weekly[i, ,drop = FALSE] , type = 'response')
  if (glm.prob_c > 0.5) { 
    glm.pred_c[i] <- "Up"
  }
  error_vector[i] <- ifelse(glm.pred_c[i] != Weekly$Direction[i], 1,0)
  
}
e)
mean(error_vector)
## [1] 0.4499541

We received mean of approx 0.45. This means that our model made error 45% of the time during the loop of our dataset. In other words, our model has error rate of 0.45.

9)

a)
mu_hat <- mean(Boston$medv)
b)
std_err <- sd(Boston$medv)/sqrt(nrow(Boston))
std_err
## [1] 0.4088611
c)
boot.mean <- function(data,index){
  mean(data$medv[index])
}
set.seed(99)
boot(Boston, boot.mean, R= 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.mean, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 -0.01791186   0.4305781

Standard error from the formula was 0.408 and bootstrap gave us 0.431. Bootstrap is probably more accurate because there are no assumptions involved unlike the formula.

d)
# 95% Confidence Interval based on bootstrap estimate
left <- mu_hat - 2*0.4305781
right <- mu_hat + 2*0.4305781
paste("[",left,",",right,"]")
## [1] "[ 21.6716501241107 , 23.3939625241107 ]"
# CI using t-test
t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

Confidence interval from bootstrap was (21.67 to 23.39) and t-test CI was (21.73 to 23.34). So both of them are really close to each other.

e)
mu_med <- median(Boston$medv)
mu_med
## [1] 21.2
f)
boot.median <- function(data,index){
  median(data$medv[index])
}
set.seed(987)
boot(Boston, boot.median, R= 1000) 
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.median, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0193   0.3728559

Using bootstrap, our standard error for median was 0.37.

g)
mu_0.1 <- quantile(Boston$medv, probs = 0.10)
mu_0.1
##   10% 
## 12.75
h)
boot.tenth_perc <- function(data,index){
  quantile(data$medv[index],prob = 0.10)
  }
set.seed(912)
boot(Boston, boot.tenth_perc, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.tenth_perc, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 -0.0099   0.4977899

Our bootstrap process gave us a standard error of approx 0.50 for tenth percentile value.