library(ISLR2)

summary(Default)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

5.)

a.)

glm_fitted <- glm(default ~ income + balance,
                  data = Default,
                  family = binomial)

summary(glm_fitted)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

b.)

set.seed(1)

# Split into training and validation set
training_set <- sample(nrow(Default), nrow(Default)/2)

# Fit Model to Training Set
glm_fitted1 <- glm(
  default ~ income + balance,
  data = Default,
  family = binomial,
  subset = training_set
) 

# Predict on Validation Set
validated <- setdiff(seq_len(nrow(Default)), training_set)
glm_prob1 <- predict(glm_fitted1, Default[validated, ], type = "response")

#Classify using threshold = 0.5:
glm_pred1 <- ifelse(glm_prob1 > 0.5, "Yes", "No")

#Compute validation error
default.valid <- Default$default[validated]
val_error1 <- mean(glm_pred1 != default.valid)
val_error1

## [1] 0.0254

c.) We will likely see very similar error rates each time. This suggests that for this data set, the validation set approach is fairely stable because it is large enough and the relationship with balance and default is strong.

# Store the results in a vector
val_errors <- numeric(3)  

for (i in 2:4) {
  set.seed(i)
  
#New split
training_set_i <- sample(nrow(Default), nrow(Default) / 2)
validated_i <- setdiff(seq_len(nrow(Default)), training_set_i)
  
#Fit model to training set:
glm_fitted_i <- glm(default ~ income + balance,
                      data = Default,
                      family = binomial,
                      subset = training_set_i)
  
#Predict on validation set:
glm_prob_i <- predict(glm_fitted_i, Default[validated_i, ], type = "response")
glm_pred_i <- ifelse(glm_prob_i > 0.5, "Yes", "No")
  
#Compute validation error:
default_valid_i <- Default$default[validated_i]
val_error_i <- mean(glm_pred_i != default_valid_i)
  
val_errors[i - 1] <- val_error_i
cat("Seed:", i, "- Validation Error:", val_error_i, "\n")
}

## Seed: 2 - Validation Error: 0.0238 
## Seed: 3 - Validation Error: 0.0264 
## Seed: 4 - Validation Error: 0.0256

#Print Results
val_errors

## [1] 0.0238 0.0264 0.0256

d.) It is slightly higher but still falls within the 2-3% range. There seems to be very little cahnge when adding a student. It appears that the variable balance dominates in predicting default so student doesn’t make things significantly better.

set.seed(10)  

#Split again
training_set2 <- sample(nrow(Default), nrow(Default) / 2)
validated2 <- setdiff(seq_len(nrow(Default)), training_set2)

#Fit logistic model with student included
glm_fitted2 <- glm(default ~ income + balance + student,
                   data = Default,
                   family = binomial,
                   subset = training_set2)

#Predict on validation set
glm_prob2 <- predict(glm_fitted2, Default[validated2, ], type = "response")
glm_pred2 <- ifelse(glm_prob2 > 0.5, "Yes", "No")

#Compute validation error
default_valid2 <- Default$default[validated2]
val_error2 <- mean(glm_pred2 != default_valid2)
val_error2

## [1] 0.0286

6.)

a.)

data("Default")

set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = binomial)

summary(glm.fit)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

b.)

boot.fn <- function(data, index) {
  
  fit <- glm(default ~ income + balance, data = data[index, ], family = binomial)
  return(coef(fit))
}

boot.fn(Default, 1:nrow(Default))

##   (Intercept)        income       balance 
## -1.154047e+01  2.080898e-05  5.647103e-03

c.)

library(boot)

# We will do 1000
boot.out <- boot(data = Default, statistic = boot.fn, R = 1000)

boot.out

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2*  2.080898e-05  1.680317e-07 4.866284e-06
## t3*  5.647103e-03  1.855765e-05 2.298949e-04

d.) The observed values from both the bootstrap and glm are very similar. However, it can be obsrved that the bootstrap coefficients are slightly lower across the board.

7.)

a.)

data("Weekly")
head(Weekly)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22122    0.06147   3.599 0.000319 ***
## Lag1        -0.03872    0.02622  -1.477 0.139672    
## Lag2         0.06025    0.02655   2.270 0.023232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1488.2  on 1086  degrees of freedom
## AIC: 1494.2
## 
## Number of Fisher Scoring iterations: 4

b.)

#Logistic regression model excluding the first observation
glm.fit.exclude1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = binomial)
summary(glm.fit.exclude1)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1, 
##     ])
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22324    0.06150   3.630 0.000283 ***
## Lag1        -0.03843    0.02622  -1.466 0.142683    
## Lag2         0.06085    0.02656   2.291 0.021971 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1494.6  on 1087  degrees of freedom
## Residual deviance: 1486.5  on 1085  degrees of freedom
## AIC: 1492.5
## 
## Number of Fisher Scoring iterations: 4

c.)

# Predict the probability of direction up
prob.first <- predict(glm.fit, Weekly[1, ], type = "response")

# If probability > 0.5, predict "Up"; otherwise, "Down".
pred.first <- ifelse(prob.first > 0.5, "Up", "Down")
cat("First observation -> Predicted:", pred.first, 
    " | Actual:", Weekly$Direction[1], "\n")

## First observation -> Predicted: Up  | Actual: 1

# Number of observations
n <- nrow(Weekly)

# Create a vector to store predictions from LOOCV
loo.pred <- rep(NA, n)

# Loop over each observation, leaving one out each time
for (i in 1:n) {
  # Fit the model excluding the i-th observation
  glm.fit.loo <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = binomial)
  
  # Predict the probability for the left-out observation
  prob.i <- predict(glm.fit.loo, Weekly[i, ], type = "response")
  
  # Convert probability to class prediction
  loo.pred[i] <- ifelse(prob.i > 0.5, "Up", "Down")
}

Lab6

2025-03-27

5.)

a.)

b.)

c.) We will likely see very similar error rates each time. This suggests that for this data set, the validation set approach is fairely stable because it is large enough and the relationship with balance and default is strong.

d.) It is slightly higher but still falls within the 2-3% range. There seems to be very little cahnge when adding a student. It appears that the variable balance dominates in predicting default so student doesn’t make things significantly better.

6.)

7.)