library(ISLR2)
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
glm_fitted <- glm(default ~ income + balance,
data = Default,
family = binomial)
summary(glm_fitted)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
# Split into training and validation set
training_set <- sample(nrow(Default), nrow(Default)/2)
# Fit Model to Training Set
glm_fitted1 <- glm(
default ~ income + balance,
data = Default,
family = binomial,
subset = training_set
)
# Predict on Validation Set
validated <- setdiff(seq_len(nrow(Default)), training_set)
glm_prob1 <- predict(glm_fitted1, Default[validated, ], type = "response")
#Classify using threshold = 0.5:
glm_pred1 <- ifelse(glm_prob1 > 0.5, "Yes", "No")
#Compute validation error
default.valid <- Default$default[validated]
val_error1 <- mean(glm_pred1 != default.valid)
val_error1
## [1] 0.0254
# Store the results in a vector
val_errors <- numeric(3)
for (i in 2:4) {
set.seed(i)
#New split
training_set_i <- sample(nrow(Default), nrow(Default) / 2)
validated_i <- setdiff(seq_len(nrow(Default)), training_set_i)
#Fit model to training set:
glm_fitted_i <- glm(default ~ income + balance,
data = Default,
family = binomial,
subset = training_set_i)
#Predict on validation set:
glm_prob_i <- predict(glm_fitted_i, Default[validated_i, ], type = "response")
glm_pred_i <- ifelse(glm_prob_i > 0.5, "Yes", "No")
#Compute validation error:
default_valid_i <- Default$default[validated_i]
val_error_i <- mean(glm_pred_i != default_valid_i)
val_errors[i - 1] <- val_error_i
cat("Seed:", i, "- Validation Error:", val_error_i, "\n")
}
## Seed: 2 - Validation Error: 0.0238
## Seed: 3 - Validation Error: 0.0264
## Seed: 4 - Validation Error: 0.0256
#Print Results
val_errors
## [1] 0.0238 0.0264 0.0256
set.seed(10)
#Split again
training_set2 <- sample(nrow(Default), nrow(Default) / 2)
validated2 <- setdiff(seq_len(nrow(Default)), training_set2)
#Fit logistic model with student included
glm_fitted2 <- glm(default ~ income + balance + student,
data = Default,
family = binomial,
subset = training_set2)
#Predict on validation set
glm_prob2 <- predict(glm_fitted2, Default[validated2, ], type = "response")
glm_pred2 <- ifelse(glm_prob2 > 0.5, "Yes", "No")
#Compute validation error
default_valid2 <- Default$default[validated2]
val_error2 <- mean(glm_pred2 != default_valid2)
val_error2
## [1] 0.0286
a.)
data("Default")
set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
b.)
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data[index, ], family = binomial)
return(coef(fit))
}
boot.fn(Default, 1:nrow(Default))
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
c.)
library(boot)
# We will do 1000
boot.out <- boot(data = Default, statistic = boot.fn, R = 1000)
boot.out
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2* 2.080898e-05 1.680317e-07 4.866284e-06
## t3* 5.647103e-03 1.855765e-05 2.298949e-04
d.) The observed values from both the bootstrap and glm are very similar. However, it can be obsrved that the bootstrap coefficients are slightly lower across the board.
a.)
data("Weekly")
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
b.)
#Logistic regression model excluding the first observation
glm.fit.exclude1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = binomial)
summary(glm.fit.exclude1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
c.)
# Predict the probability of direction up
prob.first <- predict(glm.fit, Weekly[1, ], type = "response")
# If probability > 0.5, predict "Up"; otherwise, "Down".
pred.first <- ifelse(prob.first > 0.5, "Up", "Down")
cat("First observation -> Predicted:", pred.first,
" | Actual:", Weekly$Direction[1], "\n")
## First observation -> Predicted: Up | Actual: 1
# Number of observations
n <- nrow(Weekly)
# Create a vector to store predictions from LOOCV
loo.pred <- rep(NA, n)
# Loop over each observation, leaving one out each time
for (i in 1:n) {
# Fit the model excluding the i-th observation
glm.fit.loo <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = binomial)
# Predict the probability for the left-out observation
prob.i <- predict(glm.fit.loo, Weekly[i, ], type = "response")
# Convert probability to class prediction
loo.pred[i] <- ifelse(prob.i > 0.5, "Up", "Down")
}