library(ISLR2)
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
glm_fitted <- glm(default ~ income + balance,
data = Default,
family = binomial)
summary(glm_fitted)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
# Split into training and validation set
training_set <- sample(nrow(Default), nrow(Default)/2)
# Fit Model to Training Set
glm_fitted1 <- glm(
default ~ income + balance,
data = Default,
family = binomial,
subset = training_set
)
# Predict on Validation Set
validated <- setdiff(seq_len(nrow(Default)), training_set)
glm_prob1 <- predict(glm_fitted1, Default[validated, ], type = "response")
#Classify using threshold = 0.5:
glm_pred1 <- ifelse(glm_prob1 > 0.5, "Yes", "No")
#Compute validation error
default.valid <- Default$default[validated]
val_error1 <- mean(glm_pred1 != default.valid)
val_error1
## [1] 0.0254
We will likely see very similar error rates each time. This suggests that for this data set, the validation set approach is fairely stable because it is large enough and the relationship with balance and default is strong.
# Store the results in a vector
val_errors <- numeric(3)
for (i in 2:4) {
set.seed(i)
#New split
training_set_i <- sample(nrow(Default), nrow(Default) / 2)
validated_i <- setdiff(seq_len(nrow(Default)), training_set_i)
#Fit model to training set:
glm_fitted_i <- glm(default ~ income + balance,
data = Default,
family = binomial,
subset = training_set_i)
#Predict on validation set:
glm_prob_i <- predict(glm_fitted_i, Default[validated_i, ], type = "response")
glm_pred_i <- ifelse(glm_prob_i > 0.5, "Yes", "No")
#Compute validation error:
default_valid_i <- Default$default[validated_i]
val_error_i <- mean(glm_pred_i != default_valid_i)
val_errors[i - 1] <- val_error_i
cat("Seed:", i, "- Validation Error:", val_error_i, "\n")
}
## Seed: 2 - Validation Error: 0.0238
## Seed: 3 - Validation Error: 0.0264
## Seed: 4 - Validation Error: 0.0256
#Print Results
val_errors
## [1] 0.0238 0.0264 0.0256
It is slightly higher but still falls within the 2-3% range. There seems to be very little cahnge when adding a student. It appears that the variable balance dominates in predicting default so student doesn’t make things significantly better.
set.seed(10)
#Split again
training_set2 <- sample(nrow(Default), nrow(Default) / 2)
validated2 <- setdiff(seq_len(nrow(Default)), training_set2)
#Fit logistic model with student included
glm_fitted2 <- glm(default ~ income + balance + student,
data = Default,
family = binomial,
subset = training_set2)
#Predict on validation set
glm_prob2 <- predict(glm_fitted2, Default[validated2, ], type = "response")
glm_pred2 <- ifelse(glm_prob2 > 0.5, "Yes", "No")
#Compute validation error
default_valid2 <- Default$default[validated2]
val_error2 <- mean(glm_pred2 != default_valid2)
val_error2
## [1] 0.0286
data("Default")
set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data[index, ], family = binomial)
return(coef(fit))
}
boot.fn(Default, 1:nrow(Default))
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
library(boot)
# We will do 1000
boot.out <- boot(data = Default, statistic = boot.fn, R = 1000)
boot.out
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2* 2.080898e-05 1.680317e-07 4.866284e-06
## t3* 5.647103e-03 1.855765e-05 2.298949e-04
The observed values from both the bootstrap and glm are very similar. However, it can be obsrved that the bootstrap coefficients are slightly lower across the board.
Lag_GLM <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(Lag_GLM)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
Weekly_1 <- Weekly[-1,]
Lag_GLM_minus_one <- glm(Direction ~ Lag1 + Lag2, data = Weekly_1, family = "binomial")
summary(Lag_GLM_minus_one)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
prediction1 <- ifelse(predict(Lag_GLM_minus_one, newdata = Weekly[1,], type = "response") > 0.5, "Up", "Down")
prediction1 == Weekly[1,]$Direction
## [1] FALSE
error <- numeric(dim(Weekly)[1])
for (i in 1:dim(Weekly)[1])
{
trainDat <- Weekly[-i,]
modFit <- glm(Direction ~ Lag1 + Lag2, data = trainDat, family = "binomial")
pred <- ifelse(predict(modFit, newdata = Weekly[i,], type = "response") > 0.5,
"Up", "Down")
if (pred != Weekly[i,]$Direction)
{
error[i] <- 1
}}
mean(error)
## [1] 0.4499541
medvMean <- mean(Boston$medv)
medvMean
## [1] 22.53281
sdMedv <- sd(Boston$medv)
SEmedvMean <- sdMedv / sqrt(nrow(Boston))
SEmedvMean
## [1] 0.4088611
library(TeachingDemos)
set.seed(char2seed("Spencer"))
meanBootMedv <- function(data, index) {
mean(data[index, ]$medv)
}
bootmedv <- boot(Boston, meanBootMedv, R = 1000)
SEmedvMeanBS <- sd(bootmedv$t)
SEmedvMeanBS
## [1] 0.4027572
CI <- c(mean(Boston$medv) - 2 * SEmedvMeanBS, mean(Boston$medv) + 2 * SEmedvMeanBS)
CI
## [1] 21.72729 23.33832
t.test(Boston$medv)$conf.int
## [1] 21.72953 23.33608
## attr(,"conf.level")
## [1] 0.95
medianMedvEst <- median(Boston$medv)
medianMedvEst
## [1] 21.2
set.seed(char2seed("Spencer"))
bootMed <- function(data, index) {
median(data[index, ]$medv)
}
bootMedianMedv <- boot(Boston, bootMed, R = 1000)
SEMedianMedvBS <- sd(bootMedianMedv$t)
SEMedianMedvBS
## [1] 0.3671808
TenPctMedvEst <- quantile(Boston$medv, probs = 0.1)
TenPctMedvEst
## 10%
## 12.75
set.seed(char2seed("Spencer"))
bootTen <- function(data, index) {
quantile(data[index, ]$medv, probs = 0.1)
}
bootTenMedvBS <- boot(Boston, bootTen, R = 1000)
SETenPctMedvBS <- sd(bootTenMedvBS$t)
SETenPctMedvBS
## [1] 0.4944738