library(ISLR2)
library(boot)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load data
default <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/Default.csv", na.strings = "?")
default <- na.omit(default)
default$default <- as.factor(default$default)
# Set seed
set.seed(1)
model_default <- glm(default ~ income + balance, data = default, family = "binomial")
summary(model_default)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Observation:
- Both income and balance are statistically significant predictors of
default.
- The positive coefficient for balance suggests that higher balances are
associated with higher default probabilities, while the smaller, but
still significant, coefficient for income implies that income has a less
impactful relationship with default status.
set.seed(1)
train <- sample(nrow(default), nrow(default)/2)
train_data <- default[train, ]
test_data <- default[-train, ]
model_val <- glm(default ~ income + balance, data = train_data, family = "binomial")
probs <- predict(model_val, test_data, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")
mean(preds != test_data$default)
## [1] 0.0254
Observation:
- The validation error is approximately 2.54%, indicating good model
accuracy.
- This low error rate suggests the model generalizes well to unseen data
when using these predictors.
set.seed(2)
train2 <- sample(nrow(default), nrow(default)/2)
model2 <- glm(default ~ income + balance, data = default[train2, ], family = "binomial")
probs2 <- predict(model2, default[-train2, ], type = "response")
preds2 <- ifelse(probs2 > 0.5, "Yes", "No")
error2 <- mean(preds2 != default[-train2, ]$default)
set.seed(3)
train3 <- sample(nrow(default), nrow(default)/2)
model3 <- glm(default ~ income + balance, data = default[train3, ], family = "binomial")
probs3 <- predict(model3, default[-train3, ], type = "response")
preds3 <- ifelse(probs3 > 0.5, "Yes", "No")
error3 <- mean(preds3 != default[-train3, ]$default)
error2
## [1] 0.0238
error3
## [1] 0.0264
Comment:
- Validation errors from different splits: 0.0238 and 0.0264, close
to the original error of 0.0254.
- Confirms the model’s robustness and consistency across different
random training/validation splits.
set.seed(4)
train4 <- sample(nrow(default), nrow(default)/2)
model4 <- glm(default ~ income + balance + student, data = default[train4, ], family = "binomial")
probs4 <- predict(model4, default[-train4, ], type = "response")
preds4 <- ifelse(probs4 > 0.5, "Yes", "No")
error4 <- mean(preds4 != default[-train4, ]$default)
error4
## [1] 0.0262
Observation:
- Including student does not meaningfully reduce test error
(0.0262).
- This suggests student status may not add much additional predictive
power beyond income and balance.
model_se <- glm(default ~ income + balance, data = default, family = "binomial")
summary(model_se)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Observation:
- Standard errors for income and balance are approximately 4.985e-06 and 2.274e-04 respectively.
boot.fn <- function(data, index) {
coef(glm(default ~ income + balance, data = data, family = binomial, subset = index))[2:3]
}
set.seed(5)
boot_results <- boot(default, boot.fn, R = 1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 2.733337e-07 4.909003e-06
## t2* 5.647103e-03 1.512143e-05 2.332406e-04
boot_results$t0
## income balance
## 2.080898e-05 5.647103e-03
apply(boot_results$t, 2, sd)
## [1] 4.909003e-06 2.332406e-04
Comment:
- Bootstrap SE for income: ~4.909e-06, for balance: ~2.332e-04.
- These closely match the standard GLM estimates.
- Indicates that both methods provide reliable SEs, though bootstrap may
better capture data-specific variability.
weekly <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/Weekly.csv", na.strings = "?")
weekly <- na.omit(weekly)
weekly$Direction <- as.factor(weekly$Direction)
logit_weekly <- glm(Direction ~ Lag1 + Lag2, data = weekly, family = binomial)
summary(logit_weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
Observation:
- Lag2 is significant (p < 0.05), but Lag1 is not.
- Indicates Lag2 may be more predictive of weekly market movement.
logit_loocv <- glm(Direction ~ Lag1 + Lag2, data = weekly[-1, ], family = binomial)
prob1 <- predict(logit_loocv, newdata = weekly[1, ], type = "response")
pred1 <- ifelse(prob1 > 0.5, "Up", "Down")
actual1 <- weekly$Direction[1]
pred1 == actual1
## [1] FALSE
Observation:
- The model misclassifies the first observation (FALSE).
- LOOCV examines how well each observation is predicted by all
others.
n <- nrow(weekly)
errors <- rep(0, n)
for (i in 1:n) {
fit_loo <- glm(Direction ~ Lag1 + Lag2, data = weekly[-i, ], family = binomial)
prob <- predict(fit_loo, newdata = weekly[i, ], type = "response")
pred <- ifelse(prob > 0.5, "Up", "Down")
errors[i] <- ifelse(pred != weekly$Direction[i], 1, 0)
}
mean(errors) # LOOCV error estimate
## [1] 0.4499541
Comment:
- The LOOCV error is approximately 0.45 (45%), indicating moderate
predictive ability.
- Despite being computationally expensive, LOOCV gives a reliable,
low-bias error estimate for classification models.
boston <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/Boston.csv", na.strings = "?")
boston <- na.omit(boston)
mu_hat <- mean(boston$medv)
mu_hat
## [1] 22.53281
se_formula <- sd(boston$medv) / sqrt(nrow(boston))
se_formula
## [1] 0.4088611
Interpretation:
- This standard error of approximately 0.4089 represents the average
variability of the sample mean if we repeatedly sampled from the Boston
population.
- A smaller value indicates that the sample mean is a stable and
reliable estimate of the true population mean.
boot.fn.mean <- function(data, index) {
mean(data$medv[index])
}
set.seed(6)
boot_mu <- boot(boston, statistic = boot.fn.mean, R = 1000)
boot_mu
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = boston, statistic = boot.fn.mean, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.01886976 0.4176042
Comparison:
- The bootstrap estimate of standard error (≈ 0.4176) is very close to the analytical standard error from (b) (≈ 0.4089), reinforcing that the bootstrap method provides a reliable alternative when the formula is unknown or infeasible.
mu_se_boot <- sd(boot_mu$t)
ci_boot <- c(mu_hat - 2 * mu_se_boot, mu_hat + 2 * mu_se_boot)
ci_boot
## [1] 21.69760 23.36801
# Compare with t.test()
t.test(boston$medv)
##
## One Sample t-test
##
## data: boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Observation:
- The bootstrap CI is [21.698, 23.368], and the t-test CI is [21.73,
23.34]. Both intervals are nearly identical, indicating consistent
inference regardless of the method.
- This demonstrates that bootstrap confidence intervals can serve as
valid alternatives to classical methods.
mu_med <- median(boston$medv)
mu_med
## [1] 21.2
Observation:
- The estimated median house value is 21.2, which is slightly lower
than the mean.
- This suggests that the distribution of medv is right-skewed with a few
higher values pulling the mean upward.
boot.fn.median <- function(data, index) {
median(data$medv[index])
}
boot_median <- boot(boston, statistic = boot.fn.median, R = 1000)
sd(boot_median$t)
## [1] 0.3889205
Comment:
- The bootstrap estimate of the standard error of the median is approximately 0.389. Since there’s no closed-form expression for the standard error of the median, this shows how useful bootstrapping can be for estimating uncertainty in robust statistics.
mu_10 <- quantile(boston$medv, 0.1)
mu_10
## 10%
## 12.75
Observation:
- The 10th percentile of medv is 12.75, meaning 10% of Boston census
tracts have a median home value below this amount.
- This is useful for identifying the lower end of the housing market
distribution.
boot.fn.q10 <- function(data, index) {
quantile(data$medv[index], 0.1)
}
boot_q10 <- boot(boston, statistic = boot.fn.q10, R = 1000)
sd(boot_q10$t)
## [1] 0.4992374
Comment:
- The bootstrap standard error of the 10th percentile is
approximately 0.499. This reflects greater variability in estimating the
tails of a distribution, emphasizing the uncertainty in the lower end of
home values.
- This insight is important when evaluating housing affordability and
extremes in real estate data.