A:
library(ISLR)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(1)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# Lets fit a logistic regression model with income and balance to predict default
glm_fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm_fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# i. Split the sample into training and validation sets
set.seed(1)
train_idx <- sample(1:nrow(Default), nrow(Default)/2)
train_data <- Default[train_idx, ]
valid_data <- Default[-train_idx, ]
# ii.lets fit Multiple linear regression model with training data
glm_train <- glm(default ~ income + balance, data = train_data, family = "binomial")
# iii. Predict and classify
probs <- predict(glm_train, newdata = valid_data, type = "response")
pred_default <- ifelse(probs > 0.5, "Yes", "No")
# iv. Compute validation error
actual_default <- valid_data$default
mean(pred_default != actual_default)
## [1] 0.0254
set.seed(2)
train1 <- sample(1:nrow(Default), nrow(Default)/2)
glm1 <- glm(default ~ income + balance, data = Default[train1, ], family = "binomial")
probs1 <- predict(glm1, newdata = Default[-train1, ], type = "response")
pred1 <- ifelse(probs1 > 0.5, "Yes", "No")
err1 <- mean(pred1 != Default[-train1, "default"])
set.seed(3)
train2 <- sample(1:nrow(Default), nrow(Default)/2)
glm2 <- glm(default ~ income + balance, data = Default[train2, ], family = "binomial")
probs2 <- predict(glm2, newdata = Default[-train2, ], type = "response")
pred2 <- ifelse(probs2 > 0.5, "Yes", "No")
err2 <- mean(pred2 != Default[-train2, "default"])
set.seed(4)
train3 <- sample(1:nrow(Default), nrow(Default)/2)
glm3 <- glm(default ~ income + balance, data = Default[train3, ], family = "binomial")
probs3 <- predict(glm3, newdata = Default[-train3, ], type = "response")
pred3 <- ifelse(probs3 > 0.5, "Yes", "No")
err3 <- mean(pred3 != Default[-train3, "default"])
err1; err2; err3
## [1] 0.0238
## [1] 0.0264
## [1] 0.0256
The test errors from the three splits are in similar range (around ~2.5%) but not identical, due to sampling variability. This shows the importance of using cross validation or multiple runs to get a more stable estimate of test error. The model is fairly stable with respect to different train/test splits.The predictors income and balance provide a reasonably reliable basis for classification. There is low variance in the test error due to sampling, which indicates good generalization in this case.
set.seed(1)
train4 <- sample(1:nrow(Default), nrow(Default)/2)
glm4 <- glm(default ~ income + balance + student, data = Default[train4, ], family = "binomial")
probs4 <- predict(glm4, newdata = Default[-train4, ], type = "response")
pred4 <- ifelse(probs4 > 0.5, "Yes", "No")
err4 <- mean(pred4 != Default[-train4, "default"])
err4
## [1] 0.026
Including the student variable did not reduce the test error. In fact, it slightly increased the error (from ~2.53% to 2.6%), though the difference is not statistically significant. So we can say that adding a dummy variable doesn’t increase the predictive power.
A: a.
library(ISLR)
set.seed(1)
glm_fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm_fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data, subset = index, family = "binomial")
return(coef(fit)[c("income", "balance")])
}
library(boot)
set.seed(1)
boot_results <- boot(Default, boot.fn, R = 1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04
We can see the bootstrap estimates and standard errors for the income and balance coefficients.
summary(glm_fit)$coefficients[c("income", "balance"), "Std. Error"]
## income balance
## 4.985167e-06 2.273731e-04
boot_results$t0
## income balance
## 2.080898e-05 5.647103e-03
apply(boot_results$t, 2, sd)
## [1] 4.866284e-06 2.298949e-04
Standard errors from glm function : income: 4.99e-06, balance: 2.27e-04 Standard errors from boot: income: 4.87e-06, balance: 2.30e-04
The standard errors from both methods are nearly identical. The model assumptions (e.g., large sample approximation) used by glm() are reasonable in this case. This is how bootstrap methods are valuable when model assumptions are questionable or sample sizes are small.
glm_full <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
Lag2 is statistically significant (p < 0.05), meaning it contributes meaningfully to predicting Direction. Lag1 is not significant *,suggesting it may not improve model prediction much on its own.
glm_excl1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial")
prob1 <- predict(glm_excl1, newdata = Weekly[1, ], type = "response")
# Classify
pred1 <- ifelse(prob1 > 0.5, "Up", "Down")
actual1 <- Weekly$Direction[1]
pred1 == actual1
## [1] FALSE
Result: FALSE = Incorrect classification The model misclassified the first observation.
n <- nrow(Weekly)
errors <- rep(NA, n)
for (i in 1:n) {
# i. Fit logistic model on all but the i th observation
glm_loo <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
# ii. Predict posterior probability for i th observation
prob_i <- predict(glm_loo, newdata = Weekly[i, ], type = "response")
# iii. Classify based on 0.5 threshold
pred_i <- ifelse(prob_i > 0.5, "Up", "Down")
# iv. Record whether prediction was incorrect (1) or correct (0)
actual_i <- Weekly$Direction[i]
errors[i] <- ifelse(pred_i != actual_i, 1, 0)
mean(errors)
}
loocv_error <- mean(errors)
loocv_error
## [1] 0.4499541
The model using Lag1 and Lag2 correctly predicts market direction about 55% of the time. A 45% test error rate implies the model is only slightly better than random guessing. This suggests: The relationship between Lag1, Lag2, and market Direction is weak. or the market may not be predictable using just these two lagged variables.
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(boot)
data("Boston")
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
The estimate of the population mean of medv = 22.532
formula for this calculation is SE(mu) =s.mean/sq.root(n)
se_mu_hat <- sd(Boston$medv) / sqrt(nrow(Boston))
se_mu_hat
## [1] 0.4088611
estimate of standard error mu hat = 0.4088611 . It tells us how much the sample mean (22.53) would vary if we were to take many repeated random samples of the same size from the Boston population.
boot.fn.mean <- function(data, index) {
return(mean(data[index]))
}
set.seed(1)
boot_mean <- boot(Boston$medv, statistic = boot.fn.mean, R = 1000)
boot_mean
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn.mean, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
boot_se_mu_hat <- sd(boot_mean$t)
boot_se_mu_hat
## [1] 0.4106622
Theoretical s.e= 0.4088611 and bootstrap s.e= 0.4106622. Both are very close which says sampling distribution of sample mean is normal.
ci_boot <- c(mu_hat - 2 * boot_se_mu_hat, mu_hat + 2 * boot_se_mu_hat)
ci_boot
## [1] 21.71148 23.35413
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Both intervals are very similar, centered around the same mean 22.53. The bootstrap CI is: Slightly wider, due to using an approximate method and not assuming normality. Still almost similiar to the t-distribution result. This shows that bootstrapping provides a reliable and flexible alternative to analytical methods, especially when distributional assumptions are questionable.
mu_hat_med <- median(Boston$medv)
mu_hat_med
## [1] 21.2
Median value of medv is 21.2.
boot.fn.med <- function(data, index) {
return(median(data[index]))
}
set.seed(1)
boot_med <- boot(Boston$medv, statistic = boot.fn.med, R = 1000)
boot_se_med <- sd(boot_med$t)
boot_se_med
## [1] 0.3778075
Median using the bootstrap is 0.377 which is slighly higher than the estimate. This tells us how much the median medv value would fluctuate across repeated samples from the population.
mu_hat_0.1 <- quantile(Boston$medv, 0.1)
mu_hat_0.1
## 10%
## 12.75
The 10th percentile is a measure of the lower tail of the distribution. It shows the median home value i.e., 1275 below which 10% of Boston neighborhoods fall.
boot.fn.q10 <- function(data, index) {
return(quantile(data[index], 0.1))
}
set.seed(1)
boot_q10 <- boot(Boston$medv, statistic = boot.fn.q10, R = 1000)
boot_se_q10 <- sd(boot_q10$t)
boot_se_q10
## [1] 0.4767526
The standard error is relatively larger than that of the mean (0.41) and median (0.38). This makes sense because quantiles near the tails, like the 10th percentile, tend to be more sensitive to variability and outliers. This S.E. tells us that if we repeatedly sampled from the population, the 10th percentile of medv would vary by roughly 0.48 on average.