Statistical learning- lab 6
library(ISLR)
set.seed(1) # Set a random seed for reproducibility
# Part (a): Fit a logistic regression model using income and balance to predict default
model1 <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model1)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# i. Split the data into training (50%) and validation (50%) sets
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
validation_data <- Default[-train_indices, ]
# ii. Fit the logistic regression model on training data
model_train <- glm(default ~ income + balance, data = train_data, family = binomial)
# iii. Predict on validation set
pred_probs <- predict(model_train, newdata = validation_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
# iv. Compute validation set error
validation_error <- mean(pred_class != validation_data$default)
print(paste("Validation Set Error:", round(validation_error, 4)))
## [1] "Validation Set Error: 0.0254"
set.seed(2)
train_indices2 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data2 <- Default[train_indices2, ]
validation_data2 <- Default[-train_indices2, ]
model_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
pred_probs2 <- predict(model_train2, newdata = validation_data2, type = "response")
pred_class2 <- ifelse(pred_probs2 > 0.5, "Yes", "No")
validation_error2 <- mean(pred_class2 != validation_data2$default)
print(paste("Validation Set Error (Split 2):", round(validation_error2, 4)))
## [1] "Validation Set Error (Split 2): 0.0238"
set.seed(3)
train_indices3 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data3 <- Default[train_indices3, ]
validation_data3 <- Default[-train_indices3, ]
model_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
pred_probs3 <- predict(model_train3, newdata = validation_data3, type = "response")
pred_class3 <- ifelse(pred_probs3 > 0.5, "Yes", "No")
validation_error3 <- mean(pred_class3 != validation_data3$default)
print(paste("Validation Set Error (Split 3):", round(validation_error3, 4)))
## [1] "Validation Set Error (Split 3): 0.0264"
set.seed(1)
train_indices4 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data4 <- Default[train_indices4, ]
validation_data4 <- Default[-train_indices4, ]
# Fit model with student as an additional predictor
model_train4 <- glm(default ~ income + balance + student, data = train_data4, family = binomial)
pred_probs4 <- predict(model_train4, newdata = validation_data4, type = "response")
pred_class4 <- ifelse(pred_probs4 > 0.5, "Yes", "No")
validation_error4 <- mean(pred_class4 != validation_data4$default)
print(paste("Validation Set Error (With Student Variable):", round(validation_error4, 4)))
## [1] "Validation Set Error (With Student Variable): 0.026"
# Set random seed for reproducibility
set.seed(123)
# Load necessary package
library(ISLR)
# Fit logistic regression model
model <- glm(default ~ income + balance, family = binomial, data = Default)
# Display summary of the model
summary(model)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# Define the boot.fn function
boot.fn <- function(data, indices) {
# Subset the data based on the indices provided by boot()
d <- data[indices, ]
# Fit logistic regression model
model_boot <- glm(default ~ income + balance, family = binomial, data = d)
# Return coefficients
return(coef(model_boot))
}
# Load the boot package
library(boot)
# Set random seed for reproducibility
set.seed(123)
# Run bootstrap with 1000 replications
boot_results <- boot(data = Default, statistic = boot.fn, R = 1000)
# Display bootstrap results
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.754771e-02 4.204817e-01
## t2* 2.080898e-05 1.582518e-07 4.729534e-06
## t3* 5.647103e-03 1.296980e-05 2.217214e-04
Standard Errors from glm(): - The standard errors obtained from the glm() function are based on asymptotic normality and Fisher’s information matrix, which is the default approach for estimating standard errors in logistic regression. - The estimates provided by glm() are precise, and the standard errors are relatively small for both income and balance, suggesting that the coefficients are estimated with good precision. Standard Errors from Bootstrap: - The bootstrap method provides a different approach to estimating standard errors by resampling the data multiple times and calculating the coefficient estimates for each resample. - The standard errors obtained from the bootstrap method are close to those from glm(). In fact, the standard error for the intercept, income, and balance coefficients from the bootstrap are very similar to the glm() results, confirming that both methods yield consistent estimates. - The bias in the bootstrap estimates is very small, indicating that the method does not introduce significant systematic errors.
# Load the data
data(Weekly)
# Fit the logistic regression model
model_a <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(model_a)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
# Fit the logistic regression model using all but the first observation
model_b <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial")
summary(model_b)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly[-1,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
# Predict the direction for the first observation using the model from (b)
pred_b <- predict(model_b, newdata = Weekly[1, ], type = "response")
# Predict if the market goes up (1 if probability > 0.5, else 0)
pred_class <- ifelse(pred_b > 0.5, "Up", "Down")
# Check if the first observation was correctly classified
actual_class <- Weekly$Direction[1]
correct_classification <- (pred_class == actual_class)
pred_class
## 1
## "Up"
correct_classification
## [1] FALSE
n <- nrow(Weekly)
errors <- numeric(n)
# Loop through each observation
for (i in 1:n) {
# Fit the logistic regression model using all but the ith observation
model_d <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
# Compute the posterior probability of the market moving up for the ith observation
pred_d <- predict(model_d, newdata = Weekly[i, ], type = "response")
# Predict whether the market moves up (if probability > 0.5)
pred_class_d <- ifelse(pred_d > 0.5, "Up", "Down")
# Check if an error was made
actual_class_d <- Weekly$Direction[i]
errors[i] <- ifelse(pred_class_d != actual_class_d, 1, 0)
}
errors
## [1] 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 0
## [38] 0 1 0 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0
## [75] 0 1 0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1
## [112] 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0
## [149] 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0
## [186] 0 0 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1
## [223] 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0
## [260] 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0
## [297] 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0
## [334] 1 1 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 1 0 1
## [371] 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1
## [408] 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1
## [445] 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0
## [482] 0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0
## [519] 1 1 0 0 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 1 1 1
## [556] 1 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1
## [593] 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1
## [630] 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 0
## [667] 0 1 1 0 1 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1
## [704] 0 0 0 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
## [741] 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1
## [778] 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1
## [815] 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 1 0 0 1
## [852] 1 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 1 1 0 1 0 1
## [889] 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1
## [926] 0 0 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1
## [963] 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 1 0 0 0
## [1000] 0 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0
## [1037] 1 0 1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0
## [1074] 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0
# Compute the LOOCV error estimate
loocv_error <- mean(errors)
loocv_error
## [1] 0.4499541
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
data(Boston)
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
se_mu_hat <- sd(Boston$medv) / sqrt(length(Boston$medv))
se_mu_hat
## [1] 0.4088611
set.seed(123) # For reproducibility
B <- 1000 # Number of bootstrap samples
boot_means <- replicate(B, mean(sample(Boston$medv, replace = TRUE)))
se_bootstrap <- sd(boot_means)
se_bootstrap
## [1] 0.4185474
ci_boot <- c(mu_hat - 2 * se_bootstrap, mu_hat + 2 * se_bootstrap)
ci_boot
## [1] 21.69571 23.36990
t.test(Boston$medv)$conf.int
## [1] 21.72953 23.33608
## attr(,"conf.level")
## [1] 0.95
mu_med <- median(Boston$medv)
mu_med
## [1] 21.2
Since there is no simple formula for SE of the median, we use bootstrapping
boot_medians <- replicate(B, median(sample(Boston$medv, replace = TRUE)))
se_med_bootstrap <- sd(boot_medians)
se_med_bootstrap
## [1] 0.3779944
mu_0.1 <- quantile(Boston$medv, 0.1)
mu_0.1
## 10%
## 12.75
boot_percentiles <- replicate(B, quantile(sample(Boston$medv, replace = TRUE), 0.1))
se_p10_bootstrap <- sd(boot_percentiles)
se_p10_bootstrap
## [1] 0.5069193
comment on results