Assignment4

3

K-fold cross-validation is a model evaluation method that involves dividing the dataset into k approximately equal-sized folds. The model is trained k times, each time using a different fold as the validation set and the remaining k–1 folds as the training set. After training, the model’s performance is evaluated on the validation fold.
Compared to the validation set approach, k-fold cross-validation has the advantage of producing a more accurate estimate of the test error since it uses multiple training-validation splits. The disadvantage is k-fold is more computationally intensive. In comparison to LOOCV, k-fold is less computationally expensive. The disadvantage is k-fold CV introduce more bias. ## 5

library(ISLR)
set.seed(1)  
# (a)
model_full <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(model_full)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# (b)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_index, ]
valid_data <- Default[-train_index, ]

model_train <- glm(default ~ income + balance, data = train_data, family = "binomial")

probs <- predict(model_train, newdata = valid_data, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")

actual <- valid_data$default
mean(preds != actual)

## [1] 0.0254

# (c)
set.seed(1)  
train1 <- sample(1:nrow(Default), nrow(Default) / 2)
model1 <- glm(default ~ income + balance, data = Default, subset = train1, family = "binomial")
probs1 <- predict(model1, newdata = Default[-train1, ], type = "response")
preds1 <- ifelse(probs1 > 0.5, "Yes", "No")
error1 <- mean(preds1 != Default$default[-train1])

set.seed(2)  
train2 <- sample(1:nrow(Default), nrow(Default) / 2)
model2 <- glm(default ~ income + balance, data = Default, subset = train2, family = "binomial")
probs2 <- predict(model2, newdata = Default[-train2, ], type = "response")
preds2 <- ifelse(probs2 > 0.5, "Yes", "No")
error2 <- mean(preds2 != Default$default[-train2])

set.seed(3)  
train3 <- sample(1:nrow(Default), nrow(Default) / 2)
model3 <- glm(default ~ income + balance, data = Default, subset = train3, family = "binomial")
probs3 <- predict(model3, newdata = Default[-train3, ], type = "response")
preds3 <- ifelse(probs3 > 0.5, "Yes", "No")
error3 <- mean(preds3 != Default$default[-train3])

c(error1, error2, error3)

## [1] 0.0254 0.0238 0.0264

# (d)
set.seed(1)  
train <- sample(1:nrow(Default), nrow(Default) / 2)

model_with_student <- glm(default ~ income + balance + student, data = Default, subset = train, family = "binomial")
probs_student <- predict(model_with_student, newdata = Default[-train, ], type = "response")
preds_student <- ifelse(probs_student > 0.5, "Yes", "No")
error_student <- mean(preds_student != Default$default[-train])
error_student

## [1] 0.026

6

# (a)
set.seed(1)
glm_fit <- glm(default ~ income + balance, data = Default, family = "binomial")

summary(glm_fit)$coefficients

##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136

# (b)
boot_fn <- function(data, index) {
  fit <- glm(default ~ income + balance, data = data, subset = index, family = "binomial")
  return(coef(fit)[2:3])  # Return only income and balance coefficients
}
# (c)
library(boot)

set.seed(1)
boot_results <- boot(Default, boot_fn, R = 1000)

boot_results

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original       bias     std. error
## t1* 2.080898e-05 1.680317e-07 4.866284e-06
## t2* 5.647103e-03 1.855765e-05 2.298949e-04

boot_results$t0

##       income      balance 
## 2.080898e-05 5.647103e-03

apply(boot_results$t, 2, sd)

## [1] 4.866284e-06 2.298949e-04

# (d) The standard error of 4.866284e-06 using the bootstrap was similar to the standard error from glm() of 4.985167e-06, suggesting that the model assumptions are met.

9

library(MASS)
data("Boston")
medv <- Boston$medv
n <- length(medv)

# (a)
mu_hat <- mean(medv)
mu_hat

## [1] 22.53281

# (b)
se_formula <- sd(medv) / sqrt(n)
se_formula

## [1] 0.4088611

# (c)
boot_mean <- function(data, index) {
  mean(data[index])
}

set.seed(1)
boot_result <- boot(medv, boot_mean, R = 1000)

se_bootstrap <- sd(boot_result$t)
se_bootstrap

## [1] 0.4106622

# (d)
ci_bootstrap <- c(mu_hat - 2 * se_bootstrap, mu_hat + 2 * se_bootstrap)

ci_formula <- c(mu_hat - 2 * se_formula, mu_hat + 2 * se_formula)

ci_bootstrap

## [1] 21.71148 23.35413

ci_formula

## [1] 21.71508 23.35053

# (e)
mu_median <- median(medv)
mu_median

## [1] 21.2

# (f)
boot_median <- function(data, index) {
  median(data[index])
}

set.seed(1)
boot_median_result <- boot(medv, boot_median, R = 1000)

se_median <- sd(boot_median_result$t)
se_median

## [1] 0.3778075

# (g)
mu_10 <- quantile(medv, 0.1)
mu_10

##   10% 
## 12.75

# (h)
boot_p10 <- function(data, index) {
  quantile(data[index], 0.1)
}

set.seed(1)
boot_p10_result <- boot(medv, boot_p10, R = 1000)

se_p10 <- sd(boot_p10_result$t)
se_p10

## [1] 0.4767526

Assignment4

2025-07-03

3

6

9