3. (a) Explain how k-fold cross-validation is implemented.

In k-fold cross-validation, the dataset is randomly divided into k equal-sized subsets or folds. The model is trained on k-1 of these folds and tested on the remaining fold. This process is repeated k times, with a different fold used as the test set in each iteration. After all iterations, the final performance of the model is determined by averaging the results from each fold.

(b) What are the advantages and disadvantages of k-fold cross-validation relative to:

  1. The validation set approach?

Advantages: more reliable estimates of model’s performance since every data point is used for training and validation; reduces variance compared to single train-test split

Disadvantage: computationally expensive as the model is trained k times instead of once.

  1. LOOCV?

Advantages: less computational cost compared to LOOCV, as it trains k times instead of n times (n = dataset size)

Disadvantage: LOOCV may provide an unbiased estimate fo the true error, but at the cost of higher variance.

library(ISLR) 
library(caTools)

library(boot)

library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
data(Default)
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
Default$default <- ifelse(Default$default == "Yes", 1, 0)
set.seed(1)
lm <- glm(default ~ balance + income, data = Default, family = binomial)
summary(lm)
## 
## Call:
## glm(formula = default ~ balance + income, family = binomial, 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
split <- sample.split(Default$default, SplitRatio = 0.5)
train_set <- subset(Default, split == TRUE)
valid_set <- subset(Default, split == FALSE)

lm2_train <- glm(default ~ balance + income, data = train_set, family = binomial)
y_pred_prob <- predict(lm2_train, newdata = valid_set, type = "response")
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)
validation_error <- mean(y_pred != valid_set$default)

print(paste("Validation Set Error:", round(validation_error * 100, 2), "%"))
## [1] "Validation Set Error: 2.48 %"
lr_func<- function(seed_value) {
  set.seed(seed_value)
  split <- sample.split(Default$default, SplitRatio = 0.5)
  train_set <- subset(Default, split == TRUE)
  valid_set <- subset(Default, split == FALSE)

  logit_model <- glm(default ~ balance + income, data = train_set, family = binomial)

  y_pred_prob <- predict(logit_model, newdata = valid_set, type = "response")

  y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

  validation_error <- mean(y_pred != valid_set$default)
  print(paste("Seed: ", seed_value, "- Validation Set Error: ", round(validation_error * 100, 2), "%"))

}

lr_func(123)
## [1] "Seed:  123 - Validation Set Error:  2.42 %"
lr_func(456)
## [1] "Seed:  456 - Validation Set Error:  2.58 %"
lr_func(789)
## [1] "Seed:  789 - Validation Set Error:  2.92 %"

The three different seeds are used to ensure the different splits. The validation error rates vary only slightly across the different splits.

set.seed(123)

split <- sample.split(Default$default, SplitRatio = 0.5)
train_set <- subset(Default, split == TRUE)
valid_set <- subset(Default, split == FALSE)

lm_student <- glm(default ~ balance + income + student, data = train_set, family = binomial)
y_pred_prob_student <- predict(lm_student, newdata = valid_set, type = "response")
y_pred_student <- ifelse(y_pred_prob_student > 0.5, 1, 0)
validation_error_student <- mean(y_pred_student != valid_set$default)

print(paste("Validation Set Error with Student Variable:", round(validation_error_student * 100, 2), "%"))
## [1] "Validation Set Error with Student Variable: 2.5 %"

Adding a dummy variable for student does not lead to a reduction in the test error rate.

6. (a)

set.seed(123)
logit_model2 <- glm(default ~ balance + income, data = Default, family = binomial)
summary(logit_model2)$coefficients
##                  Estimate   Std. Error    z value      Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## balance      5.647103e-03 2.273731e-04  24.836280 3.638120e-136
## income       2.080898e-05 4.985167e-06   4.174178  2.990638e-05

(b)

boot.fn <- function(data, index) {
  model <- glm(default ~ balance + income, data = data[index, ], family = binomial)
  
  return (coef(model)[c("income","balance")])

}

(c)

set.seed(123)

boot_results <- boot(Default, boot.fn, R = 1000)

boot_df <- data.frame(
  Original = boot_results$t0,  
  Bias = apply(boot_results$t, 2, mean) - boot_results$t0,
  Std_Error = apply(boot_results$t, 2, sd)
)

print(boot_df)
##             Original         Bias    Std_Error
## income  2.080898e-05 1.582518e-07 4.729534e-06
## balance 5.647103e-03 1.296980e-05 2.217214e-04

Both the balance and income variables have slightly lower standard error, so the logistic regression model seems stable.

9.

data("Boston")
mu_hat <- mean(Boston$medv)
print(paste("Estimated Mean (mu_hat):", round(mu_hat, 2)))
## [1] "Estimated Mean (mu_hat): 22.53"

(b)

n <- length(Boston$medv)
sd_medv <- sd(Boston$medv)

se_hat <- sd_medv / sqrt(n)
print(paste("Standard Error of Mean (SE):", round(se_hat, 4)))
## [1] "Standard Error of Mean (SE): 0.4089"

The estimated SE of 0.4089 is relatively small which suggests that the estimate of the mean (22.53) is stable and reliable.

boot.fn <- function(data, index) {
  return(mean(data$medv[index]))
}


set.seed(123)
boot_results <- boot(Boston, boot.fn, R = 1000)

print(boot_results)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 -0.01607372   0.4045557
boot_se <- sd(boot_results$t)
print(paste("Bootstrap Standard Error:", round(boot_se, 4)))
## [1] "Bootstrap Standard Error: 0.4046"
se_comparison <- data.frame(Method = c("Analytical SE", "Bootstrap SE"),
                            SE = c(se_hat, boot_se))

print(se_comparison)
##          Method        SE
## 1 Analytical SE 0.4088611
## 2  Bootstrap SE 0.4045557

Both the bootstrap and the analytical method are very close, which suggests that hte normality assumption in the analytical method is valid for this dataset.

(d)

boot_ci_lower <- mu_hat - 2 * boot_se
boot_ci_upper <- mu_hat + 2 * boot_se

print(paste("Bootstrap 95% CI: [", round(boot_ci_lower, 2), ",", round(boot_ci_upper, 2), "]"))
## [1] "Bootstrap 95% CI: [ 21.72 , 23.34 ]"
t_test_results <- t.test(Boston$medv)
t_test_ci <- t_test_results$conf.int

print(paste("t.test 95% CI: [", round(t_test_ci[1], 2), ",", round(t_test_ci[2], 2), "]"))
## [1] "t.test 95% CI: [ 21.73 , 23.34 ]"

Since the two confidence intervals are almost identical, the t-test is a valid approach.

(e)

mu_med_hat <- median(Boston$medv)
print(paste("Estimated Median (mu_med_hat):", round(mu_med_hat, 2)))
## [1] "Estimated Median (mu_med_hat): 21.2"

(f)

boot_median_fn <- function(data, index) {
  return(median(data$medv[index]))
}

boot.fn <- function(data, index) {
  return(median(data[index]))  #
}

boot_med_results <- boot(Boston$medv, boot.fn, 1000)
print(boot_med_results)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     21.2 -0.0264   0.3783996

The standard error using the bootstrap is 0.3676, which is small and indicates that the median estimate is relatively stable

(g)

mu_0.1_hat <- quantile(Boston$medv, probs = 0.1)
print(paste("Estimated 10th Percentile (mu_0.1_hat):", round(mu_0.1_hat, 2)))
## [1] "Estimated 10th Percentile (mu_0.1_hat): 12.75"
boot_p10_fn <- function(data, index) {
  return(quantile(data[index], c(0.1)))
}



boot10results <- boot(Boston$medv, boot_p10_fn, 1000)
print(boot10results)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston$medv, statistic = boot_p10_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    12.75 0.02485   0.5082798

The bootstrap standard error of 0.4834 suggests that the 10th percentile estimate is stable though it has some variability.