Divide the dataset into K equal folds and train the model with k-1 folds while the remaining 1-fold is the validation or testing set. Train the model and evaluate on the validation set and then check performance metrics such as accuracy and MSE for each k iteration. To see overall performance get the average of the k performance scores.
Advantages of k-fold have more stable results with reduced variance in performance, is less prone to overfitting, and utilizes the entire dataset that can reduce bias in the model evaluation. Disadvantages are it is more computationally intensive, complex, and can lead to data leakage if preprocessing is done incorrectly.
The advantages are that K-fold is computationally faster, has a lower variance, and is more robust. Disadvantages are that k-fold can be less accurate, more biased, and requires a choice of k.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
n <- nrow(Default)
train <- sample(1:n, size = 0.8 * n)
test <- setdiff(1:n, train)
glm.fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs <- predict(glm.fit, newdata = Default[test, ], type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")
actual <- Default$default[test]
mean(preds != actual)
## [1] 0.026
set.seed(42)
splits <- c(0.5, 0.7, 0.6)
error_no_student <- c()
error_with_student <- c()
for (i in 1:3) {
train_size <- floor(splits[i] * n)
train <- sample(1:n, size = train_size)
test <- setdiff(1:n, train)
# ---- Model 1: without student ----
glm1 <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
probs1 <- predict(glm1, newdata = Default[test, ], type = "response")
preds1 <- ifelse(probs1 > 0.5, "Yes", "No")
error_no_student[i] <- mean(preds1 != Default$default[test])
# ---- Model 2: with student ----
glm2 <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
probs2 <- predict(glm2, newdata = Default[test, ], type = "response")
preds2 <- ifelse(probs2 > 0.5, "Yes", "No")
error_with_student[i] <- mean(preds2 != Default$default[test])
}
results <- data.frame(
Split = c("50/50", "70/30", "60/40"),
Error_Without_Student = round(error_no_student, 4),
Error_With_Student = round(error_with_student, 4)
)
print(results)
## Split Error_Without_Student Error_With_Student
## 1 50/50 0.0260 0.0258
## 2 70/30 0.0237 0.0233
## 3 60/40 0.0278 0.0278
Test error is low across all the splits between 2.3% and 2.8% which indicates a well performing model when classifying defaults. The lowest error rates for both models was 70/30 split and including or not including Student predictor had minimal impact.
library(ISLR)
# -----------------------------
# Step 1: Fit logistic regression model
# -----------------------------
set.seed(1)
glm.fit <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(glm.fit)$coefficients[, "Std. Error"]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
# -----------------------------
# Step 2: Bootstrap to estimate SEs
# -----------------------------
boot.fn <- function(data, index) {
fit <- glm(default ~ income + balance, data = data[index, ], family = "binomial")
return(coef(fit)[2:3])
}
# Perform 1000 bootstrap replications
set.seed(1)
boot.reps <- 1000
boot.coefs <- replicate(boot.reps, boot.fn(Default, sample(1:nrow(Default), replace = TRUE)))
# Transpose result: rows = coefficients, cols = bootstrap samples
boot.coefs <- t(boot.coefs)
# Standard error = standard deviation of bootstrap estimates
boot_se <- apply(boot.coefs, 2, sd)
print("Bootstrap Standard Errors:")
## [1] "Bootstrap Standard Errors:"
print(round(boot_se, 5))
## income balance
## 0.00000 0.00023
# Standard SEs from glm()
glm_se <- summary(glm.fit)$coefficients[c("income", "balance"), "Std. Error"]
# Create comparison table
comparison <- data.frame(
Coefficient = c("income", "balance"),
SE_from_glm = round(glm_se, 5),
SE_from_bootstrap = round(boot_se, 5)
)
print(comparison)
## Coefficient SE_from_glm SE_from_bootstrap
## income income 0.00000 0.00000
## balance balance 0.00023 0.00023
The bootstrap standard errors and the standard glm method have identical results which indicates that the model’s assumptions are holding well and thet there is a large enough dataset to yield consistent estimates.
library(boot)
# Load the Boston dataset
data("Boston")
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
se_mu_hat <- sd(Boston$medv) / sqrt(nrow(Boston))
se_mu_hat
## [1] 0.4088611
# Define bootstrap function for the mean
boot_mean <- function(data, indices) {
return(mean(data[indices]))
}
set.seed(1)
boot_mean_result <- boot(Boston$medv, boot_mean, R = 1000)
boot_mean_result
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot_mean, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
# Bootstrap SE
boot_se_mu <- boot_mean_result$t0
sd(boot_mean_result$t)
## [1] 0.4106622
Using the bootstrap method the standard error was slightly higher.
ci_lower <- mu_hat - 2 * sd(boot_mean_result$t)
ci_upper <- mu_hat + 2 * sd(boot_mean_result$t)
c(ci_lower, ci_upper)
## [1] 21.71148 23.35413
mu_med_hat <- median(Boston$medv)
mu_med_hat
## [1] 21.2
# Define bootstrap function for the median
boot_median <- function(data, indices) {
return(median(data[indices]))
}
set.seed(2)
boot_median_result <- boot(Boston$medv, boot_median, R = 1000)
sd(boot_median_result$t)
## [1] 0.3879954
mu_0.1_hat <- quantile(Boston$medv, 0.1)
mu_0.1_hat
## 10%
## 12.75
# Define bootstrap function for the 10th percentile
boot_p10 <- function(data, indices) {
return(quantile(data[indices], 0.1))
}
set.seed(3)
boot_p10_result <- boot(Boston$medv, boot_p10, R = 1000)
sd(boot_p10_result$t)
## [1] 0.4873263
The variation in estimates is about 0.49 when I resampled the Boston data set with replacement and calculated the 10th percentile each time.