In k-fold cross-validation, the dataset is randomly divided into k equal-sized subsets or folds. The model is trained on k-1 of these folds and tested on the remaining fold. This process is repeated k times, with a different fold used as the test set in each iteration. After all iterations, the final performance of the model is determined by averaging the results from each fold.
Advantages: more reliable estimates of model’s performance since every data point is used for training and validation; reduces variance compared to single train-test split
Disadvantage: computationally expensive as the model is trained k times instead of once.
Advantages: less computational cost compared to LOOCV, as it trains k times instead of n times (n = dataset size)
Disadvantage: LOOCV may provide an unbiased estimate fo the true error, but at the cost of higher variance.
library(ISLR)
library(caTools)
library(boot)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
data(Default)
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
Default$default <- ifelse(Default$default == "Yes", 1, 0)
set.seed(1)
lm <- glm(default ~ balance + income, data = Default, family = binomial)
summary(lm)
##
## Call:
## glm(formula = default ~ balance + income, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
split <- sample.split(Default$default, SplitRatio = 0.5)
train_set <- subset(Default, split == TRUE)
valid_set <- subset(Default, split == FALSE)
lm2_train <- glm(default ~ balance + income, data = train_set, family = binomial)
y_pred_prob <- predict(lm2_train, newdata = valid_set, type = "response")
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)
validation_error <- mean(y_pred != valid_set$default)
print(paste("Validation Set Error:", round(validation_error * 100, 2), "%"))
## [1] "Validation Set Error: 2.48 %"
lr_func<- function(seed_value) {
set.seed(seed_value)
split <- sample.split(Default$default, SplitRatio = 0.5)
train_set <- subset(Default, split == TRUE)
valid_set <- subset(Default, split == FALSE)
logit_model <- glm(default ~ balance + income, data = train_set, family = binomial)
y_pred_prob <- predict(logit_model, newdata = valid_set, type = "response")
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)
validation_error <- mean(y_pred != valid_set$default)
print(paste("Seed: ", seed_value, "- Validation Set Error: ", round(validation_error * 100, 2), "%"))
}
lr_func(123)
## [1] "Seed: 123 - Validation Set Error: 2.42 %"
lr_func(456)
## [1] "Seed: 456 - Validation Set Error: 2.58 %"
lr_func(789)
## [1] "Seed: 789 - Validation Set Error: 2.92 %"
The three different seeds are used to ensure the different splits. The validation error rates vary only slightly across the different splits.
set.seed(123)
split <- sample.split(Default$default, SplitRatio = 0.5)
train_set <- subset(Default, split == TRUE)
valid_set <- subset(Default, split == FALSE)
lm_student <- glm(default ~ balance + income + student, data = train_set, family = binomial)
y_pred_prob_student <- predict(lm_student, newdata = valid_set, type = "response")
y_pred_student <- ifelse(y_pred_prob_student > 0.5, 1, 0)
validation_error_student <- mean(y_pred_student != valid_set$default)
print(paste("Validation Set Error with Student Variable:", round(validation_error_student * 100, 2), "%"))
## [1] "Validation Set Error with Student Variable: 2.5 %"
Adding a dummy variable for student does not lead to a reduction in the test error rate.
set.seed(123)
logit_model2 <- glm(default ~ balance + income, data = Default, family = binomial)
summary(logit_model2)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154047e+01 4.347564e-01 -26.544680 2.958355e-155
## balance 5.647103e-03 2.273731e-04 24.836280 3.638120e-136
## income 2.080898e-05 4.985167e-06 4.174178 2.990638e-05
boot.fn <- function(data, index) {
model <- glm(default ~ balance + income, data = data[index, ], family = binomial)
return (coef(model)[c("income","balance")])
}
set.seed(123)
boot_results <- boot(Default, boot.fn, R = 1000)
boot_df <- data.frame(
Original = boot_results$t0,
Bias = apply(boot_results$t, 2, mean) - boot_results$t0,
Std_Error = apply(boot_results$t, 2, sd)
)
print(boot_df)
## Original Bias Std_Error
## income 2.080898e-05 1.582518e-07 4.729534e-06
## balance 5.647103e-03 1.296980e-05 2.217214e-04
Both the balance and income variables have slightly lower standard error, so the logistic regression model seems stable.
data("Boston")
mu_hat <- mean(Boston$medv)
print(paste("Estimated Mean (mu_hat):", round(mu_hat, 2)))
## [1] "Estimated Mean (mu_hat): 22.53"
n <- length(Boston$medv)
sd_medv <- sd(Boston$medv)
se_hat <- sd_medv / sqrt(n)
print(paste("Standard Error of Mean (SE):", round(se_hat, 4)))
## [1] "Standard Error of Mean (SE): 0.4089"
The estimated SE of 0.4089 is relatively small which suggests that the estimate of the mean (22.53) is stable and reliable.
boot.fn <- function(data, index) {
return(mean(data$medv[index]))
}
set.seed(123)
boot_results <- boot(Boston, boot.fn, R = 1000)
print(boot_results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.01607372 0.4045557
boot_se <- sd(boot_results$t)
print(paste("Bootstrap Standard Error:", round(boot_se, 4)))
## [1] "Bootstrap Standard Error: 0.4046"
se_comparison <- data.frame(Method = c("Analytical SE", "Bootstrap SE"),
SE = c(se_hat, boot_se))
print(se_comparison)
## Method SE
## 1 Analytical SE 0.4088611
## 2 Bootstrap SE 0.4045557
Both the bootstrap and the analytical method are very close, which suggests that hte normality assumption in the analytical method is valid for this dataset.
boot_ci_lower <- mu_hat - 2 * boot_se
boot_ci_upper <- mu_hat + 2 * boot_se
print(paste("Bootstrap 95% CI: [", round(boot_ci_lower, 2), ",", round(boot_ci_upper, 2), "]"))
## [1] "Bootstrap 95% CI: [ 21.72 , 23.34 ]"
t_test_results <- t.test(Boston$medv)
t_test_ci <- t_test_results$conf.int
print(paste("t.test 95% CI: [", round(t_test_ci[1], 2), ",", round(t_test_ci[2], 2), "]"))
## [1] "t.test 95% CI: [ 21.73 , 23.34 ]"
Since the two confidence intervals are almost identical, the t-test is a valid approach.
mu_med_hat <- median(Boston$medv)
print(paste("Estimated Median (mu_med_hat):", round(mu_med_hat, 2)))
## [1] "Estimated Median (mu_med_hat): 21.2"
boot_median_fn <- function(data, index) {
return(median(data$medv[index]))
}
boot.fn <- function(data, index) {
return(median(data[index])) #
}
boot_med_results <- boot(Boston$medv, boot.fn, 1000)
print(boot_med_results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.0264 0.3783996
The standard error using the bootstrap is 0.3676, which is small and indicates that the median estimate is relatively stable
mu_0.1_hat <- quantile(Boston$medv, probs = 0.1)
print(paste("Estimated 10th Percentile (mu_0.1_hat):", round(mu_0.1_hat, 2)))
## [1] "Estimated 10th Percentile (mu_0.1_hat): 12.75"
boot_p10_fn <- function(data, index) {
return(quantile(data[index], c(0.1)))
}
boot10results <- boot(Boston$medv, boot_p10_fn, 1000)
print(boot10results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot_p10_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.02485 0.5082798
The bootstrap standard error of 0.4834 suggests that the 10th percentile estimate is stable though it has some variability.