Summary of advantages and disadvantages
validation_set_comparison <- data.frame(
Method = c("Validation Set", "k-Fold Cross-Validation"),
Advantages = c("Simple and fast", "More reliable performance estimate"),
Disadvantages = c("High variance, depends on a single split", "More computationally expensive")
)
knitr::kable(validation_set_comparison)
| Validation Set |
Simple and fast |
High variance, depends on a single split |
| k-Fold Cross-Validation |
More reliable performance estimate |
More computationally expensive |
# Summary of k-Fold vs LOOCV
loocv_comparison <- data.frame(
Method = c("LOOCV", "k-Fold Cross-Validation"),
Advantages = c("Uses all data for training, lower bias", "Less computationally expensive, lower variance"),
Disadvantages = c("Computationally expensive", "Choice of k affects bias-variance tradeoff")
)
knitr::kable(loocv_comparison)
| LOOCV |
Uses all data for training, lower bias |
Computationally expensive |
| k-Fold Cross-Validation |
Less computationally expensive, lower variance |
Choice of k affects bias-variance tradeoff |
Question 5
# Load required libraries
library(ISLR2) # Dataset
library(MASS) # Logistic regression
library(caret) # Data splitting
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(ggplot2) # Visualization
set.seed(123) # Set seed for reproducibility
# Load Default dataset
data(Default)
# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)
# Display summary of model
summary(logit_model)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# Split dataset into training (50%) and validation (50%) set
train_index <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data <- Default[train_index, ]
valid_data <- Default[-train_index, ]
# Fit logistic regression model on training data
train_logit <- glm(default ~ income + balance, data = train_data, family = binomial)
# Summary of trained model
summary(train_logit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.158e+01 6.209e-01 -18.656 < 2e-16 ***
## income 2.296e-05 7.206e-06 3.187 0.00144 **
## balance 5.617e-03 3.200e-04 17.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1463.76 on 5000 degrees of freedom
## Residual deviance: 791.05 on 4998 degrees of freedom
## AIC: 797.05
##
## Number of Fisher Scoring iterations: 8
# Predict probability of default on validation data
valid_probs <- predict(train_logit, valid_data, type = "response")
# Classify based on probability threshold of 0.5
valid_preds <- ifelse(valid_probs > 0.5, "Yes", "No")
# Convert to factor for comparison
valid_preds <- factor(valid_preds, levels = levels(Default$default))
# Compute misclassification rate
error_rate <- mean(valid_preds != valid_data$default)
error_rate
## [1] 0.02740548
set.seed(456) # New seed for different split
train_index2 <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data2 <- Default[train_index2, ]
valid_data2 <- Default[-train_index2, ]
train_logit2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
valid_probs2 <- predict(train_logit2, valid_data2, type = "response")
valid_preds2 <- factor(ifelse(valid_probs2 > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate2 <- mean(valid_preds2 != valid_data2$default)
set.seed(789) # Another different split
train_index3 <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data3 <- Default[train_index3, ]
valid_data3 <- Default[-train_index3, ]
train_logit3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
valid_probs3 <- predict(train_logit3, valid_data3, type = "response")
valid_preds3 <- factor(ifelse(valid_probs3 > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate3 <- mean(valid_preds3 != valid_data3$default)
# Store and display results
error_rates <- data.frame(Split = c("First", "Second", "Third"), Error_Rate = c(error_rate, error_rate2, error_rate3))
knitr::kable(error_rates)
| First |
0.0274055 |
| Second |
0.0242048 |
| Third |
0.0266053 |
# Fit model including student dummy variable
train_logit_student <- glm(default ~ income + balance + student, data = train_data, family = binomial)
# Predict and compute error rate
valid_probs_student <- predict(train_logit_student, valid_data, type = "response")
valid_preds_student <- factor(ifelse(valid_probs_student > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate_student <- mean(valid_preds_student != valid_data$default)
# Compare test error rates
error_comparison <- data.frame(Model = c("Without Student", "With Student"),
Error_Rate = c(error_rate, error_rate_student))
knitr::kable(error_comparison)
| Without Student |
0.0274055 |
| With Student |
0.0278056 |
Question 6
# Load Default dataset
data(Default)
# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)
# Display standard errors from summary
summary(logit_model)$coefficients[, "Std. Error"]
## (Intercept) income balance
## 4.347564e-01 4.985167e-06 2.273731e-04
# Function to compute coefficients on a random subset
manual_boot_fn <- function(data, sample_size) {
sampled_index <- sample(1:nrow(data), size = sample_size, replace = TRUE)
model <- glm(default ~ income + balance, data = data[sampled_index, ], family = binomial)
return(coef(model)[c("income", "balance")])
}
# Perform resampling manually (1000 iterations)
num_samples <- 1000
resampled_coefs <- replicate(num_samples, manual_boot_fn(Default, nrow(Default)))
# Compute standard errors manually
manual_se <- apply(resampled_coefs, 1, sd)
manual_se
## income balance
## 4.601931e-06 2.270060e-04
# Extract standard errors from glm() summary
glm_se <- summary(logit_model)$coefficients[, "Std. Error"][c("income", "balance")]
# Combine results into a table
se_comparison <- data.frame(
Method = c("GLM", "Manual Resampling"),
Income_SE = c(glm_se[1], manual_se[1]),
Balance_SE = c(glm_se[2], manual_se[2])
)
# Display table
knitr::kable(se_comparison)
| GLM |
5.0e-06 |
0.0002274 |
| Manual Resampling |
4.6e-06 |
0.0002270 |
# Extract standard errors from glm() summary
glm_se <- summary(logit_model)$coefficients[, "Std. Error"][c("income", "balance")]
# Manually computed standard errors from resampling
manual_se <- apply(resampled_coefs, 1, sd)
# Combine results into a table
se_comparison <- data.frame(
Method = c("GLM", "Manual Resampling"),
Income_SE = c(glm_se[1], manual_se[1]),
Balance_SE = c(glm_se[2], manual_se[2])
)
# Display table
knitr::kable(se_comparison)
| GLM |
5.0e-06 |
0.0002274 |
| Manual Resampling |
4.6e-06 |
0.0002270 |
Question 9
# Load required libraries
library(ISLR2) # Boston dataset
library(MASS) # Additional statistical functions
library(boot) # Bootstrap functions
set.seed(123) # Set seed for reproducibility
# Load the Boston dataset
data(Boston)
# Compute the mean of medv
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
# Compute standard error manually (sample standard deviation / sqrt(n))
se_mu_hat <- sd(Boston$medv) / sqrt(nrow(Boston))
se_mu_hat
## [1] 0.4088611
# Define bootstrap function for mean
boot_mean_fn <- function(data, index) {
return(mean(data[index]))
}
# Perform bootstrap with 1000 resamples
boot_mean_results <- boot(Boston$medv, boot_mean_fn, R = 1000)
# Bootstrap standard error
boot_se_mu_hat <- sd(boot_mean_results$t)
boot_se_mu_hat
## [1] 0.4045557
# Approximate 95% CI using bootstrap standard error
ci_boot <- c(mu_hat - 2 * boot_se_mu_hat, mu_hat + 2 * boot_se_mu_hat)
# Compute confidence interval using t-test
ci_ttest <- t.test(Boston$medv)$conf.int
# Store results in a table
ci_comparison <- data.frame(
Method = c("Bootstrap", "t-test"),
Lower_Bound = c(ci_boot[1], ci_ttest[1]),
Upper_Bound = c(ci_boot[2], ci_ttest[2])
)
# Display table
knitr::kable(ci_comparison)
| Bootstrap |
21.72369 |
23.34192 |
| t-test |
21.72953 |
23.33608 |
# Compute the median of medv
mu_med_hat <- median(Boston$medv)
mu_med_hat
## [1] 21.2
# Define bootstrap function for median
boot_median_fn <- function(data, index) {
return(median(data[index]))
}
# Perform bootstrap with 1000 resamples
boot_median_results <- boot(Boston$medv, boot_median_fn, R = 1000)
# Bootstrap standard error for median
boot_se_med_hat <- sd(boot_median_results$t)
boot_se_med_hat
## [1] 0.3783996
# Compute the 10th percentile
mu_10_hat <- quantile(Boston$medv, 0.1)
mu_10_hat
## 10%
## 12.75
# Define bootstrap function for 10th percentile
boot_percentile_fn <- function(data, index) {
return(quantile(data[index], 0.1))
}
# Perform bootstrap with 1000 resamples
boot_percentile_results <- boot(Boston$medv, boot_percentile_fn, R = 1000)
# Bootstrap standard error for 10th percentile
boot_se_10_hat <- sd(boot_percentile_results$t)
boot_se_10_hat
## [1] 0.5082798