Summary of advantages and disadvantages

validation_set_comparison <- data.frame(
  Method = c("Validation Set", "k-Fold Cross-Validation"),
  Advantages = c("Simple and fast", "More reliable performance estimate"),
  Disadvantages = c("High variance, depends on a single split", "More computationally expensive")
)

knitr::kable(validation_set_comparison)

Method	Advantages	Disadvantages
Validation Set	Simple and fast	High variance, depends on a single split
k-Fold Cross-Validation	More reliable performance estimate	More computationally expensive

# Summary of k-Fold vs LOOCV
loocv_comparison <- data.frame(
  Method = c("LOOCV", "k-Fold Cross-Validation"),
  Advantages = c("Uses all data for training, lower bias", "Less computationally expensive, lower variance"),
  Disadvantages = c("Computationally expensive", "Choice of k affects bias-variance tradeoff")
)

knitr::kable(loocv_comparison)

Method	Advantages	Disadvantages
LOOCV	Uses all data for training, lower bias	Computationally expensive
k-Fold Cross-Validation	Less computationally expensive, lower variance	Choice of k affects bias-variance tradeoff

Question 5

# Load required libraries
library(ISLR2)  # Dataset
library(MASS)   # Logistic regression
library(caret)  # Data splitting

## Loading required package: ggplot2

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

library(ggplot2) # Visualization

set.seed(123)  # Set seed for reproducibility

# Load Default dataset
data(Default)

# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Display summary of model
summary(logit_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# Split dataset into training (50%) and validation (50%) set
train_index <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data <- Default[train_index, ]
valid_data <- Default[-train_index, ]

# Fit logistic regression model on training data
train_logit <- glm(default ~ income + balance, data = train_data, family = binomial)

# Summary of trained model
summary(train_logit)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.158e+01  6.209e-01 -18.656  < 2e-16 ***
## income       2.296e-05  7.206e-06   3.187  0.00144 ** 
## balance      5.617e-03  3.200e-04  17.555  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1463.76  on 5000  degrees of freedom
## Residual deviance:  791.05  on 4998  degrees of freedom
## AIC: 797.05
## 
## Number of Fisher Scoring iterations: 8

# Predict probability of default on validation data
valid_probs <- predict(train_logit, valid_data, type = "response")

# Classify based on probability threshold of 0.5
valid_preds <- ifelse(valid_probs > 0.5, "Yes", "No")

# Convert to factor for comparison
valid_preds <- factor(valid_preds, levels = levels(Default$default))


# Compute misclassification rate
error_rate <- mean(valid_preds != valid_data$default)
error_rate

## [1] 0.02740548

set.seed(456)  # New seed for different split
train_index2 <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data2 <- Default[train_index2, ]
valid_data2 <- Default[-train_index2, ]

train_logit2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
valid_probs2 <- predict(train_logit2, valid_data2, type = "response")
valid_preds2 <- factor(ifelse(valid_probs2 > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate2 <- mean(valid_preds2 != valid_data2$default)


set.seed(789)  # Another different split
train_index3 <- createDataPartition(Default$default, p = 0.5, list = FALSE)
train_data3 <- Default[train_index3, ]
valid_data3 <- Default[-train_index3, ]

train_logit3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
valid_probs3 <- predict(train_logit3, valid_data3, type = "response")
valid_preds3 <- factor(ifelse(valid_probs3 > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate3 <- mean(valid_preds3 != valid_data3$default)

# Store and display results
error_rates <- data.frame(Split = c("First", "Second", "Third"), Error_Rate = c(error_rate, error_rate2, error_rate3))
knitr::kable(error_rates)

Split	Error_Rate
First	0.0274055
Second	0.0242048
Third	0.0266053

# Fit model including student dummy variable
train_logit_student <- glm(default ~ income + balance + student, data = train_data, family = binomial)
# Predict and compute error rate
valid_probs_student <- predict(train_logit_student, valid_data, type = "response")
valid_preds_student <- factor(ifelse(valid_probs_student > 0.5, "Yes", "No"), levels = levels(Default$default))
error_rate_student <- mean(valid_preds_student != valid_data$default)
# Compare test error rates
error_comparison <- data.frame(Model = c("Without Student", "With Student"), 
                               Error_Rate = c(error_rate, error_rate_student))
knitr::kable(error_comparison)

Model	Error_Rate
Without Student	0.0274055
With Student	0.0278056

Question 6

# Load Default dataset
data(Default)

# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Display standard errors from summary
summary(logit_model)$coefficients[, "Std. Error"]

##  (Intercept)       income      balance 
## 4.347564e-01 4.985167e-06 2.273731e-04

# Function to compute coefficients on a random subset
manual_boot_fn <- function(data, sample_size) {
  sampled_index <- sample(1:nrow(data), size = sample_size, replace = TRUE)
  model <- glm(default ~ income + balance, data = data[sampled_index, ], family = binomial)
  return(coef(model)[c("income", "balance")])
}

# Perform resampling manually (1000 iterations)
num_samples <- 1000
resampled_coefs <- replicate(num_samples, manual_boot_fn(Default, nrow(Default)))

# Compute standard errors manually
manual_se <- apply(resampled_coefs, 1, sd)
manual_se

##       income      balance 
## 4.601931e-06 2.270060e-04

# Extract standard errors from glm() summary
glm_se <- summary(logit_model)$coefficients[, "Std. Error"][c("income", "balance")]

# Combine results into a table
se_comparison <- data.frame(
  Method = c("GLM", "Manual Resampling"),
  Income_SE = c(glm_se[1], manual_se[1]),
  Balance_SE = c(glm_se[2], manual_se[2])
)

# Display table
knitr::kable(se_comparison)

Method	Income_SE	Balance_SE
GLM	5.0e-06	0.0002274
Manual Resampling	4.6e-06	0.0002270

# Extract standard errors from glm() summary
glm_se <- summary(logit_model)$coefficients[, "Std. Error"][c("income", "balance")]

# Manually computed standard errors from resampling
manual_se <- apply(resampled_coefs, 1, sd)

# Combine results into a table
se_comparison <- data.frame(
  Method = c("GLM", "Manual Resampling"),
  Income_SE = c(glm_se[1], manual_se[1]),
  Balance_SE = c(glm_se[2], manual_se[2])
)

# Display table
knitr::kable(se_comparison)

Method	Income_SE	Balance_SE
GLM	5.0e-06	0.0002274
Manual Resampling	4.6e-06	0.0002270

Question 9

# Load required libraries
library(ISLR2)  # Boston dataset
library(MASS)   # Additional statistical functions
library(boot)   # Bootstrap functions
set.seed(123)  # Set seed for reproducibility

# Load the Boston dataset
data(Boston)

# Compute the mean of medv
mu_hat <- mean(Boston$medv)
mu_hat

## [1] 22.53281

# Compute standard error manually (sample standard deviation / sqrt(n))
se_mu_hat <- sd(Boston$medv) / sqrt(nrow(Boston))
se_mu_hat

## [1] 0.4088611

# Define bootstrap function for mean
boot_mean_fn <- function(data, index) {
  return(mean(data[index]))
}

# Perform bootstrap with 1000 resamples
boot_mean_results <- boot(Boston$medv, boot_mean_fn, R = 1000)

# Bootstrap standard error
boot_se_mu_hat <- sd(boot_mean_results$t)
boot_se_mu_hat

## [1] 0.4045557

# Approximate 95% CI using bootstrap standard error
ci_boot <- c(mu_hat - 2 * boot_se_mu_hat, mu_hat + 2 * boot_se_mu_hat)

# Compute confidence interval using t-test
ci_ttest <- t.test(Boston$medv)$conf.int

# Store results in a table
ci_comparison <- data.frame(
  Method = c("Bootstrap", "t-test"),
  Lower_Bound = c(ci_boot[1], ci_ttest[1]),
  Upper_Bound = c(ci_boot[2], ci_ttest[2])
)

# Display table
knitr::kable(ci_comparison)

Method	Lower_Bound	Upper_Bound
Bootstrap	21.72369	23.34192
t-test	21.72953	23.33608

# Compute the median of medv
mu_med_hat <- median(Boston$medv)
mu_med_hat

## [1] 21.2

# Define bootstrap function for median
boot_median_fn <- function(data, index) {
  return(median(data[index]))
}

# Perform bootstrap with 1000 resamples
boot_median_results <- boot(Boston$medv, boot_median_fn, R = 1000)

# Bootstrap standard error for median
boot_se_med_hat <- sd(boot_median_results$t)
boot_se_med_hat

## [1] 0.3783996

# Compute the 10th percentile
mu_10_hat <- quantile(Boston$medv, 0.1)
mu_10_hat

##   10% 
## 12.75

# Define bootstrap function for 10th percentile
boot_percentile_fn <- function(data, index) {
  return(quantile(data[index], 0.1))
}

# Perform bootstrap with 1000 resamples
boot_percentile_results <- boot(Boston$medv, boot_percentile_fn, R = 1000)

# Bootstrap standard error for 10th percentile
boot_se_10_hat <- sd(boot_percentile_results$t)
boot_se_10_hat

## [1] 0.5082798

Assignment4 Q3,5,6,9

Logan Robinson

2025-02-27

Question 3

Summary of advantages and disadvantages

Question 5

Question 6

Question 9