Homework5.1

#Q5

# Part A fit the model
lmfit <- glm(default ~ income + balance, data = Default, family = binomial)
summary(lmfit)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# Part B validation
set.seed(2)  
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]

# linear model on train data
lm_train <- glm(default ~ income + balance, data = train_data, family = binomial)



# predictions with the validation set
predict_probs <- predict(lm_train, newdata = validation_data, type = "response")

#binary yes or no classification
predict_class <- ifelse(predict_probs > 0.5, "Yes", "No")


#misclassification error
misclassification_error <- mean(predict_class != validation_data$default)
print(paste("Validation Set Error:", round(misclassification_error, 4)))

## [1] "Validation Set Error: 0.0238"

# Part C
# Repeat process 3 times but change seed

set.seed(3)
train_index2 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data2 <- Default[train_index2, ]
validation_data2 <- Default[-train_index2, ]

lm_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
predict_probs2 <- predict(lm_train2, newdata = validation_data2, type = "response")
predict_class2 <- ifelse(predict_probs2 > 0.5, "Yes", "No")
misclassification_error2 <- mean(predict_class2 != validation_data2$default)
print(paste("Validation Set Error (Run 2):", round(misclassification_error2, 4)))

## [1] "Validation Set Error (Run 2): 0.0264"

set.seed(4)
train_index3 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data3 <- Default[train_index3, ]
validation_data3 <- Default[-train_index3, ]

lm_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
predict_probs3 <- predict(lm_train3, newdata = validation_data3, type = "response")
predict_class3 <- ifelse(predict_probs3 > 0.5, "Yes", "No")
misclassification_error3 <- mean(predict_class3 != validation_data3$default)
print(paste("Validation Set Error (Run 3):", round(misclassification_error3, 4)))

## [1] "Validation Set Error (Run 3): 0.0256"

#Part D
# Regression w dummy variable "student"

set.seed(5)
train_index4 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data4 <- Default[train_index4, ]
validation_data4 <- Default[-train_index4, ]

# including student
lm_train4 <- glm(default ~ income + balance + student, data = train_data4, family = binomial)


predict_probs4 <- predict(lm_train4, newdata = validation_data4, type = "response")
predict_class4 <- ifelse(predict_probs4 > 0.5, "Yes", "No")
misclassification_error4 <- mean(predict_class4 != validation_data4$default)

print(paste("Validation Set Error w/Student:", round(misclassification_error4, 4)))

## [1] "Validation Set Error w/Student: 0.029"

#Q8

# Part A
set.seed(1)
x <- rnorm(100)
y <- x - 2 * x^2 + rnorm(100)
data <- data.frame(x, y)

# x - 2 * x^2 + rnorm(100) model outline

# Part B
library(ggplot2)

data %>% 
  ggplot(aes(x = x, y = y)) +
  geom_point() +
  labs(title = "Scatterplot of X vs Y", x = "X", y = "Y") +
  theme_minimal()

# Part C ()
library(boot) 

loocv_error <- function(formula, data) {
  lm_fit <- glm(formula, data = data)
  cv_result <- cv.glm(data, lm_fit, K = nrow(data))
  return(cv_result$delta[1])
}


set.seed(2)  


loocv_1 <- loocv_error(y ~ x, data)
loocv_2 <- loocv_error(y ~ poly(x, 2), data)
loocv_3 <- loocv_error(y ~ poly(x, 3), data)
loocv_4 <- loocv_error(y ~ poly(x, 4), data)

# Print LOOCV Errors
loocv_results <- data.frame(
  Model = c("Linear", "Quadratic", "Cubic", "Quartic"),
  LOOCV_Error = c(loocv_1, loocv_2, loocv_3, loocv_4)
)

print(loocv_results)

##       Model LOOCV_Error
## 1    Linear   7.2881616
## 2 Quadratic   0.9374236
## 3     Cubic   0.9566218
## 4   Quartic   0.9539049

# Part C 2nd part and D/E
# Repeat w/ different seed

set.seed(3)  # Change seed

loocv_1_new <- loocv_error(y ~ x, data)
loocv_2_new <- loocv_error(y ~ poly(x, 2), data)
loocv_3_new <- loocv_error(y ~ poly(x, 3), data)
loocv_4_new <- loocv_error(y ~ poly(x, 4), data)

loocv_results_new <- data.frame(
  Model = c("Linear", "Quadratic", "Cubic", "Quartic"),
  LOOCV_Error = c(loocv_1_new, loocv_2_new, loocv_3_new, loocv_4_new)
)

print(loocv_results_new)

##       Model LOOCV_Error
## 1    Linear   7.2881616
## 2 Quadratic   0.9374236
## 3     Cubic   0.9566218
## 4   Quartic   0.9539049

# results are the same as before. Changing the seed doesn't matter.
# Quadratic has the lowest error of 0.9374236

#Q9

library(ISLR2)

## 
## Attaching package: 'ISLR2'

## The following object is masked from 'package:MASS':
## 
##     Boston

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

library(boot)

data("Boston")

medv <- Boston$medv

mu_hat <- mean(medv)
print(paste("Estimated Mean mu:", round(mu_hat, 2)))

## [1] "Estimated Mean mu: 22.53"

n <- length(medv)  # alternative to nrow()

stan_error_mu_hat <- sd(medv) / sqrt(n)  # standard error
print(paste("Standard Error of Mean:", round(stan_error_mu_hat, 4)))

## [1] "Standard Error of Mean: 0.4089"

boot_function <- function(data, index) {
  return(mean(data[index]))
}

# Perform bootstrap with 1,000 replications
set.seed(42)  # Ensure reproducibility
boot_results <- boot(medv, boot_function, R = 1000)

# Bootstrap estimate of standard error
boot_stan_error_mu_hat <- sd(boot_results$t)
print(paste("Bootstrap Standard Error of Mean:", round(boot_stan_error_mu_hat, 4)))

## [1] "Bootstrap Standard Error of Mean: 0.4009"

# confidence interval

ci_approx <- c(mu_hat - 2 * boot_stan_error_mu_hat, mu_hat + 2 * boot_stan_error_mu_hat)
print(paste("95% Confidence Interval:", round(ci_approx[1], 2), "to", round(ci_approx[2], 2)))

## [1] "95% Confidence Interval: 21.73 to 23.33"

# Compare with t-test confidence interval
t_test_ci <- t.test(medv)$conf.int
print(paste("95% Confidence Interval T-Test:", round(t_test_ci[1], 2), "to", round(t_test_ci[2], 2)))

## [1] "95% Confidence Interval T-Test: 21.73 to 23.34"

mu_hat_med <- median(medv)
print(paste("Estimated Median:", round(mu_hat_med, 2)))

## [1] "Estimated Median: 21.2"

# Bootstrap function - median
boot_median <- function(data, index) {
  return(median(data[index]))
}

# Perform bootstrap
set.seed(7)
boot_results_med <- boot(medv, boot_median, R = 1000)

# Standard error of the median
boot_stan_error_med <- sd(boot_results_med$t)
print(paste("Bootstrap Standard Error of Median:", round(boot_stan_error_med, 4)))

## [1] "Bootstrap Standard Error of Median: 0.3878"

mu_hat_0.1 <- quantile(medv, 0.1)
print(paste("Estimated 10th Percentile:", round(mu_hat_0.1, 2)))

## [1] "Estimated 10th Percentile: 12.75"

# Bootstrap function for the 10th percentile
boot_percentile <- function(data, index) {
  return(quantile(data[index], 0.1))
}

# Perform bootstrap
set.seed(46)
boot_results_p10 <- boot(medv, boot_percentile, R = 1000)

# Standard error of the 10th percentile
boot_stan_error_p10 <- sd(boot_results_p10$t)
print(paste("Bootstrap Standard Error 10th Percentile:", round(boot_stan_error_p10, 4)))

## [1] "Bootstrap Standard Error 10th Percentile: 0.5104"

Homework5.1

Dillon M

2025-03-28