#Q5
# Part A fit the model
lmfit <- glm(default ~ income + balance, data = Default, family = binomial)
summary(lmfit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# Part B validation
set.seed(2)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]
# linear model on train data
lm_train <- glm(default ~ income + balance, data = train_data, family = binomial)
# predictions with the validation set
predict_probs <- predict(lm_train, newdata = validation_data, type = "response")
#binary yes or no classification
predict_class <- ifelse(predict_probs > 0.5, "Yes", "No")
#misclassification error
misclassification_error <- mean(predict_class != validation_data$default)
print(paste("Validation Set Error:", round(misclassification_error, 4)))
## [1] "Validation Set Error: 0.0238"
# Part C
# Repeat process 3 times but change seed
set.seed(3)
train_index2 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data2 <- Default[train_index2, ]
validation_data2 <- Default[-train_index2, ]
lm_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
predict_probs2 <- predict(lm_train2, newdata = validation_data2, type = "response")
predict_class2 <- ifelse(predict_probs2 > 0.5, "Yes", "No")
misclassification_error2 <- mean(predict_class2 != validation_data2$default)
print(paste("Validation Set Error (Run 2):", round(misclassification_error2, 4)))
## [1] "Validation Set Error (Run 2): 0.0264"
set.seed(4)
train_index3 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data3 <- Default[train_index3, ]
validation_data3 <- Default[-train_index3, ]
lm_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
predict_probs3 <- predict(lm_train3, newdata = validation_data3, type = "response")
predict_class3 <- ifelse(predict_probs3 > 0.5, "Yes", "No")
misclassification_error3 <- mean(predict_class3 != validation_data3$default)
print(paste("Validation Set Error (Run 3):", round(misclassification_error3, 4)))
## [1] "Validation Set Error (Run 3): 0.0256"
#Part D
# Regression w dummy variable "student"
set.seed(5)
train_index4 <- sample(1:nrow(Default), nrow(Default) / 2)
train_data4 <- Default[train_index4, ]
validation_data4 <- Default[-train_index4, ]
# including student
lm_train4 <- glm(default ~ income + balance + student, data = train_data4, family = binomial)
predict_probs4 <- predict(lm_train4, newdata = validation_data4, type = "response")
predict_class4 <- ifelse(predict_probs4 > 0.5, "Yes", "No")
misclassification_error4 <- mean(predict_class4 != validation_data4$default)
print(paste("Validation Set Error w/Student:", round(misclassification_error4, 4)))
## [1] "Validation Set Error w/Student: 0.029"
#Q8
# Part A
set.seed(1)
x <- rnorm(100)
y <- x - 2 * x^2 + rnorm(100)
data <- data.frame(x, y)
# x - 2 * x^2 + rnorm(100) model outline
# Part B
library(ggplot2)
data %>%
ggplot(aes(x = x, y = y)) +
geom_point() +
labs(title = "Scatterplot of X vs Y", x = "X", y = "Y") +
theme_minimal()
# Part C ()
library(boot)
loocv_error <- function(formula, data) {
lm_fit <- glm(formula, data = data)
cv_result <- cv.glm(data, lm_fit, K = nrow(data))
return(cv_result$delta[1])
}
set.seed(2)
loocv_1 <- loocv_error(y ~ x, data)
loocv_2 <- loocv_error(y ~ poly(x, 2), data)
loocv_3 <- loocv_error(y ~ poly(x, 3), data)
loocv_4 <- loocv_error(y ~ poly(x, 4), data)
# Print LOOCV Errors
loocv_results <- data.frame(
Model = c("Linear", "Quadratic", "Cubic", "Quartic"),
LOOCV_Error = c(loocv_1, loocv_2, loocv_3, loocv_4)
)
print(loocv_results)
## Model LOOCV_Error
## 1 Linear 7.2881616
## 2 Quadratic 0.9374236
## 3 Cubic 0.9566218
## 4 Quartic 0.9539049
# Part C 2nd part and D/E
# Repeat w/ different seed
set.seed(3) # Change seed
loocv_1_new <- loocv_error(y ~ x, data)
loocv_2_new <- loocv_error(y ~ poly(x, 2), data)
loocv_3_new <- loocv_error(y ~ poly(x, 3), data)
loocv_4_new <- loocv_error(y ~ poly(x, 4), data)
loocv_results_new <- data.frame(
Model = c("Linear", "Quadratic", "Cubic", "Quartic"),
LOOCV_Error = c(loocv_1_new, loocv_2_new, loocv_3_new, loocv_4_new)
)
print(loocv_results_new)
## Model LOOCV_Error
## 1 Linear 7.2881616
## 2 Quadratic 0.9374236
## 3 Cubic 0.9566218
## 4 Quartic 0.9539049
# results are the same as before. Changing the seed doesn't matter.
# Quadratic has the lowest error of 0.9374236
#Q9
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(boot)
data("Boston")
medv <- Boston$medv
mu_hat <- mean(medv)
print(paste("Estimated Mean mu:", round(mu_hat, 2)))
## [1] "Estimated Mean mu: 22.53"
n <- length(medv) # alternative to nrow()
stan_error_mu_hat <- sd(medv) / sqrt(n) # standard error
print(paste("Standard Error of Mean:", round(stan_error_mu_hat, 4)))
## [1] "Standard Error of Mean: 0.4089"
boot_function <- function(data, index) {
return(mean(data[index]))
}
# Perform bootstrap with 1,000 replications
set.seed(42) # Ensure reproducibility
boot_results <- boot(medv, boot_function, R = 1000)
# Bootstrap estimate of standard error
boot_stan_error_mu_hat <- sd(boot_results$t)
print(paste("Bootstrap Standard Error of Mean:", round(boot_stan_error_mu_hat, 4)))
## [1] "Bootstrap Standard Error of Mean: 0.4009"
# confidence interval
ci_approx <- c(mu_hat - 2 * boot_stan_error_mu_hat, mu_hat + 2 * boot_stan_error_mu_hat)
print(paste("95% Confidence Interval:", round(ci_approx[1], 2), "to", round(ci_approx[2], 2)))
## [1] "95% Confidence Interval: 21.73 to 23.33"
# Compare with t-test confidence interval
t_test_ci <- t.test(medv)$conf.int
print(paste("95% Confidence Interval T-Test:", round(t_test_ci[1], 2), "to", round(t_test_ci[2], 2)))
## [1] "95% Confidence Interval T-Test: 21.73 to 23.34"
mu_hat_med <- median(medv)
print(paste("Estimated Median:", round(mu_hat_med, 2)))
## [1] "Estimated Median: 21.2"
# Bootstrap function - median
boot_median <- function(data, index) {
return(median(data[index]))
}
# Perform bootstrap
set.seed(7)
boot_results_med <- boot(medv, boot_median, R = 1000)
# Standard error of the median
boot_stan_error_med <- sd(boot_results_med$t)
print(paste("Bootstrap Standard Error of Median:", round(boot_stan_error_med, 4)))
## [1] "Bootstrap Standard Error of Median: 0.3878"
mu_hat_0.1 <- quantile(medv, 0.1)
print(paste("Estimated 10th Percentile:", round(mu_hat_0.1, 2)))
## [1] "Estimated 10th Percentile: 12.75"
# Bootstrap function for the 10th percentile
boot_percentile <- function(data, index) {
return(quantile(data[index], 0.1))
}
# Perform bootstrap
set.seed(46)
boot_results_p10 <- boot(medv, boot_percentile, R = 1000)
# Standard error of the 10th percentile
boot_stan_error_p10 <- sd(boot_results_p10$t)
print(paste("Bootstrap Standard Error 10th Percentile:", round(boot_stan_error_p10, 4)))
## [1] "Bootstrap Standard Error 10th Percentile: 0.5104"