#Chapter 4 Question 16 – Boston Data: Classification Models
library(MASS)
library(class)
library(e1071)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(123)
library(dplyr)
data("Boston")
# Create binary response variable
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston <- dplyr::select(Boston, -crim)
# View data
head(Boston)
## zn indus chas nox rm age dis rad tax ptratio black lstat medv
## 1 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
## 2 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
## 3 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
## 4 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
## 5 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
## 6 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7
## crim01
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
train_index <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
train_x <- dplyr::select(train, -crim01)
test_x <- dplyr::select(test, -crim01)
train_y <- train$crim01
test_y <- test$crim01
log_model <- glm(crim01 ~ ., data = train, family = binomial)
log_probs <- predict(log_model, newdata = test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
confusionMatrix(as.factor(log_pred), as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 64 6
## 1 11 71
##
## Accuracy : 0.8882
## 95% CI : (0.827, 0.9335)
## No Information Rate : 0.5066
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7761
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.8533
## Specificity : 0.9221
## Pos Pred Value : 0.9143
## Neg Pred Value : 0.8659
## Prevalence : 0.4934
## Detection Rate : 0.4211
## Detection Prevalence : 0.4605
## Balanced Accuracy : 0.8877
##
## 'Positive' Class : 0
##
lda_model <- lda(crim01 ~ ., data = train)
lda_pred <- predict(lda_model, newdata = test)$class
confusionMatrix(lda_pred, as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 15
## 1 10 62
##
## Accuracy : 0.8355
## 95% CI : (0.7669, 0.8906)
## No Information Rate : 0.5066
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6713
##
## Mcnemar's Test P-Value : 0.4237
##
## Sensitivity : 0.8667
## Specificity : 0.8052
## Pos Pred Value : 0.8125
## Neg Pred Value : 0.8611
## Prevalence : 0.4934
## Detection Rate : 0.4276
## Detection Prevalence : 0.5263
## Balanced Accuracy : 0.8359
##
## 'Positive' Class : 0
##
nb_model <- naiveBayes(as.factor(crim01) ~ ., data = train)
nb_pred <- predict(nb_model, newdata = test)
confusionMatrix(nb_pred, as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 18
## 1 10 59
##
## Accuracy : 0.8158
## 95% CI : (0.7449, 0.874)
## No Information Rate : 0.5066
## P-Value [Acc > NIR] : 2.259e-15
##
## Kappa : 0.632
##
## Mcnemar's Test P-Value : 0.1859
##
## Sensitivity : 0.8667
## Specificity : 0.7662
## Pos Pred Value : 0.7831
## Neg Pred Value : 0.8551
## Prevalence : 0.4934
## Detection Rate : 0.4276
## Detection Prevalence : 0.5461
## Balanced Accuracy : 0.8165
##
## 'Positive' Class : 0
##
# Normalize predictors
train_norm <- scale(train_x)
test_norm <- scale(test_x, center = attr(train_norm, "scaled:center"),
scale = attr(train_norm, "scaled:scale"))
knn_pred <- knn(train = train_norm, test = test_norm, cl = train_y, k = 5)
confusionMatrix(knn_pred, as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 8
## 1 10 69
##
## Accuracy : 0.8816
## 95% CI : (0.8193, 0.9283)
## No Information Rate : 0.5066
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.763
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.8667
## Specificity : 0.8961
## Pos Pred Value : 0.8904
## Neg Pred Value : 0.8734
## Prevalence : 0.4934
## Detection Rate : 0.4276
## Detection Prevalence : 0.4803
## Balanced Accuracy : 0.8814
##
## 'Positive' Class : 0
##
#“Chapter 5 Question 5- Logistic Regression with Validation Set”
library(ISLR)
library(dplyr)
library(caret)
set.seed(42)
data("Default")
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
model1 <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model1)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# Step i: Randomly split 50% training, 50% validation
set.seed(100)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train <- Default[train_index, ]
valid <- Default[-train_index, ]
# Step ii: Fit model
model2 <- glm(default ~ income + balance, data = train, family = binomial)
# Step iii: Predict on validation set
probs <- predict(model2, newdata = valid, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")
# Step iv: Compute validation error
confusionMatrix(as.factor(preds), as.factor(valid$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 4822 112
## Yes 20 46
##
## Accuracy : 0.9736
## 95% CI : (0.9688, 0.9779)
## No Information Rate : 0.9684
## P-Value [Acc > NIR] : 0.0176
##
## Kappa : 0.3995
##
## Mcnemar's Test P-Value : 2.365e-15
##
## Sensitivity : 0.9959
## Specificity : 0.2911
## Pos Pred Value : 0.9773
## Neg Pred Value : 0.6970
## Prevalence : 0.9684
## Detection Rate : 0.9644
## Detection Prevalence : 0.9868
## Balanced Accuracy : 0.6435
##
## 'Positive' Class : No
##
mean(preds != valid$default)
## [1] 0.0264
set.seed(123)
validation_errors <- c()
for (i in 1:3) {
split_idx <- sample(1:nrow(Default), nrow(Default)/2)
train_split <- Default[split_idx, ]
valid_split <- Default[-split_idx, ]
model_split <- glm(default ~ income + balance, data = train_split, family = binomial)
pred_split <- predict(model_split, newdata = valid_split, type = "response")
class_split <- ifelse(pred_split > 0.5, "Yes", "No")
error_rate <- mean(class_split != valid_split$default)
validation_errors <- c(validation_errors, error_rate)
}
validation_errors
## [1] 0.0276 0.0246 0.0262
mean(validation_errors)
## [1] 0.02613333
set.seed(456)
split_idx <- sample(1:nrow(Default), nrow(Default)/2)
train_stud <- Default[split_idx, ]
valid_stud <- Default[-split_idx, ]
model_stud <- glm(default ~ income + balance + student, data = train_stud, family = binomial)
pred_stud <- predict(model_stud, newdata = valid_stud, type = "response")
class_stud <- ifelse(pred_stud > 0.5, "Yes", "No")
confusionMatrix(as.factor(class_stud), as.factor(valid_stud$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 4797 115
## Yes 33 55
##
## Accuracy : 0.9704
## 95% CI : (0.9653, 0.9749)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : 0.04443
##
## Kappa : 0.4127
##
## Mcnemar's Test P-Value : 2.773e-11
##
## Sensitivity : 0.9932
## Specificity : 0.3235
## Pos Pred Value : 0.9766
## Neg Pred Value : 0.6250
## Prevalence : 0.9660
## Detection Rate : 0.9594
## Detection Prevalence : 0.9824
## Balanced Accuracy : 0.6583
##
## 'Positive' Class : No
##
mean(class_stud != valid_stud$default)
## [1] 0.0296
#Chapter 6 Question 9 - Regression Modeling on College Dataset
# Load all tidymodels packages
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tibble 3.3.0
## ✔ infer 1.0.8 ✔ tidyr 1.3.1
## ✔ modeldata 1.4.0 ✔ tune 1.3.0
## ✔ parsnip 1.3.2 ✔ workflows 1.2.0
## ✔ purrr 1.0.4 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ rsample::permutations() masks e1071::permutations()
## ✖ yardstick::precision() masks caret::precision()
## ✖ yardstick::recall() masks caret::recall()
## ✖ dplyr::select() masks MASS::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step() masks stats::step()
## ✖ tune::tune() masks parsnip::tune(), e1071::tune()
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
## The following object is masked from 'package:MASS':
##
## Boston
# Set seed for reproducibility
set.seed(123)
# Load data
data("College")
college <- College
# Split into training and test sets (70% train)
split <- initial_split(college, prop = 0.7)
train_data <- training(split)
test_data <- testing(split)
lm_model <- linear_reg() %>%
set_engine("lm")
lm_recipe <- recipe(Apps ~ ., data = train_data)
lm_workflow <- workflow() %>%
add_model(lm_model) %>%
add_recipe(lm_recipe)
lm_fit <- lm_workflow %>% fit(data = train_data)
lm_preds <- predict(lm_fit, test_data) %>%
bind_cols(test_data)
lm_mse <- mean((lm_preds$.pred - lm_preds$Apps)^2)
cat("(b) Linear Regression Test MSE:", lm_mse, "\n")
## (b) Linear Regression Test MSE: 1734841
install.packages("glmnet")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependency 'RcppEigen'
## Warning in install.packages("glmnet"): installation of package 'RcppEigen' had
## non-zero exit status
## Warning in install.packages("glmnet"): installation of package 'glmnet' had
## non-zero exit status
# Manual Ridge using closed-form solution
library(MASS)
# Create model matrix (remove intercept)
x <- model.matrix(Apps ~ ., data = train_data)[, -1]
y <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_test <- test_data$Apps
# Try a range of lambda values
lambdas <- 10^seq(10, -2, length = 100)
mse_values <- numeric(length(lambdas))
for (i in seq_along(lambdas)) {
lambda <- lambdas[i]
ridge_fit <- lm.ridge(Apps ~ ., data = train_data, lambda = lambda)
ridge_coefs <- coef(ridge_fit)
preds <- cbind(1, x_test) %*% ridge_coefs
mse_values[i] <- mean((y_test - preds)^2)
}
best_lambda <- lambdas[which.min(mse_values)]
best_mse <- min(mse_values)
cat("(c) Ridge Regression Test MSE (manual):", best_mse, "\n")
## (c) Ridge Regression Test MSE (manual): 1735254
cat(" Best lambda:", best_lambda, "\n")
## Best lambda: 0.01
library(ISLR2)
set.seed(123)
index <- sample(1:nrow(College), nrow(College) * 0.7)
train_data <- College[index, ]
test_data <- College[-index, ]
# Full model
full_model <- lm(Apps ~ ., data = train_data)
# Use base R's step function explicitly
step_model <- stats::step(full_model, direction = "both", trace = FALSE)
# Predict
preds <- predict(step_model, newdata = test_data)
# MSE
mse <- mean((preds - test_data$Apps)^2)
# Count predictors
num_nonzero <- sum(coef(step_model)[-1] != 0)
cat("(d) Stepwise Regression (Lasso-like) Test MSE:", mse, "\n")
## (d) Stepwise Regression (Lasso-like) Test MSE: 1713938
cat(" Number of predictors selected:", num_nonzero, "\n")
## Number of predictors selected: 11
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Fit PCR model with cross-validation
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Plot CV error vs number of components
print(validationplot(pcr_model, val.type = "MSEP"))
## NULL
# Find best number of components
msep_values <- MSEP(pcr_model)$val[1,1,]
best_M <- which.min(msep_values)
# Predict on test data
pcr_preds <- predict(pcr_model, newdata = test_data, ncomp = best_M)
# Test MSE
pcr_mse <- mean((pcr_preds - test_data$Apps)^2)
# Output
cat("(e) PCR Test MSE:", pcr_mse, "\n")
## (e) PCR Test MSE: 1734841
cat(" Optimal number of components (M):", best_M, "\n")
## Optimal number of components (M): 17
# Load pls package (already loaded if you've done PCR)
library(pls)
# Fit PLS model with cross-validation
set.seed(123)
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Plot CV error vs number of components (optional)
validationplot(pls_model, val.type = "MSEP")
# Find best number of components (min MSEP)
pls_msep_values <- MSEP(pls_model)$val[1,1,]
best_pls_M <- which.min(pls_msep_values)
# Predict on test data
pls_preds <- predict(pls_model, newdata = test_data, ncomp = best_pls_M)
# Calculate test MSE
pls_mse <- mean((pls_preds - test_data$Apps)^2)
# Output
cat("(f) PLS Test MSE:", pls_mse, "\n")
## (f) PLS Test MSE: 1742582
cat(" Optimal number of components (M):", best_pls_M, "\n")
## Optimal number of components (M): 12
# (d) Stepwise Regression (Lasso-like) – RE-RUN THIS
full_model <- lm(Apps ~ ., data = train_data)
step_model <- stats::step(full_model, direction = "both", trace = FALSE)
step_preds <- predict(step_model, newdata = test_data)
step_mse <- mean((step_preds - test_data$Apps)^2)
# Automatically summarize model results
library(tibble)
library(dplyr)
library(glue)
model_results <- tibble(
Model = c("Linear Regression",
"Ridge Regression",
"Stepwise Regression",
"PCR",
"PLS"),
Test_MSE = c(
lm_mse, # from (b)
best_mse, # from Ridge
step_mse, # from Stepwise
pcr_mse, # from PCR
pls_mse # from PLS
)
)
model_results <- model_results %>% arrange(Test_MSE)
best_model <- model_results$Model[1]
best_mse <- model_results$Test_MSE[1]
worst_model <- model_results$Model[5]
worst_mse <- model_results$Test_MSE[5]
range_mse <- round(worst_mse - best_mse)
summary_text <- glue("
Among the five models compared, {best_model} achieved the lowest test MSE at {round(best_mse)}.
In contrast, {worst_model} had the highest MSE at {round(worst_mse)}, resulting in a difference of about {range_mse} in prediction error.
This suggests that while all models perform reasonably well in predicting the number of college applications,
the use of dimensionality reduction or variable selection can offer modest improvements.
The close performance across models also indicates that the predictors are informative,
and the problem does not suffer heavily from overfitting or multicollinearity.
Overall, {best_model} stands out as the most accurate model for this dataset.
")
cat(summary_text)
## Among the five models compared, Stepwise Regression achieved the lowest test MSE at 1713938.
## In contrast, PLS had the highest MSE at 1742582, resulting in a difference of about 28644 in prediction error.
## This suggests that while all models perform reasonably well in predicting the number of college applications,
## the use of dimensionality reduction or variable selection can offer modest improvements.
## The close performance across models also indicates that the predictors are informative,
## and the problem does not suffer heavily from overfitting or multicollinearity.
## Overall, Stepwise Regression stands out as the most accurate model for this dataset.