library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)

Chapter 4, Number 9

# Load the Boston dataset
data("Boston")

# Create a binary response variable: 1 if crime rate > median, else 0
Boston <- Boston %>%
  mutate(crim_binary = ifelse(crim > median(crim), 1, 0)) %>%
  select(-crim)  # Remove original crim to avoid leakage

Split the Data into Training and Test Sets

set.seed(1)
train_index <- createDataPartition(Boston$crim_binary, p = 0.7, list = FALSE)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]

Logistic Regression

log_model <- glm(crim_binary ~ ., data = train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
log_probs <- predict(log_model, test, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)

# Accuracy
log_acc <- mean(log_preds == test$crim_binary)
log_acc
## [1] 0.88
# Logistic Regression using a subset of predictors (rm, lstat, dis)
log_model_subset <- glm(crim_binary ~ rm + lstat + dis, data = train, family = binomial)
log_probs_subset <- predict(log_model_subset, test, type = "response")
log_preds_subset <- ifelse(log_probs_subset > 0.5, 1, 0)

# Accuracy for subset model
log_subset_acc <- mean(log_preds_subset == test$crim_binary)
log_subset_acc
## [1] 0.7666667

Linear Discriminant Analysis (LDA)

lda_model <- lda(crim_binary ~ ., data = train)
lda_preds <- predict(lda_model, test)$class

# Accuracy
lda_acc <- mean(lda_preds == test$crim_binary)
lda_acc
## [1] 0.84
lda_model_subset <- lda(crim_binary ~ rm + lstat + dis, data = train)
lda_preds_subset <- predict(lda_model_subset, test)$class
lda_subset_acc <- mean(lda_preds_subset == test$crim_binary)
lda_subset_acc
## [1] 0.76

Naive Bayes

nb_model <- naiveBayes(as.factor(crim_binary) ~ ., data = train)
nb_preds <- predict(nb_model, test)

# Accuracy
nb_acc <- mean(nb_preds == test$crim_binary)
nb_acc
## [1] 0.8333333
nb_model_subset <- naiveBayes(as.factor(crim_binary) ~ rm + lstat + dis, data = train)
nb_preds_subset <- predict(nb_model_subset, test)
nb_subset_acc <- mean(nb_preds_subset == test$crim_binary)
nb_subset_acc
## [1] 0.78

K-Nearest Neighbors (KNN)

# Standardize features
train_scaled <- scale(train[ , -which(names(train) == "crim_binary")])
test_scaled <- scale(test[ , -which(names(test) == "crim_binary")])

train_labels <- train$crim_binary
test_labels <- test$crim_binary

# Try k = 1, 5, 10
knn1 <- knn(train_scaled, test_scaled, train_labels, k = 1)
knn5 <- knn(train_scaled, test_scaled, train_labels, k = 5)
knn10 <- knn(train_scaled, test_scaled, train_labels, k = 10)

knn1_acc <- mean(knn1 == test_labels)
knn5_acc <- mean(knn5 == test_labels)
knn10_acc <- mean(knn10 == test_labels)

knn1_acc
## [1] 0.9
knn5_acc
## [1] 0.8666667
knn10_acc
## [1] 0.8466667
train_knn_subset <- scale(train[, c("rm", "lstat", "dis")])
test_knn_subset <- scale(test[, c("rm", "lstat", "dis")])

knn_subset1 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 1)
knn_subset5 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 5)
knn_subset10 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 10)

knn_subset1_acc <- mean(knn_subset1 == test_labels)
knn_subset5_acc <- mean(knn_subset5 == test_labels)
knn_subset10_acc <- mean(knn_subset10 == test_labels)

knn_subset1_acc
## [1] 0.7266667
knn_subset5_acc
## [1] 0.7866667
knn_subset10_acc
## [1] 0.8066667

Summary of Findings

cat("Accuracy Results:\n")
## Accuracy Results:
cat("Logistic Regression: ", round(log_acc, 3), "\n")
## Logistic Regression:  0.88
cat("LDA: ", round(lda_acc, 3), "\n")
## LDA:  0.84
cat("Naive Bayes: ", round(nb_acc, 3), "\n")
## Naive Bayes:  0.833
cat("KNN (k=1): ", round(knn1_acc, 3), "\n")
## KNN (k=1):  0.9
cat("KNN (k=5): ", round(knn5_acc, 3), "\n")
## KNN (k=5):  0.867
cat("KNN (k=10): ", round(knn10_acc, 3), "\n")
## KNN (k=10):  0.847
cat("Subset Accuracy Results:\n")
## Subset Accuracy Results:
cat("Logistic Regression (subset): ", round(log_subset_acc, 3), "\n")
## Logistic Regression (subset):  0.767
cat("LDA (subset): ", round(lda_subset_acc, 3), "\n")
## LDA (subset):  0.76
cat("Naive Bayes (subset): ", round(nb_subset_acc, 3), "\n")
## Naive Bayes (subset):  0.78
cat("KNN (subset, k=1): ", round(knn_subset1_acc, 3), "\n")
## KNN (subset, k=1):  0.727
cat("KNN (subset, k=5): ", round(knn_subset5_acc, 3), "\n")
## KNN (subset, k=5):  0.787
cat("KNN (subset, k=10): ", round(knn_subset10_acc, 3), "\n")
## KNN (subset, k=10):  0.807
# Confusion matrix for logistic regression (full model)
confusionMatrix(as.factor(log_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 68 11
##          1  7 64
##                                          
##                Accuracy : 0.88           
##                  95% CI : (0.817, 0.9273)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.76           
##                                          
##  Mcnemar's Test P-Value : 0.4795         
##                                          
##             Sensitivity : 0.9067         
##             Specificity : 0.8533         
##          Pos Pred Value : 0.8608         
##          Neg Pred Value : 0.9014         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4533         
##    Detection Prevalence : 0.5267         
##       Balanced Accuracy : 0.8800         
##                                          
##        'Positive' Class : 0              
## 
# Confusion matrix for LDA
confusionMatrix(as.factor(lda_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 68 17
##          1  7 58
##                                           
##                Accuracy : 0.84            
##                  95% CI : (0.7714, 0.8947)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.68            
##                                           
##  Mcnemar's Test P-Value : 0.06619         
##                                           
##             Sensitivity : 0.9067          
##             Specificity : 0.7733          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.8923          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4533          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.8400          
##                                           
##        'Positive' Class : 0               
## 
# Confusion matrix for Naive Bayes
confusionMatrix(as.factor(nb_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 66 16
##          1  9 59
##                                           
##                Accuracy : 0.8333          
##                  95% CI : (0.7639, 0.8891)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6667          
##                                           
##  Mcnemar's Test P-Value : 0.2301          
##                                           
##             Sensitivity : 0.8800          
##             Specificity : 0.7867          
##          Pos Pred Value : 0.8049          
##          Neg Pred Value : 0.8676          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4400          
##    Detection Prevalence : 0.5467          
##       Balanced Accuracy : 0.8333          
##                                           
##        'Positive' Class : 0               
## 
# Confusion matrix for KNN (k=5)
confusionMatrix(as.factor(knn5), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 70 15
##          1  5 60
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.8016, 0.9166)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7333          
##                                           
##  Mcnemar's Test P-Value : 0.04417         
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.8667          
##                                           
##        'Positive' Class : 0               
## 

Confusion Matrix for Logistic Regression (Full Model)

# Store the confusionMatrix object
log_cm <- confusionMatrix(as.factor(log_preds), as.factor(test$crim_binary))

# Extract the table
log_cm_table <- as.data.frame(log_cm$table)

# Create the heatmap
ggplot(log_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") + # Adjust size and color as needed
  labs(title = "Confusion Matrix: Logistic Regression (Full Model)",
       x = "Actual Class",
       y = "Predicted Class") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold title
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10))

LDA (Full Model)

lda_cm <- confusionMatrix(as.factor(lda_preds), as.factor(test$crim_binary))
lda_cm_table <- as.data.frame(lda_cm$table)

ggplot(lda_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "darkgreen") + # Different color for variety
  geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
  labs(title = "Confusion Matrix: LDA (Full Model)",
       x = "Actual Class",
       y = "Predicted Class") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10))

Naive Bayes (Full Model):

nb_cm <- confusionMatrix(as.factor(nb_preds), as.factor(test$crim_binary))
nb_cm_table <- as.data.frame(nb_cm$table)

ggplot(nb_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "purple") +
  geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
  labs(title = "Confusion Matrix: Naive Bayes (Full Model)",
       x = "Actual Class",
       y = "Predicted Class") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10))

KNN (k=5) (Full Model):

knn5_cm <- confusionMatrix(as.factor(knn5), as.factor(test$crim_binary))
knn5_cm_table <- as.data.frame(knn5_cm$table)

ggplot(knn5_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "orange") +
  geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
  labs(title = "Confusion Matrix: KNN (k=5, Full Model)",
       x = "Actual Class",
       y = "Predicted Class") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10))

confusionMatrix(as.factor(lda_preds_subset), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 53 14
##          1 22 61
##                                           
##                Accuracy : 0.76            
##                  95% CI : (0.6835, 0.8259)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 6.115e-11       
##                                           
##                   Kappa : 0.52            
##                                           
##  Mcnemar's Test P-Value : 0.2433          
##                                           
##             Sensitivity : 0.7067          
##             Specificity : 0.8133          
##          Pos Pred Value : 0.7910          
##          Neg Pred Value : 0.7349          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3533          
##    Detection Prevalence : 0.4467          
##       Balanced Accuracy : 0.7600          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(as.factor(nb_preds_subset), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 53 11
##          1 22 64
##                                           
##                Accuracy : 0.78            
##                  95% CI : (0.7051, 0.8435)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.603e-12       
##                                           
##                   Kappa : 0.56            
##                                           
##  Mcnemar's Test P-Value : 0.08172         
##                                           
##             Sensitivity : 0.7067          
##             Specificity : 0.8533          
##          Pos Pred Value : 0.8281          
##          Neg Pred Value : 0.7442          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3533          
##    Detection Prevalence : 0.4267          
##       Balanced Accuracy : 0.7800          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(as.factor(knn_subset5), as.factor(test$crim_binary))  # example for KNN k=5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 63 20
##          1 12 55
##                                           
##                Accuracy : 0.7867          
##                  95% CI : (0.7124, 0.8493)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 4.42e-13        
##                                           
##                   Kappa : 0.5733          
##                                           
##  Mcnemar's Test P-Value : 0.2159          
##                                           
##             Sensitivity : 0.8400          
##             Specificity : 0.7333          
##          Pos Pred Value : 0.7590          
##          Neg Pred Value : 0.8209          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4200          
##    Detection Prevalence : 0.5533          
##       Balanced Accuracy : 0.7867          
##                                           
##        'Positive' Class : 0               
## 

Conclusion

In this analysis, we evaluated multiple classification methods to predict whether a neighborhood in Boston has a high or low crime rate, using both full and reduced feature sets. Among all models tested, K-Nearest Neighbors (K = 1) achieved the highest accuracy (90%) on the full feature set, highlighting its ability to capture local patterns in the data. Logistic Regression, LDA, and Naive Bayes also performed well but showed a moderate decline in accuracy when using only selected predictors. This suggests that including more features can improve predictive performance, although simpler models still offer reasonably strong results. Overall, KNN showed promising performance but may benefit from tuning and validation to avoid overfitting.

Chapter 5, number 5

# Set a seed for reproducibility
set.seed(1)

# Check the structure
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

(a) Fit a logistic regression model using income and balance to predict default

# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = "binomial")

# Summary of model
summary(logit_model)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

(b) Validation Set Approach

set.seed(1)  # for reproducibility

# Create train and validation set
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
valid_data <- Default[-train_indices, ]

# Fit model on training set
logit_model_val <- glm(default ~ income + balance, data = train_data, family = "binomial")

# Predict probabilities on validation set
pred_probs <- predict(logit_model_val, newdata = valid_data, type = "response")

# Convert to binary prediction using 0.5 cutoff
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")

# Validation error (fraction misclassified)
val_error <- mean(pred_class != valid_data$default)
val_error
## [1] 0.0254

(c) Repeat (b) three times with different splits

set.seed(1)
split_error <- function(seed_val) {
  set.seed(seed_val)
  train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
  train_data <- Default[train_indices, ]
  valid_data <- Default[-train_indices, ]

  model <- glm(default ~ income + balance, data = train_data, family = "binomial")
  pred_probs <- predict(model, newdata = valid_data, type = "response")
  pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
  error_rate <- mean(pred_class != valid_data$default)
  return(error_rate)
}

#three different seeds/splits
errors <- c(
  split_error(1),
  split_error(2),
  split_error(3)
)

errors
## [1] 0.0254 0.0238 0.0264
mean(errors)  # average test error across the 3 splits
## [1] 0.0252

Add student as a dummy variable to the model and check error

split_error_with_student <- function(seed_val) {
  set.seed(seed_val)
  train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
  train_data <- Default[train_indices, ]
  valid_data <- Default[-train_indices, ]

  model <- glm(default ~ income + balance + student, data = train_data, family = "binomial")
  pred_probs <- predict(model, newdata = valid_data, type = "response")
  pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
  error_rate <- mean(pred_class != valid_data$default)
  return(error_rate)
}

# Test error with student variable
errors_with_student <- c(
  split_error_with_student(1),
  split_error_with_student(2),
  split_error_with_student(3)
)

errors_with_student
## [1] 0.0260 0.0246 0.0272
mean(errors_with_student)
## [1] 0.02593333

comment

Including the student variable in the logistic regression model does not reduce the test error. In fact, the average validation set error increased slightly, suggesting that student is not a significant predictor of default beyond what is already explained by balance and income

Chapter 6, number 9

library(ISLR2)
library(tidyverse)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-9
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
# Set seed for reproducibility
set.seed(1)

# View structure
str(College)
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

(a) Split Data into Training and Test Sets

# Split into train/test sets
train_indices <- sample(1:nrow(College), nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]

(b) Linear Model Using Least Squares

lm_fit <- lm(Apps ~ ., data = train_data)
lm_preds <- predict(lm_fit, newdata = test_data)
lm_mse <- mean((lm_preds - test_data$Apps)^2)
lm_mse
## [1] 1135758

(c) Ridge Regression (α = 0) with Cross-Validation

x_train <- model.matrix(Apps ~ ., data = train_data)[, -1]
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_train <- train_data$Apps
y_test <- test_data$Apps

cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min

ridge_preds <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
ridge_mse <- mean((ridge_preds - y_test)^2)

ridge_mse
## [1] 976261.5

(d) Lasso Regression (α = 1) with Cross-Validation

cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min

lasso_preds <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
lasso_mse <- mean((lasso_preds - y_test)^2)

# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
non_zero_lasso <- sum(lasso_coef != 0) - 1  # exclude intercept

lasso_mse
## [1] 1115901
non_zero_lasso
## [1] 17

(e) PCR (Principal Components Regression)

set.seed(1)
pcr_fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")

# Choose best M
pcr_pred <- predict(pcr_fit, newdata = test_data, ncomp = which.min(pcr_fit$validation$PRESS))
pcr_mse <- mean((pcr_pred - test_data$Apps)^2)

best_M_pcr <- which.min(pcr_fit$validation$PRESS)
pcr_mse
## [1] 1135758
best_M_pcr
## [1] 17

(f) PLS (Partial Least Squares)

set.seed(1)
pls_fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")

pls_pred <- predict(pls_fit, newdata = test_data, ncomp = which.min(pls_fit$validation$PRESS))
pls_mse <- mean((pls_pred - test_data$Apps)^2)

best_M_pls <- which.min(pls_fit$validation$PRESS)
pls_mse
## [1] 1135758
best_M_pls
## [1] 17

Predicting Credit Card Default using Machine Learning Models in Finance (What I learned in class)

library(ISLR2)
library(tidyverse)
library(caret)
library(MASS)
library(class)
library(glmnet)
library(pls)
library(e1071)

Load Packages and Data

data(Default)
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
data(Default)
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

Data Preprocessing

# Convert student to numeric
Default$student <- as.factor(Default$student)
Default$default <- as.factor(Default$default)

# Train-test split
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
test_data <- Default[-train_index, ]

Models and Implementation

Logistic Regression

log_model <- glm(default ~ balance + income + student, data = train_data, family = "binomial")
summary(log_model)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = "binomial", 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.048e+01  5.782e-01 -18.122  < 2e-16 ***
## balance      5.638e-03  2.733e-04  20.628  < 2e-16 ***
## income      -5.519e-07  9.660e-06  -0.057  0.95444    
## studentYes  -8.937e-01  2.774e-01  -3.221  0.00128 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2050.6  on 7000  degrees of freedom
## Residual deviance: 1128.8  on 6997  degrees of freedom
## AIC: 1136.8
## 
## Number of Fisher Scoring iterations: 8
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, "Yes", "No")

log_pred <- factor(log_pred, levels = c("No", "Yes"))
actual <- factor(test_data$default, levels = c("No", "Yes"))

confusionMatrix(log_pred, actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2884   63
##        Yes   16   36
##                                           
##                Accuracy : 0.9737          
##                  95% CI : (0.9673, 0.9791)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.0204          
##                                           
##                   Kappa : 0.4646          
##                                           
##  Mcnemar's Test P-Value : 2.274e-07       
##                                           
##             Sensitivity : 0.9945          
##             Specificity : 0.3636          
##          Pos Pred Value : 0.9786          
##          Neg Pred Value : 0.6923          
##              Prevalence : 0.9670          
##          Detection Rate : 0.9617          
##    Detection Prevalence : 0.9827          
##       Balanced Accuracy : 0.6791          
##                                           
##        'Positive' Class : No              
## 

Linear Discriminant Analysis (LDA)

lda_model <- lda(default ~ balance + income + student, data = train_data)
lda_pred <- predict(lda_model, test_data)$class

lda_pred <- factor(lda_pred, levels = c("No", "Yes"))
actual <- factor(test_data$default, levels = c("No", "Yes"))

confusionMatrix(lda_pred, actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2891   71
##        Yes    9   28
##                                           
##                Accuracy : 0.9733          
##                  95% CI : (0.9669, 0.9788)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.02642         
##                                           
##                   Kappa : 0.401           
##                                           
##  Mcnemar's Test P-Value : 9.104e-12       
##                                           
##             Sensitivity : 0.9969          
##             Specificity : 0.2828          
##          Pos Pred Value : 0.9760          
##          Neg Pred Value : 0.7568          
##              Prevalence : 0.9670          
##          Detection Rate : 0.9640          
##    Detection Prevalence : 0.9877          
##       Balanced Accuracy : 0.6399          
##                                           
##        'Positive' Class : No              
## 

K-Nearest Neighbors (KNN)

# Normalize predictors
train_X <- scale(train_data[, c("balance", "income")])
test_X <- scale(test_data[, c("balance", "income")], center = attr(train_X, "scaled:center"), scale = attr(train_X, "scaled:scale"))

train_Y <- train_data$default
test_Y <- test_data$default

knn_pred <- knn(train_X, test_X, train_Y, k = 5)

knn_pred <- factor(knn_pred, levels = c("No", "Yes"))
actual <- factor(test_Y, levels = c("No", "Yes"))

confusionMatrix(knn_pred, actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2870   63
##        Yes   30   36
##                                           
##                Accuracy : 0.969           
##                  95% CI : (0.9621, 0.9749)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.2907542       
##                                           
##                   Kappa : 0.4211          
##                                           
##  Mcnemar's Test P-Value : 0.0009058       
##                                           
##             Sensitivity : 0.9897          
##             Specificity : 0.3636          
##          Pos Pred Value : 0.9785          
##          Neg Pred Value : 0.5455          
##              Prevalence : 0.9670          
##          Detection Rate : 0.9570          
##    Detection Prevalence : 0.9780          
##       Balanced Accuracy : 0.6766          
##                                           
##        'Positive' Class : No              
## 

Lasso and Ridge Regression (Classification with glmnet)

# Prepare design matrix
x <- model.matrix(default ~ balance + income + student, data = Default)[, -1]
y <- Default$default

x_train <- x[train_index, ]
y_train <- y[train_index]
x_test <- x[-train_index, ]
y_test <- y[-train_index]

# Ridge Regression (alpha = 0)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0, family = "binomial")
ridge_pred <- predict(cv_ridge, s = "lambda.min", newx = x_test, type = "class")
ridge_pred <- factor(ridge_pred, levels = c("No", "Yes"))
y_test <- factor(y_test, levels = c("No", "Yes"))
confusionMatrix(ridge_pred, y_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2898   91
##        Yes    2    8
##                                           
##                Accuracy : 0.969           
##                  95% CI : (0.9621, 0.9749)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.2908          
##                                           
##                   Kappa : 0.1416          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99931         
##             Specificity : 0.08081         
##          Pos Pred Value : 0.96956         
##          Neg Pred Value : 0.80000         
##              Prevalence : 0.96699         
##          Detection Rate : 0.96632         
##    Detection Prevalence : 0.99667         
##       Balanced Accuracy : 0.54006         
##                                           
##        'Positive' Class : No              
## 
# Lasso Regression (alpha = 1)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1, family = "binomial")
lasso_pred <- predict(cv_lasso, s = "lambda.min", newx = x_test, type = "class")
lasso_pred <- factor(lasso_pred, levels = c("No", "Yes"))
confusionMatrix(lasso_pred, y_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2887   63
##        Yes   13   36
##                                         
##                Accuracy : 0.9747        
##                  95% CI : (0.9684, 0.98)
##     No Information Rate : 0.967         
##     P-Value [Acc > NIR] : 0.008743      
##                                         
##                   Kappa : 0.475         
##                                         
##  Mcnemar's Test P-Value : 1.902e-08     
##                                         
##             Sensitivity : 0.9955        
##             Specificity : 0.3636        
##          Pos Pred Value : 0.9786        
##          Neg Pred Value : 0.7347        
##              Prevalence : 0.9670        
##          Detection Rate : 0.9627        
##    Detection Prevalence : 0.9837        
##       Balanced Accuracy : 0.6796        
##                                         
##        'Positive' Class : No            
## 

Principal Component Analysis (PCA)

pca_data <- Default[, c("balance", "income")]
pca_result <- prcomp(pca_data, scale. = TRUE)
summary(pca_result)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0734 0.9207
## Proportion of Variance 0.5761 0.4239
## Cumulative Proportion  0.5761 1.0000
biplot(pca_result)

Apply SMOTE

library(smotefamily)

train_data$default_num <- ifelse(train_data$default == "Yes", 1, 0)

smote_output <- SMOTE(
  X = train_data[, c("balance", "income")], 
  target = train_data$default_num,
  K = 5
)

# Combine result into a new data frame
smote_data <- smote_output$data
smote_data$default <- ifelse(smote_data$class == 1, "Yes", "No")
smote_data$default <- as.factor(smote_data$default)

Logistic Regression on SMOTE data

smote_data$student <- sample(train_data$student, size = nrow(smote_data), replace = TRUE)

# Fit logistic regression
log_model_smote <- glm(default ~ balance + income + student, data = smote_data, family = "binomial")

# Predict on original test data
log_probs_smote <- predict(log_model_smote, test_data, type = "response")
log_pred_smote <- ifelse(log_probs_smote > 0.5, "Yes", "No")
log_pred_smote <- factor(log_pred_smote, levels = c("No", "Yes"))

# Evaluate
conf_matrix_smote <- confusionMatrix(log_pred_smote, actual)
conf_matrix_smote
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2533   10
##        Yes  367   89
##                                          
##                Accuracy : 0.8743         
##                  95% CI : (0.8619, 0.886)
##     No Information Rate : 0.967          
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2818         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8734         
##             Specificity : 0.8990         
##          Pos Pred Value : 0.9961         
##          Neg Pred Value : 0.1952         
##              Prevalence : 0.9670         
##          Detection Rate : 0.8446         
##    Detection Prevalence : 0.8479         
##       Balanced Accuracy : 0.8862         
##                                          
##        'Positive' Class : No             
## 
# Save all confusion matrices
cm_logistic <- confusionMatrix(log_pred, actual)
cm_lda      <- confusionMatrix(lda_pred, actual)
cm_knn      <- confusionMatrix(knn_pred, actual)
cm_ridge    <- confusionMatrix(ridge_pred, y_test)
cm_lasso    <- confusionMatrix(lasso_pred, y_test)
cm_smote    <- conf_matrix_smote

# Convert to tidy data frame
cm_to_df <- function(cm, model_name) {
  df <- as.data.frame(as.table(cm$table))
  colnames(df) <- c("Predicted", "Actual", "Freq")
  df$Model <- model_name
  return(df)
}


cm_all <- bind_rows(
  cm_to_df(cm_logistic, "Logistic"),
  cm_to_df(cm_lda, "LDA"),
  cm_to_df(cm_knn, "KNN"),
  cm_to_df(cm_ridge, "Ridge"),
  cm_to_df(cm_lasso, "Lasso"),
  cm_to_df(cm_smote, "Logistic (SMOTE)")
)

# Plot confusion matrices
ggplot(cm_all, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 4) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme_minimal() +
  labs(title = "Confusion Matrices for All Models") +
  facet_wrap(~ Model, ncol = 3)

Comparison Table

results <- tibble::tibble(
  Model = c("Logistic", "LDA", "KNN", "Ridge", "Lasso", "Logistic (SMOTE)"),
  Accuracy = c(0.9737, 0.9733, 0.969, 0.969, 0.9747, conf_matrix_smote$overall["Accuracy"]),
  Specificity = c(0.3636, 0.2828, 0.3636, 0.0808, 0.3636, conf_matrix_smote$byClass["Specificity"]),
  Sensitivity = c(0.9945, 0.9969, 0.9897, 0.9993, 0.9955, conf_matrix_smote$byClass["Sensitivity"])
)

knitr::kable(results, digits = 4)
Model Accuracy Specificity Sensitivity
Logistic 0.9737 0.3636 0.9945
LDA 0.9733 0.2828 0.9969
KNN 0.9690 0.3636 0.9897
Ridge 0.9690 0.0808 0.9993
Lasso 0.9747 0.3636 0.9955
Logistic (SMOTE) 0.8743 0.8990 0.8734
library(ggplot2)

# Plot SMOTE result
ggplot(smote_data, aes(x = balance, y = income, color = default)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "firebrick")) +
  theme_minimal() +
  labs(
    title = "SMOTE Oversampling: Class Distribution After Resampling",
    x = "Balance",
    y = "Income",
    color = "Default"
  )