Chapter 4

library(MASS)       
library(class)      
library(e1071)      
library(caret)      
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)   
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(123)
library(dplyr)
data("Boston")

Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

Boston <- dplyr::select(Boston, -crim)

head(Boston)
##   zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7
##   crim01
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      0
train_index <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train <- Boston[train_index, ]
test <- Boston[-train_index, ]

train_x <- dplyr::select(train, -crim01)
test_x <- dplyr::select(test, -crim01)
train_y <- train$crim01
test_y <- test$crim01
log_model <- glm(crim01 ~ ., data = train, family = binomial)
log_probs <- predict(log_model, newdata = test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)

confusionMatrix(as.factor(log_pred), as.factor(test_y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 64  6
##          1 11 71
##                                          
##                Accuracy : 0.8882         
##                  95% CI : (0.827, 0.9335)
##     No Information Rate : 0.5066         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7761         
##                                          
##  Mcnemar's Test P-Value : 0.332          
##                                          
##             Sensitivity : 0.8533         
##             Specificity : 0.9221         
##          Pos Pred Value : 0.9143         
##          Neg Pred Value : 0.8659         
##              Prevalence : 0.4934         
##          Detection Rate : 0.4211         
##    Detection Prevalence : 0.4605         
##       Balanced Accuracy : 0.8877         
##                                          
##        'Positive' Class : 0              
## 
lda_model <- lda(crim01 ~ ., data = train)
lda_pred <- predict(lda_model, newdata = test)$class

confusionMatrix(lda_pred, as.factor(test_y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 15
##          1 10 62
##                                           
##                Accuracy : 0.8355          
##                  95% CI : (0.7669, 0.8906)
##     No Information Rate : 0.5066          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6713          
##                                           
##  Mcnemar's Test P-Value : 0.4237          
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.8052          
##          Pos Pred Value : 0.8125          
##          Neg Pred Value : 0.8611          
##              Prevalence : 0.4934          
##          Detection Rate : 0.4276          
##    Detection Prevalence : 0.5263          
##       Balanced Accuracy : 0.8359          
##                                           
##        'Positive' Class : 0               
## 
nb_model <- naiveBayes(as.factor(crim01) ~ ., data = train)
nb_pred <- predict(nb_model, newdata = test)

confusionMatrix(nb_pred, as.factor(test_y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 18
##          1 10 59
##                                          
##                Accuracy : 0.8158         
##                  95% CI : (0.7449, 0.874)
##     No Information Rate : 0.5066         
##     P-Value [Acc > NIR] : 2.259e-15      
##                                          
##                   Kappa : 0.632          
##                                          
##  Mcnemar's Test P-Value : 0.1859         
##                                          
##             Sensitivity : 0.8667         
##             Specificity : 0.7662         
##          Pos Pred Value : 0.7831         
##          Neg Pred Value : 0.8551         
##              Prevalence : 0.4934         
##          Detection Rate : 0.4276         
##    Detection Prevalence : 0.5461         
##       Balanced Accuracy : 0.8165         
##                                          
##        'Positive' Class : 0              
## 
train_norm <- scale(train_x)
test_norm <- scale(test_x, center = attr(train_norm, "scaled:center"), 
                              scale = attr(train_norm, "scaled:scale"))

knn_pred <- knn(train = train_norm, test = test_norm, cl = train_y, k = 5)

confusionMatrix(knn_pred, as.factor(test_y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65  8
##          1 10 69
##                                           
##                Accuracy : 0.8816          
##                  95% CI : (0.8193, 0.9283)
##     No Information Rate : 0.5066          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.763           
##                                           
##  Mcnemar's Test P-Value : 0.8137          
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.8961          
##          Pos Pred Value : 0.8904          
##          Neg Pred Value : 0.8734          
##              Prevalence : 0.4934          
##          Detection Rate : 0.4276          
##    Detection Prevalence : 0.4803          
##       Balanced Accuracy : 0.8814          
##                                           
##        'Positive' Class : 0               
## 

Chapter 5

library(ISLR)
library(dplyr)
library(caret)
set.seed(42)
data("Default")
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
model1 <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model1)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
set.seed(100)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train <- Default[train_index, ]
valid <- Default[-train_index, ]

model2 <- glm(default ~ income + balance, data = train, family = binomial)

probs <- predict(model2, newdata = valid, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")

confusionMatrix(as.factor(preds), as.factor(valid$default))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  4822  112
##        Yes   20   46
##                                           
##                Accuracy : 0.9736          
##                  95% CI : (0.9688, 0.9779)
##     No Information Rate : 0.9684          
##     P-Value [Acc > NIR] : 0.0176          
##                                           
##                   Kappa : 0.3995          
##                                           
##  Mcnemar's Test P-Value : 2.365e-15       
##                                           
##             Sensitivity : 0.9959          
##             Specificity : 0.2911          
##          Pos Pred Value : 0.9773          
##          Neg Pred Value : 0.6970          
##              Prevalence : 0.9684          
##          Detection Rate : 0.9644          
##    Detection Prevalence : 0.9868          
##       Balanced Accuracy : 0.6435          
##                                           
##        'Positive' Class : No              
## 
mean(preds != valid$default)
## [1] 0.0264
set.seed(123)
validation_errors <- c()

for (i in 1:3) {
  split_idx <- sample(1:nrow(Default), nrow(Default)/2)
  train_split <- Default[split_idx, ]
  valid_split <- Default[-split_idx, ]
  
  model_split <- glm(default ~ income + balance, data = train_split, family = binomial)
  pred_split <- predict(model_split, newdata = valid_split, type = "response")
  class_split <- ifelse(pred_split > 0.5, "Yes", "No")
  
  error_rate <- mean(class_split != valid_split$default)
  validation_errors <- c(validation_errors, error_rate)
}

validation_errors
## [1] 0.0276 0.0246 0.0262
mean(validation_errors)
## [1] 0.02613333
set.seed(456)
split_idx <- sample(1:nrow(Default), nrow(Default)/2)
train_stud <- Default[split_idx, ]
valid_stud <- Default[-split_idx, ]

model_stud <- glm(default ~ income + balance + student, data = train_stud, family = binomial)
pred_stud <- predict(model_stud, newdata = valid_stud, type = "response")
class_stud <- ifelse(pred_stud > 0.5, "Yes", "No")

confusionMatrix(as.factor(class_stud), as.factor(valid_stud$default))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  4797  115
##        Yes   33   55
##                                           
##                Accuracy : 0.9704          
##                  95% CI : (0.9653, 0.9749)
##     No Information Rate : 0.966           
##     P-Value [Acc > NIR] : 0.04443         
##                                           
##                   Kappa : 0.4127          
##                                           
##  Mcnemar's Test P-Value : 2.773e-11       
##                                           
##             Sensitivity : 0.9932          
##             Specificity : 0.3235          
##          Pos Pred Value : 0.9766          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.9660          
##          Detection Rate : 0.9594          
##    Detection Prevalence : 0.9824          
##       Balanced Accuracy : 0.6583          
##                                           
##        'Positive' Class : No              
## 
mean(class_stud != valid_stud$default)
## [1] 0.0296
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tibble       3.3.0
## ✔ infer        1.0.8     ✔ tidyr        1.3.1
## ✔ modeldata    1.4.0     ✔ tune         1.3.0
## ✔ parsnip      1.3.2     ✔ workflows    1.2.0
## ✔ purrr        1.0.4     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()         masks scales::discard()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ rsample::permutations()  masks e1071::permutations()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ dplyr::select()          masks MASS::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## ✖ tune::tune()             masks parsnip::tune(), e1071::tune()
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Boston
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
## The following object is masked from 'package:MASS':
## 
##     Boston
set.seed(123)

data("College")
college <- College

split <- initial_split(college, prop = 0.7)
train_data <- training(split)
test_data <- testing(split)
lm_model <- linear_reg() %>% 
  set_engine("lm")

lm_recipe <- recipe(Apps ~ ., data = train_data)

lm_workflow <- workflow() %>% 
  add_model(lm_model) %>% 
  add_recipe(lm_recipe)

lm_fit <- lm_workflow %>% fit(data = train_data)

lm_preds <- predict(lm_fit, test_data) %>% 
  bind_cols(test_data)

lm_mse <- mean((lm_preds$.pred - lm_preds$Apps)^2)
cat("(b) Linear Regression Test MSE:", lm_mse, "\n")
## (b) Linear Regression Test MSE: 1734841
install.packages("glmnet")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependency 'RcppEigen'
## Warning in install.packages("glmnet"): installation of package 'RcppEigen' had
## non-zero exit status
library(MASS)

x <- model.matrix(Apps ~ ., data = train_data)[, -1]
y <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_test <- test_data$Apps

lambdas <- 10^seq(10, -2, length = 100)
mse_values <- numeric(length(lambdas))

for (i in seq_along(lambdas)) {
  lambda <- lambdas[i]
  ridge_fit <- lm.ridge(Apps ~ ., data = train_data, lambda = lambda)
  ridge_coefs <- coef(ridge_fit)
  preds <- cbind(1, x_test) %*% ridge_coefs
  mse_values[i] <- mean((y_test - preds)^2)
}

best_lambda <- lambdas[which.min(mse_values)]
best_mse <- min(mse_values)

cat("(c) Ridge Regression Test MSE (manual):", best_mse, "\n")
## (c) Ridge Regression Test MSE (manual): 1735254
cat("    Best lambda:", best_lambda, "\n")
##     Best lambda: 0.01
library(ISLR2)

set.seed(123)
index <- sample(1:nrow(College), nrow(College) * 0.7)
train_data <- College[index, ]
test_data <- College[-index, ]

full_model <- lm(Apps ~ ., data = train_data)

step_model <- stats::step(full_model, direction = "both", trace = FALSE)

preds <- predict(step_model, newdata = test_data)

mse <- mean((preds - test_data$Apps)^2)

num_nonzero <- sum(coef(step_model)[-1] != 0)

cat("(d) Stepwise Regression (Lasso-like) Test MSE:", mse, "\n")
## (d) Stepwise Regression (Lasso-like) Test MSE: 1713938
cat("    Number of predictors selected:", num_nonzero, "\n")
##     Number of predictors selected: 11
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

print(validationplot(pcr_model, val.type = "MSEP"))

## NULL
msep_values <- MSEP(pcr_model)$val[1,1,]
best_M <- which.min(msep_values)

pcr_preds <- predict(pcr_model, newdata = test_data, ncomp = best_M)

pcr_mse <- mean((pcr_preds - test_data$Apps)^2)

cat("(e) PCR Test MSE:", pcr_mse, "\n")
## (e) PCR Test MSE: 1734841
cat("    Optimal number of components (M):", best_M, "\n")
##     Optimal number of components (M): 17
library(pls)

set.seed(123)
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

validationplot(pls_model, val.type = "MSEP")

pls_msep_values <- MSEP(pls_model)$val[1,1,]
best_pls_M <- which.min(pls_msep_values)

pls_preds <- predict(pls_model, newdata = test_data, ncomp = best_pls_M)

pls_mse <- mean((pls_preds - test_data$Apps)^2)

cat("(f) PLS Test MSE:", pls_mse, "\n")
## (f) PLS Test MSE: 1742582
cat("    Optimal number of components (M):", best_pls_M, "\n")
##     Optimal number of components (M): 12
full_model <- lm(Apps ~ ., data = train_data)
step_model <- stats::step(full_model, direction = "both", trace = FALSE)

step_preds <- predict(step_model, newdata = test_data)
step_mse <- mean((step_preds - test_data$Apps)^2)
library(tibble)
library(dplyr)
library(glue)

model_results <- tibble(
  Model = c("Linear Regression", 
            "Ridge Regression", 
            "Stepwise Regression", 
            "PCR", 
            "PLS"),
  Test_MSE = c(
    lm_mse,     # from (b)
    best_mse,   # from Ridge
    step_mse,   # from Stepwise
    pcr_mse,    # from PCR
    pls_mse     # from PLS
  )
)

model_results <- model_results %>% arrange(Test_MSE)
best_model <- model_results$Model[1]
best_mse <- model_results$Test_MSE[1]
worst_model <- model_results$Model[5]
worst_mse <- model_results$Test_MSE[5]
range_mse <- round(worst_mse - best_mse)

summary_text <- glue("
Among the five models compared, {best_model} achieved the lowest test MSE at {round(best_mse)}. 
In contrast, {worst_model} had the highest MSE at {round(worst_mse)}, resulting in a difference of about {range_mse} in prediction error. 
This suggests that while all models perform reasonably well in predicting the number of college applications, 
the use of dimensionality reduction or variable selection can offer modest improvements. 
The close performance across models also indicates that the predictors are informative, 
and the problem does not suffer heavily from overfitting or multicollinearity. 
Overall, {best_model} stands out as the most accurate model for this dataset.
")

cat(summary_text)
## Among the five models compared, Stepwise Regression achieved the lowest test MSE at 1713938. 
## In contrast, PLS had the highest MSE at 1742582, resulting in a difference of about 28644 in prediction error. 
## This suggests that while all models perform reasonably well in predicting the number of college applications, 
## the use of dimensionality reduction or variable selection can offer modest improvements. 
## The close performance across models also indicates that the predictors are informative, 
## and the problem does not suffer heavily from overfitting or multicollinearity. 
## Overall, Stepwise Regression stands out as the most accurate model for this dataset.