Chapter 4

Load Libraries and Data

# Load necessary libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(class)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the Boston dataset
data("Boston")

Create Response Variable

# Create a binary response variable based on median crime rate
median_crime <- median(Boston$crim)
Boston$crim_binary <- ifelse(Boston$crim > median_crime, 1, 0)
Boston$crim_binary <- as.factor(Boston$crim_binary)

# Check the response variable
table(Boston$crim_binary)
## 
##   0   1 
## 253 253

Split Data into Training and Test Sets

# Set seed for reproducibility
set.seed(123)

# Split the data
train_index <- createDataPartition(Boston$crim_binary, p = 0.7, list = FALSE)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]

# Check the dimensions of the training and test sets
dim(train_data)
## [1] 356  15
dim(test_data)
## [1] 150  15

Logistic Regression

# Fit logistic regression model
logit_model <- glm(crim_binary ~ ., data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict on test data
logit_pred <- predict(logit_model, test_data, type = "response")
logit_class <- ifelse(logit_pred > 0.5, 1, 0)

# Confusion matrix for logistic regression
confusionMatrix(as.factor(logit_class), test_data$crim_binary)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 74  1
##          1  1 74
##                                           
##                Accuracy : 0.9867          
##                  95% CI : (0.9527, 0.9984)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9733          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9867          
##             Specificity : 0.9867          
##          Pos Pred Value : 0.9867          
##          Neg Pred Value : 0.9867          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4933          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9867          
##                                           
##        'Positive' Class : 0               
## 

Linear Discriminant Analysis (LDA)

# Fit LDA model
lda_model <- lda(crim_binary ~ ., data = train_data)

# Predict on test data
lda_pred <- predict(lda_model, test_data)
lda_class <- lda_pred$class

# Confusion matrix for LDA
confusionMatrix(lda_class, test_data$crim_binary)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 70 15
##          1  5 60
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.8016, 0.9166)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7333          
##                                           
##  Mcnemar's Test P-Value : 0.04417         
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.8667          
##                                           
##        'Positive' Class : 0               
## 

Naive Bayes

# Fit Naive Bayes model
nb_model <- naiveBayes(crim_binary ~ ., data = train_data)

# Predict on test data
nb_pred <- predict(nb_model, test_data)
nb_class <- nb_pred

# Confusion matrix for Naive Bayes
confusionMatrix(nb_class, test_data$crim_binary)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 71  1
##          1  4 74
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.9239, 0.9891)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9333          
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.9467          
##             Specificity : 0.9867          
##          Pos Pred Value : 0.9861          
##          Neg Pred Value : 0.9487          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4733          
##    Detection Prevalence : 0.4800          
##       Balanced Accuracy : 0.9667          
##                                           
##        'Positive' Class : 0               
## 

K-Nearest Neighbors (KNN)

# Normalize the data
train_data_scaled <- as.data.frame(scale(train_data[, -ncol(train_data)]))
test_data_scaled <- as.data.frame(scale(test_data[, -ncol(test_data)]))
train_labels <- train_data$crim_binary
test_labels <- test_data$crim_binary

# Fit KNN model
knn_model <- knn(train = train_data_scaled, test = test_data_scaled, cl = train_labels, k = 5)

# Confusion matrix for KNN
confusionMatrix(knn_model, test_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69 10
##          1  6 65
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8326, 0.9378)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7867          
##                                           
##  Mcnemar's Test P-Value : 0.4533          
##                                           
##             Sensitivity : 0.9200          
##             Specificity : 0.8667          
##          Pos Pred Value : 0.8734          
##          Neg Pred Value : 0.9155          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4600          
##    Detection Prevalence : 0.5267          
##       Balanced Accuracy : 0.8933          
##                                           
##        'Positive' Class : 0               
## 

Findings

# Summarize and compare the models' performance
logit_perf <- confusionMatrix(as.factor(logit_class), test_data$crim_binary)
lda_perf <- confusionMatrix(lda_class, test_data$crim_binary)
nb_perf <- confusionMatrix(nb_class, test_data$crim_binary)
knn_perf <- confusionMatrix(knn_model, test_labels)

performance_summary <- data.frame(
  Model = c("Logistic Regression", "LDA", "Naive Bayes", "KNN"),
  Accuracy = c(logit_perf$overall['Accuracy'], lda_perf$overall['Accuracy'], nb_perf$overall['Accuracy'], knn_perf$overall['Accuracy']),
  Kappa = c(logit_perf$overall['Kappa'], lda_perf$overall['Kappa'], nb_perf$overall['Kappa'], knn_perf$overall['Kappa'])
)

print(performance_summary)
##                 Model  Accuracy     Kappa
## 1 Logistic Regression 0.9866667 0.9733333
## 2                 LDA 0.8666667 0.7333333
## 3         Naive Bayes 0.9666667 0.9333333
## 4                 KNN 0.8933333 0.7866667

Chapter 5

Load Libraries and Data

library(ISLR)
# Load the Default dataset
data("Default")

Fit Logistic Regression Model

# Set a random seed for reproducibility
set.seed(123)

# Fit logistic regression model using income and balance
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Summary of the model
summary(logit_model)
## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

Validation Set Approach

First Split

# Split the data into training and validation sets
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]

# Fit logistic regression model on training data
logit_model_train <- glm(default ~ income + balance, data = train_data, family = binomial)

# Predict on validation set
pred_probs <- predict(logit_model_train, validation_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix <- confusionMatrix(as.factor(pred_class), validation_data$default)
validation_error <- 1 - conf_matrix$overall['Accuracy']
validation_error
##   Accuracy 
## 0.02534178

Second Split

# Set a new random seed for a different split
set.seed(456)

# Split the data again
train_index2 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data2 <- Default[train_index2, ]
validation_data2 <- Default[-train_index2, ]

# Fit logistic regression model on training data
logit_model_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)

# Predict on validation set
pred_probs2 <- predict(logit_model_train2, validation_data2, type = "response")
pred_class2 <- ifelse(pred_probs2 > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix2 <- confusionMatrix(as.factor(pred_class2), validation_data2$default)
validation_error2 <- 1 - conf_matrix2$overall['Accuracy']
validation_error2
## Accuracy 
## 0.027009

Third Split

# Set another random seed for a different split
set.seed(789)

# Split the data again
train_index3 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data3 <- Default[train_index3, ]
validation_data3 <- Default[-train_index3, ]

# Fit logistic regression model on training data
logit_model_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)

# Predict on validation set
pred_probs3 <- predict(logit_model_train3, validation_data3, type = "response")
pred_class3 <- ifelse(pred_probs3 > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix3 <- confusionMatrix(as.factor(pred_class3), validation_data3$default)
validation_error3 <- 1 - conf_matrix3$overall['Accuracy']
validation_error3
##   Accuracy 
## 0.02500834

Summary of Validation Errors

# Summarize validation errors
validation_errors <- data.frame(
  Split = c("First Split", "Second Split", "Third Split"),
  Validation_Error = c(validation_error, validation_error2, validation_error3)
)

print(validation_errors)
##          Split Validation_Error
## 1  First Split       0.02534178
## 2 Second Split       0.02700900
## 3  Third Split       0.02500834

Adding Student Variable

# Set seed for reproducibility
set.seed(123)

# Split the data
train_index4 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data4 <- Default[train_index4, ]
validation_data4 <- Default[-train_index4, ]

# Fit logistic regression model with income, balance, and student
logit_model_student <- glm(default ~ income + balance + student, data = train_data4, family = binomial)

# Predict on validation set
pred_probs_student <- predict(logit_model_student, validation_data4, type = "response")
pred_class_student <- ifelse(pred_probs_student > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix_student <- confusionMatrix(as.factor(pred_class_student), validation_data4$default)
validation_error_student <- 1 - conf_matrix_student$overall['Accuracy']
validation_error_student
##   Accuracy 
## 0.02634211

Summary and Comparison

# Summarize and compare the validation errors
comparison <- data.frame(
  Model = c("Without Student", "With Student"),
  Validation_Error = c(validation_error, validation_error_student)
)

print(comparison)
##             Model Validation_Error
## 1 Without Student       0.02534178
## 2    With Student       0.02634211
# Discussion on the results
# Here you can add your observations and comments on whether including the student variable
# led to a reduction in the test error rate.

Chapter 6

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
# Load the College dataset
data("College")

Split the Data into Training and Test Sets

# Set a random seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]

# Check dimensions
dim(train_data)
## [1] 545  18
dim(test_data)
## [1] 232  18

Linear Model using Least Squares

# Fit linear model using least squares
lm_model <- lm(Apps ~ ., data = train_data)

# Predict on test set
lm_predictions <- predict(lm_model, test_data)

# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Apps)^2)
lm_mse
## [1] 1882074

Ridge Regression

# Prepare data for ridge regression
x_train <- model.matrix(Apps ~ ., train_data)[,-1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[,-1]
y_test <- test_data$Apps

# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)

# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min

# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)

# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse
## [1] 3265646

Lasso Regression

# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)

# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min

# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)

# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse
## [1] 1942428
# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)
## [1] 17

Principal Components Regression (PCR)

# Fit PCR model with cross-validation
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)

# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)

# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Apps)^2)
pcr_mse
## [1] 1882074

Partial Least Squares (PLS)

# Fit PLS model with cross-validation
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)

# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)

# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Apps)^2)
pls_mse
## [1] 1888865

Summary and Comparison

# Summarize and compare the test errors
test_errors <- data.frame(
  Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
  Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)

print(test_errors)
##              Model Test_Error
## 1     Linear Model    1882074
## 2 Ridge Regression    3265646
## 3 Lasso Regression    1942428
## 4              PCR    1882074
## 5              PLS    1888865

Financial Application Example

Example: Predicting Stock Prices

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Get stock data (e.g., Apple Inc.)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-12-31")
## [1] "AAPL"
stock_data <- na.omit(AAPL)

# Check column names
colnames(stock_data)
## [1] "AAPL.Open"     "AAPL.High"     "AAPL.Low"      "AAPL.Close"   
## [5] "AAPL.Volume"   "AAPL.Adjusted"
# Create predictors and response variable
stock_data <- data.frame(Date = index(stock_data), coredata(stock_data))
colnames(stock_data) <- make.names(colnames(stock_data))
stock_data <- stock_data %>%
  mutate(Return = (AAPL.Close - lag(AAPL.Close)) / lag(AAPL.Close)) %>%
  na.omit()

Split Data into Training and Test Sets

# Set a random seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_index <- createDataPartition(stock_data$Return, p = 0.7, list = FALSE)
train_data <- stock_data[train_index, ]
test_data <- stock_data[-train_index, ]

# Check dimensions
dim(train_data)
## [1] 705   8
dim(test_data)
## [1] 300   8

Linear Model using Least Squares

# Fit linear model using least squares
lm_model <- lm(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data)

# Predict on test set
lm_predictions <- predict(lm_model, test_data)

# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Return)^2)
lm_mse
## [1] 0.0002262872

Ridge Regression

# Prepare data for ridge regression
x_train <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, train_data)[,-1]
y_train <- train_data$Return
x_test <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, test_data)[,-1]
y_test <- test_data$Return

# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)

# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min

# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)

# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse
## [1] 0.0003197523

Lasso Regression

# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)

# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min

# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)

# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse
## [1] 0.000228227
# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)
## [1] 6

Principal Components Regression (PCR)

# Fit PCR model with cross-validation
pcr_model <- pcr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)

# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)

# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Return)^2)
pcr_mse
## [1] 0.0002262872

Partial Least Squares (PLS)

# Fit PLS model with cross-validation
pls_model <- plsr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)

# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)

# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Return)^2)
pls_mse
## [1] 0.0002262872

Summary and Comparison

# Summarize and compare the test errors
test_errors <- data.frame(
  Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
  Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)

print(test_errors)
##              Model   Test_Error
## 1     Linear Model 0.0002262872
## 2 Ridge Regression 0.0003197523
## 3 Lasso Regression 0.0002282270
## 4              PCR 0.0002262872
## 5              PLS 0.0002262872