Chapter 4 Load Libraries and Data

# Load necessary libraries
library(MASS)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(e1071)
library(class)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Load the Boston dataset
data("Boston")

Create Response Variable

# Create a binary response variable based on median crime rate
median_crime <- median(Boston$crim)
Boston$crim_binary <- ifelse(Boston$crim > median_crime, 1, 0)
Boston$crim_binary <- as.factor(Boston$crim_binary)

# Check the response variable
table(Boston$crim_binary)

## 
##   0   1 
## 253 253

Split Data into Training and Test Sets

# Set seed for reproducibility
set.seed(123)

# Split the data
train_index <- createDataPartition(Boston$crim_binary, p = 0.7, list = FALSE)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]

# Check the dimensions of the training and test sets
dim(train_data)

## [1] 356  15

dim(test_data)

## [1] 150  15

Logistic Regression

# Fit logistic regression model
logit_model <- glm(crim_binary ~ ., data = train_data, family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Predict on test data
logit_pred <- predict(logit_model, test_data, type = "response")
logit_class <- ifelse(logit_pred > 0.5, 1, 0)

# Confusion matrix for logistic regression
confusionMatrix(as.factor(logit_class), test_data$crim_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 74  1
##          1  1 74
##                                           
##                Accuracy : 0.9867          
##                  95% CI : (0.9527, 0.9984)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9733          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9867          
##             Specificity : 0.9867          
##          Pos Pred Value : 0.9867          
##          Neg Pred Value : 0.9867          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4933          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9867          
##                                           
##        'Positive' Class : 0               
##

Linear Discriminant Analysis (LDA)

# Fit LDA model
lda_model <- lda(crim_binary ~ ., data = train_data)

# Predict on test data
lda_pred <- predict(lda_model, test_data)
lda_class <- lda_pred$class

# Confusion matrix for LDA
confusionMatrix(lda_class, test_data$crim_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 70 15
##          1  5 60
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.8016, 0.9166)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7333          
##                                           
##  Mcnemar's Test P-Value : 0.04417         
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 0.5667          
##       Balanced Accuracy : 0.8667          
##                                           
##        'Positive' Class : 0               
##

Naive Bayes

# Fit Naive Bayes model
nb_model <- naiveBayes(crim_binary ~ ., data = train_data)

# Predict on test data
nb_pred <- predict(nb_model, test_data)
nb_class <- nb_pred

# Confusion matrix for Naive Bayes
confusionMatrix(nb_class, test_data$crim_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 71  1
##          1  4 74
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.9239, 0.9891)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9333          
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.9467          
##             Specificity : 0.9867          
##          Pos Pred Value : 0.9861          
##          Neg Pred Value : 0.9487          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4733          
##    Detection Prevalence : 0.4800          
##       Balanced Accuracy : 0.9667          
##                                           
##        'Positive' Class : 0               
##

K-Nearest Neighbors (KNN)

# Normalize the data
train_data_scaled <- as.data.frame(scale(train_data[, -ncol(train_data)]))
test_data_scaled <- as.data.frame(scale(test_data[, -ncol(test_data)]))
train_labels <- train_data$crim_binary
test_labels <- test_data$crim_binary

# Fit KNN model
knn_model <- knn(train = train_data_scaled, test = test_data_scaled, cl = train_labels, k = 5)

# Confusion matrix for KNN
confusionMatrix(knn_model, test_labels)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69 10
##          1  6 65
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8326, 0.9378)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7867          
##                                           
##  Mcnemar's Test P-Value : 0.4533          
##                                           
##             Sensitivity : 0.9200          
##             Specificity : 0.8667          
##          Pos Pred Value : 0.8734          
##          Neg Pred Value : 0.9155          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4600          
##    Detection Prevalence : 0.5267          
##       Balanced Accuracy : 0.8933          
##                                           
##        'Positive' Class : 0               
##

Findings

# Summarize and compare the models' performance
logit_perf <- confusionMatrix(as.factor(logit_class), test_data$crim_binary)
lda_perf <- confusionMatrix(lda_class, test_data$crim_binary)
nb_perf <- confusionMatrix(nb_class, test_data$crim_binary)
knn_perf <- confusionMatrix(knn_model, test_labels)

performance_summary <- data.frame(
  Model = c("Logistic Regression", "LDA", "Naive Bayes", "KNN"),
  Accuracy = c(logit_perf$overall['Accuracy'], lda_perf$overall['Accuracy'], nb_perf$overall['Accuracy'], knn_perf$overall['Accuracy']),
  Kappa = c(logit_perf$overall['Kappa'], lda_perf$overall['Kappa'], nb_perf$overall['Kappa'], knn_perf$overall['Kappa'])
)

print(performance_summary)

##                 Model  Accuracy     Kappa
## 1 Logistic Regression 0.9866667 0.9733333
## 2                 LDA 0.8666667 0.7333333
## 3         Naive Bayes 0.9666667 0.9333333
## 4                 KNN 0.8933333 0.7866667

Chapter 5 Load Libraries and Data

library(ISLR)
# Load the Default dataset
data("Default")

Fit Logistic Regression Model

# Set a random seed for reproducibility
set.seed(123)

# Fit logistic regression model using income and balance
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Summary of the model
summary(logit_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

Validation Set Approach

First Split

# Split the data into training and validation sets
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]

# Fit logistic regression model on training data
logit_model_train <- glm(default ~ income + balance, data = train_data, family = binomial)

# Predict on validation set
pred_probs <- predict(logit_model_train, validation_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix <- confusionMatrix(as.factor(pred_class), validation_data$default)
validation_error <- 1 - conf_matrix$overall['Accuracy']
validation_error

##   Accuracy 
## 0.02534178

Second Split

# Set a new random seed for a different split
set.seed(456)

# Split the data again
train_index2 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data2 <- Default[train_index2, ]
validation_data2 <- Default[-train_index2, ]

# Fit logistic regression model on training data
logit_model_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)

# Predict on validation set
pred_probs2 <- predict(logit_model_train2, validation_data2, type = "response")
pred_class2 <- ifelse(pred_probs2 > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix2 <- confusionMatrix(as.factor(pred_class2), validation_data2$default)
validation_error2 <- 1 - conf_matrix2$overall['Accuracy']
validation_error2

## Accuracy 
## 0.027009

Third Split

# Set another random seed for a different split
set.seed(789)

# Split the data again
train_index3 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data3 <- Default[train_index3, ]
validation_data3 <- Default[-train_index3, ]

# Fit logistic regression model on training data
logit_model_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)

# Predict on validation set
pred_probs3 <- predict(logit_model_train3, validation_data3, type = "response")
pred_class3 <- ifelse(pred_probs3 > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix3 <- confusionMatrix(as.factor(pred_class3), validation_data3$default)
validation_error3 <- 1 - conf_matrix3$overall['Accuracy']
validation_error3

##   Accuracy 
## 0.02500834

Summary of Validation Errors

# Summarize validation errors
validation_errors <- data.frame(
  Split = c("First Split", "Second Split", "Third Split"),
  Validation_Error = c(validation_error, validation_error2, validation_error3)
)

print(validation_errors)

##          Split Validation_Error
## 1  First Split       0.02534178
## 2 Second Split       0.02700900
## 3  Third Split       0.02500834

Adding Student Variable

# Set seed for reproducibility
set.seed(123)

# Split the data
train_index4 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data4 <- Default[train_index4, ]
validation_data4 <- Default[-train_index4, ]

# Fit logistic regression model with income, balance, and student
logit_model_student <- glm(default ~ income + balance + student, data = train_data4, family = binomial)

# Predict on validation set
pred_probs_student <- predict(logit_model_student, validation_data4, type = "response")
pred_class_student <- ifelse(pred_probs_student > 0.5, "Yes", "No")

# Confusion matrix and error rate
conf_matrix_student <- confusionMatrix(as.factor(pred_class_student), validation_data4$default)
validation_error_student <- 1 - conf_matrix_student$overall['Accuracy']
validation_error_student

##   Accuracy 
## 0.02634211

Summary and Comparison

# Summarize and compare the validation errors
comparison <- data.frame(
  Model = c("Without Student", "With Student"),
  Validation_Error = c(validation_error, validation_error_student)
)

print(comparison)

##             Model Validation_Error
## 1 Without Student       0.02534178
## 2    With Student       0.02634211

# Discussion on the results
# Here you can add your observations and comments on whether including the student variable
# led to a reduction in the test error rate.

Chapter 6

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# Load the College dataset
data("College")

Split the Data into Training and Test Sets

# Set a random seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]

# Check dimensions
dim(train_data)

## [1] 545  18

dim(test_data)

## [1] 232  18

Linear Model using Least Squares

# Fit linear model using least squares
lm_model <- lm(Apps ~ ., data = train_data)

# Predict on test set
lm_predictions <- predict(lm_model, test_data)

# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Apps)^2)
lm_mse

## [1] 1882074

Ridge Regression

# Prepare data for ridge regression
x_train <- model.matrix(Apps ~ ., train_data)[,-1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[,-1]
y_test <- test_data$Apps

# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)

# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min

# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)

# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse

## [1] 3265646

Lasso Regression

# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)

# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min

# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)

# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse

## [1] 1942428

# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)

## [1] 17

Principal Components Regression (PCR)

# Fit PCR model with cross-validation
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)

# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)

# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Apps)^2)
pcr_mse

## [1] 1882074

Partial Least Squares (PLS)

# Fit PLS model with cross-validation
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)

# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)

# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Apps)^2)
pls_mse

## [1] 1888865

Summary and Comparison

# Summarize and compare the test errors
test_errors <- data.frame(
  Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
  Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)

print(test_errors)

##              Model Test_Error
## 1     Linear Model    1882074
## 2 Ridge Regression    3265646
## 3 Lasso Regression    1942428
## 4              PCR    1882074
## 5              PLS    1888865

Financial Application Example

Example: Predicting Stock Prices

library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Get stock data (e.g., Apple Inc.)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-12-31")

## [1] "AAPL"

stock_data <- na.omit(AAPL)

# Check column names
colnames(stock_data)

## [1] "AAPL.Open"     "AAPL.High"     "AAPL.Low"      "AAPL.Close"   
## [5] "AAPL.Volume"   "AAPL.Adjusted"

# Create predictors and response variable
stock_data <- data.frame(Date = index(stock_data), coredata(stock_data))
colnames(stock_data) <- make.names(colnames(stock_data))
stock_data <- stock_data %>%
  mutate(Return = (AAPL.Close - lag(AAPL.Close)) / lag(AAPL.Close)) %>%
  na.omit()

Split Data into Training and Test Sets

# Set a random seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_index <- createDataPartition(stock_data$Return, p = 0.7, list = FALSE)
train_data <- stock_data[train_index, ]
test_data <- stock_data[-train_index, ]

# Check dimensions
dim(train_data)

## [1] 705   8

dim(test_data)

## [1] 300   8

Linear Model using Least Squares

# Fit linear model using least squares
lm_model <- lm(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data)

# Predict on test set
lm_predictions <- predict(lm_model, test_data)

# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Return)^2)
lm_mse

## [1] 0.0002262872

Ridge Regression

# Prepare data for ridge regression
x_train <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, train_data)[,-1]
y_train <- train_data$Return
x_test <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, test_data)[,-1]
y_test <- test_data$Return

# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)

# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min

# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)

# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse

## [1] 0.0003197523

Lasso Regression

# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)

# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min

# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)

# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse

## [1] 0.000228227

# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)

## [1] 6

Principal Components Regression (PCR)

# Fit PCR model with cross-validation
pcr_model <- pcr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)

# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)

# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Return)^2)
pcr_mse

## [1] 0.0002262872

Partial Least Squares (PLS)

# Fit PLS model with cross-validation
pls_model <- plsr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")

# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)

# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)

# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Return)^2)
pls_mse

## [1] 0.0002262872

Summary and Comparison

# Summarize and compare the test errors
test_errors <- data.frame(
  Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
  Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)

print(test_errors)

##              Model   Test_Error
## 1     Linear Model 0.0002262872
## 2 Ridge Regression 0.0003197523
## 3 Lasso Regression 0.0002282270
## 4              PCR 0.0002262872
## 5              PLS 0.0002262872

Nyamsuren

2024-06-14

Chapter 4

Load Libraries and Data

Create Response Variable

Split Data into Training and Test Sets

Logistic Regression

Linear Discriminant Analysis (LDA)

Naive Bayes

K-Nearest Neighbors (KNN)

Findings

Chapter 5

Load Libraries and Data

Fit Logistic Regression Model

Validation Set Approach

First Split

Second Split

Third Split

Summary of Validation Errors

Adding Student Variable

Summary and Comparison

Chapter 6

Split the Data into Training and Test Sets

Linear Model using Least Squares

Ridge Regression

Lasso Regression

Principal Components Regression (PCR)

Partial Least Squares (PLS)

Summary and Comparison

Financial Application Example

Example: Predicting Stock Prices

Split Data into Training and Test Sets

Linear Model using Least Squares

Ridge Regression

Lasso Regression

Principal Components Regression (PCR)

Partial Least Squares (PLS)

Summary and Comparison