Chapter 4
Load Libraries and Data
# Load necessary libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(class)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the Boston dataset
data("Boston")
Create Response Variable
# Create a binary response variable based on median crime rate
median_crime <- median(Boston$crim)
Boston$crim_binary <- ifelse(Boston$crim > median_crime, 1, 0)
Boston$crim_binary <- as.factor(Boston$crim_binary)
# Check the response variable
table(Boston$crim_binary)
##
## 0 1
## 253 253
Split Data into Training and Test Sets
# Set seed for reproducibility
set.seed(123)
# Split the data
train_index <- createDataPartition(Boston$crim_binary, p = 0.7, list = FALSE)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]
# Check the dimensions of the training and test sets
dim(train_data)
## [1] 356 15
dim(test_data)
## [1] 150 15
Logistic Regression
# Fit logistic regression model
logit_model <- glm(crim_binary ~ ., data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict on test data
logit_pred <- predict(logit_model, test_data, type = "response")
logit_class <- ifelse(logit_pred > 0.5, 1, 0)
# Confusion matrix for logistic regression
confusionMatrix(as.factor(logit_class), test_data$crim_binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 74 1
## 1 1 74
##
## Accuracy : 0.9867
## 95% CI : (0.9527, 0.9984)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9733
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9867
## Specificity : 0.9867
## Pos Pred Value : 0.9867
## Neg Pred Value : 0.9867
## Prevalence : 0.5000
## Detection Rate : 0.4933
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9867
##
## 'Positive' Class : 0
##
Linear Discriminant Analysis (LDA)
# Fit LDA model
lda_model <- lda(crim_binary ~ ., data = train_data)
# Predict on test data
lda_pred <- predict(lda_model, test_data)
lda_class <- lda_pred$class
# Confusion matrix for LDA
confusionMatrix(lda_class, test_data$crim_binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 15
## 1 5 60
##
## Accuracy : 0.8667
## 95% CI : (0.8016, 0.9166)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.04417
##
## Sensitivity : 0.9333
## Specificity : 0.8000
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.9231
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : 0
##
Naive Bayes
# Fit Naive Bayes model
nb_model <- naiveBayes(crim_binary ~ ., data = train_data)
# Predict on test data
nb_pred <- predict(nb_model, test_data)
nb_class <- nb_pred
# Confusion matrix for Naive Bayes
confusionMatrix(nb_class, test_data$crim_binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 71 1
## 1 4 74
##
## Accuracy : 0.9667
## 95% CI : (0.9239, 0.9891)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9333
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9467
## Specificity : 0.9867
## Pos Pred Value : 0.9861
## Neg Pred Value : 0.9487
## Prevalence : 0.5000
## Detection Rate : 0.4733
## Detection Prevalence : 0.4800
## Balanced Accuracy : 0.9667
##
## 'Positive' Class : 0
##
K-Nearest Neighbors (KNN)
# Normalize the data
train_data_scaled <- as.data.frame(scale(train_data[, -ncol(train_data)]))
test_data_scaled <- as.data.frame(scale(test_data[, -ncol(test_data)]))
train_labels <- train_data$crim_binary
test_labels <- test_data$crim_binary
# Fit KNN model
knn_model <- knn(train = train_data_scaled, test = test_data_scaled, cl = train_labels, k = 5)
# Confusion matrix for KNN
confusionMatrix(knn_model, test_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 10
## 1 6 65
##
## Accuracy : 0.8933
## 95% CI : (0.8326, 0.9378)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7867
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9200
## Specificity : 0.8667
## Pos Pred Value : 0.8734
## Neg Pred Value : 0.9155
## Prevalence : 0.5000
## Detection Rate : 0.4600
## Detection Prevalence : 0.5267
## Balanced Accuracy : 0.8933
##
## 'Positive' Class : 0
##
Findings
# Summarize and compare the models' performance
logit_perf <- confusionMatrix(as.factor(logit_class), test_data$crim_binary)
lda_perf <- confusionMatrix(lda_class, test_data$crim_binary)
nb_perf <- confusionMatrix(nb_class, test_data$crim_binary)
knn_perf <- confusionMatrix(knn_model, test_labels)
performance_summary <- data.frame(
Model = c("Logistic Regression", "LDA", "Naive Bayes", "KNN"),
Accuracy = c(logit_perf$overall['Accuracy'], lda_perf$overall['Accuracy'], nb_perf$overall['Accuracy'], knn_perf$overall['Accuracy']),
Kappa = c(logit_perf$overall['Kappa'], lda_perf$overall['Kappa'], nb_perf$overall['Kappa'], knn_perf$overall['Kappa'])
)
print(performance_summary)
## Model Accuracy Kappa
## 1 Logistic Regression 0.9866667 0.9733333
## 2 LDA 0.8666667 0.7333333
## 3 Naive Bayes 0.9666667 0.9333333
## 4 KNN 0.8933333 0.7866667
Chapter 5
Load Libraries and Data
library(ISLR)
# Load the Default dataset
data("Default")
Fit Logistic Regression Model
# Set a random seed for reproducibility
set.seed(123)
# Fit logistic regression model using income and balance
logit_model <- glm(default ~ income + balance, data = Default, family = binomial)
# Summary of the model
summary(logit_model)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Validation Set Approach
First Split
# Split the data into training and validation sets
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]
# Fit logistic regression model on training data
logit_model_train <- glm(default ~ income + balance, data = train_data, family = binomial)
# Predict on validation set
pred_probs <- predict(logit_model_train, validation_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
# Confusion matrix and error rate
conf_matrix <- confusionMatrix(as.factor(pred_class), validation_data$default)
validation_error <- 1 - conf_matrix$overall['Accuracy']
validation_error
## Accuracy
## 0.02534178
Second Split
# Set a new random seed for a different split
set.seed(456)
# Split the data again
train_index2 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data2 <- Default[train_index2, ]
validation_data2 <- Default[-train_index2, ]
# Fit logistic regression model on training data
logit_model_train2 <- glm(default ~ income + balance, data = train_data2, family = binomial)
# Predict on validation set
pred_probs2 <- predict(logit_model_train2, validation_data2, type = "response")
pred_class2 <- ifelse(pred_probs2 > 0.5, "Yes", "No")
# Confusion matrix and error rate
conf_matrix2 <- confusionMatrix(as.factor(pred_class2), validation_data2$default)
validation_error2 <- 1 - conf_matrix2$overall['Accuracy']
validation_error2
## Accuracy
## 0.027009
Third Split
# Set another random seed for a different split
set.seed(789)
# Split the data again
train_index3 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data3 <- Default[train_index3, ]
validation_data3 <- Default[-train_index3, ]
# Fit logistic regression model on training data
logit_model_train3 <- glm(default ~ income + balance, data = train_data3, family = binomial)
# Predict on validation set
pred_probs3 <- predict(logit_model_train3, validation_data3, type = "response")
pred_class3 <- ifelse(pred_probs3 > 0.5, "Yes", "No")
# Confusion matrix and error rate
conf_matrix3 <- confusionMatrix(as.factor(pred_class3), validation_data3$default)
validation_error3 <- 1 - conf_matrix3$overall['Accuracy']
validation_error3
## Accuracy
## 0.02500834
Summary of Validation Errors
# Summarize validation errors
validation_errors <- data.frame(
Split = c("First Split", "Second Split", "Third Split"),
Validation_Error = c(validation_error, validation_error2, validation_error3)
)
print(validation_errors)
## Split Validation_Error
## 1 First Split 0.02534178
## 2 Second Split 0.02700900
## 3 Third Split 0.02500834
Adding Student Variable
# Set seed for reproducibility
set.seed(123)
# Split the data
train_index4 <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data4 <- Default[train_index4, ]
validation_data4 <- Default[-train_index4, ]
# Fit logistic regression model with income, balance, and student
logit_model_student <- glm(default ~ income + balance + student, data = train_data4, family = binomial)
# Predict on validation set
pred_probs_student <- predict(logit_model_student, validation_data4, type = "response")
pred_class_student <- ifelse(pred_probs_student > 0.5, "Yes", "No")
# Confusion matrix and error rate
conf_matrix_student <- confusionMatrix(as.factor(pred_class_student), validation_data4$default)
validation_error_student <- 1 - conf_matrix_student$overall['Accuracy']
validation_error_student
## Accuracy
## 0.02634211
Summary and Comparison
# Summarize and compare the validation errors
comparison <- data.frame(
Model = c("Without Student", "With Student"),
Validation_Error = c(validation_error, validation_error_student)
)
print(comparison)
## Model Validation_Error
## 1 Without Student 0.02534178
## 2 With Student 0.02634211
# Discussion on the results
# Here you can add your observations and comments on whether including the student variable
# led to a reduction in the test error rate.
Chapter 6
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Load the College dataset
data("College")
Split the Data into Training and Test Sets
# Set a random seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# Check dimensions
dim(train_data)
## [1] 545 18
dim(test_data)
## [1] 232 18
Linear Model using Least Squares
# Fit linear model using least squares
lm_model <- lm(Apps ~ ., data = train_data)
# Predict on test set
lm_predictions <- predict(lm_model, test_data)
# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Apps)^2)
lm_mse
## [1] 1882074
Ridge Regression
# Prepare data for ridge regression
x_train <- model.matrix(Apps ~ ., train_data)[,-1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[,-1]
y_test <- test_data$Apps
# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min
# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse
## [1] 3265646
Lasso Regression
# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min
# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse
## [1] 1942428
# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)
## [1] 17
Principal Components Regression (PCR)
# Fit PCR model with cross-validation
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)
# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)
# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Apps)^2)
pcr_mse
## [1] 1882074
Partial Least Squares (PLS)
# Fit PLS model with cross-validation
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)
# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)
# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Apps)^2)
pls_mse
## [1] 1888865
Summary and Comparison
# Summarize and compare the test errors
test_errors <- data.frame(
Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)
print(test_errors)
## Model Test_Error
## 1 Linear Model 1882074
## 2 Ridge Regression 3265646
## 3 Lasso Regression 1942428
## 4 PCR 1882074
## 5 PLS 1888865
Financial Application Example
Example: Predicting Stock Prices
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Get stock data (e.g., Apple Inc.)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-12-31")
## [1] "AAPL"
stock_data <- na.omit(AAPL)
# Check column names
colnames(stock_data)
## [1] "AAPL.Open" "AAPL.High" "AAPL.Low" "AAPL.Close"
## [5] "AAPL.Volume" "AAPL.Adjusted"
# Create predictors and response variable
stock_data <- data.frame(Date = index(stock_data), coredata(stock_data))
colnames(stock_data) <- make.names(colnames(stock_data))
stock_data <- stock_data %>%
mutate(Return = (AAPL.Close - lag(AAPL.Close)) / lag(AAPL.Close)) %>%
na.omit()
Split Data into Training and Test Sets
# Set a random seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_index <- createDataPartition(stock_data$Return, p = 0.7, list = FALSE)
train_data <- stock_data[train_index, ]
test_data <- stock_data[-train_index, ]
# Check dimensions
dim(train_data)
## [1] 705 8
dim(test_data)
## [1] 300 8
Linear Model using Least Squares
# Fit linear model using least squares
lm_model <- lm(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data)
# Predict on test set
lm_predictions <- predict(lm_model, test_data)
# Calculate test error (MSE)
lm_mse <- mean((lm_predictions - test_data$Return)^2)
lm_mse
## [1] 0.0002262872
Ridge Regression
# Prepare data for ridge regression
x_train <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, train_data)[,-1]
y_train <- train_data$Return
x_test <- model.matrix(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, test_data)[,-1]
y_test <- test_data$Return
# Fit ridge regression model with cross-validation
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
# Best lambda
best_lambda_ridge <- cv_ridge$lambda.min
# Predict on test set
ridge_predictions <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
# Calculate test error (MSE)
ridge_mse <- mean((ridge_predictions - y_test)^2)
ridge_mse
## [1] 0.0003197523
Lasso Regression
# Fit lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
# Best lambda
best_lambda_lasso <- cv_lasso$lambda.min
# Predict on test set
lasso_predictions <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
# Calculate test error (MSE)
lasso_mse <- mean((lasso_predictions - y_test)^2)
lasso_mse
## [1] 0.000228227
# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
sum(lasso_coef != 0)
## [1] 6
Principal Components Regression (PCR)
# Fit PCR model with cross-validation
pcr_model <- pcr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")
# Choose number of components with lowest CV error
best_ncomp_pcr <- which.min(pcr_model$validation$PRESS)
# Predict on test set
pcr_predictions <- predict(pcr_model, test_data, ncomp = best_ncomp_pcr)
# Calculate test error (MSE)
pcr_mse <- mean((pcr_predictions - test_data$Return)^2)
pcr_mse
## [1] 0.0002262872
Partial Least Squares (PLS)
# Fit PLS model with cross-validation
pls_model <- plsr(Return ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data, scale = TRUE, validation = "CV")
# Choose number of components with lowest CV error
best_ncomp_pls <- which.min(pls_model$validation$PRESS)
# Predict on test set
pls_predictions <- predict(pls_model, test_data, ncomp = best_ncomp_pls)
# Calculate test error (MSE)
pls_mse <- mean((pls_predictions - test_data$Return)^2)
pls_mse
## [1] 0.0002262872
Summary and Comparison
# Summarize and compare the test errors
test_errors <- data.frame(
Model = c("Linear Model", "Ridge Regression", "Lasso Regression", "PCR", "PLS"),
Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)
print(test_errors)
## Model Test_Error
## 1 Linear Model 0.0002262872
## 2 Ridge Regression 0.0003197523
## 3 Lasso Regression 0.0002282270
## 4 PCR 0.0002262872
## 5 PLS 0.0002262872