library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-9
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(ggplot2)
data("Smarket")
Smarket$Direction <- ifelse(Smarket$Direction == "Up", 1, 0) # Binary encoding
set.seed(1)
train_idx <- Smarket$Year < 2005
x <- model.matrix(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket)[, -1]
y <- Smarket$Direction
x_train <- x[train_idx, ]
x_test <- x[!train_idx, ]
y_train <- y[train_idx]
y_test <- y[!train_idx]
# Linear Model
lm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket, family = binomial, subset = train_idx)
probs <- predict(lm_fit, newdata = Smarket[!train_idx, ], type = "response")
preds_lm <- ifelse(probs > 0.5, 1, 0)
test_error_linear <- mean(preds_lm != y_test)
# Ridge Regression
grid <- 10^seq(10, -2, length = 100)
ridge_mod <- glmnet(x_train, y_train, alpha = 0, lambda = grid, family = "binomial")
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0, family = "binomial")
best_lambda_ridge <- cv_ridge$lambda.min
ridge_preds <- predict(ridge_mod, s = best_lambda_ridge, newx = x_test, type = "class")
test_error_ridge <- mean(ridge_preds != y_test)
# Lasso
lasso_mod <- glmnet(x_train, y_train, alpha = 1, lambda = grid, family = "binomial")
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1, family = "binomial")
best_lambda_lasso <- cv_lasso$lambda.min
lasso_preds <- predict(lasso_mod, s = best_lambda_lasso, newx = x_test, type = "class")
test_error_lasso <- mean(lasso_preds != y_test)
# PCR
pcr_fit <- pcr(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket,
subset = train_idx, scale = TRUE, validation = "CV")
pcr_pred <- predict(pcr_fit, Smarket[!train_idx, ], ncomp = 3)
pcr_class <- ifelse(pcr_pred > 0.5, 1, 0)
test_error_pcr <- mean(pcr_class != y_test)
# PLS
pls_fit <- plsr(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket,
subset = train_idx, scale = TRUE, validation = "CV")
pls_pred <- predict(pls_fit, Smarket[!train_idx, ], ncomp = 3)
pls_class <- ifelse(pls_pred > 0.5, 1, 0)
test_error_pls <- mean(pls_class != y_test)
results <- data.frame(
Method = c("Linear Model", "Ridge Regression", "Lasso", "PCR", "PLS"),
TestError = c(test_error_linear, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls)
)
print(results)
## Method TestError
## 1 Linear Model 0.5198413
## 2 Ridge Regression 0.4404762
## 3 Lasso 0.4404762
## 4 PCR 0.5476190
## 5 PLS 0.5198413
# Optional: Visualize
ggplot(results, aes(x = Method, y = TestError, fill = Method)) +
geom_col() +
theme_minimal() +
labs(title = "Test Error Comparison of 5 Models", y = "Test Error Rate", x = "")
