Introduction

In this project, I apply machine learning models in finance by predicting whether the S&P 500 stock market will go Up or Down, based on past performance (lagged returns).

I use the Smarket dataset from the ISLR package and apply the following models:

Logistic Regression Linear Discriminant Analysis (LDA) K-Nearest Neighbors (KNN) Ridge Regression Lasso Regression

Load Packages and Data

library(ISLR)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(class)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-9
data(Smarket)
str(Smarket)
## 'data.frame':    1250 obs. of  9 variables:
##  $ Year     : num  2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
summary(Smarket)
##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 

#Exploratory Analysis

pairs(Smarket[, -9])

cor(Smarket[, -9])
##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000

#Logistic Regression

logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5,
                   data = Smarket,
                   family = binomial)
summary(logit_model)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = binomial, 
##     data = Smarket)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.074163   0.056674   1.309    0.191
## Lag1        -0.071325   0.050104  -1.424    0.155
## Lag2        -0.044136   0.050025  -0.882    0.378
## Lag3         0.009229   0.049879   0.185    0.853
## Lag4         0.007211   0.049898   0.145    0.885
## Lag5         0.009311   0.049490   0.188    0.851
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1728.3  on 1244  degrees of freedom
## AIC: 1740.3
## 
## Number of Fisher Scoring iterations: 3

Predict and Evaluate

pred_probs <- predict(logit_model, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")
table(Predicted = pred_classes, Actual = Smarket$Direction)
##          Actual
## Predicted Down  Up
##      Down  116  98
##      Up    486 550
mean(pred_classes == Smarket$Direction)
## [1] 0.5328

#Linear Discriminant Analysis (LDA)

lda_model <- lda(Direction ~ Lag1 + Lag2, data = Smarket)
lda_pred <- predict(lda_model)
table(Predicted = lda_pred$class, Actual = Smarket$Direction)
##          Actual
## Predicted Down  Up
##      Down  114 102
##      Up    488 546
mean(lda_pred$class == Smarket$Direction)
## [1] 0.528

#K-Nearest Neighbors (KNN)

X <- scale(Smarket[, c("Lag1", "Lag2")])
Y <- Smarket$Direction

train_index <- 1:1000
test_index <- 1001:nrow(Smarket)

set.seed(123)
knn_pred <- knn(train = X[train_index, ],
                test = X[test_index, ],
                cl = Y[train_index],
                k = 5)

table(Predicted = knn_pred, Actual = Y[test_index])
##          Actual
## Predicted Down Up
##      Down   39 60
##      Up     70 81
mean(knn_pred == Y[test_index])
## [1] 0.48

#Ridge and Lasso Regression Prepare Data

x <- model.matrix(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = Smarket)[, -1]
y <- ifelse(Smarket$Direction == "Up", 1, 0)

train_x <- x[train_index, ]
test_x <- x[test_index, ]
train_y <- y[train_index]
test_y <- y[test_index]

Ridge Regression (alpha = 0)

set.seed(123)
cv_ridge <- cv.glmnet(train_x, train_y, alpha = 0, family = "binomial")
best_lambda_ridge <- cv_ridge$lambda.min

ridge_model <- glmnet(train_x, train_y, alpha = 0, lambda = best_lambda_ridge, family = "binomial")
ridge_probs <- predict(ridge_model, s = best_lambda_ridge, newx = test_x, type = "response")
ridge_pred <- ifelse(ridge_probs > 0.5, 1, 0)
mean(ridge_pred == test_y)
## [1] 0.564

Lasso Regression (alpha = 1)

set.seed(123)
cv_lasso <- cv.glmnet(train_x, train_y, alpha = 1, family = "binomial")
best_lambda_lasso <- cv_lasso$lambda.min

lasso_model <- glmnet(train_x, train_y, alpha = 1, lambda = best_lambda_lasso, family = "binomial")
lasso_probs <- predict(lasso_model, s = best_lambda_lasso, newx = test_x, type = "response")
lasso_pred <- ifelse(lasso_probs > 0.5, 1, 0)
mean(lasso_pred == test_y)
## [1] 0.564

Conclusion

I applied several machine learning models to predict market direction:

Logistic Regression and LDA performed around 52–56%. KNN gave similar accuracy depending on k. Ridge and Lasso showed the power of regularization but still limited by the data features. This type of model can be extended to ETF datasets or intraday trading signals for more realistic financial forecasting.