Introduction
In this project, I apply machine learning models in finance by predicting whether the S&P 500 stock market will go Up or Down, based on past performance (lagged returns).
I use the Smarket dataset from the ISLR package and apply the following models:
Logistic Regression Linear Discriminant Analysis (LDA) K-Nearest Neighbors (KNN) Ridge Regression Lasso Regression
library(ISLR)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(class)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-9
data(Smarket)
str(Smarket)
## 'data.frame': 1250 obs. of 9 variables:
## $ Year : num 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
#Exploratory Analysis
pairs(Smarket[, -9])
cor(Smarket[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
#Logistic Regression
logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5,
data = Smarket,
family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = binomial,
## data = Smarket)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.074163 0.056674 1.309 0.191
## Lag1 -0.071325 0.050104 -1.424 0.155
## Lag2 -0.044136 0.050025 -0.882 0.378
## Lag3 0.009229 0.049879 0.185 0.853
## Lag4 0.007211 0.049898 0.145 0.885
## Lag5 0.009311 0.049490 0.188 0.851
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1728.3 on 1244 degrees of freedom
## AIC: 1740.3
##
## Number of Fisher Scoring iterations: 3
Predict and Evaluate
pred_probs <- predict(logit_model, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")
table(Predicted = pred_classes, Actual = Smarket$Direction)
## Actual
## Predicted Down Up
## Down 116 98
## Up 486 550
mean(pred_classes == Smarket$Direction)
## [1] 0.5328
#Linear Discriminant Analysis (LDA)
lda_model <- lda(Direction ~ Lag1 + Lag2, data = Smarket)
lda_pred <- predict(lda_model)
table(Predicted = lda_pred$class, Actual = Smarket$Direction)
## Actual
## Predicted Down Up
## Down 114 102
## Up 488 546
mean(lda_pred$class == Smarket$Direction)
## [1] 0.528
#K-Nearest Neighbors (KNN)
X <- scale(Smarket[, c("Lag1", "Lag2")])
Y <- Smarket$Direction
train_index <- 1:1000
test_index <- 1001:nrow(Smarket)
set.seed(123)
knn_pred <- knn(train = X[train_index, ],
test = X[test_index, ],
cl = Y[train_index],
k = 5)
table(Predicted = knn_pred, Actual = Y[test_index])
## Actual
## Predicted Down Up
## Down 39 60
## Up 70 81
mean(knn_pred == Y[test_index])
## [1] 0.48
#Ridge and Lasso Regression Prepare Data
x <- model.matrix(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = Smarket)[, -1]
y <- ifelse(Smarket$Direction == "Up", 1, 0)
train_x <- x[train_index, ]
test_x <- x[test_index, ]
train_y <- y[train_index]
test_y <- y[test_index]
Ridge Regression (alpha = 0)
set.seed(123)
cv_ridge <- cv.glmnet(train_x, train_y, alpha = 0, family = "binomial")
best_lambda_ridge <- cv_ridge$lambda.min
ridge_model <- glmnet(train_x, train_y, alpha = 0, lambda = best_lambda_ridge, family = "binomial")
ridge_probs <- predict(ridge_model, s = best_lambda_ridge, newx = test_x, type = "response")
ridge_pred <- ifelse(ridge_probs > 0.5, 1, 0)
mean(ridge_pred == test_y)
## [1] 0.564
Lasso Regression (alpha = 1)
set.seed(123)
cv_lasso <- cv.glmnet(train_x, train_y, alpha = 1, family = "binomial")
best_lambda_lasso <- cv_lasso$lambda.min
lasso_model <- glmnet(train_x, train_y, alpha = 1, lambda = best_lambda_lasso, family = "binomial")
lasso_probs <- predict(lasso_model, s = best_lambda_lasso, newx = test_x, type = "response")
lasso_pred <- ifelse(lasso_probs > 0.5, 1, 0)
mean(lasso_pred == test_y)
## [1] 0.564
Conclusion
I applied several machine learning models to predict market direction:
Logistic Regression and LDA performed around 52–56%. KNN gave similar accuracy depending on k. Ridge and Lasso showed the power of regularization but still limited by the data features. This type of model can be extended to ETF datasets or intraday trading signals for more realistic financial forecasting.