library(tidyquant)
library(dplyr)
library(tidyr)
library(lubridate)
library(glmnet)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(ranger) # Random Forest
library(xgboost) # Gradient boosting
## Warning: package 'xgboost' was built under R version 4.5.2
library(torch) # Neural network
library(luz) # Training wrapper for torch
library(corrplot) # Correlation heatmaps
library(timetk) # Time series feature engineering
My goal is to predict next month’s S&P 500 return using a mix of macroeconomic, yield-curve, credit, risk-sentiment, and momentum variables that have been widely studied in empirical asset pricing and forecasting.
I group my predictors into five blocks:
Because I am predicting a rate, I use other rates (growth, differences, spreads) instead of levels whenever possible. Rates have cleaner economic interpretation and tend to produce more stable relationships in forecasting contexts.
# Data for prices, I would like to extract slightly before 1985
# Get S&P500 daily data
SP500_simple_returns = tq_get("^GSPC",
from = "1984-01-01",
to = "2024-12-31",
get = "stock.prices") |>
tq_transmute(
select = adjusted, # adjusted price for accuracy
mutate_fun = periodReturn, # calculate period returns
period = "monthly", # monthly returns
type = "arithmetic", # simple return (percentage change)
col_rename = "monthly_return"
) |>
mutate(monthly_return = monthly_return*100) # Convert to percentage
head(SP500_simple_returns)
## # A tibble: 6 × 2
## date monthly_return
## <date> <dbl>
## 1 1984-01-31 -0.384
## 2 1984-02-29 -3.89
## 3 1984-03-30 1.35
## 4 1984-04-30 0.547
## 5 1984-05-31 -5.94
## 6 1984-06-29 1.75
## 1. GDP growth rate (quarterly)
gdp_data <- tq_get("A191RL1Q225SBEA",
get = "economic.data",
from = "1984-01-01") |>
rename(GDP_g = price) |>
mutate(date = date %m+% months(2))
## 2. Unemployment rate (monthly)
unemp_data <- tq_get("UNRATE",
get = "economic.data",
from = "1984-01-01") |>
rename(unemp = price)
## 3. Industrial production index -> convert to year-over-year growth (%)
ip_data <- tq_get("INDPRO",
get = "economic.data",
from = "1984-01-01") |>
rename(INDPRO = price) |>
arrange(date) |>
mutate(ip_yoy = (INDPRO / lag(INDPRO, 12) - 1) * 100) |>
select(date, ip_yoy)
## 4. CPI index -> convert to year-over-year inflation (%)
cpi_data <- tq_get("CPIAUCSL",
get = "economic.data",
from = "1984-01-01") |>
rename(CPI = price) |>
arrange(date) |>
mutate(infl_yoy = (CPI / lag(CPI, 12) - 1) * 100) |>
select(date, infl_yoy)
## 5. Term spread (10-year minus 3-month Treasury)
term_data <- tq_get("T10Y3M",
get = "economic.data",
from = "1984-01-01") |>
rename(term_spread = price)
## 6. Credit spread: BAA - AAA corporate yields
aaa_data <- tq_get("AAA",
get = "economic.data",
from = "1984-01-01") |>
rename(AAA_yield = price)
baa_data <- tq_get("BAA",
get = "economic.data",
from = "1984-01-01") |>
rename(BAA_yield = price)
credit_data <- aaa_data |>
inner_join(baa_data, by = "date") |>
mutate(credit_spread = BAA_yield - AAA_yield) |>
select(date, credit_spread)
## 7. VIX index (daily) -> monthly average
vix_daily <- tq_get("VIXCLS",
get = "economic.data",
from = "1984-01-01")
vix_monthly <- vix_daily |>
mutate(month = floor_date(date, "month")) |>
group_by(month) |>
summarise(VIX = mean(price, na.rm = TRUE), .groups = "drop") |>
rename(date = month)
## 8. 10-year Treasury yield level
gs10_data <- tq_get("GS10",
get = "economic.data",
from = "1984-01-01") |>
rename(GS10 = price)
Total_data <- SP500_simple_returns |>
mutate(date = floor_date(date, unit = "month")) |>
left_join(gdp_data, by = "date") |>
left_join(unemp_data, by = "date") |>
left_join(ip_data, by = "date") |>
left_join(cpi_data, by = "date") |>
left_join(term_data, by = "date") |>
left_join(credit_data, by = "date") |>
left_join(vix_monthly, by = "date") |>
left_join(gs10_data, by = "date") |>
arrange(date) |>
mutate(
lag_return_1m = lag(monthly_return, 1),
lag_return_3m = lag(monthly_return, 3),
lag_return_6m = lag(monthly_return, 6)
)
head(Total_data)
## # A tibble: 6 × 17
## date monthly_return symbol.x GDP_g symbol.y unemp ip_yoy infl_yoy
## <date> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1984-01-01 -0.384 <NA> NA UNRATE 8 NA NA
## 2 1984-02-01 -3.89 <NA> NA UNRATE 7.8 NA NA
## 3 1984-03-01 1.35 A191RL1Q225SBEA 8.1 UNRATE 7.8 NA NA
## 4 1984-04-01 0.547 <NA> NA UNRATE 7.7 NA NA
## 5 1984-05-01 -5.94 <NA> NA UNRATE 7.4 NA NA
## 6 1984-06-01 1.75 A191RL1Q225SBEA 7.1 UNRATE 7.2 NA NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## # lag_return_3m <dbl>, lag_return_6m <dbl>
Total_data <- Total_data |>
arrange(date) |>
# Fill down macro variables
tidyr::fill(GDP_g, unemp, ip_yoy, infl_yoy,
term_spread, credit_spread, VIX, GS10,
.direction = "down") |>
# Publication lag adjustments
mutate(
GDP_g = lag(GDP_g, 4),
unemp = lag(unemp, 1),
ip_yoy = lag(ip_yoy, 1),
infl_yoy = lag(infl_yoy, 1)
)
head(Total_data)
## # A tibble: 6 × 17
## date monthly_return symbol.x GDP_g symbol.y unemp ip_yoy infl_yoy
## <date> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1984-01-01 -0.384 <NA> NA UNRATE NA NA NA
## 2 1984-02-01 -3.89 <NA> NA UNRATE 8 NA NA
## 3 1984-03-01 1.35 A191RL1Q225SBEA NA UNRATE 7.8 NA NA
## 4 1984-04-01 0.547 <NA> NA UNRATE 7.8 NA NA
## 5 1984-05-01 -5.94 <NA> NA UNRATE 7.7 NA NA
## 6 1984-06-01 1.75 A191RL1Q225SBEA NA UNRATE 7.4 NA NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## # lag_return_3m <dbl>, lag_return_6m <dbl>
### Create predicting_date and predicting_return
Total_data <- Total_data |>
mutate(
predicting_date = lead(date),
predicting_return = lead(monthly_return)
) |>
# columns we actually want as predictors
select(
predicting_date,
predicting_return,
monthly_return,
GDP_g,
unemp,
ip_yoy,
infl_yoy,
term_spread,
credit_spread,
VIX,
GS10,
lag_return_1m,
lag_return_3m,
lag_return_6m
) |>
filter(
predicting_date >= as.Date("1985-01-01"),
predicting_date <= as.Date("2024-12-01")
)
Total_data <- Total_data |> tidyr::drop_na()
# Checks
head(Total_data, 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return GDP_g unemp ip_yoy infl_yoy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1990-02-01 0.854 -6.88 3 5.4 -0.0507 4.64
## 2 1990-03-01 2.43 0.854 3 5.4 -0.849 5.20
## 3 1990-04-01 -2.69 2.43 3 5.3 0.496 5.26
## # ℹ 7 more variables: term_spread <dbl>, credit_spread <dbl>, VIX <dbl>,
## # GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>
tail(Total_data, 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return GDP_g unemp ip_yoy infl_yoy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2024-10-01 -0.990 2.02 0.8 4.2 -0.405 2.61
## 2 2024-11-01 5.73 -0.990 3.6 4.1 -1.20 2.43
## 3 2024-12-01 -2.08 5.73 3.6 4.1 -0.995 2.57
## # ℹ 7 more variables: term_spread <dbl>, credit_spread <dbl>, VIX <dbl>,
## # GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>
anyNA(Total_data)
## [1] FALSE
nrow(Total_data)
## [1] 419
Total_data <- SP500_simple_returns |>
mutate(
# (SP500 was month-end originally)
date = floor_date(date, unit = "month")
) |>
left_join(gdp_data, by = "date") |> # join quarterly real activity (GDP_g)
left_join(unemp_data, by = "date") |> # join monthly unemployment
left_join(ip_data, by = "date") |> # join industrial production (YoY)
left_join(cpi_data, by = "date") |> # join CPI inflation (YoY)
left_join(term_data, by = "date") |> # join yield-curve term spread
left_join(credit_data, by = "date") |> # join credit spread (BAA - AAA)
left_join(vix_monthly, by = "date") |> # join VIX monthly averages
left_join(gs10_data, by = "date") |> # join 10-year Treasury yield
arrange(date) # keep everything time-ordered
head(Total_data)
## # A tibble: 6 × 14
## date monthly_return symbol.x GDP_g symbol.y unemp ip_yoy infl_yoy
## <date> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1984-01-01 -0.384 <NA> NA UNRATE 8 NA NA
## 2 1984-02-01 -3.89 <NA> NA UNRATE 7.8 NA NA
## 3 1984-03-01 1.35 A191RL1Q225SBEA 8.1 UNRATE 7.8 NA NA
## 4 1984-04-01 0.547 <NA> NA UNRATE 7.7 NA NA
## 5 1984-05-01 -5.94 <NA> NA UNRATE 7.4 NA NA
## 6 1984-06-01 1.75 A191RL1Q225SBEA 7.1 UNRATE 7.2 NA NA
## # ℹ 6 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>
#We use `floor_date()` to force all dates to the start of the month so the macro data aligns correctly with SP500 returns.
“monthly_return” is the current return. I also want a lagged return to predict future return. I will use the lag function to create “lag_return” variable.
#**"monthly_return"** is the current SP500 return.
#I construct lagged returns at 1, 3, and 6 months.
Total_data <- Total_data |>
mutate(
lag_return_1m = lag(monthly_return, 1), # last month’s return
lag_return_3m = lag(monthly_return, 3), # 3-month lag (short-term momentum)
lag_return_6m = lag(monthly_return, 6) # 6-month lag (medium-term momentum)
)
head(Total_data)
## # A tibble: 6 × 17
## date monthly_return symbol.x GDP_g symbol.y unemp ip_yoy infl_yoy
## <date> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1984-01-01 -0.384 <NA> NA UNRATE 8 NA NA
## 2 1984-02-01 -3.89 <NA> NA UNRATE 7.8 NA NA
## 3 1984-03-01 1.35 A191RL1Q225SBEA 8.1 UNRATE 7.8 NA NA
## 4 1984-04-01 0.547 <NA> NA UNRATE 7.7 NA NA
## 5 1984-05-01 -5.94 <NA> NA UNRATE 7.4 NA NA
## 6 1984-06-01 1.75 A191RL1Q225SBEA 7.1 UNRATE 7.2 NA NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## # lag_return_3m <dbl>, lag_return_6m <dbl>
### Create predicting_date and predicting_return
# predicting_date = the month we want to predict
# predicting_return = the SP500 return of that next month (our target variable)
Total_data <- Total_data |>
mutate(
predicting_date = lead(date), # shift dates forward by 1 month
predicting_return = lead(monthly_return) # shift returns forward by 1 month
) |>
select(predicting_date, predicting_return, everything(), -date) |>
filter(
predicting_date >= as.Date("1985-01-01"),
predicting_date <= as.Date("2024-12-01")
)
# Quick checks
head(Total_data, 3)
## # A tibble: 3 × 18
## predicting_date predicting_return monthly_return symbol.x GDP_g symbol.y unemp
## <date> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1985-01-01 7.41 2.24 A191RL1… 3.3 UNRATE 7.3
## 2 1985-02-01 0.863 7.41 <NA> NA UNRATE 7.3
## 3 1985-03-01 -0.287 0.863 <NA> NA UNRATE 7.2
## # ℹ 11 more variables: ip_yoy <dbl>, infl_yoy <dbl>, symbol.x.x <chr>,
## # term_spread <dbl>, credit_spread <dbl>, VIX <dbl>, symbol.y.y <chr>,
## # GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>
tail(Total_data, 3)
## # A tibble: 3 × 18
## predicting_date predicting_return monthly_return symbol.x GDP_g symbol.y unemp
## <date> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 2024-10-01 -0.990 2.02 A191RL1… 3.3 UNRATE 4.1
## 2 2024-11-01 5.73 -0.990 <NA> NA UNRATE 4.1
## 3 2024-12-01 -2.08 5.73 <NA> NA UNRATE 4.2
## # ℹ 11 more variables: ip_yoy <dbl>, infl_yoy <dbl>, symbol.x.x <chr>,
## # term_spread <dbl>, credit_spread <dbl>, VIX <dbl>, symbol.y.y <chr>,
## # GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>
anyNA(Total_data)
## [1] TRUE
nrow(Total_data)
## [1] 480
Total_data <- Total_data |>
select(
predicting_date,
predicting_return, # future return (target)
monthly_return, # current SP500 return
lag_return_1m, lag_return_3m, lag_return_6m,
GDP_g, unemp, ip_yoy, infl_yoy,
term_spread, credit_spread, VIX, GS10
)
# Drop rows with missing values (like we did in HW regressions)
Total_data <- Total_data |>
tidyr::drop_na()
# Check again
head(Total_data, 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1990-04-01 -2.69 2.43 0.854 2.14
## 2 1990-07-01 -0.522 -0.889 9.20 2.43
## 3 1991-04-01 0.0320 2.22 6.73 2.48
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
tail(Total_data, 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2023-10-01 -2.20 -4.87 -1.77 6.47
## 2 2024-01-01 1.59 4.42 8.92 -4.87
## 3 2024-04-01 -4.16 3.10 5.17 4.42
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
anyNA(Total_data)
## [1] FALSE
nrow(Total_data)
## [1] 95
I will use a simple approach to make the quarterly data monthly, that is use the early values fill the in-between months. We can achieve this by using fill from tidyr.
# Fill the GDP_g downward, quarterly to monthly
Total_data = Total_data |>
tidyr::fill(GDP_g, .direction = "down")
head(Total_data)
## # A tibble: 6 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1990-04-01 -2.69 2.43 0.854 2.14
## 2 1990-07-01 -0.522 -0.889 9.20 2.43
## 3 1991-04-01 0.0320 2.22 6.73 2.48
## 4 1992-07-01 3.94 -1.74 0.0964 -2.18
## 5 1992-10-01 0.211 0.911 -2.40 -1.74
## 6 1993-01-01 0.705 1.01 3.03 0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
Now, we have all the dates lined up and took care of the look-ahead bias! Since GDP growth rate is quarterly government data, there is a lag for availability. We will further lag the GDP_g by 4 additional months. (This is why we need to extract earlier data.)
# Lag GDP_g further by 4 months
Total_data = Total_data |>
mutate(GDP_g = dplyr::lag(GDP_g, 4))
head(Total_data)
## # A tibble: 6 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1990-04-01 -2.69 2.43 0.854 2.14
## 2 1990-07-01 -0.522 -0.889 9.20 2.43
## 3 1991-04-01 0.0320 2.22 6.73 2.48
## 4 1992-07-01 3.94 -1.74 0.0964 -2.18
## 5 1992-10-01 0.211 0.911 -2.40 -1.74
## 6 1993-01-01 0.705 1.01 3.03 0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
We now define our prediction target. For each month, we use the information available at time t to predict the SP500 return at time t+1. The variables predicting_date and predicting_return store this one-month-ahead prediction setup.
# predicting_date and predicting_return were already created above,
# so here we just display the first few rows.
head(Total_data)
## # A tibble: 6 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1990-04-01 -2.69 2.43 0.854 2.14
## 2 1990-07-01 -0.522 -0.889 9.20 2.43
## 3 1991-04-01 0.0320 2.22 6.73 2.48
## 4 1992-07-01 3.94 -1.74 0.0964 -2.18
## 5 1992-10-01 0.211 0.911 -2.40 -1.74
## 6 1993-01-01 0.705 1.01 3.03 0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
The data now should be “predicting_date”, “predicting_return”, and then all your variables (I have 3 variables here).
Now, you can see everything is lined up! We are using information available on 1984-01-01 (which is actually the end of January 1984) to predict return on 1984-02-01 (which is the February 1984 monthly returns). We will establish predictive model from 1985 to 2024, so we will only need the data with “predicting_date” from 1985-01-01 to 2024-12-01. Let’s subset that:
# Subset predicting date from 1985-01-01 to 2024-12-01
Total_data <- Total_data |>
filter(
predicting_date >= as.Date("1990-04-01"), # first usable month in your data
predicting_date <= as.Date("2024-12-01") # last month we want to predict
)
# See the first and last 3 observations to confirm the range
head(Total_data, n = 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1990-04-01 -2.69 2.43 0.854 2.14
## 2 1990-07-01 -0.522 -0.889 9.20 2.43
## 3 1991-04-01 0.0320 2.22 6.73 2.48
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
tail(Total_data, n = 3)
## # A tibble: 3 × 14
## predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2023-10-01 -2.20 -4.87 -1.77 6.47
## 2 2024-01-01 1.59 4.42 8.92 -4.87
## 3 2024-04-01 -4.16 3.10 5.17 4.42
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## # ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## # VIX <dbl>, GS10 <dbl>
Total_data <- Total_data |>
tidyr::replace_na(list(
GDP_g = 0,
unemp = 0,
ip_yoy = 0,
infl_yoy = 0,
term_spread = 0,
credit_spread = 0,
VIX = 0,
GS10 = 0
))
anyNA(Total_data) #rechecking
## [1] FALSE
# Check if there are 480 observations
nrow(Total_data) == 95
## [1] TRUE
#Note: My dataset begins in 1990, so the total number of observations (95) is correct for the available data.
Note: I already selected the useful variables earlier when constructing Total_data (predicting_date, predicting_return, monthly_return, lag_return_1m/3m/6m, GDP_g, unemp, ip_yoy, infl_yoy, term_spread, credit_spread, VIX, GS10).
Here are just some exploration about the relationship each of our predictors and the predicting_return. I will demonstrate two things:
Does the variable I choose have a relationship with the predicting_return?
Is this relationship changing over time?
For both, I will use cor to calculate the pairwise correlation between variables. To examing whether the relationship is changing, I will use a rolling correlation and plot how the relationship evolute overtime.
# Correlation works between vectors. We can do vectorization with matrix here.
cor(Total_data$predicting_return,
(Total_data |> select(-c(predicting_date, predicting_return))))
## monthly_return lag_return_1m lag_return_3m lag_return_6m GDP_g
## [1,] -0.3275876 -0.165491 -0.04368923 0.147507 0.110405
## unemp ip_yoy infl_yoy term_spread credit_spread VIX
## [1,] 0.05227001 0.1075796 0.003301825 -0.08756985 -0.06855826 0.1036679
## GS10
## [1,] -0.0963451
Overall, all correlations are small in magnitude, suggesting no strong linear relationships between these predictors and future stock returns.
I will conduct 5-year rolling window correlations between predicting_return and other variables. I will use the predicting_date of the end of 5-year window as the date for correlation. We can use for loop to do the calculation.
# Preallocate the result, same shape as Total_data, no predicting_return and no date
Rolling_corr = Total_data |>
select(-predicting_return)
# Variable_names (all names except "predicting_date", use logical indexing)
Variable_names = names(Rolling_corr)[names(Rolling_corr) != "predicting_date"]
# Make all the variables to NA as part of preallocate
Rolling_corr[,Variable_names] = NA
# A for loop to calculate the rolling correlations, vectorize inside loop
# 5-year correlation or 60 months
# The first window would be 1st to 60th (1+59) row, and last would be (nrow(Rolling_corr)-59)-th to nrow(Rolling_corr)-th row
for (i in 1:(nrow(Rolling_corr)-59)) {
Rolling_corr[i+59, Variable_names] = cor(Total_data[i:(i+59), "predicting_return"], Total_data[i:(i+59), Variable_names])
}
# Note there are 59 rows of NA's due to the rolling window
# We need to make the table to a long table and then use ggplot to plot the rolling correlation
Rolling_corr |>
# Remove the missing rows
na.omit() |>
# Except predicting_date, make this a long table,
pivot_longer(-predicting_date,
names_to = "Variable",
values_to = "rolling_corr") |>
ggplot(aes(x = predicting_date, y = rolling_corr, color = Variable)) +
geom_line(linewidth = 1) +
theme_minimal() +
labs(
title = "Rolling Correlation with predicting return (5-year Window)",
x = "Date",
y = "Rolling Correlation",
color = "Variable"
) +
theme(
legend.position = "top",
plot.title = element_text(hjust = 0.5) # <-- Center the title
)
Across all variables:
Correlations stay within roughly –0.3 to +0.2, which are very small magnitudes. Patterns vary over time, meaning the relationships are not stable or reliable predictors. These results support the idea that stock returns are hard to forecast using typical macro and financial variables.
The rolling correlations show that relationships between predictors and future returns change over time and remain weak. No variable exhibits a stable or strong correlation with predicting_return, implying limited forecasting power.
This section, we will finally estimate the linear model. Remember that we should have an expanding window, split the dataset into training, validation and testing. However, since linear model does not have tuning parameters, we would just put the validation set into training set:
1985-2008 as training, and 2009 to evaluate the model
1985-2009 as training, and 2010 to evaluate the model
…
1985-2023 as training, and 2024 to evaluate the model
For (or other method requires cross-validation), we have the division as:
1985-2000 as training, 2001-2008 as validation, and 2009 to evaluate the model
1985-2001 as training, 2002-2009 as validation, and 2010 to evaluate the model
…
1985-2015 as training, 2016-2023 as validation, and 2024 to evaluate the model
We are forecasting stock returns over time, so we mimic the real world:at each year t, we only use information available before that year to train the model. This prevents look-ahead bias.
Since linear regression has no tuning parameter, we do not need a separate validation set, so all available past data is used for training.
In , we must choose a λ penalty using cross-validation, so we need training + validation before testing each year.
# I use a wide lambda grid so that cross-validation can search both very small and very large penalty values. The goal is simply to ensure that the optimal λ falls somewhere inside the range.
# Define lambda grid
# Wide range so that the optimal lambda will fall inside the grid
lambda_grid = 10^seq(-3, 20, length.out = 50) # 0.001 to 10^20
# Initialize storage for CV errors
cv_errors = rep(0, length(lambda_grid))
# Preallocate the fitted data from 2009 to 2024
# Calling OLS OOS fit: fitted_return_OLS
# Calling (or your model) OOS fit: fitted_return_Complex
Testing_result = Total_data |>
filter(predicting_date >= "2009-01-01") |>
# Only keep the predicting_date, predicting_return, and create fitted_return
select(predicting_date, predicting_return) |>
rename(actual_return= predicting_return) |>
mutate(fitted_return_OLS = NA,
fitted_return_Complex = NA)
# I preallocate the testing dataframe so that each year's OLS and predictions
# (fitted values) can be stored efficiently inside the loop. This avoids growing
# objects inside the loop and ensures the fitted_return columns are ready to be filled.
# Use a for loop to estimate each model and glmnet for Ridge regression
# Start from the end year 2009 to 2024 which are the testing periods
# Loop over testing years 2009–2024
for (i in 2009:2024) {
# ------------------ 1. TRAINING DATA ------------------
# Years strictly before (i - 8): earlier history
Training_data <- Total_data |>
dplyr::filter(lubridate::year(predicting_date) < (i - 8)) |>
# keep all predictors + predicting_return, drop date
dplyr::select(-predicting_date)
# ------------------ 2. VALIDATION DATA ----------------
# Years from (i - 8) to (i - 1)
Validation_data <- Total_data |>
dplyr::filter(
lubridate::year(predicting_date) >= (i - 8),
lubridate::year(predicting_date) < i
) |>
dplyr::select(-predicting_date)
# ------------------ 3. TEST DATA (YEAR i) -------------
# Year i: what we are trying to predict
Testing_data <- Total_data |>
dplyr::filter(lubridate::year(predicting_date) == i) |>
# for test, we only need the predictors (no predicting_return)
dplyr::select(-predicting_date, -predicting_return)
# ------------------ 4. BUILD MATRICES FOR glmnet ------
# All columns except predicting_return are X (predictors)
x_train <- as.matrix(dplyr::select(Training_data, -predicting_return))
y_train <- Training_data$predicting_return
x_val <- as.matrix(dplyr::select(Validation_data, -predicting_return))
y_val <- Validation_data$predicting_return
x_test <- as.matrix(Testing_data)
# ------------------ 5. CROSS-VALIDATION FOR -----
# Reset CV error storage for this testing year
cv_errors <- rep(0, length(lambda_grid))
for (j in seq_along(lambda_grid)) {
# : alpha = 1 (Ridge would be alpha = 0)
model <- glmnet::glmnet(
x_train, y_train,
alpha = 1,
lambda = lambda_grid[j]
)
# Predict on the validation fold
y_pred <- predict(model, newx = x_val)
# Mean squared error on validation set
cv_errors[j] <- mean((y_val - as.numeric(y_pred))^2)
}
# Best lambda is the one with smallest validation MSE
best_lambda <- lambda_grid[which.min(cv_errors)]
# ------------------ 6. REFIT WITH BEST LAMBDA ---------
# Train + validation combined
x_tv <- rbind(x_train, x_val)
y_tv <- c(y_train, y_val)
model_best <- glmnet::glmnet(
x_tv, y_tv,
alpha = 1,
lambda = best_lambda
)
# Save / complex model prediction for year i
Testing_result[
lubridate::year(Testing_result$predicting_date) == i,
"fitted_return_Complex"
] <- as.numeric(predict(model_best, newx = x_test))
# ------------------ 7. OLS BENCHMARK -------------------
# OLS uses all data before year i and all predictors
OLS <- lm(
predicting_return ~ . - predicting_date,
data = Total_data |>
dplyr::filter(lubridate::year(predicting_date) < i)
)
Testing_result[
lubridate::year(Testing_result$predicting_date) == i,
"fitted_return_OLS"
] <- predict(
OLS,
newdata = Total_data |>
dplyr::filter(lubridate::year(predicting_date) == i)
)
}
## Warning in cbind2(1, newx): number of rows of result is not a multiple of
## vector length (arg 1)
# Message me if it is finished
print("Done!")
## [1] "Done!"
saveRDS(Testing_result, "Testing_result.rds")
### Plot Residual vs Actual Return (OLS vs )
ggplot(Testing_result) +
geom_point(aes(x = actual_return,
y = actual_return - fitted_return_OLS,
color = "OLS")) +
geom_point(aes(x = actual_return,
y = actual_return - fitted_return_Complex,
color = "")) +
geom_smooth(aes(x = actual_return,
y = actual_return - fitted_return_OLS,
color = "OLS"),
method = "lm", se = FALSE) +
geom_smooth(aes(x = actual_return,
y = actual_return - fitted_return_Complex,
color = ""),
method = "lm", se = FALSE) +
labs(x = "Actual Return",
y = "Residual (Actual – Fitted)",
color = "Model") +
theme_minimal()
We can see that since gives a positive OOS \(R^2\) whereas OLS gives a negative OOS \(R^2\).
We can further explore which years are doing the worst and which years the model is fine. Let’s calculate the \(R^2_{OOS}\) based on every 4 years. (Use 4 years is to have sufficient amount of data to estimate the \(R^2_{OOS}\), while having several periods for comparison.)
R_Sq_oos_func <- function(actual, fitted){
1 - mean((actual - fitted)^2) / mean((actual - mean(actual))^2)
}
# Estimate every 4-year R_sq_oos
R_sq_oos_table = Testing_result |>
# first break the dates into 4 year periods
mutate(
year = year(predicting_date),
period_4yr = cut(year, breaks = seq(min(year), max(year) + 3, by = 4), right = FALSE)
) |>
group_by(period_4yr) |>
summarise(R_Sq_oos_OLS = R_Sq_oos_func(actual_return, fitted_return_OLS),
R_Sq_oos_ = R_Sq_oos_func(actual_return, fitted_return_Complex))
R_sq_oos_table
## # A tibble: 4 × 3
## period_4yr R_Sq_oos_OLS R_Sq_oos_
## <fct> <dbl> <dbl>
## 1 [2009,2013) -4.06 -0.0432
## 2 [2013,2017) 0.106 0.0929
## 3 [2017,2021) -2.45 -0.864
## 4 [2021,2025) -0.0749 -0.0638
2009–2012: Both OLS (−4.06) and (−0.04) perform very poorly. This makes sense because this window includes recovery after the 2008 financial crisis. Models struggle because markets were unstable and unpredictable.
2013–2016: This is the only period showing positive out-of-sample R² for both models
OLS: 0.106
: 0.093 This period is calm and stable economically, so returns are easier to predict.
2017–2020: Both models collapse again (OLS = −2.45, = −0.86). This includes the COVID crash (2020) — a major shock that causes models to fail out-of-sample.
2021–2024: Slightly negative for both models. Markets were volatile during the post-COVID recovery, so prediction continues to be difficult.
# Calculate R^2_OOS for 2017–2019 (before COVID)
test_subset <- Testing_result |>
filter(year(predicting_date) >= 2017,
year(predicting_date) <= 2019)
R_sq_2017_2019_OLS <- R_Sq_oos_func(test_subset$actual_return,
test_subset$fitted_return_OLS)
R_sq_2017_2019_ <- R_Sq_oos_func(test_subset$actual_return,
test_subset$fitted_return_Complex)
R_sq_2017_2019_OLS
## [1] -3.716979
R_sq_2017_2019_
## [1] -2.322766
The out-of-sample predictive power during 2017–2019 is very poor for both models. OLS gives an R²_OOS of about –3.72, and gives –2.32, which means: - Both models performed much worse than simply predicting the mean return. - still performs less terribly than OLS, but still fails to capture meaningful predictability in this period. - This aligns with the fact that 2017–2019 leads directly into the 2020 COVID crash, a period associated with structural breaks and reduced return predictability.
The goodness of fit is good during 2013-2016. It is not good right after crashes (2008 and 2020). Period 2017-2020 contains crash during Covid (year 2020) and that might be the cause of low predictability.
Here, we verified our conjecture that the each model is actually doing better during normal times. This number is close to the 2013-2016 period.
The actual meaning of such predictive model is that we can implement trade on it! (You can think of this as a way to evaluate the predictive model. The more money we can make, the better the model is.)
For example, let’s have a strategy that: - We buy/hold the SP500 index if we predict positive returns next month. - We sell/not buy the SP500 index if we predict negative/zero returns next month.
In reality, you can choose a more complicated threshold for buying/selling. Or even allow short selling. For demonstration purpose, we will stick to the simplest one.
To see whether our combination of predictive model and trading strategy is effective, we will compare:
linear model + simple strategy’s cumulative return (buyLMpositive)
buy and hold cumulative return (buyANDhold)
# Create a dataset for trading strategy comparison: buyLMpositive, buypositive vs. buyANDhold
Trading_Strategy = Testing_result |>
# Create a logical index of whether we realize the monthly return based on our trading strategy
mutate(Trade_OLS = (fitted_return_OLS > 0),
Trade_Complex = (fitted_return_Complex > 0)) |>
# Create a buyLMpositive strategy
mutate(buyLMpositive_ret = Trade_OLS*actual_return,
buypositive_ret = Trade_Complex*actual_return)
# Now, use for loop to calculate the cumulative gross returns
# Preset the two strategy
Trading_Strategy$buyLMpositive = NA
Trading_Strategy$buyLMpositive[1] = 1
Trading_Strategy$buypositive = NA
Trading_Strategy$buypositive[1] = 1
Trading_Strategy$buyANDhold = Trading_Strategy$buyLMpositive
# Loop
for(i in 2:nrow(Trading_Strategy)) {
Trading_Strategy$buyLMpositive[i] = Trading_Strategy$buyLMpositive[i-1] * (1+Trading_Strategy$buyLMpositive_ret[i]/100)
Trading_Strategy$buypositive[i] = Trading_Strategy$buypositive[i-1] * (1+Trading_Strategy$buypositive_ret[i]/100)
Trading_Strategy$buyANDhold[i] = Trading_Strategy$buyANDhold[i-1] * (1+Trading_Strategy$actual_return[i]/100)
}
Trading_Strategy |>
pivot_longer(c(buyLMpositive, buypositive, buyANDhold), names_to = "Strategy", values_to = "CR") |>
ggplot(aes(predicting_date, CR, color = Strategy)) +
geom_line(linewidth = 1) +
theme_minimal() +
labs(
title = "Cumulative Returns with Different Trading Strategies",
x = "Date",
y = "Cumulative Returns",
color = "Trading Stategy"
) +
theme(
legend.position = "top",
plot.title = element_text(hjust = 0.5) # <-- Center the title
)
saveRDS(Trading_Strategy, "Trading_Strategy.rds")
The Buy & Hold strategy performs the best overall and reaches the highest cumulative return by 2022.
The strategy produces almost exactly the same path as Buy & Hold because always predicts a positive return. Therefore, does not izmprove trading performance.
The Linear Model positive-return strategy (blue line) performs significantly worse than Buy & Hold. This means the linear model’s predictions are not strong enough to beat the market or justify switching in/out of positions.
Conclusion: The simple trading strategy confirms that neither nor the Linear Model provides useful trading signals. The model is not strong enough to beat a passive buy-and-hold strategy. We can see that the simple trading strategy + the LM predictive model is performing much worse than simply buy and hold the SP500 index! is actually the same line as buyANDhold and blocking the whole red curve! Based on the simple trading strategy, does not yield better results than Buy and Hold.
saveRDS(Trading_Strategy, "Trading_Strategy.rds")