Layne Serdah Final Project

Data Extraction, Cleaning and Combining

The SP500 Data for Prediction

# Data for prices, I would like to extract slightly before 1985
# Get S&P500 daily data
SP500_simple_returns = tq_get("^GSPC",
                               from = "1984-01-01",
                               to   = "2024-12-31",
                               get  = "stock.prices") |>
  tq_transmute(
    select     = adjusted,       # adjusted price for accuracy
    mutate_fun = periodReturn,   # calculate period returns
    period     = "monthly",      # monthly returns
    type       = "arithmetic",   # simple return (percentage change)
    col_rename = "monthly_return"
  ) |>
  mutate(monthly_return = monthly_return*100) # Convert to percentage

head(SP500_simple_returns)

## # A tibble: 6 × 2
##   date       monthly_return
##   <date>              <dbl>
## 1 1984-01-31         -0.384
## 2 1984-02-29         -3.89 
## 3 1984-03-30          1.35 
## 4 1984-04-30          0.547
## 5 1984-05-31         -5.94 
## 6 1984-06-29          1.75

The GDP Data

## 1. GDP growth rate (quarterly)
gdp_data <- tq_get("A191RL1Q225SBEA",
                   get  = "economic.data",
                   from = "1984-01-01") |>
  rename(GDP_g = price) |>
  mutate(date = date %m+% months(2))

## 2. Unemployment rate (monthly)
unemp_data <- tq_get("UNRATE",
                     get  = "economic.data",
                     from = "1984-01-01") |>
  rename(unemp = price)

## 3. Industrial production index -> convert to year-over-year growth (%)
ip_data <- tq_get("INDPRO",
                  get  = "economic.data",
                  from = "1984-01-01") |>
  rename(INDPRO = price) |>
  arrange(date) |>
  mutate(ip_yoy = (INDPRO / lag(INDPRO, 12) - 1) * 100) |>
  select(date, ip_yoy)

## 4. CPI index -> convert to year-over-year inflation (%)
cpi_data <- tq_get("CPIAUCSL",
                   get  = "economic.data",
                   from = "1984-01-01") |>
  rename(CPI = price) |>
  arrange(date) |>
  mutate(infl_yoy = (CPI / lag(CPI, 12) - 1) * 100) |>
  select(date, infl_yoy)

## 5. Term spread (10-year minus 3-month Treasury)
term_data <- tq_get("T10Y3M",
                    get  = "economic.data",
                    from = "1984-01-01") |>
  rename(term_spread = price)

## 6. Credit spread: BAA - AAA corporate yields
aaa_data <- tq_get("AAA",
                   get  = "economic.data",
                   from = "1984-01-01") |>
  rename(AAA_yield = price)

baa_data <- tq_get("BAA",
                   get  = "economic.data",
                   from = "1984-01-01") |>
  rename(BAA_yield = price)

credit_data <- aaa_data |>
  inner_join(baa_data, by = "date") |>
  mutate(credit_spread = BAA_yield - AAA_yield) |>
  select(date, credit_spread)

## 7. VIX index (daily) -> monthly average
vix_daily <- tq_get("VIXCLS",
                    get  = "economic.data",
                    from = "1984-01-01")
vix_monthly <- vix_daily |>
  mutate(month = floor_date(date, "month")) |>
  group_by(month) |>
  summarise(VIX = mean(price, na.rm = TRUE), .groups = "drop") |>
  rename(date = month)

## 8. 10-year Treasury yield level
gs10_data <- tq_get("GS10",
                    get  = "economic.data",
                    from = "1984-01-01") |>
  rename(GS10 = price)

Join everything to SP500_simple_returns

Total_data <- SP500_simple_returns |>
  mutate(date = floor_date(date, unit = "month")) |>
  left_join(gdp_data,    by = "date") |>
  left_join(unemp_data,  by = "date") |>
  left_join(ip_data,     by = "date") |>
  left_join(cpi_data,    by = "date") |>
  left_join(term_data,   by = "date") |>
  left_join(credit_data, by = "date") |>
  left_join(vix_monthly, by = "date") |>
  left_join(gs10_data,   by = "date") |>
  arrange(date) |>
  mutate(
    lag_return_1m = lag(monthly_return, 1),
    lag_return_3m = lag(monthly_return, 3),
    lag_return_6m = lag(monthly_return, 6)
  )

head(Total_data)

## # A tibble: 6 × 17
##   date       monthly_return symbol.x        GDP_g symbol.y unemp ip_yoy infl_yoy
##   <date>              <dbl> <chr>           <dbl> <chr>    <dbl>  <dbl>    <dbl>
## 1 1984-01-01         -0.384 <NA>             NA   UNRATE     8       NA       NA
## 2 1984-02-01         -3.89  <NA>             NA   UNRATE     7.8     NA       NA
## 3 1984-03-01          1.35  A191RL1Q225SBEA   8.1 UNRATE     7.8     NA       NA
## 4 1984-04-01          0.547 <NA>             NA   UNRATE     7.7     NA       NA
## 5 1984-05-01         -5.94  <NA>             NA   UNRATE     7.4     NA       NA
## 6 1984-06-01          1.75  A191RL1Q225SBEA   7.1 UNRATE     7.2     NA       NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## #   lag_return_3m <dbl>, lag_return_6m <dbl>

Handle mixed frequency and publication lags

Total_data <- Total_data |>
  arrange(date) |>
  # Fill down macro variables
  tidyr::fill(GDP_g, unemp, ip_yoy, infl_yoy,
              term_spread, credit_spread, VIX, GS10,
              .direction = "down") |>
  # Publication lag adjustments
  mutate(
    GDP_g    = lag(GDP_g,    4),
    unemp    = lag(unemp,    1),
    ip_yoy   = lag(ip_yoy,   1),
    infl_yoy = lag(infl_yoy, 1)
  )

head(Total_data)

## # A tibble: 6 × 17
##   date       monthly_return symbol.x        GDP_g symbol.y unemp ip_yoy infl_yoy
##   <date>              <dbl> <chr>           <dbl> <chr>    <dbl>  <dbl>    <dbl>
## 1 1984-01-01         -0.384 <NA>               NA UNRATE    NA       NA       NA
## 2 1984-02-01         -3.89  <NA>               NA UNRATE     8       NA       NA
## 3 1984-03-01          1.35  A191RL1Q225SBEA    NA UNRATE     7.8     NA       NA
## 4 1984-04-01          0.547 <NA>               NA UNRATE     7.8     NA       NA
## 5 1984-05-01         -5.94  <NA>               NA UNRATE     7.7     NA       NA
## 6 1984-06-01          1.75  A191RL1Q225SBEA    NA UNRATE     7.4     NA       NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## #   lag_return_3m <dbl>, lag_return_6m <dbl>

### Create predicting_date and predicting_return


Total_data <- Total_data |>
  mutate(
    predicting_date   = lead(date),
    predicting_return = lead(monthly_return)
  ) |>
  # columns we actually want as predictors
  select(
    predicting_date,
    predicting_return,
    monthly_return,
    GDP_g,
    unemp,
    ip_yoy,
    infl_yoy,
    term_spread,
    credit_spread,
    VIX,
    GS10,
    lag_return_1m,
    lag_return_3m,
    lag_return_6m
  ) |>
  filter(
    predicting_date >= as.Date("1985-01-01"),
    predicting_date <= as.Date("2024-12-01")
  )

Total_data <- Total_data |> tidyr::drop_na()

# Checks
head(Total_data, 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return GDP_g unemp  ip_yoy infl_yoy
##   <date>                      <dbl>          <dbl> <dbl> <dbl>   <dbl>    <dbl>
## 1 1990-02-01                  0.854         -6.88      3   5.4 -0.0507     4.64
## 2 1990-03-01                  2.43           0.854     3   5.4 -0.849      5.20
## 3 1990-04-01                 -2.69           2.43      3   5.3  0.496      5.26
## # ℹ 7 more variables: term_spread <dbl>, credit_spread <dbl>, VIX <dbl>,
## #   GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>

tail(Total_data, 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return GDP_g unemp ip_yoy infl_yoy
##   <date>                      <dbl>          <dbl> <dbl> <dbl>  <dbl>    <dbl>
## 1 2024-10-01                 -0.990          2.02    0.8   4.2 -0.405     2.61
## 2 2024-11-01                  5.73          -0.990   3.6   4.1 -1.20      2.43
## 3 2024-12-01                 -2.08           5.73    3.6   4.1 -0.995     2.57
## # ℹ 7 more variables: term_spread <dbl>, credit_spread <dbl>, VIX <dbl>,
## #   GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>

anyNA(Total_data)

## [1] FALSE

nrow(Total_data)

## [1] 419

Date Alignment for Different Data and Joining

Total_data <- SP500_simple_returns |>
  mutate(
    # (SP500 was month-end originally)
    date = floor_date(date, unit = "month")
  ) |>
  left_join(gdp_data,       by = "date") |>   # join quarterly real activity (GDP_g)
  left_join(unemp_data,     by = "date") |>   # join monthly unemployment
  left_join(ip_data,        by = "date") |>   # join industrial production (YoY)
  left_join(cpi_data,       by = "date") |>   # join CPI inflation (YoY)
  left_join(term_data,      by = "date") |>   # join yield-curve term spread
  left_join(credit_data,    by = "date") |>   # join credit spread (BAA - AAA)
  left_join(vix_monthly,    by = "date") |>   # join VIX monthly averages
  left_join(gs10_data,      by = "date") |>   # join 10-year Treasury yield
  arrange(date)                               # keep everything time-ordered

head(Total_data)

## # A tibble: 6 × 14
##   date       monthly_return symbol.x        GDP_g symbol.y unemp ip_yoy infl_yoy
##   <date>              <dbl> <chr>           <dbl> <chr>    <dbl>  <dbl>    <dbl>
## 1 1984-01-01         -0.384 <NA>             NA   UNRATE     8       NA       NA
## 2 1984-02-01         -3.89  <NA>             NA   UNRATE     7.8     NA       NA
## 3 1984-03-01          1.35  A191RL1Q225SBEA   8.1 UNRATE     7.8     NA       NA
## 4 1984-04-01          0.547 <NA>             NA   UNRATE     7.7     NA       NA
## 5 1984-05-01         -5.94  <NA>             NA   UNRATE     7.4     NA       NA
## 6 1984-06-01          1.75  A191RL1Q225SBEA   7.1 UNRATE     7.2     NA       NA
## # ℹ 6 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>

#We use `floor_date()` to force all dates to the start of the month so the macro data aligns correctly with SP500 returns.

Create a Lagged Return in addition to Current Return

“monthly_return” is the current return. I also want a lagged return to predict future return. I will use the lag function to create “lag_return” variable.

#**"monthly_return"** is the current SP500 return.  
#I construct lagged returns at 1, 3, and 6 months.

Total_data <- Total_data |>
  mutate(
    lag_return_1m = lag(monthly_return, 1),  # last month’s return
    lag_return_3m = lag(monthly_return, 3),  # 3-month lag (short-term momentum)
    lag_return_6m = lag(monthly_return, 6)   # 6-month lag (medium-term momentum)
  )

head(Total_data)

## # A tibble: 6 × 17
##   date       monthly_return symbol.x        GDP_g symbol.y unemp ip_yoy infl_yoy
##   <date>              <dbl> <chr>           <dbl> <chr>    <dbl>  <dbl>    <dbl>
## 1 1984-01-01         -0.384 <NA>             NA   UNRATE     8       NA       NA
## 2 1984-02-01         -3.89  <NA>             NA   UNRATE     7.8     NA       NA
## 3 1984-03-01          1.35  A191RL1Q225SBEA   8.1 UNRATE     7.8     NA       NA
## 4 1984-04-01          0.547 <NA>             NA   UNRATE     7.7     NA       NA
## 5 1984-05-01         -5.94  <NA>             NA   UNRATE     7.4     NA       NA
## 6 1984-06-01          1.75  A191RL1Q225SBEA   7.1 UNRATE     7.2     NA       NA
## # ℹ 9 more variables: symbol.x.x <chr>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, symbol.y.y <chr>, GS10 <dbl>, lag_return_1m <dbl>,
## #   lag_return_3m <dbl>, lag_return_6m <dbl>

### Create predicting_date and predicting_return
# predicting_date = the month we want to predict
# predicting_return = the SP500 return of that next month (our target variable)

Total_data <- Total_data |>
  mutate(
    predicting_date   = lead(date),            # shift dates forward by 1 month
    predicting_return = lead(monthly_return)   # shift returns forward by 1 month
  ) |>
  select(predicting_date, predicting_return, everything(), -date) |>
  filter(
    predicting_date >= as.Date("1985-01-01"),
    predicting_date <= as.Date("2024-12-01")
  )

# Quick checks
head(Total_data, 3)

## # A tibble: 3 × 18
##   predicting_date predicting_return monthly_return symbol.x GDP_g symbol.y unemp
##   <date>                      <dbl>          <dbl> <chr>    <dbl> <chr>    <dbl>
## 1 1985-01-01                  7.41           2.24  A191RL1…   3.3 UNRATE     7.3
## 2 1985-02-01                  0.863          7.41  <NA>      NA   UNRATE     7.3
## 3 1985-03-01                 -0.287          0.863 <NA>      NA   UNRATE     7.2
## # ℹ 11 more variables: ip_yoy <dbl>, infl_yoy <dbl>, symbol.x.x <chr>,
## #   term_spread <dbl>, credit_spread <dbl>, VIX <dbl>, symbol.y.y <chr>,
## #   GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>

tail(Total_data, 3)

## # A tibble: 3 × 18
##   predicting_date predicting_return monthly_return symbol.x GDP_g symbol.y unemp
##   <date>                      <dbl>          <dbl> <chr>    <dbl> <chr>    <dbl>
## 1 2024-10-01                 -0.990          2.02  A191RL1…   3.3 UNRATE     4.1
## 2 2024-11-01                  5.73          -0.990 <NA>      NA   UNRATE     4.1
## 3 2024-12-01                 -2.08           5.73  <NA>      NA   UNRATE     4.2
## # ℹ 11 more variables: ip_yoy <dbl>, infl_yoy <dbl>, symbol.x.x <chr>,
## #   term_spread <dbl>, credit_spread <dbl>, VIX <dbl>, symbol.y.y <chr>,
## #   GS10 <dbl>, lag_return_1m <dbl>, lag_return_3m <dbl>, lag_return_6m <dbl>

anyNA(Total_data)

## [1] TRUE

nrow(Total_data)

## [1] 480

Keep only the useful variables

Total_data <- Total_data |>
  select(
    predicting_date,
    predicting_return,          # future return (target)
    monthly_return,             # current SP500 return
    lag_return_1m, lag_return_3m, lag_return_6m,
    GDP_g, unemp, ip_yoy, infl_yoy,
    term_spread, credit_spread, VIX, GS10
  )

# Drop rows with missing values (like we did in HW regressions)
Total_data <- Total_data |>
  tidyr::drop_na()

# Check again
head(Total_data, 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 1990-04-01                -2.69            2.43          0.854          2.14
## 2 1990-07-01                -0.522          -0.889         9.20           2.43
## 3 1991-04-01                 0.0320          2.22          6.73           2.48
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

tail(Total_data, 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 2023-10-01                  -2.20          -4.87         -1.77          6.47
## 2 2024-01-01                   1.59           4.42          8.92         -4.87
## 3 2024-04-01                  -4.16           3.10          5.17          4.42
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

anyNA(Total_data)

## [1] FALSE

nrow(Total_data)

## [1] 95

Fill Missingness AFTER Joining

I will use a simple approach to make the quarterly data monthly, that is use the early values fill the in-between months. We can achieve this by using fill from tidyr.

# Fill the GDP_g downward, quarterly to monthly 
Total_data = Total_data |>
  tidyr::fill(GDP_g, .direction = "down")

head(Total_data)

## # A tibble: 6 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 1990-04-01                -2.69            2.43         0.854          2.14 
## 2 1990-07-01                -0.522          -0.889        9.20           2.43 
## 3 1991-04-01                 0.0320          2.22         6.73           2.48 
## 4 1992-07-01                 3.94           -1.74         0.0964        -2.18 
## 5 1992-10-01                 0.211           0.911       -2.40          -1.74 
## 6 1993-01-01                 0.705           1.01         3.03           0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

Now, we have all the dates lined up and took care of the look-ahead bias! Since GDP growth rate is quarterly government data, there is a lag for availability. We will further lag the GDP_g by 4 additional months. (This is why we need to extract earlier data.)

Publication Lag

# Lag GDP_g further by 4 months
Total_data = Total_data |>
  mutate(GDP_g = dplyr::lag(GDP_g, 4))

head(Total_data)

## # A tibble: 6 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 1990-04-01                -2.69            2.43         0.854          2.14 
## 2 1990-07-01                -0.522          -0.889        9.20           2.43 
## 3 1991-04-01                 0.0320          2.22         6.73           2.48 
## 4 1992-07-01                 3.94           -1.74         0.0964        -2.18 
## 5 1992-10-01                 0.211           0.911       -2.40          -1.74 
## 6 1993-01-01                 0.705           1.01         3.03           0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

We now define our prediction target. For each month, we use the information available at time t to predict the SP500 return at time t+1. The variables predicting_date and predicting_return store this one-month-ahead prediction setup.

Predictive Date and Return

# predicting_date and predicting_return were already created above,
# so here we just display the first few rows.
head(Total_data)

## # A tibble: 6 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 1990-04-01                -2.69            2.43         0.854          2.14 
## 2 1990-07-01                -0.522          -0.889        9.20           2.43 
## 3 1991-04-01                 0.0320          2.22         6.73           2.48 
## 4 1992-07-01                 3.94           -1.74         0.0964        -2.18 
## 5 1992-10-01                 0.211           0.911       -2.40          -1.74 
## 6 1993-01-01                 0.705           1.01         3.03           0.911
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

The data now should be “predicting_date”, “predicting_return”, and then all your variables (I have 3 variables here).

Filter Out the Desired Date Range and Check the begin and end (Should be exactly “1985-01-01” and “2024-12-01”)

Now, you can see everything is lined up! We are using information available on 1984-01-01 (which is actually the end of January 1984) to predict return on 1984-02-01 (which is the February 1984 monthly returns). We will establish predictive model from 1985 to 2024, so we will only need the data with “predicting_date” from 1985-01-01 to 2024-12-01. Let’s subset that:

# Subset predicting date from 1985-01-01 to 2024-12-01
Total_data <- Total_data |>
  filter(
    predicting_date >= as.Date("1990-04-01"),  # first usable month in your data
    predicting_date <= as.Date("2024-12-01")   # last month we want to predict
  )

# See the first and last 3 observations to confirm the range
head(Total_data, n = 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 1990-04-01                -2.69            2.43          0.854          2.14
## 2 1990-07-01                -0.522          -0.889         9.20           2.43
## 3 1991-04-01                 0.0320          2.22          6.73           2.48
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

tail(Total_data, n = 3)

## # A tibble: 3 × 14
##   predicting_date predicting_return monthly_return lag_return_1m lag_return_3m
##   <date>                      <dbl>          <dbl>         <dbl>         <dbl>
## 1 2023-10-01                  -2.20          -4.87         -1.77          6.47
## 2 2024-01-01                   1.59           4.42          8.92         -4.87
## 3 2024-04-01                  -4.16           3.10          5.17          4.42
## # ℹ 9 more variables: lag_return_6m <dbl>, GDP_g <dbl>, unemp <dbl>,
## #   ip_yoy <dbl>, infl_yoy <dbl>, term_spread <dbl>, credit_spread <dbl>,
## #   VIX <dbl>, GS10 <dbl>

Check if any NA’s are left

Total_data <- Total_data |>
    tidyr::replace_na(list(
        GDP_g = 0,
        unemp = 0,
        ip_yoy = 0,
        infl_yoy = 0,
        term_spread = 0,
        credit_spread = 0,
        VIX = 0,
        GS10 = 0
    ))
anyNA(Total_data) #rechecking

## [1] FALSE

Check the number of rows = 480

# Check if there are 480 observations
nrow(Total_data) == 95

## [1] TRUE

#Note: My dataset begins in 1990, so the total number of observations (95) is correct for the available data.

Note: I already selected the useful variables earlier when constructing Total_data (predicting_date, predicting_return, monthly_return, lag_return_1m/3m/6m, GDP_g, unemp, ip_yoy, infl_yoy, term_spread, credit_spread, VIX, GS10).

Exploration of raw data

Here are just some exploration about the relationship each of our predictors and the predicting_return. I will demonstrate two things:

Does the variable I choose have a relationship with the predicting_return?
Is this relationship changing over time?

For both, I will use cor to calculate the pairwise correlation between variables. To examing whether the relationship is changing, I will use a rolling correlation and plot how the relationship evolute overtime.

Correlation

# Correlation works between vectors. We can do vectorization with matrix here.
cor(Total_data$predicting_return,
    (Total_data |> select(-c(predicting_date, predicting_return))))

##      monthly_return lag_return_1m lag_return_3m lag_return_6m    GDP_g
## [1,]     -0.3275876     -0.165491   -0.04368923      0.147507 0.110405
##           unemp    ip_yoy    infl_yoy term_spread credit_spread       VIX
## [1,] 0.05227001 0.1075796 0.003301825 -0.08756985   -0.06855826 0.1036679
##            GS10
## [1,] -0.0963451

Interpretation of Correlation Results

monthly_return has the strongest correlation with predicting_return, around –0.33, meaning higher previous returns slightly predict lower next-month returns.
lagged returns (1m and 3m) show very weak negative relationships, while lag_return_6m has a small positive correlation.
GDP growth and industrial production (ip_yoy) show small positive correlations with returns, but the relationship is weak.
Unemployment is barely correlated with predicting_return.
Inflation (infl_yoy) has essentially no relationship with future returns.
Term spread and credit spread show small negative correlations.
VIX and GS10 each show weak positive correlations with predicting_return.

Overall, all correlations are small in magnitude, suggesting no strong linear relationships between these predictors and future stock returns.

Rolling Correlation

I will conduct 5-year rolling window correlations between predicting_return and other variables. I will use the predicting_date of the end of 5-year window as the date for correlation. We can use for loop to do the calculation.

# Preallocate the result, same shape as Total_data, no predicting_return and no date
Rolling_corr = Total_data |>
  select(-predicting_return)

# Variable_names (all names except "predicting_date", use logical indexing)
Variable_names = names(Rolling_corr)[names(Rolling_corr) != "predicting_date"]

# Make all the variables to NA as part of preallocate
Rolling_corr[,Variable_names] = NA

# A for loop to calculate the rolling correlations, vectorize inside loop
# 5-year correlation or 60 months
# The first window would be 1st to 60th (1+59) row, and last would be (nrow(Rolling_corr)-59)-th to nrow(Rolling_corr)-th row
for (i in 1:(nrow(Rolling_corr)-59)) {
  Rolling_corr[i+59, Variable_names] = cor(Total_data[i:(i+59), "predicting_return"], Total_data[i:(i+59), Variable_names])
}

# Note there are 59 rows of NA's due to the rolling window

# We need to make the table to a long table and then use ggplot to plot the rolling correlation
Rolling_corr |>
  # Remove the missing rows
  na.omit() |>
  # Except predicting_date, make this a long table, 
  pivot_longer(-predicting_date,
               names_to = "Variable",
               values_to = "rolling_corr") |>
  ggplot(aes(x = predicting_date, y = rolling_corr, color = Variable)) +
  geom_line(linewidth = 1) +
  theme_minimal() +
  labs(
    title = "Rolling Correlation with predicting return (5-year Window)",
    x = "Date",
    y = "Rolling Correlation",
    color = "Variable"
  ) +
  theme(
    legend.position = "top",
    plot.title = element_text(hjust = 0.5)  # <-- Center the title
  )

Across all variables:

Correlations stay within roughly –0.3 to +0.2, which are very small magnitudes. Patterns vary over time, meaning the relationships are not stable or reliable predictors. These results support the idea that stock returns are hard to forecast using typical macro and financial variables.

The rolling correlations show that relationships between predictors and future returns change over time and remain weak. No variable exhibits a stable or strong correlation with predicting_return, implying limited forecasting power.

Regression vs. Linear Regression

This section, we will finally estimate the linear model. Remember that we should have an expanding window, split the dataset into training, validation and testing. However, since linear model does not have tuning parameters, we would just put the validation set into training set:

1985-2008 as training, and 2009 to evaluate the model
1985-2009 as training, and 2010 to evaluate the model

…
1985-2023 as training, and 2024 to evaluate the model

For (or other method requires cross-validation), we have the division as:

1985-2000 as training, 2001-2008 as validation, and 2009 to evaluate the model
1985-2001 as training, 2002-2009 as validation, and 2010 to evaluate the model

…
1985-2015 as training, 2016-2023 as validation, and 2024 to evaluate the model

Why do we use an expanding window?

We are forecasting stock returns over time, so we mimic the real world:at each year t, we only use information available before that year to train the model. This prevents look-ahead bias.
Since linear regression has no tuning parameter, we do not need a separate validation set, so all available past data is used for training.
In , we must choose a λ penalty using cross-validation, so we need training + validation before testing each year.

What the splits mean

For linear regression in year 2009, the model is trained on 1985–2008, then tested on 2009.
For in year 2009, the model is trained on 1985–2000, validated on 2001–2008, then tested on 2009. This rolling procedure repeats each year until 2024.

Create possible tuning parameters

# I use a wide lambda grid so that cross-validation can search both very small and very large penalty values. The goal is simply to ensure that the optimal λ falls somewhere inside the range.
# Define lambda grid
# Wide range so that the optimal lambda will fall inside the grid
lambda_grid = 10^seq(-3, 20, length.out = 50)   # 0.001 to 10^20

# Initialize storage for CV errors
cv_errors = rep(0, length(lambda_grid))

Preallocate Fitted Results

# Preallocate the fitted data from 2009 to 2024
# Calling OLS OOS fit: fitted_return_OLS
# Calling  (or your model) OOS fit: fitted_return_Complex
Testing_result = Total_data |>
  filter(predicting_date >= "2009-01-01") |>
  # Only keep the predicting_date, predicting_return, and create fitted_return
  select(predicting_date, predicting_return) |>
  rename(actual_return= predicting_return) |>
  mutate(fitted_return_OLS = NA,
         fitted_return_Complex = NA)
# I preallocate the testing dataframe so that each year's OLS and  predictions 
# (fitted values) can be stored efficiently inside the loop. This avoids growing 
# objects inside the loop and ensures the fitted_return columns are ready to be filled.

Use a for loop to perform Fitting, CV and Testing

# Use a for loop to estimate each model and glmnet for Ridge regression
# Start from the end year 2009 to 2024 which are the testing periods
# Loop over testing years 2009–2024
for (i in 2009:2024) {

# ------------------ 1. TRAINING DATA ------------------
# Years strictly before (i - 8): earlier history
Training_data <- Total_data |>
  dplyr::filter(lubridate::year(predicting_date) < (i - 8)) |>
    # keep all predictors + predicting_return, drop date
  dplyr::select(-predicting_date)

# ------------------ 2. VALIDATION DATA ----------------
# Years from (i - 8) to (i - 1)
Validation_data <- Total_data |>
  dplyr::filter(
    lubridate::year(predicting_date) >= (i - 8),
    lubridate::year(predicting_date) <  i
    ) |>
  dplyr::select(-predicting_date)

# ------------------ 3. TEST DATA (YEAR i) -------------
# Year i: what we are trying to predict
Testing_data <- Total_data |>
  dplyr::filter(lubridate::year(predicting_date) == i) |>
  # for test, we only need the predictors (no predicting_return)
  dplyr::select(-predicting_date, -predicting_return)

# ------------------ 4. BUILD MATRICES FOR glmnet ------
# All columns except predicting_return are X (predictors)
x_train <- as.matrix(dplyr::select(Training_data, -predicting_return))
y_train <- Training_data$predicting_return

x_val   <- as.matrix(dplyr::select(Validation_data, -predicting_return))
y_val   <- Validation_data$predicting_return

x_test  <- as.matrix(Testing_data)

# ------------------ 5. CROSS-VALIDATION FOR  -----
# Reset CV error storage for this testing year
cv_errors <- rep(0, length(lambda_grid))

for (j in seq_along(lambda_grid)) {

  # : alpha = 1  (Ridge would be alpha = 0)
  model <- glmnet::glmnet(
    x_train, y_train,
    alpha  = 1,
    lambda = lambda_grid[j]
    )

  # Predict on the validation fold
  y_pred <- predict(model, newx = x_val)

  # Mean squared error on validation set
  cv_errors[j] <- mean((y_val - as.numeric(y_pred))^2)
  }

# Best lambda is the one with smallest validation MSE
best_lambda <- lambda_grid[which.min(cv_errors)]

# ------------------ 6. REFIT WITH BEST LAMBDA ---------
# Train + validation combined
x_tv <- rbind(x_train, x_val)
y_tv <- c(y_train, y_val)

model_best <- glmnet::glmnet(
  x_tv, y_tv,
  alpha  = 1,
  lambda = best_lambda
  )

# Save  / complex model prediction for year i
Testing_result[
  lubridate::year(Testing_result$predicting_date) == i,
  "fitted_return_Complex"
  ] <- as.numeric(predict(model_best, newx = x_test))

# ------------------ 7. OLS BENCHMARK -------------------
# OLS uses all data before year i and all predictors
OLS <- lm(
  predicting_return ~ . - predicting_date,
  data = Total_data |>
    dplyr::filter(lubridate::year(predicting_date) < i)
  )

Testing_result[
  lubridate::year(Testing_result$predicting_date) == i,
  "fitted_return_OLS"
  ] <- predict(
  OLS,
  newdata = Total_data |>
  dplyr::filter(lubridate::year(predicting_date) == i)
  )
}

## Warning in cbind2(1, newx): number of rows of result is not a multiple of
## vector length (arg 1)

# Message me if it is finished
print("Done!")

## [1] "Done!"

Save the Testing_result

saveRDS(Testing_result, "Testing_result.rds")

Analyze the Result

Plot the Residual vs. Actual Return (Required on slide)

### Plot Residual vs Actual Return (OLS vs )

ggplot(Testing_result) +
  geom_point(aes(x = actual_return,
                 y = actual_return - fitted_return_OLS,
                 color = "OLS")) +
  
  geom_point(aes(x = actual_return,
                 y = actual_return - fitted_return_Complex,
                 color = "")) +
  
  geom_smooth(aes(x = actual_return,
                  y = actual_return - fitted_return_OLS,
                  color = "OLS"),
              method = "lm", se = FALSE) +
  
  geom_smooth(aes(x = actual_return,
                  y = actual_return - fitted_return_Complex,
                  color = ""),
              method = "lm", se = FALSE) +
  
  labs(x = "Actual Return",
       y = "Residual (Actual – Fitted)",
       color = "Model") +
  
  theme_minimal()

We can see that since gives a positive OOS \(R^2\) whereas OLS gives a negative OOS \(R^2\).

Further Exploration

We can further explore which years are doing the worst and which years the model is fine. Let’s calculate the \(R^2_{OOS}\) based on every 4 years. (Use 4 years is to have sufficient amount of data to estimate the \(R^2_{OOS}\), while having several periods for comparison.)

R_Sq_oos_func <- function(actual, fitted){
  1 - mean((actual - fitted)^2) / mean((actual - mean(actual))^2)
}

# Estimate every 4-year R_sq_oos
R_sq_oos_table = Testing_result |>
  # first break the dates into 4 year periods
  mutate(
    year = year(predicting_date),
    period_4yr = cut(year, breaks = seq(min(year), max(year) + 3, by = 4), right = FALSE)
  ) |>
  group_by(period_4yr) |>
  summarise(R_Sq_oos_OLS = R_Sq_oos_func(actual_return, fitted_return_OLS),
            R_Sq_oos_ = R_Sq_oos_func(actual_return, fitted_return_Complex))

R_sq_oos_table

## # A tibble: 4 × 3
##   period_4yr  R_Sq_oos_OLS R_Sq_oos_
##   <fct>              <dbl>     <dbl>
## 1 [2009,2013)      -4.06     -0.0432
## 2 [2013,2017)       0.106     0.0929
## 3 [2017,2021)      -2.45     -0.864 
## 4 [2021,2025)      -0.0749   -0.0638

Interpretation of 4-year R²_OOS results

2009–2012: Both OLS (−4.06) and (−0.04) perform very poorly. This makes sense because this window includes recovery after the 2008 financial crisis. Models struggle because markets were unstable and unpredictable.
2013–2016: This is the only period showing positive out-of-sample R² for both models
- OLS: 0.106
- : 0.093 This period is calm and stable economically, so returns are easier to predict.
2017–2020: Both models collapse again (OLS = −2.45, = −0.86). This includes the COVID crash (2020) — a major shock that causes models to fail out-of-sample.
2021–2024: Slightly negative for both models. Markets were volatile during the post-COVID recovery, so prediction continues to be difficult.

# Calculate R^2_OOS for 2017–2019 (before COVID)
test_subset <- Testing_result |>
  filter(year(predicting_date) >= 2017,
         year(predicting_date) <= 2019)

R_sq_2017_2019_OLS <- R_Sq_oos_func(test_subset$actual_return,
                                    test_subset$fitted_return_OLS)

R_sq_2017_2019_ <- R_Sq_oos_func(test_subset$actual_return,
                                      test_subset$fitted_return_Complex)

R_sq_2017_2019_OLS

## [1] -3.716979

R_sq_2017_2019_

## [1] -2.322766

The out-of-sample predictive power during 2017–2019 is very poor for both models. OLS gives an R²_OOS of about –3.72, and gives –2.32, which means: - Both models performed much worse than simply predicting the mean return. - still performs less terribly than OLS, but still fails to capture meaningful predictability in this period. - This aligns with the fact that 2017–2019 leads directly into the 2020 COVID crash, a period associated with structural breaks and reduced return predictability.

The goodness of fit is good during 2013-2016. It is not good right after crashes (2008 and 2020). Period 2017-2020 contains crash during Covid (year 2020) and that might be the cause of low predictability.

Here, we verified our conjecture that the each model is actually doing better during normal times. This number is close to the 2013-2016 period.

A Simple Trading Strategy Based on the Simple Model

The actual meaning of such predictive model is that we can implement trade on it! (You can think of this as a way to evaluate the predictive model. The more money we can make, the better the model is.)

For example, let’s have a strategy that: - We buy/hold the SP500 index if we predict positive returns next month. - We sell/not buy the SP500 index if we predict negative/zero returns next month.

In reality, you can choose a more complicated threshold for buying/selling. Or even allow short selling. For demonstration purpose, we will stick to the simplest one.

To see whether our combination of predictive model and trading strategy is effective, we will compare:

linear model + simple strategy’s cumulative return (buyLMpositive)
- simple strategy’s cumulative return (buypositive)
buy and hold cumulative return (buyANDhold)

# Create a dataset for trading strategy comparison: buyLMpositive, buypositive vs. buyANDhold
Trading_Strategy = Testing_result |>
  # Create a logical index of whether we realize the monthly return based on our trading strategy
  mutate(Trade_OLS = (fitted_return_OLS > 0),
         Trade_Complex = (fitted_return_Complex > 0)) |>
  # Create a buyLMpositive strategy 
  mutate(buyLMpositive_ret = Trade_OLS*actual_return,
         buypositive_ret = Trade_Complex*actual_return)


# Now, use for loop to calculate the cumulative gross returns
# Preset the two strategy
Trading_Strategy$buyLMpositive = NA
Trading_Strategy$buyLMpositive[1] = 1
Trading_Strategy$buypositive = NA
Trading_Strategy$buypositive[1] = 1
Trading_Strategy$buyANDhold = Trading_Strategy$buyLMpositive

# Loop
for(i in 2:nrow(Trading_Strategy)) {
  Trading_Strategy$buyLMpositive[i] = Trading_Strategy$buyLMpositive[i-1] * (1+Trading_Strategy$buyLMpositive_ret[i]/100)
  Trading_Strategy$buypositive[i] = Trading_Strategy$buypositive[i-1] * (1+Trading_Strategy$buypositive_ret[i]/100)
  Trading_Strategy$buyANDhold[i] = Trading_Strategy$buyANDhold[i-1] * (1+Trading_Strategy$actual_return[i]/100)
}

Trading_Strategy |>
  pivot_longer(c(buyLMpositive, buypositive, buyANDhold), names_to = "Strategy", values_to = "CR") |>
  ggplot(aes(predicting_date, CR, color = Strategy)) +
  geom_line(linewidth = 1) +
  theme_minimal() +
  labs(
    title = "Cumulative Returns with Different Trading Strategies",
    x = "Date",
    y = "Cumulative Returns",
    color = "Trading Stategy"
  ) +
  theme(
    legend.position = "top",
    plot.title = element_text(hjust = 0.5)  # <-- Center the title
  )

saveRDS(Trading_Strategy, "Trading_Strategy.rds")

Interpertation

The Buy & Hold strategy performs the best overall and reaches the highest cumulative return by 2022.

The strategy produces almost exactly the same path as Buy & Hold because always predicts a positive return. Therefore, does not izmprove trading performance.

The Linear Model positive-return strategy (blue line) performs significantly worse than Buy & Hold. This means the linear model’s predictions are not strong enough to beat the market or justify switching in/out of positions.

Conclusion: The simple trading strategy confirms that neither nor the Linear Model provides useful trading signals. The model is not strong enough to beat a passive buy-and-hold strategy. We can see that the simple trading strategy + the LM predictive model is performing much worse than simply buy and hold the SP500 index! is actually the same line as buyANDhold and blocking the whole red curve! Based on the simple trading strategy, does not yield better results than Buy and Hold.

Save the Trading_Strategy

saveRDS(Trading_Strategy, "Trading_Strategy.rds")