library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
df <- read_csv("data.csv")
## Rows: 7542 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): symbol, date
## dbl (4): ri, MKT, SMB, HML
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 7,542
## Columns: 6
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL",…
## $ date   <chr> "4-Jan-11", "5-Jan-11", "6-Jan-11", "7-Jan-11", "10-Jan-11", "1…
## $ ri     <dbl> 0.0052062641, 0.0081462879, -0.0008082435, 0.0071360567, 0.0186…
## $ MKT    <dbl> -0.0013138901, 0.0049946699, -0.0021252276, -0.0018465050, -0.0…
## $ SMB    <dbl> -0.0065, 0.0018, 0.0001, 0.0022, 0.0041, 0.0016, 0.0031, -0.002…
## $ HML    <dbl> 0.0008, 0.0013, -0.0025, -0.0006, 0.0039, 0.0036, 0.0000, -0.00…
# Step 0: Estimate factor loadings (betas) for each stock
step0 <- df %>%
  group_by(symbol) %>%
  nest() %>%
  mutate(estimates = map(data, ~ tidy(lm(ri ~ MKT + SMB + HML, data = .)))) %>%
  unnest(estimates) %>%
  select(symbol, term, estimate) %>%
  pivot_wider(names_from = term, values_from = estimate) %>%
  rename(beta_mkt = MKT, beta_smb = SMB, beta_hml = HML) %>%
  select(symbol, beta_mkt, beta_smb, beta_hml)

# View Step 0 result
print(step0)
## # A tibble: 6 × 4
## # Groups:   symbol [6]
##   symbol beta_mkt beta_smb beta_hml
##   <chr>     <dbl>    <dbl>    <dbl>
## 1 AAPL      0.900  0.0685   -0.0578
## 2 FORD      0.513 -0.264     0.138 
## 3 GE        1.08   0.0994    0.0902
## 4 GM        1.29   0.00390  -0.0222
## 5 IBM       0.817  0.0336   -0.0121
## 6 MSFT      0.966  0.0582   -0.0641
# Join betas back into original data by stock symbol
df1 <- left_join(df, step0, by = "symbol")

# Step 1: For each date, regress returns on the factor loadings (betas)
step1 <- df1 %>%
  group_by(date) %>%
  nest() %>%
  mutate(regression = map(data, ~ tidy(lm(ri ~ beta_mkt + beta_smb + beta_hml, data = .)))) %>%
  unnest(regression) %>%
  select(date, term, estimate) %>%
  pivot_wider(names_from = term, values_from = estimate)

# View Step 1 result
print(step1)
## # A tibble: 1,257 × 5
## # Groups:   date [1,257]
##    date      `(Intercept)` beta_mkt beta_smb beta_hml
##    <chr>             <dbl>    <dbl>    <dbl>    <dbl>
##  1 4-Jan-11       -0.0298   0.0416  -0.0255    0.0574
##  2 5-Jan-11        0.0223  -0.0113  -0.158     0.0628
##  3 6-Jan-11       -0.0292   0.0373   0.00703  -0.173 
##  4 7-Jan-11       -0.0175   0.0127   0.0323   -0.0642
##  5 10-Jan-11       0.0364  -0.0366   0.0171    0.0586
##  6 11-Jan-11       0.00274  0.00409 -0.0954    0.0899
##  7 12-Jan-11       0.0723  -0.0554  -0.164     0.0430
##  8 13-Jan-11       0.0150  -0.0194   0.00181   0.0256
##  9 14-Jan-11       0.0198  -0.0165   0.0633    0.0392
## 10 18-Jan-11      -0.0189   0.0101   0.0525   -0.0900
## # ℹ 1,247 more rows
# Step One: Cross-sectional Regression (per time period)
df1 <- left_join(df, step0, by = "symbol")

step1 <- df1 %>%
  group_by(date) %>%
  nest() %>%
  mutate(regression = map(data, ~ tidy(lm(ri ~ beta_mkt + beta_smb + beta_hml, data = .)))) %>%
  unnest(regression)

# Ensure Step 1's output is correct and exists
step1_wide <- step1 %>%
  pivot_wider(names_from = term, values_from = estimate)

# Check if step1_wide exists and look at its structure
print(head(step1_wide))
## # A tibble: 6 × 9
## # Groups:   date [2]
##   date     data     std.error statistic p.value `(Intercept)` beta_mkt beta_smb
##   <chr>    <list>       <dbl>     <dbl>   <dbl>         <dbl>    <dbl>    <dbl>
## 1 4-Jan-11 <tibble>   0.00542    -5.49   0.0316       -0.0298  NA       NA     
## 2 4-Jan-11 <tibble>   0.00574     7.25   0.0185       NA        0.0416  NA     
## 3 4-Jan-11 <tibble>   0.0129     -1.98   0.186        NA       NA       -0.0255
## 4 4-Jan-11 <tibble>   0.0166      3.45   0.0746       NA       NA       NA     
## 5 5-Jan-11 <tibble>   0.0252      0.887  0.469         0.0223  NA       NA     
## 6 5-Jan-11 <tibble>   0.0267     -0.425  0.712        NA       -0.0113  NA     
## # ℹ 1 more variable: beta_hml <dbl>
step2 <- step1_wide %>%
  summarise(
    mean_beta_mkt = mean(beta_mkt, na.rm = TRUE),
    mean_beta_smb = mean(beta_smb, na.rm = TRUE),
    mean_beta_hml = mean(beta_hml, na.rm = TRUE)
  )

# View the time series average of the coefficients
print(step2)
## # A tibble: 1,257 × 4
##    date     mean_beta_mkt mean_beta_smb mean_beta_hml
##    <chr>            <dbl>         <dbl>         <dbl>
##  1 1-Apr-11       0.0935       -0.101          0.0335
##  2 1-Apr-13       0.0257       -0.0385         0.0621
##  3 1-Apr-14      -0.00167       0.202         -0.132 
##  4 1-Apr-15      -0.0412        0.00570        0.0649
##  5 1-Aug-11      -0.0111       -0.307          0.104 
##  6 1-Aug-12       0.0174        0.0990        -0.0267
##  7 1-Aug-13       0.0271       -0.00876        0.0284
##  8 1-Aug-14       0.00726       0.201          0.0110
##  9 1-Dec-11      -0.0511        0.0512         0.0284
## 10 1-Dec-14      -0.0226       -0.142         -0.0117
## # ℹ 1,247 more rows
# Optional: Run t-tests for statistical significance
t_test_mkt <- t.test(step1_wide$beta_mkt, mu = 0)
t_test_smb <- t.test(step1_wide$beta_smb, mu = 0)
t_test_hml <- t.test(step1_wide$beta_hml, mu = 0)

# Print the results of the t-tests
print(t_test_mkt)
## 
##  One Sample t-test
## 
## data:  step1_wide$beta_mkt
## t = -0.37879, df = 1256, p-value = 0.7049
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.002546371  0.001722208
## sample estimates:
##     mean of x 
## -0.0004120813
print(t_test_smb)
## 
##  One Sample t-test
## 
## data:  step1_wide$beta_smb
## t = 0.97712, df = 1256, p-value = 0.3287
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.003711466  0.011076953
## sample estimates:
##   mean of x 
## 0.003682744
print(t_test_hml)
## 
##  One Sample t-test
## 
## data:  step1_wide$beta_hml
## t = -0.18044, df = 1256, p-value = 0.8568
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.005541205  0.004607776
## sample estimates:
##     mean of x 
## -0.0004667146