The Fama-MacBeth (1973) procedure estimates factor risk premiums in three steps:
library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
library(sandwich)
library(lmtest)
library(ggplot2)
library(knitr)
library(scales)The dataset contains daily stock returns for AAPL, GE, GM, IBM, MSFT along with the Fama-French 3 factors (MKT, SMB, HML), covering the period January 2021 – December 2025.
# ── Update the filename below if yours differs ──────────────────────────────
data <- read_csv("updated_real_data_2021_2025 (2).csv")
# Auto-detect date format
data <- data %>%
mutate(date = as.Date(date, format = "%d-%b-%y"))
glimpse(data)## Rows: 6,265
## Columns: 6
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL",…
## $ date <date> 2021-01-05, 2021-01-06, 2021-01-07, 2021-01-08, 2021-01-11, 20…
## $ ri <dbl> 0.012363978, -0.033661685, 0.034123303, 0.008631227, -0.0232488…
## $ MKT <dbl> 0.0086, 0.0079, 0.0176, 0.0051, -0.0052, 0.0037, 0.0006, -0.001…
## $ SMB <dbl> 0.0122, 0.0214, 0.0032, -0.0075, 0.0026, 0.0128, -0.0094, 0.020…
## $ HML <dbl> 0.0050, 0.0394, -0.0081, -0.0138, 0.0126, 0.0124, -0.0045, 0.01…
summary_tbl <- data %>%
group_by(symbol) %>%
summarise(
Start = min(date),
End = max(date),
`# Obs` = n(),
`Mean Return` = round(mean(ri, na.rm = TRUE), 6),
`SD Return` = round(sd(ri, na.rm = TRUE), 6),
`Min Return` = round(min(ri, na.rm = TRUE), 6),
`Max Return` = round(max(ri, na.rm = TRUE), 6)
)
kable(summary_tbl,
caption = "Table 1: Dataset Summary by Stock (2021–2025)")| symbol | Start | End | # Obs | Mean Return | SD Return | Min Return | Max Return |
|---|---|---|---|---|---|---|---|
| AAPL | 2021-01-05 | 2025-12-30 | 1253 | 0.000771 | 0.017558 | -0.092456 | 0.153289 |
| GE | 2021-01-05 | 2025-12-30 | 1253 | 0.001633 | 0.019354 | -0.110963 | 0.105686 |
| GM | 2021-01-05 | 2025-12-30 | 1253 | 0.000869 | 0.023468 | -0.089867 | 0.148621 |
| IBM | 2021-01-05 | 2025-12-30 | 1253 | 0.001023 | 0.014877 | -0.099051 | 0.129642 |
| MSFT | 2021-01-05 | 2025-12-30 | 1253 | 0.000807 | 0.016203 | -0.077156 | 0.101337 |
ggplot(data, aes(x = date, y = ri, colour = symbol)) +
geom_line(linewidth = 0.35, alpha = 0.75) +
facet_wrap(~symbol, ncol = 2, scales = "free_y") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(title = "Daily Excess Return Series by Stock (2021–2025)",
subtitle = "Based on AAPL, GE, GM, IBM, MSFT",
x = NULL, y = "Daily Return") +
theme_minimal(base_size = 11) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"))data %>%
select(date, MKT, SMB, HML) %>%
distinct() %>%
pivot_longer(-date, names_to = "Factor", values_to = "Value") %>%
ggplot(aes(x = date, y = Value, colour = Factor)) +
geom_line(linewidth = 0.4, alpha = 0.8) +
facet_wrap(~Factor, ncol = 1, scales = "free_y") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(title = "Fama-French Factor Series (2021–2025)",
x = NULL, y = "Factor Return") +
theme_minimal(base_size = 11) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"))For each stock, regress daily excess returns on MKT, SMB, and HML over the full sample to obtain each stock’s factor exposures (betas).
\[r_{i,t} = \alpha_i + \beta^{MKT}_i \cdot MKT_t + \beta^{SMB}_i \cdot SMB_t + \beta^{HML}_i \cdot HML_t + \varepsilon_{i,t}\]
## Stocks: AAPL, GE, GM, IBM, MSFT
## Number of stocks (N): 5
# Run one OLS per stock
ts_models <- map(stocks, function(stk) {
d <- data %>% filter(symbol == stk)
lm(ri ~ MKT + SMB + HML, data = d)
})
names(ts_models) <- stocks
# Extract coefficients + R²
betas <- map_dfr(ts_models, function(m) {
s <- summary(m)
tibble(
alpha = coef(m)[1],
beta_mkt = coef(m)[2],
beta_smb = coef(m)[3],
beta_hml = coef(m)[4],
R2 = round(s$r.squared, 4),
Adj_R2 = round(s$adj.r.squared, 4)
)
}, .id = "Stock")
kable(betas,
col.names = c("Stock", "α", "β MKT", "β SMB", "β HML", "R²", "Adj. R²"),
digits = 5,
caption = "Table 2: Step 1 — Estimated Factor Loadings per Stock")| Stock | α | β MKT | β SMB | β HML | R² | Adj. R² |
|---|---|---|---|---|---|---|
| AAPL | 0.00028 | 1.15521 | -0.27167 | -0.27667 | 0.5891 | 0.5881 |
| GE | 0.00102 | 1.12269 | 0.05245 | 0.41780 | 0.3789 | 0.3774 |
| GM | 0.00024 | 1.24023 | 0.51248 | 0.65256 | 0.3938 | 0.3924 |
| IBM | 0.00059 | 0.68992 | -0.08599 | 0.37901 | 0.2292 | 0.2273 |
| MSFT | 0.00039 | 1.04372 | -0.42857 | -0.47316 | 0.6611 | 0.6603 |
ggplot(betas, aes(x = reorder(Stock, Adj_R2), y = Adj_R2, fill = Adj_R2)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = scales::percent(Adj_R2, accuracy = 0.1)),
hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#aec3d4", high = "#2c7bb6") +
scale_y_continuous(labels = scales::percent, limits = c(0, max(betas$Adj_R2)*1.2)) +
labs(title = "Step 1: Adjusted R² by Stock",
x = NULL, y = "Adjusted R²") +
theme_minimal(base_size = 11)betas %>%
pivot_longer(c(beta_mkt, beta_smb, beta_hml),
names_to = "Factor", values_to = "Loading") %>%
mutate(Factor = recode(Factor,
beta_mkt = "β MKT",
beta_smb = "β SMB",
beta_hml = "β HML")) %>%
ggplot(aes(x = Stock, y = Loading, fill = Loading > 0)) +
geom_col(show.legend = FALSE, width = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_text(aes(label = round(Loading, 3),
vjust = ifelse(Loading >= 0, -0.4, 1.3)),
size = 3) +
facet_wrap(~Factor, scales = "free_y") +
scale_fill_manual(values = c("TRUE" = "#4575b4", "FALSE" = "#d73027")) +
labs(title = "Step 1: Factor Loadings per Stock",
x = NULL, y = "Beta") +
theme_minimal(base_size = 11) +
theme(strip.text = element_text(face = "bold"))At each trading day t, run a cross-sectional OLS of all stocks’ returns on their betas from Step 1, producing a daily \(\lambda_t\) time series.
# Merge Step 1 betas into panel
panel <- data %>%
left_join(
betas %>%
rename(symbol = Stock) %>%
select(symbol, beta_mkt, beta_smb, beta_hml),
by = "symbol"
)
# Cross-sectional regression at each date
lambdas <- panel %>%
group_by(date) %>%
do(tidy(lm(ri ~ beta_mkt + beta_smb + beta_hml, data = .))) %>%
ungroup() %>%
select(date, term, estimate) %>%
pivot_wider(names_from = term, values_from = estimate) %>%
rename(
lambda_0 = `(Intercept)`,
lambda_MKT = beta_mkt,
lambda_SMB = beta_smb,
lambda_HML = beta_hml
)
cat("Cross-sectional regressions run (T):", nrow(lambdas), "\n")## Cross-sectional regressions run (T): 1253
## Date range: 2021-01-05 to 2025-12-30
kable(head(lambdas, 10),
col.names = c("Date", "λ₀", "λ MKT", "λ SMB", "λ HML"),
digits = 6,
caption = "Table 3: First 10 Daily λ Estimates (Second Pass)")| Date | λ₀ | λ MKT | λ SMB | λ HML |
|---|---|---|---|---|
| 2021-01-05 | -0.026284 | 0.034436 | -0.033315 | 0.044774 |
| 2021-01-06 | -0.109539 | 0.084477 | -0.153290 | 0.174354 |
| 2021-01-07 | 0.051834 | -0.022960 | 0.080949 | -0.086380 |
| 2021-01-08 | -0.041671 | 0.036881 | -0.047689 | 0.022126 |
| 2021-01-11 | 0.036821 | -0.023277 | 0.101704 | -0.024195 |
| 2021-01-12 | -0.015690 | 0.031721 | 0.060325 | 0.011709 |
| 2021-01-13 | 0.050074 | -0.027179 | 0.135947 | -0.101894 |
| 2021-01-14 | 0.094822 | -0.067831 | 0.151514 | -0.047785 |
| 2021-01-15 | 0.058213 | -0.063382 | 0.033803 | -0.041823 |
| 2021-01-19 | 0.151895 | -0.085152 | 0.300746 | -0.159075 |
lam_summary <- lambdas %>%
summarise(across(
c(lambda_0, lambda_MKT, lambda_SMB, lambda_HML),
list(
Mean = ~mean(., na.rm = TRUE),
SD = ~sd(., na.rm = TRUE),
Min = ~min(., na.rm = TRUE),
Max = ~max(., na.rm = TRUE)
),
.names = "{.col}_{.fn}"
)) %>%
pivot_longer(everything(),
names_to = c("Factor", "Stat"),
names_sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = Stat, values_from = value)
kable(lam_summary, digits = 6,
caption = "Table 4: Summary Statistics of Daily λ Estimates")| Factor | Mean | SD | Min | Max |
|---|---|---|---|---|
| lambda_0 | -0.001750 | 0.074526 | -0.324714 | 0.547373 |
| lambda_MKT | 0.002127 | 0.061343 | -0.415720 | 0.250607 |
| lambda_SMB | -0.003524 | 0.094060 | -0.475067 | 0.774270 |
| lambda_HML | 0.002720 | 0.063882 | -0.506382 | 0.296707 |
The final estimates are time-series means of the \(\lambda_t\) series, with standard errors from their time-series variation:
\[t(\hat\lambda) = \frac{\bar\lambda}{SD(\lambda_t)/\sqrt{T}}\]
fm_test <- function(x, label) {
x <- na.omit(x)
n <- length(x)
mu <- mean(x)
se <- sd(x) / sqrt(n)
ts <- mu / se
pv <- 2 * pt(-abs(ts), df = n - 1)
tibble(
Factor = label,
`Mean λ` = mu,
`Std. Error` = se,
`t-Statistic` = ts,
`p-Value` = pv,
Sig. = ifelse(pv < 0.01, "***",
ifelse(pv < 0.05, "**",
ifelse(pv < 0.10, "*", "")))
)
}
fm_table <- bind_rows(
fm_test(lambdas$lambda_0, "Intercept (λ₀)"),
fm_test(lambdas$lambda_MKT, "Market Premium (λ_MKT)"),
fm_test(lambdas$lambda_SMB, "Size Premium (λ_SMB)"),
fm_test(lambdas$lambda_HML, "Value Premium (λ_HML)")
)
kable(fm_table, digits = 5,
caption = "Table 5: Fama-MacBeth Risk Premia Estimates (Standard SE)")| Factor | Mean λ | Std. Error | t-Statistic | p-Value | Sig. |
|---|---|---|---|---|---|
| Intercept (λ₀) | -0.00175 | 0.00211 | -0.83131 | 0.40595 | |
| Market Premium (λ_MKT) | 0.00213 | 0.00173 | 1.22756 | 0.21984 | |
| Size Premium (λ_SMB) | -0.00352 | 0.00266 | -1.32602 | 0.18507 | |
| Value Premium (λ_HML) | 0.00272 | 0.00180 | 1.50712 | 0.13203 |
## *** p<0.01 ** p<0.05 * p<0.10
lambdas %>%
pivot_longer(-date, names_to = "Factor", values_to = "lambda") %>%
mutate(Factor = recode(Factor,
lambda_0 = "Intercept (λ₀)",
lambda_MKT = "Market λ",
lambda_SMB = "SMB λ",
lambda_HML = "HML λ")) %>%
ggplot(aes(x = date, y = lambda)) +
geom_line(colour = "#2c7bb6", linewidth = 0.35, alpha = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
geom_smooth(method = "loess", se = FALSE,
colour = "#d73027", linewidth = 0.8) +
facet_wrap(~Factor, ncol = 1, scales = "free_y") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(title = "Time Series of Cross-Sectional λ Estimates (2021–2025)",
subtitle = "Red line = LOESS trend",
x = NULL, y = "Lambda") +
theme_minimal(base_size = 11) +
theme(strip.text = element_text(face = "bold"))fm_table %>%
filter(Factor != "Intercept (λ₀)") %>%
mutate(
ci_lo = `Mean λ` - 1.96 * `Std. Error`,
ci_hi = `Mean λ` + 1.96 * `Std. Error`,
sig = ifelse(Sig. != "", "Significant (p<0.10)",
"Not Significant")
) %>%
ggplot(aes(x = Factor, y = `Mean λ`, fill = sig)) +
geom_col(width = 0.5) +
geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
width = 0.15, linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_text(aes(label = round(`Mean λ`, 5),
vjust = ifelse(`Mean λ` >= 0, -0.5, 1.5)),
size = 3) +
scale_fill_manual(values = c("Significant (p<0.10)" = "#4575b4",
"Not Significant" = "#aec3d4")) +
labs(title = "Risk Premia with 95% Confidence Intervals",
x = NULL, y = "Mean λ", fill = "") +
theme_minimal(base_size = 11) +
theme(legend.position = "bottom")lambdas %>%
pivot_longer(-date, names_to = "Factor", values_to = "lambda") %>%
filter(Factor != "lambda_0") %>%
mutate(Factor = recode(Factor,
lambda_MKT = "Market λ",
lambda_SMB = "SMB λ",
lambda_HML = "HML λ")) %>%
ggplot(aes(x = lambda, fill = Factor)) +
geom_histogram(bins = 60, colour = "white", alpha = 0.85,
show.legend = FALSE) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey30") +
facet_wrap(~Factor, scales = "free") +
labs(title = "Distribution of Cross-Sectional λ Estimates",
x = "Lambda", y = "Count") +
theme_minimal(base_size = 11) +
theme(strip.text = element_text(face = "bold"))Standard FM SEs correct only for cross-sectional correlation. Newey-West (1987) SEs additionally correct for time-series autocorrelation in the \(\lambda_t\) series (6 lags).
nw_test <- function(x, label) {
x <- na.omit(x)
n <- length(x)
m <- lm(x ~ 1)
se <- sqrt(NeweyWest(m, lag = 6, prewhite = FALSE)[1, 1])
mu <- mean(x)
ts <- mu / se
pv <- 2 * pt(-abs(ts), df = n - 1)
tibble(
Factor = label,
`Mean λ` = mu,
`NW Std. Error` = se,
`NW t-Stat` = ts,
`p-Value` = pv,
Sig. = ifelse(pv < 0.01, "***",
ifelse(pv < 0.05, "**",
ifelse(pv < 0.10, "*", "")))
)
}
nw_table <- bind_rows(
nw_test(lambdas$lambda_0, "Intercept (λ₀)"),
nw_test(lambdas$lambda_MKT, "Market Premium (λ_MKT)"),
nw_test(lambdas$lambda_SMB, "Size Premium (λ_SMB)"),
nw_test(lambdas$lambda_HML, "Value Premium (λ_HML)")
)
kable(nw_table, digits = 5,
caption = "Table 6: Fama-MacBeth with Newey-West Standard Errors (6 lags)")| Factor | Mean λ | NW Std. Error | NW t-Stat | p-Value | Sig. |
|---|---|---|---|---|---|
| Intercept (λ₀) | -0.00175 | 0.00208 | -0.84245 | 0.39970 | |
| Market Premium (λ_MKT) | 0.00213 | 0.00170 | 1.25433 | 0.20996 | |
| Size Premium (λ_SMB) | -0.00352 | 0.00258 | -1.36754 | 0.17170 | |
| Value Premium (λ_HML) | 0.00272 | 0.00176 | 1.54807 | 0.12186 |
## Newey-West SE with 6 lags. *** p<0.01 ** p<0.05 * p<0.10
fm_comp <- fm_table %>%
filter(Factor != "Intercept (λ₀)") %>%
select(Factor, `Mean λ`, SE = `Std. Error`) %>%
mutate(Method = "Standard FM")
nw_comp <- nw_table %>%
filter(Factor != "Intercept (λ₀)") %>%
select(Factor, `Mean λ`, SE = `NW Std. Error`) %>%
mutate(Method = "Newey-West")
bind_rows(fm_comp, nw_comp) %>%
mutate(
ci_lo = `Mean λ` - 1.96 * SE,
ci_hi = `Mean λ` + 1.96 * SE
) %>%
ggplot(aes(x = Factor, y = `Mean λ`, colour = Method, group = Method)) +
geom_point(position = position_dodge(0.4), size = 3) +
geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
position = position_dodge(0.4),
width = 0.2, linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
scale_colour_manual(values = c("Standard FM" = "#4575b4",
"Newey-West" = "#d73027")) +
labs(title = "Comparison: Standard FM vs Newey-West Standard Errors",
subtitle = "Error bars = 95% confidence interval",
x = NULL, y = "Mean λ", colour = "Method") +
theme_minimal(base_size = 11) +
theme(legend.position = "bottom")| Factor | Theory | Expected Sign |
|---|---|---|
| Market (MKT) | Higher market β → higher expected return (CAPM/FF3) | Positive (+) |
| Size (SMB) | Smaller firms (high SMB β) earn a size premium | Positive (+) |
| Value (HML) | Value firms (high HML β) earn a value premium | Positive (+) |
| Factor | Mean λ | NW t-Stat | p-Value | Sig. | Result |
|---|---|---|---|---|---|
| Market Premium (λ_MKT) | 0.00213 | 1.25433 | 0.20996 | Not statistically significant | |
| Size Premium (λ_SMB) | -0.00352 | -1.36754 | 0.17170 | Not statistically significant | |
| Value Premium (λ_HML) | 0.00272 | 1.54807 | 0.12186 | Not statistically significant |
Key Takeaways: