The Fama-MacBeth (1973) procedure is a cornerstone of empirical asset pricing. It tests whether risk factors — such as the three Fama-French factors (Market, SMB, HML) — are priced in the cross-section of stock returns. It produces standard errors that are robust to cross-sectional correlation in residuals, a critical property when working with large panels of assets.
| Method | Handles Cross-Sectional Correlation? | Preferred When |
|---|---|---|
| Pooled OLS | ❌ No | Large T, few N |
| Fixed Effects | ❌ No | Unobserved heterogeneity |
| Fama-MacBeth | ✅ Yes | Many N, moderate T |
\[r_{i,t} - r_{f,t} = \alpha_i + \beta_i^{MKT} \cdot MKT_t + \beta_i^{SMB} \cdot SMB_t + \beta_i^{HML} \cdot HML_t + \varepsilon_{i,t}\]
Where:
# Install packages if not already installed
required_pkgs <- c("broom", "tidyverse", "knitr", "kableExtra", "scales", "zoo")
new_pkgs <- required_pkgs[!(required_pkgs %in% installed.packages()[,"Package"])]
if (length(new_pkgs)) install.packages(new_pkgs, repos = "https://cloud.r-project.org")
library(broom)
library(tidyverse)
library(knitr)
library(kableExtra)
library(scales)
library(zoo)data <- read.csv("data.csv", stringsAsFactors = FALSE)
# Parse dates
data$date <- as.Date(data$date, format = "%d-%b-%y")
glimpse(data)## Rows: 7,542
## Columns: 6
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL",…
## $ date <date> 2011-01-04, 2011-01-05, 2011-01-06, 2011-01-07, 2011-01-10, 20…
## $ ri <dbl> 0.0052062641, 0.0081462879, -0.0008082435, 0.0071360567, 0.0186…
## $ MKT <dbl> -0.0013138901, 0.0049946699, -0.0021252276, -0.0018465050, -0.0…
## $ SMB <dbl> -0.0065, 0.0018, 0.0001, 0.0022, 0.0041, 0.0016, 0.0031, -0.002…
## $ HML <dbl> 0.0008, 0.0013, -0.0025, -0.0006, 0.0039, 0.0036, 0.0000, -0.00…
# Summary statistics per stock
summary_tbl <- data %>%
group_by(symbol) %>%
summarise(
N_obs = n(),
Mean_ri = mean(ri, na.rm = TRUE),
SD_ri = sd(ri, na.rm = TRUE),
Min_ri = min(ri, na.rm = TRUE),
Max_ri = max(ri, na.rm = TRUE),
Mean_MKT = mean(MKT, na.rm = TRUE),
Mean_SMB = mean(SMB, na.rm = TRUE),
Mean_HML = mean(HML, na.rm = TRUE)
) %>%
mutate(across(where(is.numeric), ~round(.x, 5)))
kable(summary_tbl,
caption = "Table 1: Descriptive Statistics by Stock",
col.names = c("Symbol","N","Mean Ret","SD Ret","Min Ret","Max Ret",
"Mean MKT","Mean SMB","Mean HML")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE)| Symbol | N | Mean Ret | SD Ret | Min Ret | Max Ret | Mean MKT | Mean SMB | Mean HML |
|---|---|---|---|---|---|---|---|---|
| AAPL | 1257 | 0.00070 | 0.01680 | -0.13188 | 0.08502 | 0.00038 | 0 | 0.00013 |
| FORD | 1257 | -0.00058 | 0.05549 | -0.39087 | 0.96141 | 0.00038 | 0 | 0.00013 |
| GE | 1257 | 0.00056 | 0.01345 | -0.06765 | 0.10260 | 0.00038 | 0 | 0.00013 |
| GM | 1257 | -0.00001 | 0.01895 | -0.11544 | 0.09108 | 0.00038 | 0 | 0.00013 |
| IBM | 1257 | -0.00006 | 0.01221 | -0.08642 | 0.05511 | 0.00038 | 0 | 0.00013 |
| MSFT | 1257 | 0.00065 | 0.01479 | -0.12103 | 0.09941 | 0.00038 | 0 | 0.00013 |
date_range <- data %>%
group_by(symbol) %>%
summarise(Start = min(date), End = max(date), Trading_Days = n())
kable(date_range, caption = "Table 2: Data Coverage per Stock") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| symbol | Start | End | Trading_Days |
|---|---|---|---|
| AAPL | 2011-01-04 | 2015-12-31 | 1257 |
| FORD | 2011-01-04 | 2015-12-31 | 1257 |
| GE | 2011-01-04 | 2015-12-31 | 1257 |
| GM | 2011-01-04 | 2015-12-31 | 1257 |
| IBM | 2011-01-04 | 2015-12-31 | 1257 |
| MSFT | 2011-01-04 | 2015-12-31 | 1257 |
ggplot(data, aes(x = ri, fill = symbol)) +
geom_histogram(bins = 60, alpha = 0.8, color = "white", linewidth = 0.2) +
facet_wrap(~symbol, scales = "free_y") +
scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribution of Daily Returns by Stock",
subtitle = "Fama-French Three-Factor Dataset (2011–2015)",
x = "Daily Return",
y = "Count",
fill = "Stock") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 14))cum_ret <- data %>%
arrange(symbol, date) %>%
group_by(symbol) %>%
mutate(cum_ret = cumprod(1 + ri) - 1)
ggplot(cum_ret, aes(x = date, y = cum_ret, color = symbol)) +
geom_line(linewidth = 0.9) +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_color_brewer(palette = "Set1") +
labs(title = "Cumulative Returns: 2011–2015",
subtitle = "Buy-and-Hold Returns from January 2011",
x = "Date",
y = "Cumulative Return",
color = "Stock") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold", size = 14))factor_data <- data %>%
select(MKT, SMB, HML, ri) %>%
rename(Market = MKT, SMB_Size = SMB, HML_Value = HML, Stock_Return = ri)
corr_mat <- cor(factor_data, use = "pairwise.complete.obs")
corr_df <- as.data.frame(as.table(round(corr_mat, 3))) %>%
rename(Var1 = Var1, Var2 = Var2, Correlation = Freq) %>%
mutate(
label = sprintf("%.3f", Correlation),
Var1 = factor(Var1, levels = colnames(corr_mat)),
Var2 = factor(Var2, levels = rev(colnames(corr_mat)))
)
ggplot(corr_df, aes(x = Var1, y = Var2, fill = Correlation)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = label), size = 4.5, fontface = "bold") +
scale_fill_gradient2(low = "#d73027", mid = "white", high = "#1a9850",
midpoint = 0, limits = c(-1, 1)) +
labs(title = "Factor Correlation Matrix",
x = NULL, y = NULL, fill = "Correlation") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 13),
legend.position = "right")┌──────────────────────────────────────────────────────────────────────┐
│ STEP 0 (Pass 1): N Time-Series Regressions │
│ For each stock i = 1, ..., N: │
│ rᵢ,ₜ = αᵢ + βᵢᴹᴷᵀ·MKTₜ + βᵢˢᴹᴮ·SMBₜ + βᵢᴴᴹᴸ·HMLₜ + εᵢ,ₜ │
│ → Obtain factor loadings: β̂ᵢᴹᴷᵀ, β̂ᵢˢᴹᴮ, β̂ᵢᴴᴹᴸ │
├──────────────────────────────────────────────────────────────────────┤
│ STEP 1 (Pass 2): T Cross-Sectional Regressions │
│ For each time period t = 1, ..., T: │
│ rᵢ,ₜ = λₜ₀ + λₜᴹᴷᵀ·β̂ᵢᴹᴷᵀ + λₜˢᴹᴮ·β̂ᵢˢᴹᴮ + λₜᴴᴹᴸ·β̂ᵢᴴᴹᴸ + ηᵢ,ₜ │
│ → Obtain time-series of risk premia: λ̂ₜᴹᴷᵀ, λ̂ₜˢᴹᴮ, λ̂ₜᴴᴹᴸ │
├──────────────────────────────────────────────────────────────────────┤
│ STEP 2: Average the Risk Premia │
│ λ̄ = (1/T) Σₜ λ̂ₜ │
│ Test H₀: λ̄ = 0 using t-test on the time-series of λ̂ₜ │
└──────────────────────────────────────────────────────────────────────┘
# --- Pass 1: Estimate betas via time-series regression for each stock ---
step0 <- data %>%
nest(data = c(date, ri, MKT, SMB, HML)) %>%
mutate(estimates = map(
data,
~tidy(lm(ri ~ MKT + SMB + HML, data = .x))
)) %>%
unnest(estimates) %>%
select(symbol, estimate, term) %>%
pivot_wider(names_from = term,
values_from = estimate) %>%
select(symbol,
b_MKT = MKT,
b_HML = HML,
b_SMB = SMB)
kable(step0 %>% mutate(across(where(is.numeric), ~round(.x, 4))),
caption = "Table 3: Estimated Factor Betas (Pass 1)",
col.names = c("Symbol","β̂ MKT","β̂ HML","β̂ SMB")) %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE)| Symbol | β̂ MK |
β̂ H
| |
|---|---|---|---|
| AAPL | 0.9000 | -0.0578 | 0.0685 |
| FORD | 0.5129 | 0.1380 | -0.2644 |
| GE | 1.0779 | 0.0902 | 0.0994 |
| GM | 1.2854 | -0.0222 | 0.0039 |
| IBM | 0.8169 | -0.0121 | 0.0336 |
| MSFT | 0.9656 | -0.0641 | 0.0582 |
step0_long <- step0 %>%
pivot_longer(cols = -symbol, names_to = "factor", values_to = "beta") %>%
mutate(factor = recode(factor,
b_MKT = "Market (MKT)",
b_SMB = "Size (SMB)",
b_HML = "Value (HML)"))
ggplot(step0_long, aes(x = symbol, y = beta, fill = factor)) +
geom_col(position = "dodge", color = "white", linewidth = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey40") +
scale_fill_manual(values = c("#2166ac","#d6604d","#4dac26")) +
labs(title = "Estimated Factor Betas by Stock (Pass 1)",
subtitle = "From time-series regression of daily returns on Fama-French factors",
x = "Stock",
y = "Beta Coefficient",
fill = "Factor") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold", size = 14))reg_details <- data %>%
nest(data = c(date, ri, MKT, SMB, HML)) %>%
mutate(
model = map(data, ~lm(ri ~ MKT + SMB + HML, data = .x)),
glanced = map(model, glance)
) %>%
unnest(glanced) %>%
select(symbol, r.squared, adj.r.squared, statistic, p.value, nobs) %>%
mutate(across(where(is.numeric), ~round(.x, 4)))
kable(reg_details,
caption = "Table 4: Time-Series Regression Fit Statistics (Pass 1)",
col.names = c("Symbol","R²","Adj. R²","F-stat","p-value","N")) %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE)| Symbol | R² | Adj. R² | F-stat | p-value | N |
|---|---|---|---|---|---|
| AAPL | 0.2729 | 0.2712 | 156.7577 | 0.0000 | 1257 |
| FORD | 0.0090 | 0.0067 | 3.8122 | 0.0098 | 1257 |
| GE | 0.6125 | 0.6116 | 660.1958 | 0.0000 | 1257 |
| GM | 0.4379 | 0.4366 | 325.4098 | 0.0000 | 1257 |
| IBM | 0.4255 | 0.4241 | 309.2936 | 0.0000 | 1257 |
| MSFT | 0.4054 | 0.4040 | 284.7811 | 0.0000 | 1257 |
# Merge betas back onto panel
step0_panel <- data %>%
left_join(step0, by = "symbol")
# --- Pass 2: Cross-sectional regression for each date ---
step1 <- step0_panel %>%
nest(data = c(symbol, ri, b_MKT, b_SMB, b_HML)) %>%
mutate(estimates = map(
data,
~tidy(lm(ri ~ b_MKT + b_SMB + b_HML, data = .x))
)) %>%
unnest(estimates) %>%
select(date, estimate, term) %>%
pivot_wider(names_from = term,
values_from = estimate) %>%
select(date, b_MKT, b_HML, b_SMB)
# Preview first few rows
kable(head(step1, 10) %>% mutate(across(where(is.numeric), ~round(.x, 6))),
caption = "Table 5: Cross-Sectional Risk Premia (first 10 dates)",
col.names = c("Date","λ MKT","λ HML","λ SMB")) %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE)| Date | λ MKT | λ HML | λ SMB |
|---|---|---|---|
| 2011-01-04 | 0.041629 | 0.057372 | -0.025520 |
| 2011-01-05 | -0.011347 | 0.062847 | -0.158046 |
| 2011-01-06 | 0.037301 | -0.173234 | 0.007029 |
| 2011-01-07 | 0.012722 | -0.064226 | 0.032269 |
| 2011-01-10 | -0.036631 | 0.058646 | 0.017123 |
| 2011-01-11 | 0.004089 | 0.089858 | -0.095361 |
| 2011-01-12 | -0.055365 | 0.043036 | -0.164496 |
| 2011-01-13 | -0.019357 | 0.025630 | 0.001815 |
| 2011-01-14 | -0.016486 | 0.039214 | 0.063259 |
| 2011-01-18 | 0.010146 | -0.090027 | 0.052508 |
step1_long <- step1 %>%
pivot_longer(cols = -date, names_to = "factor", values_to = "lambda") %>%
mutate(factor = recode(factor,
b_MKT = "Market (MKT)",
b_SMB = "Size (SMB)",
b_HML = "Value (HML)"))
ggplot(step1_long, aes(x = date, y = lambda, color = factor)) +
geom_line(alpha = 0.6, linewidth = 0.4) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
geom_smooth(method = "loess", span = 0.15, se = FALSE, linewidth = 1.1) +
facet_wrap(~factor, ncol = 1, scales = "free_y") +
scale_color_manual(values = c("#2166ac","#4dac26","#d6604d")) +
scale_y_continuous(labels = percent_format(accuracy = 0.01)) +
labs(title = "Time Series of Cross-Sectional Risk Premia (λ̂ₜ)",
subtitle = "Daily estimates from Pass 2 cross-sectional regressions. Smoothed trend in bold.",
x = "Date",
y = "Risk Premium (λ̂ₜ)",
color = "Factor") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 14))ggplot(step1_long, aes(x = lambda, fill = factor)) +
geom_histogram(bins = 60, alpha = 0.85, color = "white", linewidth = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
facet_wrap(~factor, scales = "free") +
scale_fill_manual(values = c("#2166ac","#4dac26","#d6604d")) +
scale_x_continuous(labels = percent_format(accuracy = 0.1)) +
labs(title = "Distribution of Daily Risk Premia Estimates",
subtitle = "Each observation is a cross-sectional regression coefficient for one trading day",
x = "Risk Premium λ̂ₜ",
y = "Count") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 14))# t-tests: H₀: mean λ = 0
t_mkt <- t.test(step1$b_MKT, mu = 0)
t_smb <- t.test(step1$b_SMB, mu = 0)
t_hml <- t.test(step1$b_HML, mu = 0)results <- tibble(
Factor = c("Market (MKT)", "Size (SMB)", "Value (HML)"),
Mean_Lambda = c(mean(step1$b_MKT), mean(step1$b_SMB), mean(step1$b_HML)),
SD_Lambda = c(sd(step1$b_MKT), sd(step1$b_SMB), sd(step1$b_HML)),
t_statistic = c(t_mkt$statistic, t_smb$statistic, t_hml$statistic),
p_value = c(t_mkt$p.value, t_smb$p.value, t_hml$p.value),
CI_lower = c(t_mkt$conf.int[1], t_smb$conf.int[1], t_hml$conf.int[1]),
CI_upper = c(t_mkt$conf.int[2], t_smb$conf.int[2], t_hml$conf.int[2]),
Significant = c(t_mkt$p.value < 0.05, t_smb$p.value < 0.05, t_hml$p.value < 0.05)
) %>%
mutate(across(where(is.numeric), ~round(.x, 5)))
kable(results,
caption = "Table 6: Fama-MacBeth Risk Premium Estimates",
col.names = c("Factor","Mean λ̄","Std Dev","t-stat","p-value",
"95% CI Lower","95% CI Upper","Significant (5%)")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE) %>%
column_spec(8, bold = TRUE,
color = ifelse(results$Significant, "darkgreen", "darkred")) %>%
row_spec(which(results$Significant), background = "#eaffea")| Factor | Mean λ |
Std De
| |||||
|---|---|---|---|---|---|---|---|
| Market (MKT) | -0.00041 | 0.03857 | -0.37879 | 0.70491 | -0.00255 | 0.00172 | FALSE |
| Size (SMB) | 0.00368 | 0.13363 | 0.97712 | 0.32870 | -0.00371 | 0.01108 | FALSE |
| Value (HML) | -0.00047 | 0.09171 | -0.18044 | 0.85684 | -0.00554 | 0.00461 | FALSE |
results_plot <- results %>%
mutate(Factor = factor(Factor, levels = c("Market (MKT)","Size (SMB)","Value (HML)")))
ggplot(results_plot, aes(x = Factor, y = Mean_Lambda, color = Significant)) +
geom_point(size = 5) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.15, linewidth = 1.1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50", linewidth = 0.8) +
scale_color_manual(values = c("TRUE" = "#1a9850", "FALSE" = "#d73027"),
labels = c("TRUE" = "Significant (p<0.05)",
"FALSE" = "Not Significant")) +
scale_y_continuous(labels = percent_format(accuracy = 0.01)) +
labs(title = "Fama-MacBeth Risk Premia: Estimates and 95% Confidence Intervals",
subtitle = "H₀: λ̄ = 0 | Green = reject H₀, Red = fail to reject H₀",
x = "Risk Factor",
y = "Average Risk Premium (λ̄)",
color = NULL) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(face = "bold"))To see how risk premia evolve over time, we compute 6-month rolling averages of the daily cross-sectional estimates.
roll_window <- 125 # approx 6 months of trading days
step1_roll <- step1 %>%
arrange(date) %>%
mutate(
roll_MKT = zoo::rollmean(b_MKT, k = roll_window, fill = NA, align = "right"),
roll_SMB = zoo::rollmean(b_SMB, k = roll_window, fill = NA, align = "right"),
roll_HML = zoo::rollmean(b_HML, k = roll_window, fill = NA, align = "right")
) %>%
pivot_longer(cols = starts_with("roll_"),
names_to = "factor",
values_to = "rolling_mean") %>%
mutate(factor = recode(factor,
roll_MKT = "Market (MKT)",
roll_SMB = "Size (SMB)",
roll_HML = "Value (HML)"))
ggplot(step1_roll, aes(x = date, y = rolling_mean, color = factor)) +
geom_line(linewidth = 1.0, na.rm = TRUE) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
facet_wrap(~factor, ncol = 1, scales = "free_y") +
scale_color_manual(values = c("#2166ac","#4dac26","#d6604d")) +
scale_y_continuous(labels = percent_format(accuracy = 0.01)) +
labs(title = "6-Month Rolling Average of Daily Risk Premia",
subtitle = paste0("Rolling window = ", roll_window, " trading days"),
x = "Date",
y = "Rolling Mean λ̄",
color = "Factor") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 14))conclusion <- tibble(
Factor = c("Market (MKT)", "Size (SMB)", "Value (HML)"),
Finding = c(
paste0("Mean daily premium = ", round(mean(step1$b_MKT)*100, 4),
"%. p = ", round(t_mkt$p.value, 4),
". ", ifelse(t_mkt$p.value < 0.05, "✅ Priced", "❌ Not priced"), "."),
paste0("Mean daily premium = ", round(mean(step1$b_SMB)*100, 4),
"%. p = ", round(t_smb$p.value, 4),
". ", ifelse(t_smb$p.value < 0.05, "✅ Priced", "❌ Not priced"), "."),
paste0("Mean daily premium = ", round(mean(step1$b_HML)*100, 4),
"%. p = ", round(t_hml$p.value, 4),
". ", ifelse(t_hml$p.value < 0.05, "✅ Priced", "❌ Not priced"), ".")
)
)
kable(conclusion, caption = "Table 7: Summary of Fama-MacBeth Findings") %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = TRUE) %>%
column_spec(1, bold = TRUE, width = "15em")| Factor | Finding |
|---|---|
| Market (MKT) | Mean daily premium = -0.0412%. p = 0.7049. ❌ Not priced. | |
| Size (SMB) | Mean daily premium = 0.3683%. p = 0.3287. ❌ Not priced. | |
| Value (HML) | Mean daily premium = -0.0467%. p = 0.8568. ❌ Not priced. | |
The Fama-MacBeth two-pass regression was applied to daily returns of 6 stocks over 1257 trading days (2011–2015), using the Fama-French three-factor model.
Key Findings:
Market Factor (MKT): The market risk premium is not statistically significant at the 5% level (p = 0.7049).
Size Factor (SMB): The SMB premium is not statistically significant (p = 0.3287).
Value Factor (HML): The HML premium is not statistically significant (p = 0.8568).
Methodological Note: The small cross-section (N = 6 stocks) limits the power of the cross-sectional regressions. Fama-MacBeth performs best with large N and moderate T. The results here are illustrative of the procedure rather than definitive empirical claims about factor pricing.
Generated with R Markdown | Packages: tidyverse,
broom, kableExtra, ggcorrplot,
scales, zoo