The Fama-MacBeth (1973) procedure is a two-pass regression method used in empirical asset pricing to estimate risk premia. It involves:
First Pass — Time-Series Regression (N
regressions):
For each stock i, regress its excess returns on factor returns
over the full sample period to estimate factor loadings
(betas).
Second Pass — Cross-Sectional Regression (T
regressions):
For each time period t, regress all stocks’ returns on the
betas estimated in the first pass to obtain period-specific risk premia
(λ).
Final Step — Time-Series Average:
Average the T cross-sectional coefficients. Compute Fama-MacBeth
standard errors as sd(λ) / sqrt(T).
# Install if needed:
# install.packages(c("tidyverse", "broom", "knitr", "kableExtra"))
library(tidyverse) # data manipulation & ggplot2
library(broom) # tidy regression output
library(knitr) # tables
library(kableExtra) # styled tables
# ── Try to load the real CSV; fall back to synthetic data if not found ──────
csv_path <- "data.csv" # ← change to your actual filename if needed
if (file.exists(csv_path)) {
df <- read.csv(csv_path)
# Rename columns to match the standard names used below (edit as needed):
# df <- df %>% rename(stock_id = ..., date = ..., ret = ..., mkt = ...)
cat("✔ Loaded real data:", nrow(df), "rows\n")
} else {
message("'data.csv' not found — using synthetic data for demonstration.")
set.seed(42)
n_stocks <- 50 # number of stocks
n_periods <- 60 # monthly periods (5 years)
# True betas drawn from N(1, 0.4²)
true_betas <- rnorm(n_stocks, mean = 1, sd = 0.4)
dates <- format(seq(as.Date("2018-01-01"),
by = "month", length.out = n_periods), "%Y-%m")
stock_ids <- paste0("S", sprintf("%03d", seq_len(n_stocks)))
# Simulate market excess return ~ N(0.006, 0.04²)
mkt_series <- rnorm(n_periods, mean = 0.006, sd = 0.04)
# Build panel: each row = one stock × one period
df <- expand.grid(stock_id = stock_ids,
date = dates,
stringsAsFactors = FALSE)
df <- df %>%
left_join(tibble(stock_id = stock_ids, true_beta = true_betas),
by = "stock_id") %>%
left_join(tibble(date = dates, mkt = mkt_series),
by = "date") %>%
mutate(
smb = rnorm(n(), mean = 0.002, sd = 0.03),
hml = rnorm(n(), mean = 0.001, sd = 0.025),
# Stock return = alpha + beta*mkt + noise
ret = 0.001 + true_beta * mkt + rnorm(n(), mean = 0, sd = 0.03)
) %>%
select(stock_id, date, ret, mkt, smb, hml)
}
# Preview
head(df)
dim(df)
## [1] 3000 6
str(df)
## 'data.frame': 3000 obs. of 6 variables:
## $ stock_id: chr "S001" "S002" "S003" "S004" ...
## $ date : chr "2018-01" "2018-01" "2018-01" "2018-01" ...
## $ ret : num 0.0334 0.0278 0.0317 0.0508 0.08 ...
## $ mkt : num 0.0189 0.0189 0.0189 0.0189 0.0189 ...
## $ smb : num 0.00125 0.00524 -0.01256 -0.01313 -0.04783 ...
## $ hml : num 0.01026 -0.00873 -0.0105 -0.00565 0.01354 ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : Named int [1:2] 50 60
## .. ..- attr(*, "names")= chr [1:2] "stock_id" "date"
## ..$ dimnames:List of 2
## .. ..$ stock_id: chr [1:50] "stock_id=S001" "stock_id=S002" "stock_id=S003" "stock_id=S004" ...
## .. ..$ date : chr [1:60] "date=2018-01" "date=2018-02" "date=2018-03" "date=2018-04" ...
# Ensure data is sorted by stock then date
df <- df %>% arrange(stock_id, date)
# Summary statistics
summary(df)
## stock_id date ret mkt
## Length:3000 Length:3000 Min. :-0.21844 Min. :-0.11372
## Class :character Class :character 1st Qu.:-0.01902 1st Qu.:-0.01371
## Mode :character Mode :character Median : 0.01085 Median : 0.01347
## Mean : 0.01133 Mean : 0.01089
## 3rd Qu.: 0.04362 3rd Qu.: 0.03359
## Max. : 0.18733 Max. : 0.07994
## smb hml
## Min. :-0.099152 Min. :-0.0822684
## 1st Qu.:-0.017765 1st Qu.:-0.0167800
## Median : 0.001788 Median : 0.0008203
## Mean : 0.001648 Mean : 0.0008473
## 3rd Qu.: 0.021825 3rd Qu.: 0.0179550
## Max. : 0.109540 Max. : 0.0834722
For each stock, run a time-series regression of returns on the market factor (and additional factors if available) to estimate β.
# ── Single-factor model (CAPM): ret_i = α_i + β_i * mkt + ε ──────────────
first_pass <- df %>%
group_by(stock_id) %>%
do(tidy(lm(ret ~ mkt, data = .))) %>%
ungroup()
# Keep only the beta (slope on mkt), rename for clarity
betas <- first_pass %>%
filter(term == "mkt") %>%
select(stock_id, beta = estimate)
# Preview
kable(head(betas, 10),
caption = "First-Pass Beta Estimates (first 10 stocks)",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
| stock_id | beta |
|---|---|
| S001 | 1.5237 |
| S002 | 0.6087 |
| S003 | 1.1588 |
| S004 | 1.3219 |
| S005 | 1.1183 |
| S006 | 0.8465 |
| S007 | 1.6088 |
| S008 | 0.9032 |
| S009 | 1.7767 |
| S010 | 1.0492 |
ggplot(betas, aes(x = beta)) +
geom_histogram(bins = 30, fill = "#2C7BB6", colour = "white", alpha = 0.85) +
geom_vline(xintercept = 1, linetype = "dashed", colour = "red", linewidth = 0.8) +
labs(title = "Distribution of First-Pass Beta Estimates",
x = "Beta",
y = "Count") +
theme_minimal(base_size = 13)
Merge the betas back into the panel, then for each time period run a cross-sectional regression of returns on betas.
# Merge betas into panel
df2 <- df %>% left_join(betas, by = "stock_id")
# Run cross-sectional regression for every period t
second_pass <- df2 %>%
group_by(date) %>%
do(tidy(lm(ret ~ beta, data = .))) %>%
ungroup()
# Extract the risk premium (λ) on beta each period
lambda <- second_pass %>%
filter(term == "beta") %>%
select(date, lambda = estimate)
# Preview
kable(head(lambda, 10),
caption = "Second-Pass Risk Premia λ (first 10 periods)",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
| date | lambda |
|---|---|
| 2018-01 | 0.0131 |
| 2018-02 | -0.0369 |
| 2018-03 | 0.0907 |
| 2018-04 | 0.0224 |
| 2018-05 | 0.0132 |
| 2018-06 | 0.0148 |
| 2018-07 | 0.0177 |
| 2018-08 | -0.0002 |
| 2018-09 | -0.0950 |
| 2018-10 | 0.0233 |
ggplot(lambda, aes(x = date, y = lambda, group = 1)) +
geom_line(colour = "#2C7BB6", linewidth = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
labs(title = "Cross-Sectional Risk Premia (λ) Over Time",
x = "Period",
y = "Lambda (λ)") +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Take the time-series average of λ values and compute the Fama-MacBeth t-statistic.
T <- nrow(lambda) # number of time periods
fm_results <- lambda %>%
summarise(
Mean_Lambda = mean(lambda),
SD_Lambda = sd(lambda),
SE_FM = sd(lambda) / sqrt(T), # Fama-MacBeth SE
t_stat = mean(lambda) / (sd(lambda) / sqrt(T)),
p_value = 2 * pt(-abs(mean(lambda) / (sd(lambda) / sqrt(T))),
df = T - 1)
)
kable(fm_results,
caption = "Fama-MacBeth Regression Results",
digits = 4,
col.names = c("Mean λ", "SD λ", "FM Std. Error",
"t-Statistic", "p-Value")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2C7BB6", color = "white")
| Mean λ | SD λ | FM Std. Error | t-Statistic | p-Value |
|---|---|---|---|---|
| 0.0105 | 0.038 | 0.0049 | 2.133 | 0.0371 |
cat("──────────────────────────────────────────────\n")
## ──────────────────────────────────────────────
cat(sprintf(" Average Risk Premium (λ) : %.4f\n", fm_results$Mean_Lambda))
## Average Risk Premium (λ) : 0.0105
cat(sprintf(" Fama-MacBeth Std. Error : %.4f\n", fm_results$SE_FM))
## Fama-MacBeth Std. Error : 0.0049
cat(sprintf(" t-Statistic : %.4f\n", fm_results$t_stat))
## t-Statistic : 2.1330
cat(sprintf(" p-Value (two-tailed) : %.4f\n", fm_results$p_value))
## p-Value (two-tailed) : 0.0371
cat("──────────────────────────────────────────────\n")
## ──────────────────────────────────────────────
if (fm_results$p_value < 0.05) {
cat(" ✔ Market beta is a significant pricing factor (p < 0.05)\n")
} else {
cat(" ✘ Market beta is NOT a significant pricing factor (p ≥ 0.05)\n")
}
## ✔ Market beta is a significant pricing factor (p < 0.05)
If the data includes SMB and HML, extend both passes as follows.
# ── First pass: Fama-French 3-Factor betas ────────────────────────────────
first_pass_ff3 <- df %>%
group_by(stock_id) %>%
do(tidy(lm(ret ~ mkt + smb + hml, data = .))) %>%
ungroup()
betas_ff3 <- first_pass_ff3 %>%
filter(term %in% c("mkt", "smb", "hml")) %>%
select(stock_id, term, estimate) %>%
pivot_wider(names_from = term, values_from = estimate,
names_prefix = "beta_")
# ── Merge and second pass ─────────────────────────────────────────────────
df_ff3 <- df %>% left_join(betas_ff3, by = "stock_id")
second_pass_ff3 <- df_ff3 %>%
group_by(date) %>%
do(tidy(lm(ret ~ beta_mkt + beta_smb + beta_hml, data = .))) %>%
ungroup()
# ── FM averages for each factor ───────────────────────────────────────────
T <- n_distinct(df_ff3$date)
fm_ff3 <- second_pass_ff3 %>%
filter(term != "(Intercept)") %>%
group_by(term) %>%
summarise(
Mean_Lambda = mean(estimate),
SE_FM = sd(estimate) / sqrt(T),
t_stat = mean(estimate) / (sd(estimate) / sqrt(T)),
p_value = 2 * pt(-abs(mean(estimate) / (sd(estimate) / sqrt(T))),
df = T - 1)
)
kable(fm_ff3, caption = "FM Results — Fama-French 3-Factor Model",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Step | What We Did | Output |
|---|---|---|
| 1 – Time-Series | Regressed each stock’s returns on market factor | β estimates per stock |
| 2 – Cross-Section | For each period, regressed returns on β | λ_t per period |
| 3 – FM Average | Averaged λ_t; FM SE = sd(λ)/√T | Mean λ, t-stat, p-value |
The Fama-MacBeth procedure correctly accounts for cross-sectional correlation of residuals, which ordinary pooled OLS ignores. The resulting t-statistic tests whether the risk premium on a factor is statistically different from zero.
Reference: Fama, E. F., & MacBeth, J. D. (1973). Risk, Return, and Equilibrium: Empirical Tests. Journal of Political Economy, 81(3), 607–636.