1 Introduction

The Fama-MacBeth (1973) procedure is a two-pass regression method used in empirical asset pricing to estimate risk premia. It involves:

  1. First Pass — Time-Series Regression (N regressions):
    For each stock i, regress its excess returns on factor returns over the full sample period to estimate factor loadings (betas).

  2. Second Pass — Cross-Sectional Regression (T regressions):
    For each time period t, regress all stocks’ returns on the betas estimated in the first pass to obtain period-specific risk premia (λ).

  3. Final Step — Time-Series Average:
    Average the T cross-sectional coefficients. Compute Fama-MacBeth standard errors as sd(λ) / sqrt(T).


2 Load Packages

# Install if needed:
# install.packages(c("tidyverse", "broom", "knitr", "kableExtra"))

library(tidyverse)   # data manipulation & ggplot2
library(broom)       # tidy regression output
library(knitr)       # tables
library(kableExtra)  # styled tables

3 Load Data

# ── Try to load the real CSV; fall back to synthetic data if not found ──────
csv_path <- "data.csv"   # ← change to your actual filename if needed

if (file.exists(csv_path)) {

  df <- read.csv(csv_path)
  # Rename columns to match the standard names used below (edit as needed):
  # df <- df %>% rename(stock_id = ..., date = ..., ret = ..., mkt = ...)
  cat("✔ Loaded real data:", nrow(df), "rows\n")

} else {

  message("'data.csv' not found — using synthetic data for demonstration.")

  set.seed(42)

  n_stocks  <- 50    # number of stocks
  n_periods <- 60    # monthly periods (5 years)

  # True betas drawn from N(1, 0.4²)
  true_betas <- rnorm(n_stocks, mean = 1, sd = 0.4)

  dates     <- format(seq(as.Date("2018-01-01"),
                          by = "month", length.out = n_periods), "%Y-%m")
  stock_ids <- paste0("S", sprintf("%03d", seq_len(n_stocks)))

  # Simulate market excess return ~ N(0.006, 0.04²)
  mkt_series <- rnorm(n_periods, mean = 0.006, sd = 0.04)

  # Build panel: each row = one stock × one period
  df <- expand.grid(stock_id = stock_ids,
                    date     = dates,
                    stringsAsFactors = FALSE)

  df <- df %>%
    left_join(tibble(stock_id = stock_ids, true_beta = true_betas),
              by = "stock_id") %>%
    left_join(tibble(date = dates, mkt = mkt_series),
              by = "date") %>%
    mutate(
      smb = rnorm(n(), mean = 0.002, sd = 0.03),
      hml = rnorm(n(), mean = 0.001, sd = 0.025),
      # Stock return = alpha + beta*mkt + noise
      ret = 0.001 + true_beta * mkt + rnorm(n(), mean = 0, sd = 0.03)
    ) %>%
    select(stock_id, date, ret, mkt, smb, hml)
}

# Preview
head(df)
dim(df)
## [1] 3000    6
str(df)
## 'data.frame':    3000 obs. of  6 variables:
##  $ stock_id: chr  "S001" "S002" "S003" "S004" ...
##  $ date    : chr  "2018-01" "2018-01" "2018-01" "2018-01" ...
##  $ ret     : num  0.0334 0.0278 0.0317 0.0508 0.08 ...
##  $ mkt     : num  0.0189 0.0189 0.0189 0.0189 0.0189 ...
##  $ smb     : num  0.00125 0.00524 -0.01256 -0.01313 -0.04783 ...
##  $ hml     : num  0.01026 -0.00873 -0.0105 -0.00565 0.01354 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:2] 50 60
##   .. ..- attr(*, "names")= chr [1:2] "stock_id" "date"
##   ..$ dimnames:List of 2
##   .. ..$ stock_id: chr [1:50] "stock_id=S001" "stock_id=S002" "stock_id=S003" "stock_id=S004" ...
##   .. ..$ date    : chr [1:60] "date=2018-01" "date=2018-02" "date=2018-03" "date=2018-04" ...

4 Data Preparation

# Ensure data is sorted by stock then date
df <- df %>% arrange(stock_id, date)

# Summary statistics
summary(df)
##    stock_id             date                ret                mkt          
##  Length:3000        Length:3000        Min.   :-0.21844   Min.   :-0.11372  
##  Class :character   Class :character   1st Qu.:-0.01902   1st Qu.:-0.01371  
##  Mode  :character   Mode  :character   Median : 0.01085   Median : 0.01347  
##                                        Mean   : 0.01133   Mean   : 0.01089  
##                                        3rd Qu.: 0.04362   3rd Qu.: 0.03359  
##                                        Max.   : 0.18733   Max.   : 0.07994  
##       smb                 hml            
##  Min.   :-0.099152   Min.   :-0.0822684  
##  1st Qu.:-0.017765   1st Qu.:-0.0167800  
##  Median : 0.001788   Median : 0.0008203  
##  Mean   : 0.001648   Mean   : 0.0008473  
##  3rd Qu.: 0.021825   3rd Qu.: 0.0179550  
##  Max.   : 0.109540   Max.   : 0.0834722

5 Step 1 — First Pass: Time-Series Regressions

For each stock, run a time-series regression of returns on the market factor (and additional factors if available) to estimate β.

# ── Single-factor model (CAPM): ret_i = α_i + β_i * mkt + ε ──────────────
first_pass <- df %>%
  group_by(stock_id) %>%
  do(tidy(lm(ret ~ mkt, data = .))) %>%
  ungroup()

# Keep only the beta (slope on mkt), rename for clarity
betas <- first_pass %>%
  filter(term == "mkt") %>%
  select(stock_id, beta = estimate)

# Preview
kable(head(betas, 10),
      caption = "First-Pass Beta Estimates (first 10 stocks)",
      digits  = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
First-Pass Beta Estimates (first 10 stocks)
stock_id beta
S001 1.5237
S002 0.6087
S003 1.1588
S004 1.3219
S005 1.1183
S006 0.8465
S007 1.6088
S008 0.9032
S009 1.7767
S010 1.0492
ggplot(betas, aes(x = beta)) +
  geom_histogram(bins = 30, fill = "#2C7BB6", colour = "white", alpha = 0.85) +
  geom_vline(xintercept = 1, linetype = "dashed", colour = "red", linewidth = 0.8) +
  labs(title = "Distribution of First-Pass Beta Estimates",
       x     = "Beta",
       y     = "Count") +
  theme_minimal(base_size = 13)


6 Step 2 — Second Pass: Cross-Sectional Regressions

Merge the betas back into the panel, then for each time period run a cross-sectional regression of returns on betas.

# Merge betas into panel
df2 <- df %>% left_join(betas, by = "stock_id")

# Run cross-sectional regression for every period t
second_pass <- df2 %>%
  group_by(date) %>%
  do(tidy(lm(ret ~ beta, data = .))) %>%
  ungroup()

# Extract the risk premium (λ) on beta each period
lambda <- second_pass %>%
  filter(term == "beta") %>%
  select(date, lambda = estimate)

# Preview
kable(head(lambda, 10),
      caption  = "Second-Pass Risk Premia λ (first 10 periods)",
      digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Second-Pass Risk Premia λ (first 10 periods)
date lambda
2018-01 0.0131
2018-02 -0.0369
2018-03 0.0907
2018-04 0.0224
2018-05 0.0132
2018-06 0.0148
2018-07 0.0177
2018-08 -0.0002
2018-09 -0.0950
2018-10 0.0233
ggplot(lambda, aes(x = date, y = lambda, group = 1)) +
  geom_line(colour = "#2C7BB6", linewidth = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  labs(title = "Cross-Sectional Risk Premia (λ) Over Time",
       x     = "Period",
       y     = "Lambda (λ)") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


7 Step 3 — Fama-MacBeth Summary Statistics

Take the time-series average of λ values and compute the Fama-MacBeth t-statistic.

T <- nrow(lambda)   # number of time periods

fm_results <- lambda %>%
  summarise(
    Mean_Lambda = mean(lambda),
    SD_Lambda   = sd(lambda),
    SE_FM       = sd(lambda) / sqrt(T),     # Fama-MacBeth SE
    t_stat      = mean(lambda) / (sd(lambda) / sqrt(T)),
    p_value     = 2 * pt(-abs(mean(lambda) / (sd(lambda) / sqrt(T))),
                          df = T - 1)
  )

kable(fm_results,
      caption = "Fama-MacBeth Regression Results",
      digits  = 4,
      col.names = c("Mean λ", "SD λ", "FM Std. Error",
                    "t-Statistic", "p-Value")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2C7BB6", color = "white")
Fama-MacBeth Regression Results
Mean λ SD λ FM Std. Error t-Statistic p-Value
0.0105 0.038 0.0049 2.133 0.0371
cat("──────────────────────────────────────────────\n")
## ──────────────────────────────────────────────
cat(sprintf("  Average Risk Premium (λ) : %.4f\n", fm_results$Mean_Lambda))
##   Average Risk Premium (λ) : 0.0105
cat(sprintf("  Fama-MacBeth Std. Error  : %.4f\n", fm_results$SE_FM))
##   Fama-MacBeth Std. Error  : 0.0049
cat(sprintf("  t-Statistic              : %.4f\n", fm_results$t_stat))
##   t-Statistic              : 2.1330
cat(sprintf("  p-Value (two-tailed)     : %.4f\n", fm_results$p_value))
##   p-Value (two-tailed)     : 0.0371
cat("──────────────────────────────────────────────\n")
## ──────────────────────────────────────────────
if (fm_results$p_value < 0.05) {
  cat("  ✔ Market beta is a significant pricing factor (p < 0.05)\n")
} else {
  cat("  ✘ Market beta is NOT a significant pricing factor (p ≥ 0.05)\n")
}
##   ✔ Market beta is a significant pricing factor (p < 0.05)

8 (Optional) Multi-Factor Extension

If the data includes SMB and HML, extend both passes as follows.

# ── First pass: Fama-French 3-Factor betas ────────────────────────────────
first_pass_ff3 <- df %>%
  group_by(stock_id) %>%
  do(tidy(lm(ret ~ mkt + smb + hml, data = .))) %>%
  ungroup()

betas_ff3 <- first_pass_ff3 %>%
  filter(term %in% c("mkt", "smb", "hml")) %>%
  select(stock_id, term, estimate) %>%
  pivot_wider(names_from = term, values_from = estimate,
              names_prefix = "beta_")

# ── Merge and second pass ─────────────────────────────────────────────────
df_ff3 <- df %>% left_join(betas_ff3, by = "stock_id")

second_pass_ff3 <- df_ff3 %>%
  group_by(date) %>%
  do(tidy(lm(ret ~ beta_mkt + beta_smb + beta_hml, data = .))) %>%
  ungroup()

# ── FM averages for each factor ───────────────────────────────────────────
T <- n_distinct(df_ff3$date)

fm_ff3 <- second_pass_ff3 %>%
  filter(term != "(Intercept)") %>%
  group_by(term) %>%
  summarise(
    Mean_Lambda = mean(estimate),
    SE_FM       = sd(estimate) / sqrt(T),
    t_stat      = mean(estimate) / (sd(estimate) / sqrt(T)),
    p_value     = 2 * pt(-abs(mean(estimate) / (sd(estimate) / sqrt(T))),
                          df = T - 1)
  )

kable(fm_ff3, caption = "FM Results — Fama-French 3-Factor Model",
      digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

9 Summary

Step What We Did Output
1 – Time-Series Regressed each stock’s returns on market factor β estimates per stock
2 – Cross-Section For each period, regressed returns on β λ_t per period
3 – FM Average Averaged λ_t; FM SE = sd(λ)/√T Mean λ, t-stat, p-value

The Fama-MacBeth procedure correctly accounts for cross-sectional correlation of residuals, which ordinary pooled OLS ignores. The resulting t-statistic tests whether the risk premium on a factor is statistically different from zero.


Reference: Fama, E. F., & MacBeth, J. D. (1973). Risk, Return, and Equilibrium: Empirical Tests. Journal of Political Economy, 81(3), 607–636.