1 Introduction

The Fama-MacBeth (1973) regression is a two-step procedure widely used in empirical asset pricing to estimate risk premia and test whether risk factors are priced in the cross-section of stock returns.

1.1 Two-Step Procedure

Step 1 — First Pass (Time-Series Regression):
For each asset \(i\), regress its excess returns on the risk factors over the full time series to obtain factor loadings (betas):

\[r_{i,t} - r_{f,t} = \alpha_i + \beta_i^{MKT} \cdot MKT_t + \beta_i^{SMB} \cdot SMB_t + \beta_i^{HML} \cdot HML_t + \varepsilon_{i,t}\]

Step 2 — Second Pass (Cross-Sectional Regression):
At each time period \(t\), regress the cross-sectional asset returns on the betas estimated in Step 1:

\[r_{i,t} - r_{f,t} = \lambda_{0,t} + \lambda_{1,t} \hat{\beta}_i^{MKT} + \lambda_{2,t} \hat{\beta}_i^{SMB} + \lambda_{3,t} \hat{\beta}_i^{HML} + \alpha_{i,t}\]

The final risk premium estimates are the time-series averages of the \(\lambda_t\) coefficients, and t-statistics are computed from their time-series standard errors.


2 Load Required Packages

# Install packages if not already installed
# install.packages(c("tidyverse","tidyfinance","frenchdata","broom","lmtest","sandwich","kableExtra"))

library(tidyverse)    # data manipulation & plotting
library(frenchdata)   # Kenneth French data library
library(broom)        # tidy regression output
library(lmtest)       # hypothesis testing
library(sandwich)     # Newey-West standard errors
library(kableExtra)   # nice tables
library(lubridate)    # date handling
library(ggplot2)      # visualization

3 Data Collection

3.1 Download Fama-French 3-Factor Data

We use monthly Fama-French 3-factor data from Kenneth French’s data library via the frenchdata package.

# Download Fama-French 3-Factor monthly data
ff3 <- download_french_data("Fama/French 3 Factors")

# Extract monthly subset
raw_ff3 <- ff3$subsets$data[[1]]
cat("Raw column names:", paste(colnames(raw_ff3), collapse = ", "), "\n")
## Raw column names: date, Mkt-RF, SMB, HML, RF
# Step 1: Standardise column names
# frenchdata may return "Mkt-RF", "Mkt.RF", or "Mkt_RF" depending on version
colnames(raw_ff3) <- colnames(raw_ff3) |>
  str_trim() |>
  str_replace_all("[^A-Za-z0-9]", "_")   # replace ALL non-alphanumeric with _

cat("Cleaned column names:", paste(colnames(raw_ff3), collapse = ", "), "\n")
## Cleaned column names: date, Mkt_RF, SMB, HML, RF
# Step 2: Find the market excess return column (whichever it is named)
mkt_col <- colnames(raw_ff3)[str_detect(colnames(raw_ff3), regex("mkt", ignore_case = TRUE))][1]
cat("Market column detected as:", mkt_col, "\n")
## Market column detected as: Mkt_RF
# Step 3: Rename & process
factors_ff3 <- raw_ff3 |>
  rename(MKT_RF = all_of(mkt_col)) |>
  mutate(
    date = floor_date(ymd(paste0(date, "01")), "month"),
    across(c(MKT_RF, SMB, HML, RF), ~ as.numeric(.) / 100)   # % → decimal
  ) |>
  select(date, MKT_RF, SMB, HML, RF) |>
  filter(date >= "1990-01-01" & date <= "2023-12-01")

glimpse(factors_ff3)
## Rows: 408
## Columns: 5
## $ date   <date> 1990-01-01, 1990-02-01, 1990-03-01, 1990-04-01, 1990-05-01, 19…
## $ MKT_RF <dbl> -0.0780, 0.0112, 0.0183, -0.0336, 0.0843, -0.0109, -0.0190, -0.…
## $ SMB    <dbl> -0.0114, 0.0097, 0.0147, -0.0047, -0.0256, 0.0135, -0.0308, -0.…
## $ HML    <dbl> 0.0083, 0.0065, -0.0290, -0.0257, -0.0389, -0.0200, -0.0006, 0.…
## $ RF     <dbl> 0.0057, 0.0057, 0.0064, 0.0069, 0.0068, 0.0063, 0.0068, 0.0066,…

3.2 Download Industry Portfolio Returns (Test Assets)

We use 10 Industry Portfolios from French’s library as our test assets.

# Download 10 Industry Portfolio monthly value-weighted returns
ind10 <- download_french_data("10 Industry Portfolios")

raw_ind <- ind10$subsets$data[[1]]
cat("Raw industry column names:", paste(colnames(raw_ind), collapse = ", "), "\n")
## Raw industry column names: date, NoDur, Durbl, Manuf, Enrgy, HiTec, Telcm, Shops, Hlth, Utils, Other
# Convert date and numeric columns
portfolios_raw <- raw_ind |>
  mutate(
    date = floor_date(ymd(paste0(date, "01")), "month"),
    across(-date, ~ suppressWarnings(as.numeric(.)))
  ) |>
  filter(date >= "1990-01-01" & date <= "2023-12-01") |>
  mutate(across(-date, ~ . / 100))  # convert % to decimal

# Pivot to long format
portfolios_long <- portfolios_raw |>
  pivot_longer(-date, names_to = "portfolio", values_to = "ret") |>
  drop_na()

cat("Number of portfolios:", length(unique(portfolios_long$portfolio)), "\n")
## Number of portfolios: 10
cat("Date range:", as.character(min(portfolios_long$date)), "to",
    as.character(max(portfolios_long$date)), "\n")
## Date range: 1990-01-01 to 2023-12-01
cat("Total observations:", nrow(portfolios_long), "\n")
## Total observations: 4080

3.3 Merge Portfolios with Factors

# Compute excess returns
data_merged <- portfolios_long |>
  left_join(factors_ff3, by = "date") |>
  mutate(ret_excess = ret - RF) |>
  drop_na()

head(data_merged, 10) |>
  kable(caption = "Merged Data: Industry Portfolio Excess Returns & FF3 Factors",
        digits = 4) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Merged Data: Industry Portfolio Excess Returns & FF3 Factors
date portfolio ret MKT_RF SMB HML RF ret_excess
1990-01-01 NoDur -0.0947 -0.078 -0.0114 0.0083 0.0057 -0.1004
1990-01-01 Durbl -0.0352 -0.078 -0.0114 0.0083 0.0057 -0.0409
1990-01-01 Manuf -0.0631 -0.078 -0.0114 0.0083 0.0057 -0.0688
1990-01-01 Enrgy -0.0428 -0.078 -0.0114 0.0083 0.0057 -0.0485
1990-01-01 HiTec -0.0147 -0.078 -0.0114 0.0083 0.0057 -0.0204
1990-01-01 Telcm -0.1304 -0.078 -0.0114 0.0083 0.0057 -0.1361
1990-01-01 Shops -0.0641 -0.078 -0.0114 0.0083 0.0057 -0.0698
1990-01-01 Hlth -0.0730 -0.078 -0.0114 0.0083 0.0057 -0.0787
1990-01-01 Utils -0.0535 -0.078 -0.0114 0.0083 0.0057 -0.0592
1990-01-01 Other -0.0892 -0.078 -0.0114 0.0083 0.0057 -0.0949

4 Step 1: First Pass — Time-Series Regressions

For each portfolio, we run a time-series regression of excess returns on the three Fama-French factors to obtain factor loadings (betas).

\[r_{i,t}^e = \alpha_i + \beta_i^{MKT} \cdot MKT_t + \beta_i^{SMB} \cdot SMB_t + \beta_i^{HML} \cdot HML_t + \varepsilon_{i,t}\]

# Run time-series regression for each portfolio
first_pass <- data_merged |>
  group_by(portfolio) |>
  group_modify(~ {
    model <- lm(ret_excess ~ MKT_RF + SMB + HML, data = .x)
    tidy(model) |>
      select(term, estimate) |>
      pivot_wider(names_from = term, values_from = estimate) |>
      rename(
        alpha    = `(Intercept)`,
        beta_MKT = MKT_RF,
        beta_SMB = SMB,
        beta_HML = HML
      )
  }) |>
  ungroup()

# Display estimated betas
first_pass |>
  kable(caption = "Step 1: Estimated Factor Loadings (Betas) for Each Portfolio",
        digits  = 4,
        col.names = c("Portfolio","Alpha","β MKT","β SMB","β HML")) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE)
Step 1: Estimated Factor Loadings (Betas) for Each Portfolio
Portfolio Alpha β MKT β SMB β HML
Durbl -0.0018 1.3718 0.2712 0.3391
Enrgy -0.0001 0.8976 0.0802 0.6998
HiTec 0.0024 1.2408 0.1747 -0.5922
Hlth 0.0030 0.7299 -0.1738 -0.1491
Manuf 0.0002 1.0275 0.0052 0.2695
NoDur 0.0017 0.6918 -0.2303 0.1728
Other -0.0018 1.1223 -0.0733 0.4833
Shops 0.0018 0.9086 -0.0320 -0.0233
Telcm -0.0024 0.9500 -0.1747 0.0266
Utils 0.0019 0.4954 -0.2296 0.2653

4.1 Visualise Factor Loadings

first_pass_long <- first_pass |>
  pivot_longer(cols = c(beta_MKT, beta_SMB, beta_HML),
               names_to  = "factor",
               values_to = "beta") |>
  mutate(factor = recode(factor,
                         "beta_MKT" = "Market (MKT)",
                         "beta_SMB" = "Size (SMB)",
                         "beta_HML" = "Value (HML)"))

ggplot(first_pass_long, aes(x = reorder(portfolio, beta), y = beta, fill = factor)) +
  geom_col(show.legend = FALSE, alpha = 0.85) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey40") +
  facet_wrap(~ factor, scales = "free_y", ncol = 1) +
  coord_flip() +
  labs(
    title    = "Step 1: Factor Loadings (Betas) by Industry Portfolio",
    subtitle = "Fama-French 3-Factor Model | 1990–2023",
    x = NULL, y = "Beta"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(color = "grey50"),
    strip.text    = element_text(face = "bold")
  ) +
  scale_fill_manual(values = c("#2196F3","#4CAF50","#FF5722"))


5 Step 2: Second Pass — Cross-Sectional Regressions

At each time period \(t\), we regress the cross-sectional excess returns of all portfolios on their estimated betas from Step 1. This yields a time series of \(\lambda_t\) estimates.

# Merge betas back to the panel data
data_with_betas <- data_merged |>
  left_join(first_pass |> select(portfolio, beta_MKT, beta_SMB, beta_HML),
            by = "portfolio")

# Run cross-sectional regression at each time period t
second_pass <- data_with_betas |>
  group_by(date) |>
  group_modify(~ {
    model <- lm(ret_excess ~ beta_MKT + beta_SMB + beta_HML, data = .x)
    tidy(model) |>
      select(term, estimate) |>
      pivot_wider(names_from = term, values_from = estimate) |>
      rename(
        lambda_0   = `(Intercept)`,
        lambda_MKT = beta_MKT,
        lambda_SMB = beta_SMB,
        lambda_HML = beta_HML
      )
  }) |>
  ungroup()

glimpse(second_pass)
## Rows: 408
## Columns: 5
## $ date       <date> 1990-01-01, 1990-02-01, 1990-03-01, 1990-04-01, 1990-05-01…
## $ lambda_0   <dbl> 0.074833747, 0.001204074, -0.004587882, -0.038933030, 0.047…
## $ lambda_MKT <dbl> -0.1391599865, 0.0125276978, 0.0326966018, 0.0081702798, 0.…
## $ lambda_SMB <dbl> 0.3178884561, 0.1035774493, -0.0374856633, -0.0347042354, -…
## $ lambda_HML <dbl> -0.021073356, 0.011832840, -0.044352527, -0.023548189, -0.0…

6 Fama-MacBeth Risk Premia Estimates

The final risk premia are the time-series means of the \(\lambda_t\) estimates. T-statistics are computed as:

\[t(\hat{\lambda}) = \frac{\bar{\lambda}}{se(\hat{\lambda})} = \frac{\bar{\lambda}}{sd(\lambda_t) / \sqrt{T}}\]

T <- nrow(second_pass)   # number of time periods

fm_results <- second_pass |>
  summarise(
    across(
      c(lambda_0, lambda_MKT, lambda_SMB, lambda_HML),
      list(
        mean   = ~ mean(., na.rm = TRUE),
        sd     = ~ sd(., na.rm = TRUE),
        se     = ~ sd(., na.rm = TRUE) / sqrt(T),
        t_stat = ~ mean(., na.rm = TRUE) / (sd(., na.rm = TRUE) / sqrt(T))
      ),
      .names = "{.col}__{.fn}"
    )
  ) |>
  pivot_longer(everything(),
               names_to  = c("lambda", "stat"),
               names_sep = "__") |>
  pivot_wider(names_from = stat, values_from = value) |>
  mutate(
    p_value   = 2 * pt(-abs(t_stat), df = T - 1),
    sig       = case_when(
      p_value < 0.01 ~ "***",
      p_value < 0.05 ~ "**",
      p_value < 0.10 ~ "*",
      TRUE           ~ ""
    ),
    lambda = recode(lambda,
                    "lambda_0"   = "Intercept (λ₀)",
                    "lambda_MKT" = "Market Premium (λ_MKT)",
                    "lambda_SMB" = "Size Premium (λ_SMB)",
                    "lambda_HML" = "Value Premium (λ_HML)")
  )

fm_results |>
  select(lambda, mean, se, t_stat, p_value, sig) |>
  kable(
    caption   = "Fama-MacBeth Risk Premia Estimates (1990–2023)",
    col.names = c("Factor", "Mean λ", "Std. Error", "t-Statistic", "p-Value", "Sig."),
    digits    = 4
  ) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE) |>
  column_spec(6, bold = TRUE, color = "darkgreen") |>
  footnote(general = "Significance: *** p<0.01, ** p<0.05, * p<0.10",
           general_title = "")
Fama-MacBeth Risk Premia Estimates (1990–2023)
Factor Mean λ Std. Error t-Statistic p-Value Sig.
Intercept (λ₀) 0.0105 0.0042 2.4901 0.0132 **
Market Premium (λ_MKT) -0.0026 0.0048 -0.5510 0.5820
Size Premium (λ_SMB) 0.0106 0.0067 1.5923 0.1121
Value Premium (λ_HML) -0.0020 0.0022 -0.9017 0.3678
Significance: *** p<0.01, ** p<0.05, * p<0.10

7 Visualisation of Risk Premia

7.1 Time Series of Lambda Estimates

second_pass_long <- second_pass |>
  pivot_longer(-date,
               names_to  = "lambda",
               values_to = "estimate") |>
  mutate(lambda = recode(lambda,
                         "lambda_0"   = "Intercept",
                         "lambda_MKT" = "Market (MKT)",
                         "lambda_SMB" = "Size (SMB)",
                         "lambda_HML" = "Value (HML)"))

ggplot(second_pass_long, aes(x = date, y = estimate)) +
  geom_line(color = "#2196F3", linewidth = 0.6, alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", linewidth = 0.6) +
  geom_smooth(method = "loess", se = TRUE, color = "#FF5722",
              fill = "#FF5722", alpha = 0.15, linewidth = 0.8) +
  facet_wrap(~ lambda, scales = "free_y", ncol = 2) +
  labs(
    title    = "Time Series of Cross-Sectional Lambda (λ) Estimates",
    subtitle = "Step 2: Fama-MacBeth Second Pass | 1990–2023",
    x = "Date", y = "λ Estimate"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(color = "grey50"),
    strip.text    = element_text(face = "bold"),
    axis.text.x   = element_text(angle = 30, hjust = 1)
  )

7.2 Risk Premia Bar Chart with Confidence Intervals

fm_plot <- fm_results |>
  filter(lambda != "Intercept (λ₀)") |>
  mutate(
    ci_low  = mean - 1.96 * se,
    ci_high = mean + 1.96 * se,
    color   = ifelse(mean > 0, "Positive", "Negative")
  )

ggplot(fm_plot, aes(x = lambda, y = mean, fill = color)) +
  geom_col(alpha = 0.85, width = 0.5) +
  geom_errorbar(aes(ymin = ci_low, ymax = ci_high),
                width = 0.15, linewidth = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey40") +
  scale_fill_manual(values = c("Positive" = "#4CAF50", "Negative" = "#F44336"),
                    guide = "none") +
  labs(
    title    = "Fama-MacBeth Risk Premia Estimates",
    subtitle = "Error bars represent 95% confidence intervals",
    x = NULL, y = "Risk Premium (λ)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(color = "grey50"),
    axis.text.x   = element_text(face = "bold")
  )


8 Newey-West Adjusted Standard Errors

Standard Fama-MacBeth standard errors correct only for cross-sectional correlation. We also compute Newey-West standard errors to correct for time-series autocorrelation in the lambda estimates.

nw_results <- second_pass |>
  pivot_longer(-date, names_to = "lambda", values_to = "estimate") |>
  group_by(lambda) |>
  group_modify(~ {
    # Fit intercept-only model to get NW SE for mean
    mod  <- lm(estimate ~ 1, data = .x)
    nw   <- coeftest(mod, vcov = NeweyWest(mod, lag = 6, prewhite = FALSE))
    tibble(
      mean_lambda = coef(mod)[1],
      nw_se       = nw[1, 2],
      nw_tstat    = nw[1, 3],
      nw_pvalue   = nw[1, 4]
    )
  }) |>
  ungroup() |>
  mutate(
    sig = case_when(
      nw_pvalue < 0.01 ~ "***",
      nw_pvalue < 0.05 ~ "**",
      nw_pvalue < 0.10 ~ "*",
      TRUE             ~ ""
    ),
    lambda = recode(lambda,
                    "lambda_0"   = "Intercept (λ₀)",
                    "lambda_MKT" = "Market Premium (λ_MKT)",
                    "lambda_SMB" = "Size Premium (λ_SMB)",
                    "lambda_HML" = "Value Premium (λ_HML)")
  )

nw_results |>
  select(lambda, mean_lambda, nw_se, nw_tstat, nw_pvalue, sig) |>
  kable(
    caption   = "Fama-MacBeth Results with Newey-West Standard Errors (6 lags)",
    col.names = c("Factor", "Mean λ", "NW Std. Error", "NW t-Stat", "p-Value", "Sig."),
    digits    = 4
  ) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE) |>
  column_spec(6, bold = TRUE, color = "darkgreen") |>
  footnote(general = "Newey-West SE with 6 lags. Significance: *** p<0.01, ** p<0.05, * p<0.10",
           general_title = "")
Fama-MacBeth Results with Newey-West Standard Errors (6 lags)
Factor Mean λ NW Std. Error NW t-Stat p-Value Sig.
Intercept (λ₀) 0.0105 0.0041 2.5904 0.0099 ***
Value Premium (λ_HML) -0.0020 0.0025 -0.8178 0.4140
Market Premium (λ_MKT) -0.0026 0.0050 -0.5252 0.5997
Size Premium (λ_SMB) 0.0106 0.0063 1.6794 0.0938
Newey-West SE with 6 lags. Significance: *** p<0.01, ** p<0.05, * p<0.10

9 Distribution of Lambda Estimates

ggplot(second_pass_long |> filter(lambda != "Intercept"),
       aes(x = estimate, fill = lambda)) +
  geom_histogram(bins = 40, alpha = 0.75, color = "white") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.8) +
  facet_wrap(~ lambda, scales = "free", ncol = 3) +
  scale_fill_manual(values = c("#2196F3","#4CAF50","#FF5722"), guide = "none") +
  labs(
    title    = "Distribution of Monthly Cross-Sectional Lambda (λ) Estimates",
    subtitle = "Fama-MacBeth Second Pass | 1990–2023",
    x = "λ Estimate", y = "Frequency"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(color = "grey50"),
    strip.text    = element_text(face = "bold")
  )


10 Summary & Interpretation

summary_df <- tibble(
  Factor = c("Market (MKT)", "Size (SMB)", "Value (HML)"),
  Interpretation = c(
    "Market risk premium — compensation for systematic market risk exposure",
    "Size premium — smaller firms tend to earn higher returns",
    "Value premium — high book-to-market firms tend to earn higher returns"
  ),
  Expected_Sign = c("Positive", "Positive", "Positive")
)

summary_df |>
  kable(caption = "Interpretation of Fama-MacBeth Risk Premia") |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = TRUE) |>
  column_spec(1, bold = TRUE, width = "10em") |>
  column_spec(2, width = "30em") |>
  column_spec(3, width = "8em", color = "darkgreen", bold = TRUE)
Interpretation of Fama-MacBeth Risk Premia
Factor Interpretation Expected_Sign
Market (MKT) Market risk premium — compensation for systematic market risk exposure Positive
Size (SMB) Size premium — smaller firms tend to earn higher returns Positive
Value (HML) Value premium — high book-to-market firms tend to earn higher returns Positive