1 Introduction

The Fama-MacBeth (1973) regression is a two-step procedure widely used in empirical asset pricing to estimate risk premia and test whether systematic risk factors are priced in the cross-section of stock returns.

1.1 Two-Step Procedure

Step 1 — First Pass (Time-Series Regression):
For each stock \(i\), regress its daily returns on the three Fama-French factors over the full sample to obtain factor loadings (betas):

\[r_{i,t} = \alpha_i + \beta_i^{MKT} \cdot MKT_t + \beta_i^{SMB} \cdot SMB_t + \beta_i^{HML} \cdot HML_t + \varepsilon_{i,t}\]

Step 2 — Second Pass (Cross-Sectional Regression):
At each time period \(t\), regress the cross-sectional stock returns on the betas from Step 1:

\[r_{i,t} = \lambda_{0,t} + \lambda_{1,t}\hat{\beta}_i^{MKT} + \lambda_{2,t}\hat{\beta}_i^{SMB} + \lambda_{3,t}\hat{\beta}_i^{HML} + \alpha_{i,t}\]

The final risk premium estimates are the time-series averages of \(\lambda_t\), with t-statistics computed from their time-series standard errors.


2 Load Required Packages

# install.packages(c("tidyverse","broom","lmtest","sandwich","kableExtra"))

library(tidyverse)   # data wrangling & plotting
library(lubridate)   # date parsing
library(broom)       # tidy regression output
library(lmtest)      # coeftest()
library(sandwich)    # NeweyWest()
library(kableExtra)  # formatted tables

3 Load & Inspect Data

The dataset contains daily stock returns for 6 stocks (AAPL, FORD, GE, GM, IBM, MSFT) along with the Fama-French 3 factors (MKT, SMB, HML), covering the period 2011–2015.

df <- read_csv("data.csv", show_col_types = FALSE) |>
  mutate(
    date = dmy(date),
    across(c(ri, MKT, SMB, HML), as.numeric)
  ) |>
  arrange(symbol, date)

glimpse(df)
## Rows: 7,542
## Columns: 6
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL",…
## $ date   <date> 2011-01-04, 2011-01-05, 2011-01-06, 2011-01-07, 2011-01-10, 20…
## $ ri     <dbl> 0.0052062641, 0.0081462879, -0.0008082435, 0.0071360567, 0.0186…
## $ MKT    <dbl> -0.0013138901, 0.0049946699, -0.0021252276, -0.0018465050, -0.0…
## $ SMB    <dbl> -0.0065, 0.0018, 0.0001, 0.0022, 0.0041, 0.0016, 0.0031, -0.002…
## $ HML    <dbl> 0.0008, 0.0013, -0.0025, -0.0006, 0.0039, 0.0036, 0.0000, -0.00…
df |>
  group_by(symbol) |>
  summarise(
    Start       = min(date),
    End         = max(date),
    N           = n(),
    Mean_Ret    = round(mean(ri, na.rm = TRUE), 5),
    SD_Ret      = round(sd(ri,   na.rm = TRUE), 5)
  ) |>
  kable(caption = "Dataset Summary by Stock",
        col.names = c("Stock","Start","End","# Days",
                      "Mean Return","SD Return")) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE)
Dataset Summary by Stock
Stock Start End # Days Mean Return SD Return
AAPL 2011-01-04 2015-12-31 1257 0.00070 0.01680
FORD 2011-01-04 2015-12-31 1257 -0.00058 0.05549
GE 2011-01-04 2015-12-31 1257 0.00056 0.01345
GM 2011-01-04 2015-12-31 1257 -0.00001 0.01895
IBM 2011-01-04 2015-12-31 1257 -0.00006 0.01221
MSFT 2011-01-04 2015-12-31 1257 0.00065 0.01479

4 Exploratory: Daily Return Series

ggplot(df, aes(x = date, y = ri, colour = symbol)) +
  geom_line(linewidth = 0.4, alpha = 0.8) +
  facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  scale_colour_brewer(palette = "Set2", guide = "none") +
  labs(
    title    = "Daily Stock Returns  |  2011–2015",
    subtitle = "AAPL · FORD · GE · GM · IBM · MSFT",
    x = NULL, y = "Daily Return"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(colour = "grey50"),
    strip.text    = element_text(face = "bold")
  )


5 Step 1: First Pass — Time-Series Regressions

For each stock, regress daily returns on MKT, SMB, and HML over the full sample to obtain each stock’s factor exposures (betas).

first_pass <- df |>
  group_by(symbol) |>
  group_modify(~ {
    mod <- lm(ri ~ MKT + SMB + HML, data = .x)
    tidy(mod) |>
      select(term, estimate) |>
      pivot_wider(names_from = term, values_from = estimate) |>
      rename(
        alpha    = `(Intercept)`,
        beta_MKT = MKT,
        beta_SMB = SMB,
        beta_HML = HML
      )
  }) |>
  ungroup()

first_pass |>
  kable(
    caption   = "Step 1: Estimated Factor Loadings (Betas) per Stock",
    digits    = 4,
    col.names = c("Stock", "α (Alpha)", "β MKT", "β SMB", "β HML")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE)
Step 1: Estimated Factor Loadings (Betas) per Stock
Stock α (Alpha) β MKT β SMB β HML
AAPL 4e-04 0.9000 0.0685 -0.0578
FORD -8e-04 0.5129 -0.2644 0.1380
GE 1e-04 1.0779 0.0994 0.0902
GM -5e-04 1.2854 0.0039 -0.0222
IBM -4e-04 0.8169 0.0336 -0.0121
MSFT 3e-04 0.9656 0.0582 -0.0641

5.1 Visualise Factor Loadings

first_pass |>
  pivot_longer(c(beta_MKT, beta_SMB, beta_HML),
               names_to  = "factor",
               values_to = "beta") |>
  mutate(factor = recode(factor,
    "beta_MKT" = "Market (MKT)",
    "beta_SMB" = "Size (SMB)",
    "beta_HML" = "Value (HML)"
  )) |>
  ggplot(aes(x = reorder(symbol, beta), y = beta,
             fill = beta > 0)) +
  geom_col(alpha = 0.85, show.legend = FALSE) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey30") +
  geom_text(aes(label = round(beta, 3),
                vjust = ifelse(beta >= 0, -0.4, 1.3)),
            size = 3.2, fontface = "bold") +
  facet_wrap(~ factor, ncol = 3) +
  scale_fill_manual(values = c("TRUE" = "#4CAF50", "FALSE" = "#F44336")) +
  labs(
    title    = "Step 1: Factor Loadings (β) by Stock",
    subtitle = "Fama-French 3-Factor Model — Full Sample",
    x = NULL, y = "Beta"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(colour = "grey50"),
    strip.text    = element_text(face = "bold")
  )


6 Step 2: Second Pass — Cross-Sectional Regressions

At each trading day \(t\), run a cross-sectional OLS of stock returns on the betas from Step 1, yielding a daily time series of \(\lambda_t\) estimates.

# Merge betas into daily panel
data_with_betas <- df |>
  left_join(
    first_pass |> select(symbol, beta_MKT, beta_SMB, beta_HML),
    by = "symbol"
  )

# Cross-sectional regression at each date t
second_pass <- data_with_betas |>
  group_by(date) |>
  group_modify(~ {
    if (nrow(.x) < 4) return(tibble())
    mod <- lm(ri ~ beta_MKT + beta_SMB + beta_HML, data = .x)
    tidy(mod) |>
      select(term, estimate) |>
      pivot_wider(names_from = term, values_from = estimate) |>
      rename(
        lambda_0   = `(Intercept)`,
        lambda_MKT = beta_MKT,
        lambda_SMB = beta_SMB,
        lambda_HML = beta_HML
      )
  }) |>
  ungroup() |>
  drop_na()

cat("Cross-sectional regressions run:", nrow(second_pass), "\n")
## Cross-sectional regressions run: 1257
head(second_pass, 8) |>
  kable(caption = "Sample of Daily λ Estimates (Second Pass)",
        digits = 5) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Sample of Daily λ Estimates (Second Pass)
date lambda_0 lambda_MKT lambda_SMB lambda_HML
2011-01-04 -0.02976 0.04163 -0.02552 0.05737
2011-01-05 0.02233 -0.01135 -0.15805 0.06285
2011-01-06 -0.02924 0.03730 0.00703 -0.17323
2011-01-07 -0.01750 0.01272 0.03227 -0.06423
2011-01-10 0.03641 -0.03663 0.01712 0.05865
2011-01-11 0.00274 0.00409 -0.09536 0.08986
2011-01-12 0.07233 -0.05536 -0.16450 0.04304
2011-01-13 0.01503 -0.01936 0.00181 0.02563

7 Fama-MacBeth Risk Premia Estimates

The final estimates are time-series means of the \(\lambda_t\) series, with standard errors computed from their time-series variation:

\[t(\hat{\lambda}) = \frac{\bar{\lambda}}{SD(\lambda_t)\,/\,\sqrt{T}}\]

T_obs <- nrow(second_pass)

fm_results <- second_pass |>
  summarise(across(
    c(lambda_0, lambda_MKT, lambda_SMB, lambda_HML),
    list(
      Mean   = ~ mean(., na.rm = TRUE),
      SD     = ~ sd(.,   na.rm = TRUE),
      SE     = ~ sd(.,   na.rm = TRUE) / sqrt(T_obs),
      t_stat = ~ mean(., na.rm = TRUE) /
                 (sd(., na.rm = TRUE) / sqrt(T_obs))
    ),
    .names = "{.col}__{.fn}"
  )) |>
  pivot_longer(everything(),
               names_to  = c("lambda","stat"),
               names_sep = "__") |>
  pivot_wider(names_from = stat, values_from = value) |>
  mutate(
    p_value = 2 * pt(-abs(t_stat), df = T_obs - 1),
    sig     = case_when(
      p_value < 0.01 ~ "***",
      p_value < 0.05 ~ "**",
      p_value < 0.10 ~ "*",
      TRUE           ~ ""
    ),
    lambda  = recode(lambda,
      "lambda_0"   = "Intercept (λ₀)",
      "lambda_MKT" = "Market Premium (λ_MKT)",
      "lambda_SMB" = "Size Premium (λ_SMB)",
      "lambda_HML" = "Value Premium (λ_HML)"
    )
  )

fm_results |>
  select(lambda, Mean, SE, t_stat, p_value, sig) |>
  kable(
    caption   = "Fama-MacBeth Risk Premia Estimates",
    col.names = c("Factor","Mean λ","Std. Error",
                  "t-Statistic","p-Value","Sig."),
    digits    = 5
  ) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE) |>
  column_spec(6, bold = TRUE, color = "darkgreen") |>
  footnote(general = "*** p<0.01  ** p<0.05  * p<0.10",
           general_title = "")
Fama-MacBeth Risk Premia Estimates
Factor Mean λ Std. Error t-Statistic p-Value Sig.
Intercept (λ₀) 0.00060 0.00109 0.54901 0.58309
Market Premium (λ_MKT) -0.00041 0.00109 -0.37879 0.70491
Size Premium (λ_SMB) 0.00368 0.00377 0.97712 0.32870
Value Premium (λ_HML) -0.00047 0.00259 -0.18044 0.85684
*** p<0.01 ** p<0.05 * p<0.10

8 Visualisation of Results

8.1 Time Series of λ Estimates

second_pass |>
  pivot_longer(-date, names_to = "lambda", values_to = "estimate") |>
  mutate(lambda = recode(lambda,
    "lambda_0"   = "Intercept (λ₀)",
    "lambda_MKT" = "Market (λ_MKT)",
    "lambda_SMB" = "Size (λ_SMB)",
    "lambda_HML" = "Value (λ_HML)"
  )) |>
  ggplot(aes(x = date, y = estimate)) +
  geom_line(colour = "#2196F3", linewidth = 0.4, alpha = 0.75) +
  geom_hline(yintercept = 0, linetype = "dashed",
             colour = "red", linewidth = 0.6) +
  geom_smooth(method = "loess", se = TRUE,
              colour = "#FF5722", fill = "#FF5722",
              alpha = 0.12, linewidth = 0.8) +
  facet_wrap(~ lambda, scales = "free_y", ncol = 2) +
  labs(
    title    = "Time Series of Daily Cross-Sectional λ Estimates",
    subtitle = "Blue = daily λ  |  Orange = LOESS trend  |  Dashed = zero",
    x = NULL, y = "λ Estimate"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(colour = "grey50"),
    strip.text    = element_text(face = "bold"),
    axis.text.x   = element_text(angle = 30, hjust = 1)
  )

8.2 Risk Premia with 95% Confidence Intervals

fm_results |>
  filter(lambda != "Intercept (λ₀)") |>
  mutate(
    ci_lo = Mean - 1.96 * SE,
    ci_hi = Mean + 1.96 * SE,
    col   = ifelse(Mean > 0, "Positive", "Negative")
  ) |>
  ggplot(aes(x = lambda, y = Mean, fill = col)) +
  geom_col(alpha = 0.85, width = 0.5) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                width = 0.12, linewidth = 0.9) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  geom_text(aes(label = paste0(round(Mean * 100, 3), "%"),
                vjust = ifelse(Mean >= 0, -0.7, 1.6)),
            size = 3.5, fontface = "bold") +
  scale_fill_manual(
    values = c("Positive" = "#4CAF50", "Negative" = "#F44336"),
    guide  = "none"
  ) +
  labs(
    title    = "Fama-MacBeth Risk Premia",
    subtitle = "Error bars = 95% confidence intervals",
    x = NULL, y = "Risk Premium (λ)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(colour = "grey50"),
    axis.text.x   = element_text(face = "bold")
  )

8.3 Distribution of λ Estimates

second_pass |>
  pivot_longer(-date, names_to = "lambda", values_to = "estimate") |>
  filter(lambda != "lambda_0") |>
  mutate(lambda = recode(lambda,
    "lambda_MKT" = "Market (λ_MKT)",
    "lambda_SMB" = "Size (λ_SMB)",
    "lambda_HML" = "Value (λ_HML)"
  )) |>
  ggplot(aes(x = estimate, fill = lambda)) +
  geom_histogram(bins = 50, colour = "white", alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = "dashed",
             colour = "black", linewidth = 0.8) +
  facet_wrap(~ lambda, scales = "free", ncol = 3) +
  scale_fill_manual(values = c("#2196F3","#4CAF50","#FF5722"),
                    guide = "none") +
  labs(
    title    = "Distribution of Daily λ Estimates",
    subtitle = "Dashed line = zero",
    x = "λ Estimate", y = "Frequency"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title  = element_text(face = "bold", size = 13),
    strip.text  = element_text(face = "bold")
  )


9 Newey-West Adjusted Standard Errors

Standard FM SEs correct only for cross-sectional correlation. Newey-West (1987) SEs additionally correct for time-series autocorrelation in the \(\lambda_t\) series (6 lags).

nw_results <- second_pass |>
  pivot_longer(-date, names_to = "lambda", values_to = "estimate") |>
  group_by(lambda) |>
  group_modify(~ {
    mod <- lm(estimate ~ 1, data = .x)
    nw  <- coeftest(mod, vcov = NeweyWest(mod, lag = 6, prewhite = FALSE))
    tibble(
      Mean_lambda = coef(mod)[1],
      NW_SE       = nw[1, 2],
      NW_tstat    = nw[1, 3],
      NW_pvalue   = nw[1, 4]
    )
  }) |>
  ungroup() |>
  mutate(
    sig    = case_when(
      NW_pvalue < 0.01 ~ "***",
      NW_pvalue < 0.05 ~ "**",
      NW_pvalue < 0.10 ~ "*",
      TRUE             ~ ""
    ),
    lambda = recode(lambda,
      "lambda_0"   = "Intercept (λ₀)",
      "lambda_MKT" = "Market Premium (λ_MKT)",
      "lambda_SMB" = "Size Premium (λ_SMB)",
      "lambda_HML" = "Value Premium (λ_HML)"
    )
  )

nw_results |>
  select(lambda, Mean_lambda, NW_SE, NW_tstat, NW_pvalue, sig) |>
  kable(
    caption   = "Fama-MacBeth with Newey-West Standard Errors (6 lags)",
    col.names = c("Factor","Mean λ","NW Std. Error",
                  "NW t-Stat","p-Value","Sig."),
    digits    = 5
  ) |>
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) |>
  column_spec(1, bold = TRUE) |>
  column_spec(6, bold = TRUE, color = "darkgreen") |>
  footnote(general = "Newey-West SE with 6 lags. *** p<0.01  ** p<0.05  * p<0.10",
           general_title = "")
Fama-MacBeth with Newey-West Standard Errors (6 lags)
Factor Mean λ NW Std. Error NW t-Stat p-Value Sig.
Intercept (λ₀) 0.00060 0.00095 0.63171 0.52769
Value Premium (λ_HML) -0.00047 0.00220 -0.21198 0.83216
Market Premium (λ_MKT) -0.00041 0.00101 -0.40985 0.68198
Size Premium (λ_SMB) 0.00368 0.00295 1.24658 0.21278
Newey-West SE with 6 lags. *** p<0.01 ** p<0.05 * p<0.10

10 Comparison: Standard FM vs Newey-West SE

fm_std <- fm_results |>
  filter(lambda != "Intercept (λ₀)") |>
  select(lambda, Mean, SE) |>
  mutate(Method = "Standard FM SE")

fm_nw <- nw_results |>
  filter(lambda != "Intercept (λ₀)") |>
  select(lambda, Mean = Mean_lambda, SE = NW_SE) |>
  mutate(Method = "Newey-West SE")

bind_rows(fm_std, fm_nw) |>
  mutate(ci_lo = Mean - 1.96 * SE,
         ci_hi = Mean + 1.96 * SE) |>
  ggplot(aes(x = lambda, y = Mean, colour = Method, group = Method)) +
  geom_point(position = position_dodge(0.4), size = 3) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                width = 0.2, linewidth = 0.8,
                position = position_dodge(0.4)) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  scale_colour_manual(
    values = c("Standard FM SE" = "#2196F3", "Newey-West SE" = "#FF5722")
  ) +
  labs(
    title    = "Risk Premia: Standard FM vs Newey-West Standard Errors",
    subtitle = "Points = mean λ  |  Bars = 95% CI",
    x = NULL, y = "Risk Premium (λ)", colour = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", size = 13),
    plot.subtitle   = element_text(colour = "grey50"),
    legend.position = "top",
    axis.text.x     = element_text(face = "bold")
  )


11 Summary & Interpretation

tibble(
  Factor = c("Market (MKT)", "Size (SMB)", "Value (HML)"),
  Theory = c(
    "Higher market β → higher expected return (CAPM)",
    "Smaller firms (high SMB β) earn a size premium",
    "Value firms (high HML β) earn a value premium"
  ),
  `Expected Sign` = c("Positive (+)", "Positive (+)", "Positive (+)")
) |>
  kable(caption = "Theoretical Predictions for Fama-MacBeth Risk Premia") |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = TRUE) |>
  column_spec(1, bold = TRUE, width = "10em") |>
  column_spec(3, bold = TRUE, color = "steelblue", width = "10em")
Theoretical Predictions for Fama-MacBeth Risk Premia
Factor Theory Expected Sign
Market (MKT) Higher market β → higher expected return (CAPM) Positive (+)
Size (SMB) Smaller firms (high SMB β) earn a size premium Positive (+)
Value (HML) Value firms (high HML β) earn a value premium Positive (+)