# -------------------------------------------------------
# Load required libraries
# broom     : tidy() converts model output into data frames
# tidyverse : data wrangling and piping with dplyr + purrr
# knitr     : kable() for formatted tables in the report
# -------------------------------------------------------
library(broom)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
# -------------------------------------------------------
# Load the dataset
# The panel contains daily returns for 6 U.S. stocks
# along with daily Fama-French three-factor values
# -------------------------------------------------------
data <- read.csv("/Users/faizhaikal/Downloads/data.csv")

# Quick preview of the data structure
glimpse(data)
## Rows: 7,542
## Columns: 6
## $ symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL",…
## $ date   <chr> "4-Jan-11", "5-Jan-11", "6-Jan-11", "7-Jan-11", "10-Jan-11", "1…
## $ ri     <dbl> 0.0052062641, 0.0081462879, -0.0008082435, 0.0071360567, 0.0186…
## $ MKT    <dbl> -0.0013138901, 0.0049946699, -0.0021252276, -0.0018465050, -0.0…
## $ SMB    <dbl> -0.0065, 0.0018, 0.0001, 0.0022, 0.0041, 0.0016, 0.0031, -0.002…
## $ HML    <dbl> 0.0008, 0.0013, -0.0025, -0.0006, 0.0039, 0.0036, 0.0000, -0.00…
# -------------------------------------------------------
# Summary statistics for each variable
# -------------------------------------------------------
data %>%
  select(ri, MKT, SMB, HML) %>%
  summary()
##        ri                  MKT                  SMB            
##  Min.   :-0.3908663   Min.   :-0.0689583   Min.   :-1.660e-02  
##  1st Qu.:-0.0087263   1st Qu.:-0.0040125   1st Qu.:-3.100e-03  
##  Median : 0.0000000   Median : 0.0005438   Median : 1.000e-04  
##  Mean   : 0.0002109   Mean   : 0.0003774   Mean   : 2.227e-06  
##  3rd Qu.: 0.0093507   3rd Qu.: 0.0052641   3rd Qu.: 3.100e-03  
##  Max.   : 0.9614112   Max.   : 0.0463174   Max.   : 2.490e-02  
##       HML          
##  Min.   :-0.01490  
##  1st Qu.:-0.00260  
##  Median : 0.00000  
##  Mean   : 0.00013  
##  3rd Qu.: 0.00260  
##  Max.   : 0.02250
# -------------------------------------------------------
# Display the first few rows in a formatted table
# -------------------------------------------------------
head(data, 10) %>%
  kable(
    caption = "Table 1: First 10 Observations of the Dataset",
    digits  = 6,
    align   = "c"
  )
Table 1: First 10 Observations of the Dataset
symbol date ri MKT SMB HML
AAPL 4-Jan-11 0.005206 -0.001314 -0.0065 0.0008
AAPL 5-Jan-11 0.008146 0.004995 0.0018 0.0013
AAPL 6-Jan-11 -0.000808 -0.002125 0.0001 -0.0025
AAPL 7-Jan-11 0.007136 -0.001847 0.0022 -0.0006
AAPL 10-Jan-11 0.018657 -0.001377 0.0041 0.0039
AAPL 11-Jan-11 -0.002368 0.003718 0.0016 0.0036
AAPL 12-Jan-11 0.008104 0.008967 0.0031 0.0000
AAPL 13-Jan-11 0.003652 -0.001712 -0.0026 -0.0044
AAPL 14-Jan-11 0.008067 0.007357 -0.0010 -0.0073
AAPL 18-Jan-11 -0.022725 0.001375 0.0056 0.0015
# -------------------------------------------------------
# STEP 0: Time-Series Regressions
#
# For each stock (symbol), regress ri on MKT, SMB, HML
# across all time periods to obtain factor betas.
# -------------------------------------------------------

step0_betas <- data %>%
  
  # Group data by stock and nest the time-series data for each stock
  nest(data = c(date, ri, MKT, SMB, HML)) %>%
  
  # For each stock, run OLS: ri = alpha + b_MKT*MKT + b_SMB*SMB + b_HML*HML
  mutate(estimates = map(
    data,
    ~tidy(lm(ri ~ MKT + SMB + HML, data = .x))
  )) %>%
  
  # Unnest to bring model results into the main data frame
  unnest(estimates) %>%
  
  # Keep only the symbol, coefficient name, and coefficient value
  select(symbol, estimate, term) %>%
  
  # Pivot from long to wide: each factor gets its own column
  pivot_wider(
    names_from  = term,
    values_from = estimate
  ) %>%
  
  # Rename for clarity; drop the intercept (not needed in Step 1)
  select(
    symbol,
    b_MKT = MKT,
    b_SMB = SMB,
    b_HML = HML
  )

# Display the estimated betas for each stock
step0_betas %>%
  kable(
    caption = "Table 2: Estimated Factor Betas from Time-Series Regressions (Step 0)",
    digits  = 4,
    align   = "c"
  )
Table 2: Estimated Factor Betas from Time-Series Regressions (Step 0)
symbol b_MKT b_SMB b_HML
AAPL 0.9000 0.0685 -0.0578
FORD 0.5129 -0.2644 0.1380
GE 1.0779 0.0994 0.0902
GM 1.2854 0.0039 -0.0222
IBM 0.8169 0.0336 -0.0121
MSFT 0.9656 0.0582 -0.0641
# -------------------------------------------------------
# Merge the estimated betas back into the original dataset
# so that each daily observation now carries its stock's
# factor loadings — ready for the cross-sectional step.
# -------------------------------------------------------
step0 <- data %>%
  left_join(step0_betas, by = "symbol")

# Preview the merged dataset
head(step0, 6) %>%
  kable(
    caption = "Table 3: Dataset After Merging Factor Betas (Step 0 Output)",
    digits  = 6,
    align   = "c"
  )
Table 3: Dataset After Merging Factor Betas (Step 0 Output)
symbol date ri MKT SMB HML b_MKT b_SMB b_HML
AAPL 4-Jan-11 0.005206 -0.001314 -0.0065 0.0008 0.900006 0.068535 -0.057821
AAPL 5-Jan-11 0.008146 0.004995 0.0018 0.0013 0.900006 0.068535 -0.057821
AAPL 6-Jan-11 -0.000808 -0.002125 0.0001 -0.0025 0.900006 0.068535 -0.057821
AAPL 7-Jan-11 0.007136 -0.001847 0.0022 -0.0006 0.900006 0.068535 -0.057821
AAPL 10-Jan-11 0.018657 -0.001377 0.0041 0.0039 0.900006 0.068535 -0.057821
AAPL 11-Jan-11 -0.002368 0.003718 0.0016 0.0036 0.900006 0.068535 -0.057821
# -------------------------------------------------------
# STEP 1: Cross-Sectional Regressions
#
# For each date (t), regress ri on b_MKT, b_SMB, b_HML
# across all stocks to obtain daily risk premia (lambdas).
# -------------------------------------------------------

step1_lambdas <- step0 %>%
  
  # Group data by date and nest the cross-sectional data for each date
  nest(data = c(symbol, ri, b_MKT, b_SMB, b_HML)) %>%
  
  # For each date, run OLS: ri = lambda0 + lam_MKT*b_MKT + ...
  mutate(estimates = map(
    data,
    ~tidy(lm(ri ~ b_MKT + b_SMB + b_HML, data = .x))
  )) %>%
  
  # Unnest the results
  unnest(estimates) %>%
  
  # Keep only the date, coefficient name, and coefficient value
  select(date, estimate, term) %>%
  
  # Pivot to wide format: one column per lambda
  pivot_wider(
    names_from  = term,
    values_from = estimate
  ) %>%
  
  # Select and rename the risk premia columns
  select(
    date,
    lam_MKT = b_MKT,
    lam_SMB = b_SMB,
    lam_HML = b_HML
  )

# Preview the first few rows of daily risk premia
head(step1_lambdas, 10) %>%
  kable(
    caption = "Table 4: Daily Cross-Sectional Risk Premia (Step 1 Output) — First 10 Days",
    digits  = 6,
    align   = "c"
  )
Table 4: Daily Cross-Sectional Risk Premia (Step 1 Output) — First 10 Days
date lam_MKT lam_SMB lam_HML
4-Jan-11 0.041629 -0.025520 0.057372
5-Jan-11 -0.011347 -0.158046 0.062847
6-Jan-11 0.037301 0.007029 -0.173234
7-Jan-11 0.012722 0.032269 -0.064226
10-Jan-11 -0.036631 0.017123 0.058646
11-Jan-11 0.004089 -0.095361 0.089858
12-Jan-11 -0.055365 -0.164496 0.043036
13-Jan-11 -0.019357 0.001815 0.025630
14-Jan-11 -0.016486 0.063259 0.039214
18-Jan-11 0.010146 0.052508 -0.090027
# -------------------------------------------------------
# STEP 2a: Compute time-series averages of risk premia
# -------------------------------------------------------

lambda_summary <- step1_lambdas %>%
  summarise(
    Mean_MKT = mean(lam_MKT, na.rm = TRUE),
    Mean_SMB = mean(lam_SMB, na.rm = TRUE),
    Mean_HML = mean(lam_HML, na.rm = TRUE),
    SD_MKT   = sd(lam_MKT,   na.rm = TRUE),
    SD_SMB   = sd(lam_SMB,   na.rm = TRUE),
    SD_HML   = sd(lam_HML,   na.rm = TRUE),
    N        = n()
  )

lambda_summary %>%
  kable(
    caption = "Table 5: Summary Statistics of Daily Risk Premia",
    digits  = 6,
    align   = "c"
  )
Table 5: Summary Statistics of Daily Risk Premia
Mean_MKT Mean_SMB Mean_HML SD_MKT SD_SMB SD_HML N
-0.000412 0.003683 -0.000467 0.03857 0.133626 0.091705 1257
# -------------------------------------------------------
# STEP 2b: One-sample t-tests for each risk premium
#
# H0: lambda = 0 (the factor earns no risk premium)
# H1: lambda ≠ 0 (the factor earns a nonzero risk premium)
#
# We test at the conventional 5% significance level.
# -------------------------------------------------------

cat("============================================================\n")
## ============================================================
cat("  FAMA-MACBETH HYPOTHESIS TESTS: H0: lambda = 0\n")
##   FAMA-MACBETH HYPOTHESIS TESTS: H0: lambda = 0
cat("============================================================\n\n")
## ============================================================
cat("--- Factor 1: Market Risk Premium (MKT) ---\n")
## --- Factor 1: Market Risk Premium (MKT) ---
ttest_MKT <- t.test(step1_lambdas$lam_MKT, mu = 0)
print(ttest_MKT)
## 
##  One Sample t-test
## 
## data:  step1_lambdas$lam_MKT
## t = -0.37879, df = 1256, p-value = 0.7049
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.002546371  0.001722208
## sample estimates:
##     mean of x 
## -0.0004120813
cat("\n--- Factor 2: Size Premium (SMB) ---\n")
## 
## --- Factor 2: Size Premium (SMB) ---
ttest_SMB <- t.test(step1_lambdas$lam_SMB, mu = 0)
print(ttest_SMB)
## 
##  One Sample t-test
## 
## data:  step1_lambdas$lam_SMB
## t = 0.97712, df = 1256, p-value = 0.3287
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.003711466  0.011076953
## sample estimates:
##   mean of x 
## 0.003682744
cat("\n--- Factor 3: Value Premium (HML) ---\n")
## 
## --- Factor 3: Value Premium (HML) ---
ttest_HML <- t.test(step1_lambdas$lam_HML, mu = 0)
print(ttest_HML)
## 
##  One Sample t-test
## 
## data:  step1_lambdas$lam_HML
## t = -0.18044, df = 1256, p-value = 0.8568
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.005541205  0.004607776
## sample estimates:
##     mean of x 
## -0.0004667146
# -------------------------------------------------------
# Compile a clean results table for easy reading
# -------------------------------------------------------

results_table <- tibble(
  Factor        = c("MKT (Market)", "SMB (Size)", "HML (Value)"),
  Mean_Lambda   = c(ttest_MKT$estimate, ttest_SMB$estimate, ttest_HML$estimate),
  t_Statistic   = c(ttest_MKT$statistic, ttest_SMB$statistic, ttest_HML$statistic),
  p_Value       = c(ttest_MKT$p.value, ttest_SMB$p.value, ttest_HML$p.value),
  CI_Lower      = c(ttest_MKT$conf.int[1], ttest_SMB$conf.int[1], ttest_HML$conf.int[1]),
  CI_Upper      = c(ttest_MKT$conf.int[2], ttest_SMB$conf.int[2], ttest_HML$conf.int[2]),
  Significant   = c(
    ifelse(ttest_MKT$p.value < 0.05, "Yes ***", "No"),
    ifelse(ttest_SMB$p.value < 0.05, "Yes ***", "No"),
    ifelse(ttest_HML$p.value < 0.05, "Yes ***", "No")
  )
)

results_table %>%
  kable(
    caption = "Table 6: Fama-MacBeth Regression Results Summary",
    digits  = c(0, 6, 4, 4, 6, 6, 0),
    align   = "c"
  )
Table 6: Fama-MacBeth Regression Results Summary
Factor Mean_Lambda t_Statistic p_Value CI_Lower CI_Upper Significant
MKT (Market) -0.000412 -0.3788 0.7049 -0.002546 0.001722 No
SMB (Size) 0.003683 0.9771 0.3287 -0.003711 0.011077 No
HML (Value) -0.000467 -0.1804 0.8568 -0.005541 0.004608 No
# -------------------------------------------------------
# Plot the distribution of daily lambdas for each factor
# to visually inspect whether the average differs from zero
# -------------------------------------------------------

step1_lambdas %>%
  pivot_longer(
    cols      = c(lam_MKT, lam_SMB, lam_HML),
    names_to  = "Factor",
    values_to = "Lambda"
  ) %>%
  mutate(Factor = recode(Factor,
    lam_MKT = "MKT (Market)",
    lam_SMB = "SMB (Size)",
    lam_HML = "HML (Value)"
  )) %>%
  ggplot(aes(x = Lambda, fill = Factor)) +
  geom_histogram(bins = 60, alpha = 0.75, colour = "white") +
  geom_vline(xintercept = 0, colour = "black", linetype = "dashed", linewidth = 0.8) +
  facet_wrap(~Factor, scales = "free") +
  scale_fill_manual(values = c("#2980b9", "#27ae60", "#e74c3c")) +
  labs(
    title    = "Distribution of Daily Cross-Sectional Risk Premia",
    subtitle = "Dashed line marks zero; a distribution centred away from zero indicates a nonzero risk premium",
    x        = "Lambda (Risk Premium)",
    y        = "Frequency",
    caption  = "Source: Fama-French Three-Factor Model | Daily data, Jan 2011 – Dec 2015"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position  = "none",
    strip.text       = element_text(face = "bold"),
    plot.title       = element_text(face = "bold", size = 14),
    plot.subtitle    = element_text(colour = "grey40", size = 11)
  )

# -------------------------------------------------------
# Plot the time-series of each daily lambda
# -------------------------------------------------------

step1_lambdas %>%
  mutate(date = as.Date(date, format = "%d-%b-%y")) %>%
  pivot_longer(
    cols      = c(lam_MKT, lam_SMB, lam_HML),
    names_to  = "Factor",
    values_to = "Lambda"
  ) %>%
  mutate(Factor = recode(Factor,
    lam_MKT = "MKT (Market)",
    lam_SMB = "SMB (Size)",
    lam_HML = "HML (Value)"
  )) %>%
  ggplot(aes(x = date, y = Lambda, colour = Factor)) +
  geom_line(alpha = 0.6, linewidth = 0.4) +
  geom_hline(yintercept = 0, colour = "black", linetype = "dashed") +
  facet_wrap(~Factor, ncol = 1, scales = "free_y") +
  scale_colour_manual(values = c("#2980b9", "#27ae60", "#e74c3c")) +
  labs(
    title    = "Time-Series of Daily Cross-Sectional Risk Premia",
    x        = "Date",
    y        = "Lambda (Risk Premium)",
    caption  = "Source: Fama-French Three-Factor Model | Daily data, Jan 2011 – Dec 2015"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "none",
    strip.text      = element_text(face = "bold"),
    plot.title      = element_text(face = "bold", size = 14)
  )

Conclusion This study successfully reproduced the Eugene Fama–James MacBeth (1973) two-pass regression methodology using daily return data from six U.S. stocks over the 2011–2015 period, within the framework of the Fama–French three-factor model. The analysis was conducted in three main stages. Step 0 (Time-Series Regressions): Separate time-series regressions were estimated for each of the six stocks to measure their sensitivities (betas) to the market (MKT), size (SMB), and value (HML) factors. Step 1 (Cross-Sectional Regressions): Daily cross-sectional regressions were then performed, where stock returns for each trading day were regressed on the previously estimated betas. This generated a time series of daily factor risk premia (lambda estimates). Step 2 (Averaging and Statistical Testing): The estimated daily lambdas were averaged across the sample period, and one-sample t-tests were conducted to evaluate whether the average risk premia were significantly different from zero. One of the principal strengths of the Fama–MacBeth methodology compared with pooled OLS estimation is its ability to account for cross-sectional dependence in asset returns. Since stock returns are often correlated across firms, ignoring this feature can lead to underestimated standard errors and overly optimistic statistical inferences. Overall, the findings add to the extensive empirical literature examining the validity and performance of the Fama–French three-factor framework under varying market conditions. Further research could expand the sample to include more firms, longer time horizons, or additional explanatory factors such as momentum and profitability from the five-factor model. Another potential extension would be the use of rolling-window estimation techniques to investigate how factor risk premia change over time.