Data Analysis

library(broom)
library(tidyverse)

# 1. Import data safely using semicolon delimiter
data <- read_delim("data.csv", delim = ";")

# 2. Clean up data types and handle text-to-number forced conversions
data_clean <- data %>% 
  mutate(across(c(ri, MKT, SMB, HML), as.numeric)) %>%
  mutate(date = as.Date(date, format = "%d-%b-%y")) %>% 
  drop_na(ri, MKT, SMB, HML)

# 3. Fama-MacBeth Step 0: Time-Series Regressions (Estimate Betas per Symbol)
step0 <- data_clean %>% 
  group_by(symbol) %>% 
  nest(data = c(date, ri, MKT, SMB, HML)) %>% 
  mutate(estimates = map(
    data,
    ~tidy(lm(ri ~ MKT + SMB + HML, data = .x))
  )) %>% 
  unnest(estimates) %>% 
  select(symbol, estimate, term) %>% 
  pivot_wider(names_from  = term,
              values_from = estimate) %>% 
  select(symbol, 
         b_MKT = MKT, 
         b_HML = HML, 
         b_SMB = SMB) %>% 
  ungroup()

# Merge the estimated betas back to the cleaned dataset
step0_merged <- data_clean %>% 
  left_join(step0, by = "symbol")

# 4. Fama-MacBeth Step 1: Cross-Sectional Regressions (Estimate Risk Premia per Date)
step1 <- step0_merged %>% 
  group_by(date) %>% 
  nest(data = c(symbol, ri, b_MKT, b_SMB, b_HML)) %>% 
  mutate(estimates = map(
    data,
    ~tidy(lm(ri ~ b_MKT + b_SMB + b_HML, data = .x))
  )) %>%
  unnest(estimates) %>% 
  select(date, estimate, term) %>% 
  pivot_wider(names_from  = term,
              values_from = estimate) %>% 
  select(date, b_MKT, b_HML, b_SMB) %>% 
  ungroup()

# 5. Fama-MacBeth Step 2: Estimate time series averages & Significance
t.test(step1$b_MKT, mu = 0)
## 
##  One Sample t-test
## 
## data:  step1$b_MKT
## t = -0.71163, df = 1256, p-value = 0.4768
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -256.0301  119.7291
## sample estimates:
## mean of x 
## -68.15049
t.test(step1$b_SMB, mu = 0)
## 
##  One Sample t-test
## 
## data:  step1$b_SMB
## t = 1.0107, df = 1256, p-value = 0.3124
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.002876405  0.008989319
## sample estimates:
##   mean of x 
## 0.003056457
t.test(step1$b_HML, mu = 0)
## 
##  One Sample t-test
## 
## data:  step1$b_HML
## t = 0.21531, df = 1256, p-value = 0.8296
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.01572488  0.01960192
## sample estimates:
##   mean of x 
## 0.001938522