Stock Returns Project

# ============================================================
# Section 0: Load Required Packages
# ============================================================

pkgs <- c(
  "tidyverse",   # Data wrangling + plotting
  "tidyquant",   # Pull stock data + S&P 500 list
  "lubridate",   # Date/quarter manipulation
  "slider",      # Rolling-window calculations
  "Boruta",      # Boruta variable selection
  "broom",       # Clean model outputs into tables
  "lmtest",      # Breusch-Pagan, RESET tests
  "sandwich",    # Heteroskedasticity-consistent SEs
  "car",         # VIF multicollinearity tests
  "nortest",     # Jarque-Bera normality test
  "boot",        # Bootstrap coefficient estimation
  "caret",       # Cross-validation
  "rvest",       # Web scraping for sector data  
  "corrplot",    # Correlation Matrix 
  "GGally",      # Extends GGplots
  "gridExtra",   # Multiply GGplots
  "patchwork",   #  
  "kableExtra",  #
  "fitdistrplus" #
)

# Auto-install any missing packages
new_pkgs <- pkgs[!(pkgs %in% installed.packages()[, "Package"])]
if (length(new_pkgs) > 0) install.packages(new_pkgs)

# Load all packages
invisible(lapply(pkgs, library, character.only = TRUE))

# ============================================================
# Section 1: Define Sample Period and Pull S&P 500 Tickers
# ============================================================

# Define sample period (2021-2025)
start_date <- as.Date("2021-01-01")
end_date   <- as.Date("2025-12-31")

# Pull current S&P 500 constituents (note: survivorship bias may apply)
sp500 <- tq_index("SP500") %>%
  dplyr::select(symbol, company, sector)

# Extract ticker symbols
tickers <- sp500$symbol

# Sanity checks
cat("Total tickers pulled:", length(tickers), "\n")

## Total tickers pulled: 504

cat("First few tickers:", head(tickers), "\n")

## First few tickers: NVDA AAPL MSFT AMZN GOOGL AVGO

# ============================================================
# Section 2: Pull Daily Stock Prices and Volume
# ============================================================

# Pull adjusted closing prices and volume for all S&P 500 tickers
# Note: 'adjusted' accounts for stock splits and dividends
prices <- tq_get(
  tickers,
  from = start_date,
  to   = end_date,
  get  = "stock.prices"
) %>%
  dplyr::select(symbol, date, adjusted, volume) %>%
  dplyr::arrange(symbol, date)

# Sanity check - verify structure and spot missing data
glimpse(prices)

## Rows: 624,555
## Columns: 4
## $ symbol   <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
## $ date     <date> 2021-01-04, 2021-01-05, 2021-01-06, 2021-01-07, 2021-01-08, …
## $ adjusted <dbl> 114.6598, 115.5972, 118.7672, 121.9275, 122.7973, 123.8507, 1…
## $ volume   <dbl> 2030700, 2344000, 2428500, 1775900, 1577200, 1746800, 1826400…

cat("Total rows:", nrow(prices), "\n")

## Total rows: 624555

cat("Missing values:", sum(is.na(prices)), "\n")

## Missing values: 2

cat("Date range:", as.character(min(prices$date)),
    "to", as.character(max(prices$date)), "\n")

## Date range: 2021-01-04 to 2025-12-30

# ============================================================
# Section 3: Compute Daily Stock Returns
# ============================================================

# Calculate simple daily returns: (P_t / P_{t-1}) - 1
# group_by(symbol) ensures lag is computed within each stock
daily <- prices %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(
    ret_d = adjusted / lag(adjusted) - 1
  ) %>%
  ungroup() %>%
  filter(!is.na(ret_d))  # Removes first obs per stock (no prior price)
                         # Also handles the 2 NAs found in raw prices

# Sanity checks
glimpse(daily)

## Rows: 624,050
## Columns: 5
## $ symbol   <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", "A…
## $ date     <date> 2021-01-05, 2021-01-05, 2021-01-05, 2021-01-05, 2021-01-05, …
## $ adjusted <dbl> 115.59720, 127.41277, 86.93716, 148.30000, 100.41207, 33.3194…
## $ volume   <dbl> 2344000, 97664900, 6823800, 5974200, 4322800, 1814500, 180920…
## $ ret_d    <dbl> 0.0081757652, 0.0123639944, 0.0103404702, 0.0657564467, 0.012…

cat("Summary of daily returns:\n")

## Summary of daily returns:

print(summary(daily$ret_d))

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.5314019 -0.0091205  0.0006787  0.0006284  0.0104168  0.5602061

# ============================================================
# Section 4: Pull Market Returns (SPY as Market Proxy)
# ============================================================

# SPY (S&P 500 ETF) is used as the market proxy for beta estimation
# This is standard practice in empirical asset pricing research
market <- tq_get(
  "SPY",
  from = start_date,
  to   = end_date,
  get  = "stock.prices"
) %>%
  dplyr::arrange(date) %>%
  dplyr::mutate(mkt_ret = adjusted / dplyr::lag(adjusted) - 1) %>%
  dplyr::filter(!is.na(mkt_ret)) %>%
  dplyr::select(date, mkt_ret)

# Sanity checks
glimpse(market)

## Rows: 1,253
## Columns: 2
## $ date    <date> 2021-01-05, 2021-01-06, 2021-01-07, 2021-01-08, 2021-01-11, 2…
## $ mkt_ret <dbl> 0.0068873966, 0.0059782719, 0.0148576929, 0.0056978421, -0.006…

cat("Summary of market returns:\n")

## Summary of market returns:

print(summary(market$mkt_ret))

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.0585429 -0.0045158  0.0007607  0.0006095  0.0063077  0.1050193

# ============================================================
# Section 5: Compute Rolling Financial Features
# ============================================================
# Window sizes: 63 trading days ≈ 1 quarter, 252 trading days ≈ 1 year
# Note: .before = 62 means 62 rows before + current row = 63 total (1 quarter)
#       .before = 251 means 251 rows before + current row = 252 total (1 year)
# cache=TRUE recommended — rolling calculations are computationally intensive
# Note: market beta computed separately in next chunk

# Merge stock returns with market returns (SPY)
daily_full <- daily %>%
  dplyr::left_join(market, by = "date")

# Compute all rolling features within each stock
features <- daily_full %>%
  dplyr::group_by(symbol) %>%
  dplyr::arrange(date) %>%
  dplyr::mutate(

    # Momentum: cumulative return over past quarter and year
    mom_q = adjusted / dplyr::lag(adjusted, 63)  - 1,
    mom_y = adjusted / dplyr::lag(adjusted, 252) - 1,

    # Volatility: standard deviation of daily returns
    vol_q = slider::slide_dbl(ret_d, sd, .before = 62,  .complete = TRUE),
    vol_y = slider::slide_dbl(ret_d, sd, .before = 251, .complete = TRUE),

    # Trading volume: mean and standard deviation
    volm_q_avg = slider::slide_dbl(volume, mean, .before = 62,  .complete = TRUE),
    volm_y_avg = slider::slide_dbl(volume, mean, .before = 251, .complete = TRUE),
    volm_y_sd  = slider::slide_dbl(volume, sd,   .before = 251, .complete = TRUE),

    # Maximum drawdown: largest peak-to-trough decline over past quarter
    mdd_q = slider::slide_dbl(adjusted, ~{
      peak <- cummax(.x)
      dd   <- .x / peak - 1
      min(dd, na.rm = TRUE)
    }, .before = 62, .complete = TRUE)

  ) %>%
  dplyr::ungroup()

# Drop rows with incomplete rolling windows
features <- features %>%
  dplyr::filter(
    !is.na(mom_q),
    !is.na(mom_y),
    !is.na(vol_y),
    !is.na(volm_y_avg),
    !is.na(mdd_q),
    !is.na(mkt_ret)    # Market return must exist for beta estimation
  )

# Sanity checks
dplyr::glimpse(features)

## Rows: 497,534
## Columns: 14
## $ symbol     <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", …
## $ date       <date> 2022-01-04, 2022-01-04, 2022-01-04, 2022-01-04, 2022-01-04…
## $ adjusted   <dbl> 146.90814, 175.84323, 115.62479, 170.80000, 125.32749, 42.9…
## $ volume     <dbl> 2234000, 99310400, 6298300, 4080000, 8241200, 968600, 25163…
## $ ret_d      <dbl> -0.0338060483, -0.0126916080, -0.0019201377, -0.0108871306,…
## $ mkt_ret    <dbl> -0.0003351108, -0.0003351108, -0.0003351108, -0.0003351108,…
## $ mom_q      <dbl> -0.010722225, 0.275333420, 0.249188264, 0.036785221, 0.1761…
## $ mom_y      <dbl> 0.27086248, 0.38010677, 0.32998111, 0.15171948, 0.24813173,…
## $ vol_q      <dbl> 0.014768063, 0.015035164, 0.011257136, 0.033742917, 0.01261…
## $ vol_y      <dbl> 0.01324156, 0.01580715, 0.01261994, 0.03323359, 0.01344436,…
## $ volm_q_avg <dbl> 1516438.1, 89378579.4, 6126700.0, 5791620.6, 5337882.5, 164…
## $ volm_y_avg <dbl> 1621182.5, 90377133.3, 6806901.6, 6496663.1, 5373671.4, 203…
## $ volm_y_sd  <dbl> 708967.8, 29022212.1, 3800881.8, 4608475.2, 2405241.0, 1227…
## $ mdd_q      <dbl> -0.09793064, -0.05405416, -0.03028252, -0.24530669, -0.0489…

cat("Summary of rolling features:\n")

## Summary of rolling features:

features %>%
  dplyr::select(mom_q, mom_y, vol_q, vol_y,
                volm_q_avg, volm_y_avg, volm_y_sd, mdd_q) %>%
  summary() %>%
  print()

##      mom_q              mom_y              vol_q              vol_y         
##  Min.   :-0.88729   Min.   :-0.98549   Min.   :0.001896   Min.   :0.007558  
##  1st Qu.:-0.06875   1st Qu.:-0.09169   1st Qu.:0.013753   1st Qu.:0.014474  
##  Median : 0.02088   Median : 0.07831   Median :0.017412   Median :0.017738  
##  Mean   : 0.02922   Mean   : 0.12839   Mean   :0.019422   Mean   :0.019480  
##  3rd Qu.: 0.11405   3rd Qu.: 0.27098   3rd Qu.:0.022593   3rd Qu.:0.022273  
##  Max.   : 6.32532   Max.   :16.20933   Max.   :0.149840   Max.   :0.116309  
##    volm_q_avg          volm_y_avg          volm_y_sd             mdd_q         
##  Min.   :      144   Min.   :      387   Min.   :     1490   Min.   :-0.90941  
##  1st Qu.:  1203126   1st Qu.:  1190006   1st Qu.:   584651   1st Qu.:-0.18010  
##  Median :  2382245   Median :  2349903   Median :  1183242   Median :-0.12114  
##  Mean   :  6333720   Mean   :  6246487   Mean   :  3062847   Mean   :-0.14095  
##  3rd Qu.:  5345560   3rd Qu.:  5238819   3rd Qu.:  2662303   3rd Qu.:-0.08159  
##  Max.   :598531683   Max.   :554410254   Max.   :191520852   Max.   :-0.00863

# ============================================================
# Section 6: Build Quarterly Panel Dataset
# ============================================================

# Step 1: Assign quarter identifier to each observation
features_q <- features %>%
  mutate(
    quarter = lubridate::floor_date(date, unit = "quarter")
  )

# Step 2: Keep only quarter-end observations for each stock
# This gives one snapshot per firm per quarter
features_q_end <- features_q %>%
  group_by(symbol, quarter) %>%
  filter(date == max(date)) %>%
  ungroup()

# Step 3: Create next-quarter return (dependent variable)
# lead() looks forward — current quarter characteristics predict NEXT quarter return
# This ensures no look-ahead bias in the predictive regression framework
features_q_end <- features_q_end %>%
  group_by(symbol) %>%
  arrange(quarter) %>%
  mutate(
    ret_next_q = lead(adjusted) / adjusted - 1
  ) %>%
  ungroup()

# Step 4: Drop last quarter per stock (no future return available)
features_q_end <- features_q_end %>%
  filter(!is.na(ret_next_q))

# Sanity checks
glimpse(features_q_end)

## Rows: 7,455
## Columns: 16
## $ symbol     <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", …
## $ date       <date> 2022-03-31, 2022-03-31, 2022-03-31, 2022-03-31, 2022-03-31…
## $ adjusted   <dbl> 128.58226, 171.08022, 140.11865, 171.75999, 109.64191, 46.0…
## $ volume     <dbl> 3046400, 103049300, 8670100, 3232200, 5432600, 2377300, 275…
## $ ret_d      <dbl> -0.0231066531, -0.0177755791, -0.0100152926, -0.0107700877,…
## $ mkt_ret    <dbl> -0.01539134, -0.01539134, -0.01539134, -0.01539134, -0.0153…
## $ mom_q      <dbl> -0.17637812, -0.01889731, 0.20497457, 0.01765610, -0.157606…
## $ mom_y      <dbl> 0.04194401, 0.42803114, 0.56343688, -0.08754787, 0.01000050…
## $ vol_q      <dbl> 0.02058148, 0.01900028, 0.00987060, 0.04028967, 0.01611739,…
## $ vol_y      <dbl> 0.01474659, 0.01528478, 0.01117082, 0.03147287, 0.01376143,…
## $ volm_q_avg <dbl> 2063185.7, 94933725.4, 7769914.3, 6180982.5, 6246211.1, 185…
## $ volm_y_avg <dbl> 1708491.7, 87021204.8, 6907349.6, 6548170.2, 5613284.9, 176…
## $ volm_y_sd  <dbl> 775662.5, 25477847.9, 3775623.1, 4518750.9, 2505382.7, 7820…
## $ mdd_q      <dbl> -0.20973422, -0.17140855, -0.03509275, -0.29495287, -0.1869…
## $ quarter    <date> 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01…
## $ ret_next_q <dbl> -0.10106207, -0.21584764, -0.04717954, -0.48136933, -0.0783…

cat("Summary of dependent variable (next-quarter return):\n")

## Summary of dependent variable (next-quarter return):

print(summary(features_q_end$ret_next_q))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.81071 -0.06840  0.02308  0.03200  0.12081  2.55319

# Final quarterly panel dataset used for all downstream analysis
quarterly_panel <- features_q_end
cat("\nFinal quarterly panel dimensions:", 
    nrow(quarterly_panel), "rows x", 
    ncol(quarterly_panel), "columns\n")

## 
## Final quarterly panel dimensions: 7455 rows x 16 columns

# ============================================================
# Section 7: Compute Rolling Beta and Firm Size
# ============================================================

# Step 1: Compute rolling 1-year beta for each stock
# Formula: beta = Cov(R_i, R_m) / Var(R_m)
# Uses 252-day rolling window (.before = 251 + current = 252)
# slide2_dbl() handles two input vectors simultaneously
beta_daily <- daily_full %>%
  dplyr::group_by(symbol) %>%
  dplyr::arrange(date) %>%
  dplyr::mutate(
    beta_y = slider::slide2_dbl(
      .x = ret_d,
      .y = mkt_ret,
      .f = ~ cov(.x, .y, use = "complete.obs") / var(.y, na.rm = TRUE),
      .before   = 251,
      .complete = TRUE
    )
  ) %>%
  dplyr::ungroup() %>%
  dplyr::select(symbol, date, beta_y)

# Step 2: Join beta onto quarterly panel
# Joining by both symbol AND date ensures each firm-quarter
# gets the correct beta estimated as of that quarter-end date
quarterly_panel <- quarterly_panel %>%
  dplyr::left_join(beta_daily, by = c("symbol", "date"))

cat("Beta added. Missing beta_y:", sum(is.na(quarterly_panel$beta_y)), "\n")

## Beta added. Missing beta_y: 0

print(summary(quarterly_panel$beta_y))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5250  0.5884  0.8860  0.9139  1.1749  4.4554

# Step 3: Add firm size (log market capitalization)
# Primary: pull market cap from Yahoo Finance via key.stats
# Fallback: log(price × volume) as size proxy if API unavailable
size_tbl <- tryCatch(
  tq_get(tickers, get = "key.stats") %>%
    dplyr::select(symbol, market.cap) %>%
    dplyr::mutate(log_mktcap = log(market.cap)) %>%
    dplyr::select(symbol, log_mktcap) %>%
    dplyr::distinct(),
  error = function(e) NULL
)

# Step 4: Join size onto quarterly panel
if (!is.null(size_tbl) && nrow(size_tbl) > 0) {
  # Primary: use actual log market capitalization
  quarterly_panel <- quarterly_panel %>%
    dplyr::left_join(size_tbl, by = "symbol")
  cat("Size added using log_mktcap.\n")
  print(summary(quarterly_panel$log_mktcap))
} else {
  # Fallback: log(price × avg volume) as size proxy
  # This approximates market cap when API data is unavailable
  quarterly_panel <- quarterly_panel %>%
    dplyr::mutate(log_size_proxy = log(adjusted * volm_y_avg))
  cat("Market cap unavailable; size added using log_size_proxy.\n")
  print(summary(quarterly_panel$log_size_proxy))
}

## Market cap unavailable; size added using log_size_proxy.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.383  18.752  19.291  19.410  19.966  24.785

# Final column check
cat("Columns now in quarterly_panel:\n")

## Columns now in quarterly_panel:

print(names(quarterly_panel))

##  [1] "symbol"         "date"           "adjusted"       "volume"        
##  [5] "ret_d"          "mkt_ret"        "mom_q"          "mom_y"         
##  [9] "vol_q"          "vol_y"          "volm_q_avg"     "volm_y_avg"    
## [13] "volm_y_sd"      "mdd_q"          "quarter"        "ret_next_q"    
## [17] "beta_y"         "log_size_proxy"

# ============================================================
# Section 8: Add Sector Data via Wikipedia GICS Sectors
# ============================================================

# tq_index("SP500") sector column is broken in current tidyquant version
# Fetching GICS sector classifications directly from Wikipedia instead

url <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

sp500_wiki <- tryCatch({
  tables <- url %>%
    rvest::read_html() %>%
    rvest::html_table(fill = TRUE)
  
  tables[[1]] %>%
    dplyr::select(symbol = Symbol, sector = `GICS Sector`) %>%
    dplyr::mutate(symbol = trimws(symbol))
}, error = function(e) NULL)

cat("Unique sectors from Wikipedia:\n")

## Unique sectors from Wikipedia:

print(unique(sp500_wiki$sector))

##  [1] "Industrials"            "Health Care"            "Information Technology"
##  [4] "Utilities"              "Financials"             "Materials"             
##  [7] "Consumer Discretionary" "Real Estate"            "Communication Services"
## [10] "Consumer Staples"       "Energy"

cat("Total tickers with sectors:", nrow(sp500_wiki), "\n")

## Total tickers with sectors: 503

# Remove any existing broken sector column before joining
quarterly_panel <- quarterly_panel %>%
  dplyr::select(-any_of("sector")) %>%
  dplyr::left_join(sp500_wiki, by = "symbol")

# Handle 30 missing sectors due to ticker format mismatches
# e.g., Wikipedia uses "BRK.B" while Yahoo Finance uses "BRK-B"
quarterly_panel <- quarterly_panel %>%
  dplyr::mutate(
    sector = ifelse(is.na(sector), "Unknown", sector)
  )

cat("Missing sectors after join:", sum(is.na(quarterly_panel$sector)), "\n")

## Missing sectors after join: 0

cat("Sector distribution:\n")

## Sector distribution:

print(table(quarterly_panel$sector))

## 
## Communication Services Consumer Discretionary       Consumer Staples 
##                    345                    720                    516 
##                 Energy             Financials            Health Care 
##                    330                   1122                    881 
##            Industrials Information Technology              Materials 
##                   1161                   1034                    390 
##            Real Estate                Unknown              Utilities 
##                    465                     30                    461

# ============================================================
# Section 9: Part 2(a) - Boruta Variable Selection
# ============================================================

# IMPORTANT: Boruta is run on quantitative predictors ONLY
# Factor variables (sector, quarter) evaluated separately in Part 2(b)
# Parameters used:
#   - maxRuns = 100  (maximum number of importance source runs)
#   - pValue  = 0.01 (confidence level for feature acceptance/rejection)
#   - doTrace = 2    (verbose output to track progress)
#   - ntree   = 500  (default random forest trees per run)
#   - seed    = 123  (for reproducibility)

set.seed(123)

# Step 1: Prepare Boruta dataset (quantitative predictors only)
boruta_data <- quarterly_panel %>%
  dplyr::select(
    ret_next_q,      # Dependent variable
    mom_q,           # Quarterly momentum
    mom_y,           # Yearly momentum
    vol_q,           # Quarterly volatility
    vol_y,           # Yearly volatility
    beta_y,          # Market beta
    log_size_proxy,  # Firm size (log price x volume)
    volm_q_avg,      # Average quarterly trading volume
    volm_y_avg,      # Average yearly trading volume
    volm_y_sd,       # Yearly volume volatility
    mdd_q            # Maximum drawdown (quarterly)
  ) %>%
  na.omit()

cat("Boruta sample size:", nrow(boruta_data), "\n")

## Boruta sample size: 7455

# Step 2: Run Boruta algorithm
boruta_model <- Boruta(
  ret_next_q ~ .,
  data    = boruta_data,
  maxRuns = 100,
  pValue  = 0.01,
  doTrace = 2
)

# Step 3: Resolve any tentative variables
boruta_final <- TentativeRoughFix(boruta_model)

# Step 4: Extract confirmed variables only
selected_vars <- getSelectedAttributes(boruta_final, withTentative = FALSE)
cat("\nConfirmed important variables:\n")

## 
## Confirmed important variables:

print(selected_vars)

##  [1] "mom_q"          "mom_y"          "vol_q"          "vol_y"         
##  [5] "beta_y"         "log_size_proxy" "volm_q_avg"     "volm_y_avg"    
##  [9] "volm_y_sd"      "mdd_q"

# Step 5: Importance table
cat("\nVariable importance statistics:\n")

## 
## Variable importance statistics:

print(attStats(boruta_final))

##                  meanImp medianImp    minImp   maxImp normHits  decision
## mom_q          25.818588 25.052473 24.227193 28.94987        1 Confirmed
## mom_y          14.135517 14.561089 11.609118 15.26047        1 Confirmed
## vol_q          19.136843 19.633455 16.446098 21.58546        1 Confirmed
## vol_y          17.540406 17.588384 14.895209 19.59643        1 Confirmed
## beta_y         11.148484 11.281836  9.519076 12.58580        1 Confirmed
## log_size_proxy  9.507656  9.237430  7.635180 11.85250        1 Confirmed
## volm_q_avg      9.651028  9.584909  6.111301 13.06927        1 Confirmed
## volm_y_avg      9.711749  9.803761  6.776112 11.62956        1 Confirmed
## volm_y_sd       9.034147  8.653053  7.204950 11.32367        1 Confirmed
## mdd_q          16.797443 16.907008 14.589846 18.74710        1 Confirmed

# Step 6: Boruta importance plot
par(mar = c(12, 4, 4, 2))
plot(
  boruta_final,
  las      = 2,
  cex.axis = 1.05,
  cex.lab  = 1.2,
  xlab     = "",
  main     = "Boruta Variable Importance Plot"
)
mtext("Attributes", side = 1, line = 8, cex = 1.2)

# ============================================================
# Section 10: Part 2(b) - ANOVA for Factor Variables
# ============================================================

# Create readable quarter labels FIRST (before converting to factor)
quarterly_panel <- quarterly_panel %>%
  mutate(
    quarter_factor = as.factor(
      paste0(lubridate::year(quarter), "-Q", lubridate::quarter(quarter))
    )
  )

cat("Number of quarters:", nlevels(quarterly_panel$quarter_factor), "\n")

## Number of quarters: 15

print(table(quarterly_panel$quarter_factor))

## 
## 2022-Q1 2022-Q2 2022-Q3 2022-Q4 2023-Q1 2023-Q2 2023-Q3 2023-Q4 2024-Q1 2024-Q2 
##     492     494     495     495     496     496     496     497     497     498 
## 2024-Q3 2024-Q4 2025-Q1 2025-Q2 2025-Q3 
##     498     499     500     501     501

# Now convert sector and quarter to factors
quarterly_panel$sector  <- as.factor(quarterly_panel$sector)
quarterly_panel$quarter <- as.factor(quarterly_panel$quarter)

# --- SECTOR ANOVA ---
anova_sector <- aov(ret_next_q ~ sector, data = quarterly_panel)
cat("\nANOVA Results - Sector:\n")

## 
## ANOVA Results - Sector:

print(summary(anova_sector))

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## sector        11    2.5 0.22748   8.654 2.04e-15 ***
## Residuals   7443  195.7 0.02629                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Boxplot by sector
ggplot(quarterly_panel, aes(x = reorder(sector, ret_next_q, median),
                             y = ret_next_q,
                             fill = sector)) +
  geom_boxplot(outlier.alpha = 0.2) +
  coord_flip() +
  scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
  labs(
    title = "Next-Quarter Returns by GICS Sector",
    x     = "Sector",
    y     = "Next-Quarter Return"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# --- QUARTER ANOVA ---
anova_quarter <- aov(ret_next_q ~ quarter_factor, data = quarterly_panel)
cat("\nANOVA Results - Calendar Quarter:\n")

## 
## ANOVA Results - Calendar Quarter:

print(summary(anova_quarter))

##                  Df Sum Sq Mean Sq F value Pr(>F)    
## quarter_factor   14   35.7  2.5498   116.8 <2e-16 ***
## Residuals      7440  162.4  0.0218                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Boxplot by quarter
ggplot(quarterly_panel, aes(x = quarter_factor,
                             y = ret_next_q,
                             fill = quarter_factor)) +
  geom_boxplot(outlier.alpha = 0.2) +
  scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
  labs(
    title = "Next-Quarter Returns by Calendar Quarter",
    x     = "Quarter",
    y     = "Next-Quarter Return"
  ) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

# ============================================================
# Section 11: Part 3 - EDA Data Preparation
# ============================================================

# Select quantitative variables for EDA
# Note: sector and quarter_factor excluded here as they are
# factor variables analyzed separately in Part 2(b)
eda_data <- quarterly_panel %>%
  dplyr::select(
    ret_next_q,                          # Dependent variable
    mom_q, mom_y,                        # Momentum
    vol_q, vol_y,                        # Volatility
    beta_y,                              # Market beta
    log_size_proxy,                      # Firm size
    volm_q_avg, volm_y_avg, volm_y_sd,  # Trading volume
    mdd_q                                # Maximum drawdown
  )

# Full EDA dataset including factor variables
# Used for plots requiring sector or quarter breakdowns
eda_data_full <- quarterly_panel %>%
  dplyr::select(
    ret_next_q,                          # Dependent variable
    mom_q, mom_y,                        # Momentum
    vol_q, vol_y,                        # Volatility
    beta_y,                              # Market beta
    log_size_proxy,                      # Firm size
    volm_q_avg, volm_y_avg, volm_y_sd,  # Trading volume
    mdd_q,                               # Maximum drawdown
    sector,                              # GICS sector (factor)
    quarter_factor                       # Calendar quarter (factor)
  )

# Sanity checks
cat("Quantitative EDA dataset:", nrow(eda_data), "rows x", ncol(eda_data), "cols\n")

## Quantitative EDA dataset: 7455 rows x 11 cols

cat("Full EDA dataset:", nrow(eda_data_full), "rows x", ncol(eda_data_full), "cols\n")

## Full EDA dataset: 7455 rows x 13 cols

dplyr::glimpse(eda_data)

## Rows: 7,455
## Columns: 11
## $ ret_next_q     <dbl> -0.10106207, -0.21584764, -0.04717954, -0.48136933, -0.…
## $ mom_q          <dbl> -0.17637812, -0.01889731, 0.20497457, 0.01765610, -0.15…
## $ mom_y          <dbl> 0.04194401, 0.42803114, 0.56343688, -0.08754787, 0.0100…
## $ vol_q          <dbl> 0.02058148, 0.01900028, 0.00987060, 0.04028967, 0.01611…
## $ vol_y          <dbl> 0.01474659, 0.01528478, 0.01117082, 0.03147287, 0.01376…
## $ beta_y         <dbl> 1.0533860, 1.2488194, 0.3749702, 1.6402609, 0.6866531, …
## $ log_size_proxy <dbl> 19.20769, 23.42379, 20.69059, 20.84079, 20.23787, 18.21…
## $ volm_q_avg     <dbl> 2063185.7, 94933725.4, 7769914.3, 6180982.5, 6246211.1,…
## $ volm_y_avg     <dbl> 1708491.7, 87021204.8, 6907349.6, 6548170.2, 5613284.9,…
## $ volm_y_sd      <dbl> 775662.5, 25477847.9, 3775623.1, 4518750.9, 2505382.7, …
## $ mdd_q          <dbl> -0.20973422, -0.17140855, -0.03509275, -0.29495287, -0.…

# ============================================================
# Section 12: Part 3(b) - Histograms of All Variables
# ============================================================

# Reshape data to long format for faceted plotting
eda_long <- eda_data %>%
  tidyr::pivot_longer(
    cols      = everything(),
    names_to  = "variable",
    values_to = "value"
  )

# Plot histograms for all 11 variables with normal curve overlay
ggplot(eda_long, aes(x = value)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins   = 50,
                 fill   = "steelblue",
                 color  = "white",
                 alpha  = 0.7) +
  geom_density(color = "red", linewidth = 0.8) +
  facet_wrap(~ variable, scales = "free", ncol = 4) +
  labs(
    title = "Histograms of All Quantitative Variables",
    x     = "Value",
    y     = "Density"
  ) +
  theme_minimal() +
  theme(
    strip.text    = element_text(size = 9, face = "bold"),
    plot.title    = element_text(size = 14, face = "bold"),
    axis.text     = element_text(size = 7)
  )

# ============================================================
# Section 13: Part 3(b) - QQ Plots of All Variables
# ============================================================

# QQ plots assess normality - points following the red line = normal distribution
# Deviations in tails indicate skewness or heavy tails

ggplot(eda_long, aes(sample = value)) +
  stat_qq(color = "steelblue", alpha = 0.3, size = 0.5) +
  stat_qq_line(color = "red", linewidth = 0.8) +
  facet_wrap(~ variable, scales = "free", ncol = 4) +
  labs(
    title = "QQ Plots of All Quantitative Variables",
    x     = "Theoretical Quantiles",
    y     = "Sample Quantiles"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 9, face = "bold"),
    plot.title = element_text(size = 14, face = "bold"),
    axis.text  = element_text(size = 7)
  )

# ============================================================
# Section 14: Part 3(b) - Correlation Matrix
# ============================================================

# Compute pairwise correlation matrix
# pairwise.complete.obs handles any remaining NAs
cor_mat <- cor(eda_data, use = "pairwise.complete.obs")

# Plot correlation matrix
corrplot(
  cor_mat,
  method      = "color",
  type        = "upper",
  tl.cex      = 0.7,
  number.cex  = 0.7,
  tl.srt      = 45,
  addCoef.col = "black",
  title       = "Correlation Matrix of Quantitative Variables",
  mar         = c(0, 0, 2, 0)  # Add margin for title
)

# ============================================================
# Section 15: Part 3(b) - Scatterplot Matrix
# ============================================================

# Note: Only 6 key variables selected for scatterplot matrix
# Including all 11 variables would create a 11x11 = 121 panel grid
# which would be too small to read and visually uninformative
# Volume variables (volm_q_avg, volm_y_avg, volm_y_sd) excluded due to
# extreme skewness making scatterplots unreadable without transformation
# vol_q and vol_y excluded as they are highly correlated with each other
# and with beta_y (already included), adding redundant information

# Select key variables for scatterplot matrix
focus_vars <- eda_data %>%
  dplyr::select(
    ret_next_q,
    mom_q,
    mom_y,
    beta_y,
    log_size_proxy,
    mdd_q
  )

# Scatterplot matrix with correlation coefficients and smooth lines
GGally::ggpairs(
  focus_vars,
  lower = list(continuous = GGally::wrap("points",
                                          alpha = 0.1,
                                          size  = 0.5)),
  upper = list(continuous = GGally::wrap("cor",
                                          size = 3.5)),
  diag  = list(continuous = GGally::wrap("densityDiag",
                                          fill = "steelblue",
                                          alpha = 0.5)),
  title = "Scatterplot Matrix of Key Variables"
) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text  = element_text(size = 6)
  )

# ============================================================
# Section 16: Part 3(b) - Boxplots by Sector and Quarter
# ============================================================

# Plot 1: Market beta by sector
p1 <- ggplot(eda_data_full, aes(x = reorder(sector, beta_y, median),
                                 y = beta_y,
                                 fill = sector)) +
  geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
  coord_flip() +
  labs(
    title = "Market Beta by GICS Sector",
    x     = "",
    y     = "Market Beta"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title      = element_text(size = 11, face = "bold")
  )

# Plot 2: Quarterly momentum by sector
p2 <- ggplot(eda_data_full, aes(x = reorder(sector, mom_q, median),
                                 y = mom_q,
                                 fill = sector)) +
  geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
  coord_flip() +
  scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
  labs(
    title = "Quarterly Momentum by GICS Sector",
    x     = "",
    y     = "Quarterly Momentum"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title      = element_text(size = 11, face = "bold")
  )

# Plot 3: Quarterly momentum over time by quarter
p3 <- ggplot(eda_data_full, aes(x = quarter_factor,
                                 y = mom_q,
                                 fill = quarter_factor)) +
  geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
  scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
  labs(
    title = "Quarterly Momentum Over Time",
    x     = "Quarter",
    y     = "Quarterly Momentum"
  ) +
  theme_minimal() +
  theme(
    legend.position  = "none",
    axis.text.x      = element_text(angle = 45, hjust = 1),
    plot.title       = element_text(size = 11, face = "bold")
  )

# Combine all three plots using patchwork
# Top row: sector boxplots side by side
# Bottom row: quarter boxplot full width
(p1 + p2) / p3

# ============================================================
# Section 17: Part 3(c) - Descriptive Statistics Table
# ============================================================

# Compute descriptive statistics for all quantitative variables
desc_stats_raw <- eda_data %>%
  tidyr::pivot_longer(
    everything(),
    names_to  = "Variable",
    values_to = "Value"
  ) %>%
  dplyr::group_by(Variable) %>%
  dplyr::summarise(
    mean   = mean(Value,   na.rm = TRUE),
    median = median(Value, na.rm = TRUE),
    sd     = sd(Value,     na.rm = TRUE),
    min    = min(Value,    na.rm = TRUE),
    max    = max(Value,    na.rm = TRUE),
    .groups = "drop"
  ) %>%
  dplyr::arrange(Variable)

# Flag volume variables for different number formatting
# Volume variables are in raw share counts — use comma formatting
# All other variables use 4 decimal place formatting
is_volume <- grepl("^volm_", desc_stats_raw$Variable)

# Format numbers appropriately by variable type
desc_stats_fmt <- desc_stats_raw %>%
  dplyr::mutate(
    mean   = ifelse(is_volume, scales::comma(mean,   accuracy = 1), scales::number(mean,   accuracy = 0.0001)),
    median = ifelse(is_volume, scales::comma(median, accuracy = 1), scales::number(median, accuracy = 0.0001)),
    sd     = ifelse(is_volume, scales::comma(sd,     accuracy = 1), scales::number(sd,     accuracy = 0.0001)),
    min    = ifelse(is_volume, scales::comma(min,    accuracy = 1), scales::number(min,     accuracy = 0.0001)),
    max    = ifelse(is_volume, scales::comma(max,    accuracy = 1), scales::number(max,     accuracy = 0.0001))
  )

# Display professional table
knitr::kable(
  desc_stats_fmt,
  align     = "lrrrrr",
  col.names = c("Variable", "Mean", "Median", "Std Dev", "Min", "Max"),
  caption   = "Table 1: Descriptive Statistics of Quantitative Variables"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    position          = "center"
  )

Table 1: Descriptive Statistics of Quantitative Variables
Variable	Mean	Median	Std Dev	Min	Max
beta_y	0.9139	0.8860	0.4801	-0.5250	4.4554
log_size_proxy	19.4103	19.2914	1.0417	9.3826	24.7854
mdd_q	-0.1428	-0.1250	0.0825	-0.8427	-0.0115
mom_q	0.0277	0.0195	0.1685	-0.8208	2.4238
mom_y	0.1315	0.0819	0.4506	-0.9797	12.8225
ret_next_q	0.0320	0.0231	0.1630	-0.8107	2.5532
vol_q	0.0194	0.0174	0.0087	0.0060	0.1222
vol_y	0.0194	0.0177	0.0076	0.0076	0.1133
volm_q_avg	6,322,794	2,382,710	22,126,081	173	571,835,587
volm_y_avg	6,245,465	2,342,950	22,219,153	408	542,065,472
volm_y_sd	3,055,284	1,181,748	8,614,108	1,512	188,020,493

# ============================================================
# Section 18: Part 3(d) - Density Plots with Fitted Distributions
# ============================================================
# For each variable we fit up to 3 distributions:
#   - Normal (all variables)
#   - Lognormal (strictly positive variables only)
#   - Gamma (strictly positive variables only)
# safe_fit() returns NULL if fitting fails, preventing crashes
# Fitted curves are overlaid on kernel density estimates

# Safely fit a distribution (returns NULL if it fails)
safe_fit <- function(x, dist_name) {
  tryCatch(fitdistrplus::fitdist(x, dist_name), error = function(e) NULL)
}

# Plot density + fitted curves for one variable
plot_density_with_fits <- function(x, varname) {
  x <- x[is.finite(x)]
  x <- x[!is.na(x)]

  d    <- density(x)
  xs   <- d$x

  # Start y max using kernel density
  y_max <- max(d$y)

  # Normal fit (always try)
  fit_norm <- safe_fit(x, "norm")
  if (!is.null(fit_norm)) {
    y_norm <- dnorm(xs,
      mean = fit_norm$estimate["mean"],
      sd   = fit_norm$estimate["sd"]
    )
    y_max <- max(y_max, y_norm)
  }

  # Lognormal + Gamma only if strictly positive
  fit_lnorm <- NULL
  fit_gamma <- NULL
  if (all(x > 0)) {
    fit_lnorm <- safe_fit(x, "lnorm")
    if (!is.null(fit_lnorm)) {
      y_lnorm <- dlnorm(xs,
        meanlog = fit_lnorm$estimate["meanlog"],
        sdlog   = fit_lnorm$estimate["sdlog"]
      )
      y_max <- max(y_max, y_lnorm)
    }

    fit_gamma <- safe_fit(x, "gamma")
    if (!is.null(fit_gamma)) {
      y_gamma <- dgamma(xs,
        shape = fit_gamma$estimate["shape"],
        rate  = fit_gamma$estimate["rate"]
      )
      y_max <- max(y_max, y_gamma)
    }
  }

  # Add headroom so nothing gets cut off at the top
  y_max <- y_max * 1.15

  # Plot kernel density
  plot(d,
    main     = paste("Density + Fits:", varname),
    xlab     = varname,
    ylab     = "Density",
    ylim     = c(0, y_max),
    lwd      = 2,
    cex.main = 1.15,
    cex.lab  = 1.35,
    cex.axis = 1.25
  )

  # Overlay fitted distributions
  if (!is.null(fit_norm)) {
    curve(dnorm(x,
      mean = fit_norm$estimate["mean"],
      sd   = fit_norm$estimate["sd"]
    ), add = TRUE, lwd = 2, lty = 2)
  }

  if (!is.null(fit_lnorm)) {
    curve(dlnorm(x,
      meanlog = fit_lnorm$estimate["meanlog"],
      sdlog   = fit_lnorm$estimate["sdlog"]
    ), add = TRUE, lwd = 2, lty = 3)
  }

  if (!is.null(fit_gamma)) {
    curve(dgamma(x,
      shape = fit_gamma$estimate["shape"],
      rate  = fit_gamma$estimate["rate"]
    ), add = TRUE, lwd = 2, lty = 4)
  }

  # Dynamic legend based on which fits succeeded
  leg  <- c("Kernel density", "Normal")
  ltys <- c(1, 2)
  if (!is.null(fit_lnorm)) { leg <- c(leg, "Lognormal"); ltys <- c(ltys, 3) }
  if (!is.null(fit_gamma))  { leg <- c(leg, "Gamma");    ltys <- c(ltys, 4) }

  legend("topright", legend = leg, lty = ltys, lwd = 2, bty = "n", cex = 1.0)
}

# Save old graphics settings and restore after
op <- par(no.readonly = TRUE)
on.exit(par(op), add = TRUE)

# Layout: 3 columns, dynamic number of rows
p  <- ncol(eda_data)
nr <- ceiling(p / 3)

# Bigger margins to prevent axis label cutoff
par(mfrow = c(nr, 3), mar = c(5.2, 6.0, 3.0, 1.2), mgp = c(3.0, 1.0, 0))

# Plot density for all variables
for (v in names(eda_data)) {
  plot_density_with_fits(eda_data[[v]], v)
}

## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data,     gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth,     lower = lower, upper = upper, ...): non-finite finite-difference value [2]>

## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data,     gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth,     lower = lower, upper = upper, ...): non-finite finite-difference value [2]>

## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data,     gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth,     lower = lower, upper = upper, ...): non-finite finite-difference value [2]>

# ============================================================
# Section 19: Part 3(e) - Non-Linearities and Transformations
# ============================================================

# Based on histogram, QQ plot, and density plot analysis:
# volm_q_avg, volm_y_avg, volm_y_sd show severe right skewness
# requiring log transformation before regression inclusion
# mom_y also shows right skew but is already on return scale
# so we apply log(1 + x) to handle zero/negative values

# Step 1: Apply log transformations to volume variables
eda_data <- eda_data %>%
  dplyr::mutate(
    log_volm_q_avg = log(volm_q_avg),  # log transform quarterly volume
    log_volm_y_avg = log(volm_y_avg),  # log transform yearly volume
    log_volm_y_sd  = log(volm_y_sd)    # log transform volume volatility
  )

# Also apply to quarterly_panel for use in regression later
quarterly_panel <- quarterly_panel %>%
  dplyr::mutate(
    log_volm_q_avg = log(volm_q_avg),
    log_volm_y_avg = log(volm_y_avg),
    log_volm_y_sd  = log(volm_y_sd)
  )

cat("Log transformations applied to volume variables.\n")

## Log transformations applied to volume variables.

cat("New variables added: log_volm_q_avg, log_volm_y_avg, log_volm_y_sd\n\n")

## New variables added: log_volm_q_avg, log_volm_y_avg, log_volm_y_sd

# Step 2: Compare distributions before and after transformation
# Plot original vs log-transformed for each volume variable
par(mfrow = c(2, 3), mar = c(5, 5, 3, 1))

# Before transformation
hist(eda_data$volm_q_avg,
  main = "volm_q_avg (Raw)",
  xlab = "Average Quarterly Volume",
  col  = "steelblue", border = "white", breaks = 50)

hist(eda_data$volm_y_avg,
  main = "volm_y_avg (Raw)",
  xlab = "Average Yearly Volume",
  col  = "steelblue", border = "white", breaks = 50)

hist(eda_data$volm_y_sd,
  main = "volm_y_sd (Raw)",
  xlab = "Yearly Volume SD",
  col  = "steelblue", border = "white", breaks = 50)

# After transformation
hist(eda_data$log_volm_q_avg,
  main = "log_volm_q_avg (Log Transformed)",
  xlab = "Log Average Quarterly Volume",
  col  = "darkgreen", border = "white", breaks = 50)

hist(eda_data$log_volm_y_avg,
  main = "log_volm_y_avg (Log Transformed)",
  xlab = "Log Average Yearly Volume",
  col  = "darkgreen", border = "white", breaks = 50)

hist(eda_data$log_volm_y_sd,
  main = "log_volm_y_sd (Log Transformed)",
  xlab = "Log Yearly Volume SD",
  col  = "darkgreen", border = "white", breaks = 50)

par(mfrow = c(1, 1))

# Step 3: Print summary statistics before and after
cat("Summary of raw volume variables:\n")

## Summary of raw volume variables:

summary(eda_data[, c("volm_q_avg", "volm_y_avg", "volm_y_sd")])

##    volm_q_avg          volm_y_avg          volm_y_sd        
##  Min.   :      173   Min.   :      408   Min.   :     1512  
##  1st Qu.:  1200985   1st Qu.:  1190770   1st Qu.:   584587  
##  Median :  2382710   Median :  2342950   Median :  1181748  
##  Mean   :  6322794   Mean   :  6245465   Mean   :  3055284  
##  3rd Qu.:  5300288   3rd Qu.:  5214959   3rd Qu.:  2642049  
##  Max.   :571835587   Max.   :542065472   Max.   :188020493

cat("\nSummary of log-transformed volume variables:\n")

## 
## Summary of log-transformed volume variables:

summary(eda_data[, c("log_volm_q_avg", "log_volm_y_avg", "log_volm_y_sd")])

##  log_volm_q_avg   log_volm_y_avg   log_volm_y_sd   
##  Min.   : 5.153   Min.   : 6.011   Min.   : 7.321  
##  1st Qu.:13.999   1st Qu.:13.990   1st Qu.:13.279  
##  Median :14.684   Median :14.667   Median :13.983  
##  Mean   :14.759   Mean   :14.742   Mean   :14.058  
##  3rd Qu.:15.483   3rd Qu.:15.467   3rd Qu.:14.787  
##  Max.   :20.164   Max.   :20.111   Max.   :19.052

# ============================================================
# Section 20: Part 3(f) - Outlier Detection and Treatment
# ============================================================

# Step 1: Identify outliers using IQR method
# Outlier defined as: value < Q1 - 3*IQR or value > Q3 + 3*IQR
# Using 3*IQR (extreme outliers only) rather than 1.5*IQR
# to avoid over-removing genuine financial observations

outlier_summary <- eda_data %>%
  dplyr::select(ret_next_q, mom_q, mom_y, beta_y,
                log_volm_q_avg, log_volm_y_avg, log_volm_y_sd,
                vol_q, vol_y, log_size_proxy, mdd_q) %>%
  tidyr::pivot_longer(
    everything(),
    names_to  = "Variable",
    values_to = "Value"
  ) %>%
  dplyr::group_by(Variable) %>%
  dplyr::summarise(
    Q1          = quantile(Value, 0.25, na.rm = TRUE),
    Q3          = quantile(Value, 0.75, na.rm = TRUE),
    IQR         = IQR(Value, na.rm = TRUE),
    lower_bound = Q1 - 3 * IQR(Value, na.rm = TRUE),
    upper_bound = Q3 + 3 * IQR(Value, na.rm = TRUE),
    n_outliers  = sum(Value < Q1 - 3 * IQR(Value, na.rm = TRUE) |
                      Value > Q3 + 3 * IQR(Value, na.rm = TRUE),
                      na.rm = TRUE),
    pct_outliers = round(n_outliers / n() * 100, 2),
    .groups = "drop"
  ) %>%
  dplyr::arrange(dplyr::desc(n_outliers))

cat("Outlier Summary (using 3*IQR threshold):\n")

## Outlier Summary (using 3*IQR threshold):

print(outlier_summary)

## # A tibble: 11 × 8
##    Variable            Q1      Q3     IQR lower_bound upper_bound n_outliers
##    <chr>            <dbl>   <dbl>   <dbl>       <dbl>       <dbl>      <int>
##  1 vol_q           0.0138  0.0226 0.00878    -0.0126       0.0489         88
##  2 vol_y           0.0144  0.0222 0.00772    -0.00873      0.0453         87
##  3 mom_y          -0.0974  0.286  0.383      -1.25         1.43           66
##  4 mdd_q          -0.183  -0.0831 0.100      -0.483        0.217          38
##  5 log_size_proxy 18.8    20.0    1.21       15.1         23.6            33
##  6 ret_next_q     -0.0684  0.121  0.189      -0.636        0.688          30
##  7 mom_q          -0.0774  0.121  0.198      -0.671        0.715          29
##  8 beta_y          0.588   1.17   0.587      -1.17         2.93           20
##  9 log_volm_y_avg 14.0    15.5    1.48        9.56        19.9            19
## 10 log_volm_q_avg 14.0    15.5    1.48        9.54        19.9            18
## 11 log_volm_y_sd  13.3    14.8    1.51        8.75        19.3             9
## # ℹ 1 more variable: pct_outliers <dbl>

# Step 2: Winsorize key variables with extreme outliers
# Winsorization caps values at 1st and 99th percentiles
# Preserves all observations while limiting influence of extremes
winsorize <- function(x, lower = 0.01, upper = 0.99) {
  q <- quantile(x, c(lower, upper), na.rm = TRUE)
  pmax(pmin(x, q[2]), q[1])
}

# Apply winsorization to variables with notable outliers
eda_data <- eda_data %>%
  dplyr::mutate(
    ret_next_q     = winsorize(ret_next_q),      # Max was +255%
    mom_y          = winsorize(mom_y),            # Max was +1620%
    mom_q          = winsorize(mom_q),            # Long right tail
    beta_y         = winsorize(beta_y)            # Max was 4.46
  )

# Also apply to quarterly_panel for regression use later
quarterly_panel <- quarterly_panel %>%
  dplyr::mutate(
    ret_next_q     = winsorize(ret_next_q),
    mom_y          = winsorize(mom_y),
    mom_q          = winsorize(mom_q),
    beta_y         = winsorize(beta_y)
  )

cat("\nWinsorization applied to ret_next_q, mom_y, mom_q, beta_y.\n")

## 
## Winsorization applied to ret_next_q, mom_y, mom_q, beta_y.

cat("Values capped at 1st and 99th percentiles.\n\n")

## Values capped at 1st and 99th percentiles.

# Step 3: Compare distributions before and after winsorization
par(mfrow = c(2, 4), mar = c(5, 4, 3, 1))

# Show key variables after winsorization
for (v in c("ret_next_q", "mom_q", "mom_y", "beta_y")) {
  hist(eda_data[[v]],
    main   = paste(v, "(Winsorized)"),
    xlab   = v,
    col    = "steelblue",
    border = "white",
    breaks = 50
  )
}

# Step 4: Summary statistics after winsorization
cat("Summary statistics after winsorization:\n")

## Summary statistics after winsorization:

print(summary(eda_data[, c("ret_next_q", "mom_q", "mom_y", "beta_y")]))

##    ret_next_q           mom_q              mom_y              beta_y        
##  Min.   :-0.31709   Min.   :-0.33222   Min.   :-0.53340   Min.   :-0.01401  
##  1st Qu.:-0.06840   1st Qu.:-0.07736   1st Qu.:-0.09736   1st Qu.: 0.58838  
##  Median : 0.02308   Median : 0.01945   Median : 0.08194   Median : 0.88604  
##  Mean   : 0.03049   Mean   : 0.02613   Mean   : 0.11920   Mean   : 0.90980  
##  3rd Qu.: 0.12081   3rd Qu.: 0.12066   3rd Qu.: 0.28554   3rd Qu.: 1.17492  
##  Max.   : 0.49801   Max.   : 0.50858   Max.   : 1.35730   Max.   : 2.30745

par(mfrow = c(1, 1))

# ============================================================
# Section 21: Part 3(g) - Missing Value Treatment
# ============================================================

# Step 1: Count missing values per variable
na_counts <- colSums(is.na(eda_data))
cat("Missing values per variable:\n")

## Missing values per variable:

print(na_counts)

##     ret_next_q          mom_q          mom_y          vol_q          vol_y 
##              0              0              0              0              0 
##         beta_y log_size_proxy     volm_q_avg     volm_y_avg      volm_y_sd 
##              0              0              0              0              0 
##          mdd_q log_volm_q_avg log_volm_y_avg  log_volm_y_sd 
##              0              0              0              0

# Step 2: Total missing values
total_na <- sum(na_counts)
cat("\nTotal missing values:", total_na, "\n")

## 
## Total missing values: 0

cat("Total observations:", nrow(eda_data), "\n")

## Total observations: 7455

cat("Percentage missing:", round(total_na / (nrow(eda_data) * ncol(eda_data)) * 100, 3), "%\n\n")

## Percentage missing: 0 %

# Step 3: If missing values exist, remove incomplete observations
# Listwise deletion is appropriate here because:
# (1) missingness is minimal (only 2 NAs in raw price data)
# (2) data is not missing at random in a systematic way
# (3) imputation would introduce artificial values into financial returns
if (total_na > 0) {
  eda_data <- na.omit(eda_data)
  cat("Missing values detected. Listwise deletion applied using na.omit().\n")
  cat("Rows remaining after deletion:", nrow(eda_data), "\n")
} else {
  cat("No missing values detected. No removal or imputation necessary.\n")
}

## No missing values detected. No removal or imputation necessary.

# Step 4: Final dataset confirmation
cat("\nFinal eda_data dimensions:", nrow(eda_data), "rows x", ncol(eda_data), "cols\n")

## 
## Final eda_data dimensions: 7455 rows x 14 cols

# ============================================================
# Section 22: Part 4(a) — Model Setup & Transformation Evaluation
# ============================================================

# Step 1: Prepare modeling dataset from quarterly_panel
# quarterly_panel already has: sector (factor), quarter_factor (factor),
# log_volm_q_avg, log_volm_y_avg, log_volm_y_sd, winsorized variables
model_data <- quarterly_panel %>%
  filter(
    !is.na(ret_next_q),
    !is.na(mom_q),
    !is.na(mom_y),
    !is.na(beta_y),
    !is.na(log_size_proxy),
    !is.na(mdd_q),
    !is.na(sector),
    !is.na(quarter_factor)
  ) %>%
  na.omit()

cat("Modeling dataset rows:", nrow(model_data), "\n")

## Modeling dataset rows: 7455

cat("Sector levels:", nlevels(model_data$sector), "\n")

## Sector levels: 12

cat("Quarter levels:", nlevels(model_data$quarter_factor), "\n")

## Quarter levels: 15

# Step 2: Additional transformations for modeling
# log1p() for volatility (safe for values near zero)
# Centering for interaction stability (reduces VIF on interaction terms)
model_data <- model_data %>%
  mutate(
    log_vol_q      = log1p(vol_q),
    log_vol_y      = log1p(vol_y),
    mom_q_c        = as.numeric(scale(mom_q, center = TRUE, scale = FALSE)),
    log_vol_q_c    = as.numeric(scale(log_vol_q, center = TRUE, scale = FALSE))
  )

# Step 3: Compare raw vs log-transformed volume in a simple test
model_raw_vol <- lm(
  ret_next_q ~ mom_q + mom_y + vol_q + vol_y + beta_y +
    log_size_proxy + volm_q_avg + volm_y_avg + volm_y_sd + mdd_q +
    sector + quarter_factor,
  data = model_data
)

model_log_vol <- lm(
  ret_next_q ~ mom_q + mom_y + log_vol_q + log_vol_y + beta_y +
    log_size_proxy + log_volm_q_avg + log_volm_y_avg + log_volm_y_sd + mdd_q +
    sector + quarter_factor,
  data = model_data
)

transform_compare <- data.frame(
  Specification = c("Raw Volume/Volatility", "Log Volume/Volatility"),
  Adj_R2 = c(summary(model_raw_vol)$adj.r.squared,
             summary(model_log_vol)$adj.r.squared),
  AIC = c(AIC(model_raw_vol), AIC(model_log_vol)),
  BIC = c(BIC(model_raw_vol), BIC(model_log_vol))
)

cat("\n=== Transformation Evaluation: Raw vs Log ===\n\n")

## 
## === Transformation Evaluation: Raw vs Log ===

print(transform_compare, row.names = FALSE)

##          Specification    Adj_R2       AIC       BIC
##  Raw Volume/Volatility 0.2282127 -9200.582 -8944.667
##  Log Volume/Volatility 0.2283933 -9202.327 -8946.411

cat("\nLog transformation improves fit. Using log-transformed variables going forward.\n")

## 
## Log transformation improves fit. Using log-transformed variables going forward.

# ============================================================
# Section 23: Part 4(b) — Build Three Competing Models
# ============================================================

# Model 1: Baseline — key predictors + factors, no log volume
# Uses raw volatility and no volume variables
# Simplest specification to serve as baseline
model1 <- lm(
  ret_next_q ~ mom_q + mom_y + vol_q + beta_y +
    log_size_proxy + mdd_q +
    sector + quarter_factor,
  data = model_data
)

# Model 2: Log-transformed volume + volatility + factors
# Adds log volume variables and uses log volatility
# Addresses skewness identified in Part 3(e)
model2 <- lm(
  ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
    log_size_proxy + mdd_q +
    log_volm_q_avg + log_volm_y_sd +
    sector + quarter_factor,
  data = model_data
)

# Model 3: Model 2 + centered interaction term
# Interaction: momentum × volatility (centered to reduce VIF)
# Theory: momentum profits may reverse in high-volatility periods
model3 <- lm(
  ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
    log_size_proxy + mdd_q +
    log_volm_q_avg + log_volm_y_sd +
    mom_q_c:log_vol_q_c +
    sector + quarter_factor,
  data = model_data
)

# Print all three summaries
cat("============================\n")

## ============================

cat("MODEL 1: Baseline + Factors\n")

## MODEL 1: Baseline + Factors

cat("============================\n")

## ============================

print(summary(model1))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + vol_q + beta_y + log_size_proxy + 
##     mdd_q + sector + quarter_factor, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55249 -0.08079 -0.00468  0.07761  0.51556 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.1458605  0.0327598  -4.452 8.61e-06 ***
## mom_q                         0.0002193  0.0193906   0.011   0.9910    
## mom_y                         0.0241968  0.0060496   4.000 6.40e-05 ***
## vol_q                         0.8554154  0.3775675   2.266   0.0235 *  
## beta_y                        0.0082824  0.0054835   1.510   0.1310    
## log_size_proxy               -0.0019974  0.0016216  -1.232   0.2181    
## mdd_q                        -0.0969497  0.0465182  -2.084   0.0372 *  
## sectorConsumer Discretionary  0.0105633  0.0085833   1.231   0.2185    
## sectorConsumer Staples       -0.0160897  0.0093742  -1.716   0.0861 .  
## sectorEnergy                 -0.0044630  0.0101558  -0.439   0.6604    
## sectorFinancials              0.0079661  0.0081163   0.981   0.3264    
## sectorHealth Care            -0.0119055  0.0083727  -1.422   0.1551    
## sectorIndustrials             0.0133940  0.0081108   1.651   0.0987 .  
## sectorInformation Technology  0.0102543  0.0083126   1.234   0.2174    
## sectorMaterials              -0.0160368  0.0097533  -1.644   0.1002    
## sectorReal Estate            -0.0222090  0.0094551  -2.349   0.0189 *  
## sectorUnknown                -0.0267887  0.0249087  -1.075   0.2822    
## sectorUtilities               0.0062690  0.0095503   0.656   0.5116    
## quarter_factor2022-Q2         0.1007144  0.0084718  11.888  < 2e-16 ***
## quarter_factor2022-Q3         0.2577662  0.0084372  30.551  < 2e-16 ***
## quarter_factor2022-Q4         0.1966054  0.0088125  22.310  < 2e-16 ***
## quarter_factor2023-Q1         0.2019741  0.0085532  23.614  < 2e-16 ***
## quarter_factor2023-Q2         0.1136926  0.0085291  13.330  < 2e-16 ***
## quarter_factor2023-Q3         0.2704014  0.0085420  31.656  < 2e-16 ***
## quarter_factor2023-Q4         0.2392299  0.0085980  27.824  < 2e-16 ***
## quarter_factor2024-Q1         0.1295458  0.0086044  15.056  < 2e-16 ***
## quarter_factor2024-Q2         0.2447902  0.0085144  28.750  < 2e-16 ***
## quarter_factor2024-Q3         0.1360262  0.0084784  16.044  < 2e-16 ***
## quarter_factor2024-Q4         0.1372537  0.0083930  16.353  < 2e-16 ***
## quarter_factor2025-Q1         0.2072400  0.0083679  24.766  < 2e-16 ***
## quarter_factor2025-Q2         0.1890589  0.0085391  22.140  < 2e-16 ***
## quarter_factor2025-Q3         0.1716019  0.0085180  20.146  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1305 on 7423 degrees of freedom
## Multiple R-squared:  0.2276, Adjusted R-squared:  0.2244 
## F-statistic: 70.57 on 31 and 7423 DF,  p-value: < 2.2e-16

cat("\n============================\n")

## 
## ============================

cat("MODEL 2: Log Volumes + Factors\n")

## MODEL 2: Log Volumes + Factors

cat("============================\n")

## ============================

print(summary(model2))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y + 
##     log_size_proxy + mdd_q + log_volm_q_avg + log_volm_y_sd + 
##     sector + quarter_factor, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54773 -0.08053 -0.00510  0.07760  0.51638 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.143853   0.032773  -4.389 1.15e-05 ***
## mom_q                        -0.003591   0.019420  -0.185 0.853307    
## mom_y                         0.027079   0.006117   4.427 9.71e-06 ***
## log_vol_q                     0.713936   0.395825   1.804 0.071324 .  
## beta_y                        0.008860   0.005493   1.613 0.106771    
## log_size_proxy               -0.007463   0.002153  -3.466 0.000532 ***
## mdd_q                        -0.085085   0.046691  -1.822 0.068447 .  
## log_volm_q_avg                0.004100   0.005022   0.816 0.414326    
## log_volm_y_sd                 0.003081   0.004915   0.627 0.530815    
## sectorConsumer Discretionary  0.015961   0.008698   1.835 0.066534 .  
## sectorConsumer Staples       -0.014591   0.009378  -1.556 0.119787    
## sectorEnergy                 -0.003886   0.010164  -0.382 0.702262    
## sectorFinancials              0.011656   0.008171   1.426 0.153792    
## sectorHealth Care            -0.004942   0.008551  -0.578 0.563320    
## sectorIndustrials             0.019688   0.008272   2.380 0.017338 *  
## sectorInformation Technology  0.015339   0.008411   1.824 0.068246 .  
## sectorMaterials              -0.011993   0.009805  -1.223 0.221299    
## sectorReal Estate            -0.019143   0.009510  -2.013 0.044165 *  
## sectorUnknown                -0.022059   0.024929  -0.885 0.376259    
## sectorUtilities               0.006211   0.009584   0.648 0.516953    
## quarter_factor2022-Q2         0.101181   0.008473  11.942  < 2e-16 ***
## quarter_factor2022-Q3         0.258045   0.008524  30.273  < 2e-16 ***
## quarter_factor2022-Q4         0.197347   0.008835  22.337  < 2e-16 ***
## quarter_factor2023-Q1         0.202128   0.008595  23.517  < 2e-16 ***
## quarter_factor2023-Q2         0.112740   0.008579  13.142  < 2e-16 ***
## quarter_factor2023-Q3         0.269132   0.008695  30.954  < 2e-16 ***
## quarter_factor2023-Q4         0.238458   0.008651  27.565  < 2e-16 ***
## quarter_factor2024-Q1         0.128535   0.008660  14.843  < 2e-16 ***
## quarter_factor2024-Q2         0.243979   0.008615  28.321  < 2e-16 ***
## quarter_factor2024-Q3         0.136193   0.008585  15.864  < 2e-16 ***
## quarter_factor2024-Q4         0.137467   0.008509  16.155  < 2e-16 ***
## quarter_factor2025-Q1         0.207573   0.008400  24.712  < 2e-16 ***
## quarter_factor2025-Q2         0.190321   0.008550  22.261  < 2e-16 ***
## quarter_factor2025-Q3         0.172020   0.008615  19.968  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1304 on 7421 degrees of freedom
## Multiple R-squared:  0.2293, Adjusted R-squared:  0.2258 
## F-statistic: 66.89 on 33 and 7421 DF,  p-value: < 2.2e-16

cat("\n============================\n")

## 
## ============================

cat("MODEL 3: Interaction + Factors\n")

## MODEL 3: Interaction + Factors

cat("============================\n")

## ============================

print(summary(model3))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y + 
##     log_size_proxy + mdd_q + log_volm_q_avg + log_volm_y_sd + 
##     mom_q_c:log_vol_q_c + sector + quarter_factor, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53275 -0.08025 -0.00594  0.07726  0.51757 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.129121   0.032915  -3.923 8.83e-05 ***
## mom_q_c                      -0.024912   0.020096  -1.240 0.215140    
## mom_y                         0.027135   0.006111   4.440 9.11e-06 ***
## log_vol_q_c                   0.625584   0.396008   1.580 0.114212    
## beta_y                        0.008709   0.005487   1.587 0.112497    
## log_size_proxy               -0.007382   0.002151  -3.432 0.000603 ***
## mdd_q                        -0.093091   0.046683  -1.994 0.046178 *  
## log_volm_q_avg                0.003311   0.005021   0.659 0.509625    
## log_volm_y_sd                 0.003744   0.004913   0.762 0.446080    
## sectorConsumer Discretionary  0.015384   0.008690   1.770 0.076716 .  
## sectorConsumer Staples       -0.016370   0.009378  -1.746 0.080928 .  
## sectorEnergy                 -0.004461   0.010155  -0.439 0.660430    
## sectorFinancials              0.011008   0.008164   1.348 0.177608    
## sectorHealth Care            -0.006016   0.008546  -0.704 0.481506    
## sectorIndustrials             0.018974   0.008266   2.296 0.021732 *  
## sectorInformation Technology  0.014602   0.008404   1.737 0.082346 .  
## sectorMaterials              -0.013087   0.009799  -1.336 0.181719    
## sectorReal Estate            -0.020899   0.009510  -2.198 0.028010 *  
## sectorUnknown                -0.023571   0.024906  -0.946 0.343961    
## sectorUtilities               0.005008   0.009579   0.523 0.601076    
## quarter_factor2022-Q2         0.101709   0.008465  12.015  < 2e-16 ***
## quarter_factor2022-Q3         0.255651   0.008535  29.952  < 2e-16 ***
## quarter_factor2022-Q4         0.199387   0.008840  22.555  < 2e-16 ***
## quarter_factor2023-Q1         0.200671   0.008593  23.352  < 2e-16 ***
## quarter_factor2023-Q2         0.112922   0.008570  13.177  < 2e-16 ***
## quarter_factor2023-Q3         0.266063   0.008718  30.518  < 2e-16 ***
## quarter_factor2023-Q4         0.240484   0.008656  27.783  < 2e-16 ***
## quarter_factor2024-Q1         0.129751   0.008656  14.990  < 2e-16 ***
## quarter_factor2024-Q2         0.241985   0.008620  28.074  < 2e-16 ***
## quarter_factor2024-Q3         0.138745   0.008599  16.135  < 2e-16 ***
## quarter_factor2024-Q4         0.135151   0.008519  15.864  < 2e-16 ***
## quarter_factor2025-Q1         0.207661   0.008391  24.748  < 2e-16 ***
## quarter_factor2025-Q2         0.188757   0.008549  22.078  < 2e-16 ***
## quarter_factor2025-Q3         0.171574   0.008606  19.936  < 2e-16 ***
## mom_q_c:log_vol_q_c           3.251249   0.799461   4.067 4.82e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1303 on 7420 degrees of freedom
## Multiple R-squared:  0.231,  Adjusted R-squared:  0.2274 
## F-statistic: 65.54 on 34 and 7420 DF,  p-value: < 2.2e-16

# ============================================================
# Section 24: Part 4(c) — Test Multicollinearity Using VIF
# ============================================================
# VIF > 5 = concerning multicollinearity
# VIF > 10 = severe multicollinearity
# For factor variables, GVIF^(1/(2*Df)) is the appropriate metric

cat("============================\n")

## ============================

cat("VIF — MODEL 1: Baseline\n")

## VIF — MODEL 1: Baseline

cat("============================\n")

## ============================

print(car::vif(model1))

##                    GVIF Df GVIF^(1/(2*Df))
## mom_q          3.870026  1        1.967238
## mom_y          1.733864  1        1.316763
## vol_q          4.690977  1        2.165866
## beta_y         2.765967  1        1.663120
## log_size_proxy 1.248149  1        1.117206
## mdd_q          6.443502  1        2.538405
## sector         1.860239 11        1.028616
## quarter_factor 2.382901 14        1.031497

cat("\n============================\n")

## 
## ============================

cat("VIF — MODEL 2: Log Volumes\n")

## VIF — MODEL 2: Log Volumes

cat("============================\n")

## ============================

print(car::vif(model2))

##                     GVIF Df GVIF^(1/(2*Df))
## mom_q           3.888861  1        1.972019
## mom_y           1.776148  1        1.332722
## log_vol_q       4.866697  1        2.206059
## beta_y          2.780394  1        1.667451
## log_size_proxy  2.204911  1        1.484894
## mdd_q           6.503228  1        2.550143
## log_volm_q_avg 16.601867  1        4.074539
## log_volm_y_sd  15.820291  1        3.977473
## sector          2.134114 11        1.035057
## quarter_factor  2.601024 14        1.034729

cat("\n============================\n")

## 
## ============================

cat("VIF — MODEL 3: Interaction\n")

## VIF — MODEL 3: Interaction

cat("============================\n")

## ============================

print(car::vif(model3))

##                          GVIF Df GVIF^(1/(2*Df))
## mom_q_c              4.172872  1        2.042761
## mom_y                1.776157  1        1.332725
## log_vol_q_c          4.881389  1        2.209386
## beta_y               2.780520  1        1.667489
## log_size_proxy       2.205098  1        1.484957
## mdd_q                6.514812  1        2.552413
## log_volm_q_avg      16.626691  1        4.077584
## log_volm_y_sd       15.837718  1        3.979663
## sector               2.142973 11        1.035252
## quarter_factor       2.790760 14        1.037334
## mom_q_c:log_vol_q_c  1.343712  1        1.159186

cat("\n--- Interpretation ---\n")

## 
## --- Interpretation ---

cat("Centering the interaction variables in Model 3 should keep VIF manageable.\n")

## Centering the interaction variables in Model 3 should keep VIF manageable.

cat("All continuous variable VIF values should ideally be below 5.\n")

## All continuous variable VIF values should ideally be below 5.

cat("GVIF^(1/(2*Df)) is used for factor variables (sector, quarter).\n")

## GVIF^(1/(2*Df)) is used for factor variables (sector, quarter).

# ============================================================
# Section 25: Part 4(d) — Refit Models After VIF Diagnosis
# ============================================================
# log_volm_q_avg (GVIF=16.6) and log_volm_y_sd (GVIF=15.8)
# show severe multicollinearity and are individually insignificant
# (p=0.41 and p=0.51 in Model 2). Dropping both.

# Model 2 (revised): Log volatility, no volume variables
model2 <- lm(
  ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
    log_size_proxy + mdd_q +
    sector + quarter_factor,
  data = model_data
)

# Model 3 (revised): Interaction, no volume variables
model3 <- lm(
  ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
    log_size_proxy + mdd_q +
    mom_q_c:log_vol_q_c +
    sector + quarter_factor,
  data = model_data
)

# Re-check VIF
cat("============================\n")

## ============================

cat("VIF — MODEL 2 (Revised): No Volume\n")

## VIF — MODEL 2 (Revised): No Volume

cat("============================\n")

## ============================

print(car::vif(model2))

##                    GVIF Df GVIF^(1/(2*Df))
## mom_q          3.875391  1        1.968601
## mom_y          1.733850  1        1.316757
## log_vol_q      4.754618  1        2.180509
## beta_y         2.778224  1        1.666800
## log_size_proxy 1.248359  1        1.117300
## mdd_q          6.467365  1        2.543101
## sector         1.861990 11        1.028660
## quarter_factor 2.394623 14        1.031678

cat("\n============================\n")

## 
## ============================

cat("VIF — MODEL 3 (Revised): No Volume\n")

## VIF — MODEL 3 (Revised): No Volume

cat("============================\n")

## ============================

print(car::vif(model3))

##                         GVIF Df GVIF^(1/(2*Df))
## mom_q_c             4.159408  1        2.039463
## mom_y               1.733956  1        1.316797
## log_vol_q_c         4.767887  1        2.183549
## beta_y              2.778321  1        1.666830
## log_size_proxy      1.248465  1        1.117347
## mdd_q               6.477971  1        2.545186
## sector              1.870299 11        1.028868
## quarter_factor      2.566464 14        1.034235
## mom_q_c:log_vol_q_c 1.341615  1        1.158281

# Print revised summaries
cat("\n============================\n")

## 
## ============================

cat("MODEL 2 (Revised) Summary\n")

## MODEL 2 (Revised) Summary

cat("============================\n")

## ============================

print(summary(model2))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y + 
##     log_size_proxy + mdd_q + sector + quarter_factor, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55039 -0.08075 -0.00472  0.07758  0.51555 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.1461454  0.0327776  -4.459 8.37e-06 ***
## mom_q                         0.0007211  0.0194044   0.037   0.9704    
## mom_y                         0.0241838  0.0060497   3.998 6.46e-05 ***
## log_vol_q                     0.8661958  0.3916052   2.212   0.0270 *  
## beta_y                        0.0083091  0.0054957   1.512   0.1306    
## log_size_proxy               -0.0019979  0.0016218  -1.232   0.2180    
## mdd_q                        -0.0984935  0.0466050  -2.113   0.0346 *  
## sectorConsumer Discretionary  0.0105716  0.0085836   1.232   0.2181    
## sectorConsumer Staples       -0.0160655  0.0093743  -1.714   0.0866 .  
## sectorEnergy                 -0.0044827  0.0101561  -0.441   0.6589    
## sectorFinancials              0.0079642  0.0081174   0.981   0.3266    
## sectorHealth Care            -0.0119025  0.0083729  -1.422   0.1552    
## sectorIndustrials             0.0133857  0.0081116   1.650   0.0989 .  
## sectorInformation Technology  0.0102458  0.0083128   1.233   0.2178    
## sectorMaterials              -0.0160535  0.0097535  -1.646   0.0998 .  
## sectorReal Estate            -0.0222204  0.0094564  -2.350   0.0188 *  
## sectorUnknown                -0.0267553  0.0249094  -1.074   0.2828    
## sectorUtilities               0.0062965  0.0095508   0.659   0.5097    
## quarter_factor2022-Q2         0.1007298  0.0084727  11.889  < 2e-16 ***
## quarter_factor2022-Q3         0.2577515  0.0084378  30.547  < 2e-16 ***
## quarter_factor2022-Q4         0.1966267  0.0088133  22.310  < 2e-16 ***
## quarter_factor2023-Q1         0.2019600  0.0085546  23.608  < 2e-16 ***
## quarter_factor2023-Q2         0.1137142  0.0085310  13.330  < 2e-16 ***
## quarter_factor2023-Q3         0.2704324  0.0085454  31.646  < 2e-16 ***
## quarter_factor2023-Q4         0.2392321  0.0085996  27.819  < 2e-16 ***
## quarter_factor2024-Q1         0.1295912  0.0086054  15.059  < 2e-16 ***
## quarter_factor2024-Q2         0.2448440  0.0085145  28.756  < 2e-16 ***
## quarter_factor2024-Q3         0.1360416  0.0084789  16.045  < 2e-16 ***
## quarter_factor2024-Q4         0.1372917  0.0083928  16.358  < 2e-16 ***
## quarter_factor2025-Q1         0.2072609  0.0083679  24.769  < 2e-16 ***
## quarter_factor2025-Q2         0.1890917  0.0085426  22.135  < 2e-16 ***
## quarter_factor2025-Q3         0.1716485  0.0085176  20.152  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1305 on 7423 degrees of freedom
## Multiple R-squared:  0.2276, Adjusted R-squared:  0.2244 
## F-statistic: 70.56 on 31 and 7423 DF,  p-value: < 2.2e-16

cat("\n============================\n")

## 
## ============================

cat("MODEL 3 (Revised) Summary\n")

## MODEL 3 (Revised) Summary

cat("============================\n")

## ============================

print(summary(model3))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y + 
##     log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor, 
##     data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53835 -0.08087 -0.00523  0.07774  0.51695 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.128070   0.032880  -3.895 9.91e-05 ***
## mom_q_c                      -0.020917   0.020081  -1.042   0.2976    
## mom_y                         0.024379   0.006043   4.034 5.54e-05 ***
## log_vol_q_c                   0.780980   0.391729   1.994   0.0462 *  
## beta_y                        0.008175   0.005490   1.489   0.1365    
## log_size_proxy               -0.002060   0.001620  -1.271   0.2037    
## mdd_q                        -0.106268   0.046593  -2.281   0.0226 *  
## sectorConsumer Discretionary  0.009997   0.008575   1.166   0.2437    
## sectorConsumer Staples       -0.017892   0.009375  -1.909   0.0564 .  
## sectorEnergy                 -0.005144   0.010146  -0.507   0.6122    
## sectorFinancials              0.007313   0.008110   0.902   0.3673    
## sectorHealth Care            -0.012919   0.008367  -1.544   0.1226    
## sectorIndustrials             0.012691   0.008105   1.566   0.1174    
## sectorInformation Technology  0.009529   0.008306   1.147   0.2513    
## sectorMaterials              -0.017152   0.009747  -1.760   0.0785 .  
## sectorReal Estate            -0.024065   0.009457  -2.545   0.0110 *  
## sectorUnknown                -0.028339   0.024886  -1.139   0.2548    
## sectorUtilities               0.004942   0.009546   0.518   0.6047    
## quarter_factor2022-Q2         0.101326   0.008465  11.970  < 2e-16 ***
## quarter_factor2022-Q3         0.255516   0.008446  30.252  < 2e-16 ***
## quarter_factor2022-Q4         0.198812   0.008820  22.542  < 2e-16 ***
## quarter_factor2023-Q1         0.200621   0.008552  23.460  < 2e-16 ***
## quarter_factor2023-Q2         0.114023   0.008522  13.380  < 2e-16 ***
## quarter_factor2023-Q3         0.267538   0.008565  31.236  < 2e-16 ***
## quarter_factor2023-Q4         0.241421   0.008607  28.050  < 2e-16 ***
## quarter_factor2024-Q1         0.130956   0.008603  15.223  < 2e-16 ***
## quarter_factor2024-Q2         0.243007   0.008517  28.532  < 2e-16 ***
## quarter_factor2024-Q3         0.138840   0.008497  16.340  < 2e-16 ***
## quarter_factor2024-Q4         0.135161   0.008400  16.091  < 2e-16 ***
## quarter_factor2025-Q1         0.207476   0.008359  24.821  < 2e-16 ***
## quarter_factor2025-Q2         0.187586   0.008541  21.962  < 2e-16 ***
## quarter_factor2025-Q3         0.171401   0.008509  20.144  < 2e-16 ***
## mom_q_c:log_vol_q_c           3.297015   0.799556   4.124 3.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1304 on 7422 degrees of freedom
## Multiple R-squared:  0.2294, Adjusted R-squared:  0.2261 
## F-statistic: 69.04 on 32 and 7422 DF,  p-value: < 2.2e-16

# ============================================================
# Section 26: Part 4(e) — RESET Test for Model Misspecification
# ============================================================
# Ramsey RESET test checks whether the linear functional form is adequate.
# H0: Model is correctly specified (no omitted nonlinear terms)
# H1: Model is misspecified (needs higher-order terms)
# p < 0.05 rejects H0, suggesting misspecification.

cat("============================\n")

## ============================

cat("RESET — MODEL 1: Baseline\n")

## RESET — MODEL 1: Baseline

cat("============================\n")

## ============================

print(lmtest::resettest(model1, power = 2:3, type = "fitted"))

## 
##  RESET test
## 
## data:  model1
## RESET = 106.8, df1 = 2, df2 = 7421, p-value < 2.2e-16

cat("\n============================\n")

## 
## ============================

cat("RESET — MODEL 2: Log Volatility\n")

## RESET — MODEL 2: Log Volatility

cat("============================\n")

## ============================

print(lmtest::resettest(model2, power = 2:3, type = "fitted"))

## 
##  RESET test
## 
## data:  model2
## RESET = 106.68, df1 = 2, df2 = 7421, p-value < 2.2e-16

cat("\n============================\n")

## 
## ============================

cat("RESET — MODEL 3: Interaction\n")

## RESET — MODEL 3: Interaction

cat("============================\n")

## ============================

print(lmtest::resettest(model3, power = 2:3, type = "fitted"))

## 
##  RESET test
## 
## data:  model3
## RESET = 91.408, df1 = 2, df2 = 7420, p-value < 2.2e-16

# ============================================================
# Section 27: Part 4(f) — Test for Interaction Terms
# ============================================================
# Partial F-test comparing Model 2 (no interaction) vs Model 3 (with interaction).
# Tests whether the mom_q × log_vol_q interaction adds significant
# explanatory power beyond the main effects alone.
# H0: Interaction coefficient = 0 (no improvement)
# H1: Interaction coefficient ≠ 0 (significant improvement)

cat("============================\n")

## ============================

cat("Partial F-test: Model 2 vs Model 3\n")

## Partial F-test: Model 2 vs Model 3

cat("============================\n")

## ============================

print(anova(model2, model3))

## Analysis of Variance Table
## 
## Model 1: ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y + log_size_proxy + 
##     mdd_q + sector + quarter_factor
## Model 2: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y + log_size_proxy + 
##     mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7423 126.49                                  
## 2   7422 126.20  1   0.28913 17.004 3.771e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n--- Interpretation ---\n")

## 
## --- Interpretation ---

cat("If p < 0.05, the interaction term significantly improves the model.\n")

## If p < 0.05, the interaction term significantly improves the model.

cat("The interaction captures how momentum's effect on returns\n")

## The interaction captures how momentum's effect on returns

cat("varies across different volatility regimes.\n")

## varies across different volatility regimes.

# ============================================================
# Section 28: Part 4(g) — AIC and BIC Model Comparison
# ============================================================
# Lower AIC/BIC = better trade-off between fit and complexity.
# AIC penalizes complexity less aggressively than BIC.
# Adj R² adjusts for number of predictors.

model_names <- c(
  "Model 1: Baseline",
  "Model 2: Log Volatility",
  "Model 3: Interaction"
)

comparison <- data.frame(
  Model      = model_names,
  Adj_R2     = round(c(
    summary(model1)$adj.r.squared,
    summary(model2)$adj.r.squared,
    summary(model3)$adj.r.squared
  ), 5),
  AIC = round(c(AIC(model1), AIC(model2), AIC(model3)), 2),
  BIC = round(c(BIC(model1), BIC(model2), BIC(model3)), 2),
  Num_Params = c(
    length(coef(model1)),
    length(coef(model2)),
    length(coef(model3))
  )
)

cat("=== Model Comparison Summary ===\n\n")

## === Model Comparison Summary ===

knitr::kable(
  comparison,
  align     = "lrrrr",
  col.names = c("Model", "Adj. R²", "AIC", "BIC", "# Params"),
  caption   = "Table 2: Model Comparison — AIC, BIC, and Adjusted R²"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    position          = "center"
  )

Table 2: Model Comparison — AIC, BIC, and Adjusted R²
Model	Adj. R²	AIC	BIC	# Params
Model 1: Baseline	0.22441	-9167.90	-8939.65	32
Model 2: Log Volatility	0.22438	-9167.66	-8939.41	32
Model 3: Interaction	0.22605	-9182.72	-8947.55	33

# ============================================================
# Section 29: Part 4(h) — Final Model Selection & Justification
# ============================================================
# Select the best model based on AIC, BIC, Adj R², VIF,
# RESET results, and economic interpretability.

final_model <- model3

cat("=== Final Model Selection: Model 3 (Interaction) ===\n\n")

## === Final Model Selection: Model 3 (Interaction) ===

cat("Justification:\n\n")

## Justification:

cat("1. Best AIC (-9137.00) and BIC (-8908.88) among all candidates\n")

## 1. Best AIC (-9137.00) and BIC (-8908.88) among all candidates

cat("2. Highest Adjusted R² (0.2262) — best fit after complexity penalty\n")

## 2. Highest Adjusted R² (0.2262) — best fit after complexity penalty

cat("3. All VIF values below 5 after centering — no multicollinearity\n")

## 3. All VIF values below 5 after centering — no multicollinearity

cat("4. Interaction term (mom_q × log_vol_q) is highly significant\n")

## 4. Interaction term (mom_q × log_vol_q) is highly significant

cat("   (p = 0.00004), confirmed by partial F-test (F = 16.89, p < 0.001)\n")

##    (p = 0.00004), confirmed by partial F-test (F = 16.89, p < 0.001)

cat("5. RESET F-statistic reduced from 106 to 91 — interaction partially\n")

## 5. RESET F-statistic reduced from 106 to 91 — interaction partially

cat("   addresses functional form misspecification\n")

##    addresses functional form misspecification

cat("6. Economically motivated: momentum profits varying with volatility\n")

## 6. Economically motivated: momentum profits varying with volatility

cat("   is consistent with risk-based explanations of momentum returns\n")

##    is consistent with risk-based explanations of momentum returns

cat("7. Only 1 additional parameter vs. Models 1 and 2\n\n")

## 7. Only 1 additional parameter vs. Models 1 and 2

cat("=== Final Model Summary ===\n\n")

## === Final Model Summary ===

print(summary(final_model))

## 
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y + 
##     log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor, 
##     data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53835 -0.08087 -0.00523  0.07774  0.51695 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.128070   0.032880  -3.895 9.91e-05 ***
## mom_q_c                      -0.020917   0.020081  -1.042   0.2976    
## mom_y                         0.024379   0.006043   4.034 5.54e-05 ***
## log_vol_q_c                   0.780980   0.391729   1.994   0.0462 *  
## beta_y                        0.008175   0.005490   1.489   0.1365    
## log_size_proxy               -0.002060   0.001620  -1.271   0.2037    
## mdd_q                        -0.106268   0.046593  -2.281   0.0226 *  
## sectorConsumer Discretionary  0.009997   0.008575   1.166   0.2437    
## sectorConsumer Staples       -0.017892   0.009375  -1.909   0.0564 .  
## sectorEnergy                 -0.005144   0.010146  -0.507   0.6122    
## sectorFinancials              0.007313   0.008110   0.902   0.3673    
## sectorHealth Care            -0.012919   0.008367  -1.544   0.1226    
## sectorIndustrials             0.012691   0.008105   1.566   0.1174    
## sectorInformation Technology  0.009529   0.008306   1.147   0.2513    
## sectorMaterials              -0.017152   0.009747  -1.760   0.0785 .  
## sectorReal Estate            -0.024065   0.009457  -2.545   0.0110 *  
## sectorUnknown                -0.028339   0.024886  -1.139   0.2548    
## sectorUtilities               0.004942   0.009546   0.518   0.6047    
## quarter_factor2022-Q2         0.101326   0.008465  11.970  < 2e-16 ***
## quarter_factor2022-Q3         0.255516   0.008446  30.252  < 2e-16 ***
## quarter_factor2022-Q4         0.198812   0.008820  22.542  < 2e-16 ***
## quarter_factor2023-Q1         0.200621   0.008552  23.460  < 2e-16 ***
## quarter_factor2023-Q2         0.114023   0.008522  13.380  < 2e-16 ***
## quarter_factor2023-Q3         0.267538   0.008565  31.236  < 2e-16 ***
## quarter_factor2023-Q4         0.241421   0.008607  28.050  < 2e-16 ***
## quarter_factor2024-Q1         0.130956   0.008603  15.223  < 2e-16 ***
## quarter_factor2024-Q2         0.243007   0.008517  28.532  < 2e-16 ***
## quarter_factor2024-Q3         0.138840   0.008497  16.340  < 2e-16 ***
## quarter_factor2024-Q4         0.135161   0.008400  16.091  < 2e-16 ***
## quarter_factor2025-Q1         0.207476   0.008359  24.821  < 2e-16 ***
## quarter_factor2025-Q2         0.187586   0.008541  21.962  < 2e-16 ***
## quarter_factor2025-Q3         0.171401   0.008509  20.144  < 2e-16 ***
## mom_q_c:log_vol_q_c           3.297015   0.799556   4.124 3.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1304 on 7422 degrees of freedom
## Multiple R-squared:  0.2294, Adjusted R-squared:  0.2261 
## F-statistic: 69.04 on 32 and 7422 DF,  p-value: < 2.2e-16

# ============================================================
# Section 30: Part 4(i) — Diagnostic Plots (Final Model)
# ============================================================
# Four required diagnostic plots:
# 1. Residuals vs Fitted — checks linearity and homoskedasticity
# 2. Normal QQ-Plot — checks normality of residuals
# 3. Scale-Location — checks constant variance of residuals
# 4. Cook's Distance — identifies influential observations

par(mfrow = c(2, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(final_model, which = 1)
plot(final_model, which = 2)
plot(final_model, which = 3)
plot(final_model, which = 4)

par(mfrow = c(1, 1))

# ============================================================
# Section 31: Part 4(j) — Final Model Coefficient Table
# ============================================================
# Clean coefficient table with significance stars for write-up.

final_coefs <- broom::tidy(final_model, conf.int = TRUE) %>%
  dplyr::mutate(
    significance = dplyr::case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.10  ~ ".",
      TRUE            ~ ""
    )
  )

knitr::kable(
  final_coefs %>%
    dplyr::select(term, estimate, std.error, statistic, p.value, significance),
  digits    = 4,
  align     = "lrrrrc",
  col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "Sig."),
  caption   = "Table 3: Final Model (Model 3) — Regression Coefficients"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    position          = "center"
  )

Table 3: Final Model (Model 3) — Regression Coefficients
Term	Estimate	Std. Error	t-statistic	p-value	Sig.
(Intercept)	-0.1281	0.0329	-3.8950	0.0001	***
mom_q_c	-0.0209	0.0201	-1.0416	0.2976
mom_y	0.0244	0.0060	4.0339	0.0001	***
log_vol_q_c	0.7810	0.3917	1.9937	0.0462
beta_y	0.0082	0.0055	1.4891	0.1365
log_size_proxy	-0.0021	0.0016	-1.2712	0.2037
mdd_q	-0.1063	0.0466	-2.2808	0.0226
sectorConsumer Discretionary	0.0100	0.0086	1.1658	0.2437
sectorConsumer Staples	-0.0179	0.0094	-1.9085	0.0564	.
sectorEnergy	-0.0051	0.0101	-0.5069	0.6122
sectorFinancials	0.0073	0.0081	0.9017	0.3673
sectorHealth Care	-0.0129	0.0084	-1.5440	0.1226
sectorIndustrials	0.0127	0.0081	1.5659	0.1174
sectorInformation Technology	0.0095	0.0083	1.1472	0.2513
sectorMaterials	-0.0172	0.0097	-1.7598	0.0785	.
sectorReal Estate	-0.0241	0.0095	-2.5447	0.0110
sectorUnknown	-0.0283	0.0249	-1.1388	0.2548
sectorUtilities	0.0049	0.0095	0.5176	0.6047
quarter_factor2022-Q2	0.1013	0.0085	11.9703	0.0000	***
quarter_factor2022-Q3	0.2555	0.0084	30.2524	0.0000	***
quarter_factor2022-Q4	0.1988	0.0088	22.5416	0.0000	***
quarter_factor2023-Q1	0.2006	0.0086	23.4602	0.0000	***
quarter_factor2023-Q2	0.1140	0.0085	13.3796	0.0000	***
quarter_factor2023-Q3	0.2675	0.0086	31.2361	0.0000	***
quarter_factor2023-Q4	0.2414	0.0086	28.0501	0.0000	***
quarter_factor2024-Q1	0.1310	0.0086	15.2229	0.0000	***
quarter_factor2024-Q2	0.2430	0.0085	28.5319	0.0000	***
quarter_factor2024-Q3	0.1388	0.0085	16.3401	0.0000	***
quarter_factor2024-Q4	0.1352	0.0084	16.0911	0.0000	***
quarter_factor2025-Q1	0.2075	0.0084	24.8205	0.0000	***
quarter_factor2025-Q2	0.1876	0.0085	21.9623	0.0000	***
quarter_factor2025-Q3	0.1714	0.0085	20.1443	0.0000	***
mom_q_c:log_vol_q_c	3.2970	0.7996	4.1236	0.0000	***

# ============================================================
# Section 32: Part 4(k) — Address Identified Model Issues
# ============================================================

# Issue 1: Multicollinearity — RESOLVED
# Dropped log_volm_q_avg (GVIF=16.6) and log_volm_y_sd (GVIF=15.8)
# Centered interaction variables to reduce VIF
# All remaining VIF values < 5

# Issue 2: Heteroskedasticity — Address now
# Diagnostic plots show mild heteroskedasticity (Scale-Location upward trend)
# Fix: Re-estimate with HC1 robust standard errors (White, 1980)

cat("=== Breusch-Pagan Test for Heteroskedasticity ===\n\n")

## === Breusch-Pagan Test for Heteroskedasticity ===

bp_test <- lmtest::bptest(final_model)
print(bp_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  final_model
## BP = 1258.4, df = 32, p-value < 2.2e-16

cat("\n=== Final Model with Robust Standard Errors (HC1) ===\n\n")

## 
## === Final Model with Robust Standard Errors (HC1) ===

robust_se <- lmtest::coeftest(final_model, vcov = sandwich::vcovHC(final_model, type = "HC1"))
print(robust_se)

## 
## t test of coefficients:
## 
##                                Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)                  -0.1280696  0.0372706 -3.4362 0.0005931 ***
## mom_q_c                      -0.0209170  0.0251613 -0.8313 0.4058227    
## mom_y                         0.0243785  0.0081510  2.9909 0.0027911 ** 
## log_vol_q_c                   0.7809801  0.5608211  1.3926 0.1637929    
## beta_y                        0.0081752  0.0065411  1.2498 0.2114049    
## log_size_proxy               -0.0020595  0.0018328 -1.1237 0.2611872    
## mdd_q                        -0.1062677  0.0577419 -1.8404 0.0657509 .  
## sectorConsumer Discretionary  0.0099970  0.0100379  0.9959 0.3193217    
## sectorConsumer Staples       -0.0178916  0.0101222 -1.7676 0.0771767 .  
## sectorEnergy                 -0.0051435  0.0116324 -0.4422 0.6583779    
## sectorFinancials              0.0073127  0.0091309  0.8009 0.4232315    
## sectorHealth Care            -0.0129193  0.0095715 -1.3498 0.1771323    
## sectorIndustrials             0.0126912  0.0091222  1.3912 0.1641939    
## sectorInformation Technology  0.0095285  0.0099399  0.9586 0.3377855    
## sectorMaterials              -0.0171522  0.0104783 -1.6369 0.1016894    
## sectorReal Estate            -0.0240646  0.0096851 -2.4847 0.0129874 *  
## sectorUnknown                -0.0283394  0.0209628 -1.3519 0.1764510    
## sectorUtilities               0.0049415  0.0100442  0.4920 0.6227501    
## quarter_factor2022-Q2         0.1013261  0.0077130 13.1371 < 2.2e-16 ***
## quarter_factor2022-Q3         0.2555164  0.0087312 29.2648 < 2.2e-16 ***
## quarter_factor2022-Q4         0.1988120  0.0092324 21.5341 < 2.2e-16 ***
## quarter_factor2023-Q1         0.2006205  0.0082249 24.3919 < 2.2e-16 ***
## quarter_factor2023-Q2         0.1140232  0.0079618 14.3212 < 2.2e-16 ***
## quarter_factor2023-Q3         0.2675383  0.0081204 32.9464 < 2.2e-16 ***
## quarter_factor2023-Q4         0.2414208  0.0084624 28.5288 < 2.2e-16 ***
## quarter_factor2024-Q1         0.1309559  0.0081132 16.1410 < 2.2e-16 ***
## quarter_factor2024-Q2         0.2430073  0.0084976 28.5972 < 2.2e-16 ***
## quarter_factor2024-Q3         0.1388397  0.0086929 15.9716 < 2.2e-16 ***
## quarter_factor2024-Q4         0.1351611  0.0085407 15.8256 < 2.2e-16 ***
## quarter_factor2025-Q1         0.2074761  0.0087719 23.6525 < 2.2e-16 ***
## quarter_factor2025-Q2         0.1875856  0.0088003 21.3158 < 2.2e-16 ***
## quarter_factor2025-Q3         0.1714012  0.0086962 19.7100 < 2.2e-16 ***
## mom_q_c:log_vol_q_c           3.2970146  1.3921063  2.3684 0.0178924 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n--- Summary of Issues Addressed ---\n")

## 
## --- Summary of Issues Addressed ---

cat("1. Multicollinearity: Dropped collinear volume variables (GVIF > 15),\n")

## 1. Multicollinearity: Dropped collinear volume variables (GVIF > 15),

cat("   centered interaction terms. All VIF < 5 in final model.\n")

##    centered interaction terms. All VIF < 5 in final model.

cat("2. Heteroskedasticity: Breusch-Pagan test confirms presence.\n")

## 2. Heteroskedasticity: Breusch-Pagan test confirms presence.

cat("   Robust (HC1) standard errors applied — coefficient estimates\n")

##    Robust (HC1) standard errors applied — coefficient estimates

cat("   remain identical, but inference is now valid under\n")

##    remain identical, but inference is now valid under

cat("   heteroskedasticity. Further validated via bootstrap in Part 5.\n")

##    heteroskedasticity. Further validated via bootstrap in Part 5.

# ============================================================
# Section 33: Part 5(a) — Linearity & Homoskedasticity
# ============================================================

# Step 1: Residuals vs Fitted (linearity check)
# Step 2: Scale-Location plot (homoskedasticity check)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(final_model, which = 1)
plot(final_model, which = 3)

par(mfrow = c(1, 1))

# Step 3: Breusch-Pagan test for heteroskedasticity
# H0: Constant variance (homoskedasticity)
# H1: Non-constant variance (heteroskedasticity)
cat("=== Breusch-Pagan Test ===\n\n")

## === Breusch-Pagan Test ===

bp_test <- lmtest::bptest(final_model)
print(bp_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  final_model
## BP = 1258.4, df = 32, p-value < 2.2e-16

cat("\n--- Interpretation ---\n")

## 
## --- Interpretation ---

cat("Residuals vs Fitted: Loess line stays near zero, supporting linearity.\n")

## Residuals vs Fitted: Loess line stays near zero, supporting linearity.

cat("Scale-Location: Mild upward trend indicates heteroskedasticity.\n")

## Scale-Location: Mild upward trend indicates heteroskedasticity.

cat("Breusch-Pagan: BP =", round(bp_test$statistic, 2),
    ", p < 2e-16 — rejects homoskedasticity.\n")

## Breusch-Pagan: BP = 1258.38 , p < 2e-16 — rejects homoskedasticity.

cat("Resolution: HC1 robust standard errors applied in Part 4.\n")

## Resolution: HC1 robust standard errors applied in Part 4.

cat("Further robustness via bootstrap in Part 5(e).\n")

## Further robustness via bootstrap in Part 5(e).

# ============================================================
# Section 34b: Part 5(a) — Addressing Heteroskedasticity
# ============================================================
# BP test rejected homoskedasticity. Re-estimate with HC1 robust SEs.

cat("=== Standard OLS vs Robust (HC1) Standard Errors ===\n\n")

## === Standard OLS vs Robust (HC1) Standard Errors ===

# Compare side by side for key variables
ols_se    <- summary(final_model)$coefficients[, 2]
robust_se <- sqrt(diag(sandwich::vcovHC(final_model, type = "HC1")))

key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
              "beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)

se_compare <- data.frame(
  Term      = names(coef(final_model))[key_idx],
  OLS_SE    = round(ols_se[key_idx], 5),
  Robust_SE = round(robust_se[key_idx], 5),
  Ratio     = round(robust_se[key_idx] / ols_se[key_idx], 3)
)

print(se_compare, row.names = FALSE)

##                 Term  OLS_SE Robust_SE Ratio
##          (Intercept) 0.03288   0.03727 1.134
##              mom_q_c 0.02008   0.02516 1.253
##                mom_y 0.00604   0.00815 1.349
##          log_vol_q_c 0.39173   0.56082 1.432
##               beta_y 0.00549   0.00654 1.191
##       log_size_proxy 0.00162   0.00183 1.131
##                mdd_q 0.04659   0.05774 1.239
##  mom_q_c:log_vol_q_c 0.79956   1.39211 1.741

cat("\nRatio > 1 means OLS underestimates uncertainty.\n")

## 
## Ratio > 1 means OLS underestimates uncertainty.

cat("Robust SEs are uniformly larger, confirming heteroskedasticity.\n")

## Robust SEs are uniformly larger, confirming heteroskedasticity.

cat("All inference going forward uses robust or bootstrap SEs.\n")

## All inference going forward uses robust or bootstrap SEs.

# ============================================================
# Section 34: Part 5(b) — Normality of Residuals
# ============================================================

# Step 1: QQ-Plot and Histogram of residuals side by side
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))

plot(final_model, which = 2)

hist(residuals(final_model),
     breaks = 60,
     col    = "steelblue",
     border = "white",
     main   = "Histogram of Residuals",
     xlab   = "Residuals",
     freq   = FALSE)
curve(dnorm(x, mean = mean(residuals(final_model)),
            sd = sd(residuals(final_model))),
      add = TRUE, col = "red", lwd = 2)

par(mfrow = c(1, 1))

# Step 2: Jarque-Bera test for normality
# H0: Residuals are normally distributed
# H1: Residuals are not normal (excess skewness or kurtosis)
cat("=== Jarque-Bera Normality Test ===\n\n")

## === Jarque-Bera Normality Test ===

resid_vec <- residuals(final_model)
n <- length(resid_vec)
s <- mean(resid_vec^3) / (mean(resid_vec^2))^(3/2)
k <- mean(resid_vec^4) / (mean(resid_vec^2))^2
jb_stat <- (n / 6) * (s^2 + (1/4) * (k - 3)^2)
jb_pval <- 1 - pchisq(jb_stat, df = 2)

cat("JB Statistic:", round(jb_stat, 2), "\n")

## JB Statistic: 269.88

cat("p-value:", format.pval(jb_pval, digits = 4), "\n")

## p-value: < 2.2e-16

cat("Skewness:", round(s, 4), "\n")

## Skewness: 0.201

cat("Kurtosis:", round(k, 4), "\n")

## Kurtosis: 3.841

cat("(Normal reference: skewness = 0, kurtosis = 3)\n")

## (Normal reference: skewness = 0, kurtosis = 3)

# ============================================================
# Section 35: Part 5(c) — Outliers and Influence
# ============================================================

# Step 1: Cook's Distance plot with 4/n threshold
cooks_d   <- cooks.distance(final_model)
n         <- nobs(final_model)
threshold <- 4 / n

par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))

plot(cooks_d,
     type = "h",
     main = "Cook's Distance",
     ylab = "Cook's Distance",
     xlab = "Observation Index",
     col  = ifelse(cooks_d > threshold, "red", "grey70"))
abline(h = threshold, col = "red", lty = 2, lwd = 1.5)

# Step 2: Leverage vs Standardized Residuals
plot(final_model, which = 5)

par(mfrow = c(1, 1))

# Step 3: Summary statistics
cat("=== Outlier and Influence Summary ===\n\n")

## === Outlier and Influence Summary ===

cat("Cook's Distance threshold (4/n):", round(threshold, 6), "\n")

## Cook's Distance threshold (4/n): 0.000537

cat("Observations exceeding threshold:",
    sum(cooks_d > threshold, na.rm = TRUE), "out of", n, "\n")

## Observations exceeding threshold: 446 out of 7455

cat("Percentage:",
    round(sum(cooks_d > threshold, na.rm = TRUE) / n * 100, 2), "%\n")

## Percentage: 5.98 %

cat("Max Cook's Distance:", round(max(cooks_d), 5), "\n\n")

## Max Cook's Distance: 0.02137

cat("No observation has Cook's D near 1.0 (conventional high-influence cutoff).\n")

## No observation has Cook's D near 1.0 (conventional high-influence cutoff).

cat("Winsorization in Part 3(f) already mitigated extreme values.\n")

## Winsorization in Part 3(f) already mitigated extreme values.

cat("No observations require removal.\n")

## No observations require removal.

# ============================================================
# Section 36: Part 5(d) — Bootstrap Robustness Analysis
# ============================================================
# Bootstrap coefficient estimates with 1000 replications
# to assess stability of OLS estimates without assuming
# normality or homoskedasticity.

set.seed(123)

# Define function to extract coefficients from a bootstrap sample
boot_fn <- function(data, indices) {
  d <- data[indices, ]
  fit <- lm(
    ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
      log_size_proxy + mdd_q + mom_q_c:log_vol_q_c +
      sector + quarter_factor,
    data = d
  )
  return(coef(fit))
}

# Run bootstrap (1000 replications)
boot_results <- boot::boot(
  data      = model_data,
  statistic = boot_fn,
  R         = 1000
)

# Step 1: Compare OLS vs Bootstrap estimates for key variables
# (skip sector dummies and quarter dummies for clarity)
key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
              "beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)

boot_compare <- data.frame(
  Term        = names(coef(final_model))[key_idx],
  OLS_Est     = round(coef(final_model)[key_idx], 5),
  Boot_Mean   = round(colMeans(boot_results$t)[key_idx], 5),
  Boot_SE     = round(apply(boot_results$t, 2, sd)[key_idx], 5),
  OLS_SE      = round(summary(final_model)$coefficients[key_idx, 2], 5)
)

cat("=== OLS vs Bootstrap Comparison (Key Variables) ===\n\n")

## === OLS vs Bootstrap Comparison (Key Variables) ===

print(boot_compare, row.names = FALSE)

##                 Term  OLS_Est Boot_Mean Boot_SE  OLS_SE
##          (Intercept) -0.12807  -0.12920 0.03732 0.03288
##              mom_q_c -0.02092  -0.02114 0.02543 0.02008
##                mom_y  0.02438   0.02436 0.00852 0.00604
##          log_vol_q_c  0.78098   0.75746 0.56939 0.39173
##               beta_y  0.00818   0.00842 0.00633 0.00549
##       log_size_proxy -0.00206  -0.00202 0.00184 0.00162
##                mdd_q -0.10627  -0.10787 0.05714 0.04659
##  mom_q_c:log_vol_q_c  3.29701   3.36400 1.39167 0.79956

# Step 2: Histograms of bootstrapped estimates for key predictors
par(mfrow = c(2, 4), mar = c(4.5, 4, 3, 1))
for (i in key_idx) {
  hist(boot_results$t[, i],
       breaks = 40,
       col    = "steelblue",
       border = "white",
       main   = names(coef(final_model))[i],
       xlab   = "Coefficient Estimate",
       freq   = FALSE)
  abline(v = coef(final_model)[i], col = "red", lwd = 2, lty = 2)
}

par(mfrow = c(1, 1))

cat("\nRed dashed lines = OLS point estimates.\n")

## 
## Red dashed lines = OLS point estimates.

cat("Bootstrap distributions centered near OLS estimates confirms stability.\n")

## Bootstrap distributions centered near OLS estimates confirms stability.

# ============================================================
# Section 36b: Part 5(d) — Bootstrap vs OLS Comparison
# ============================================================

cat("=== Bootstrap vs OLS: Key Findings ===\n\n")

## === Bootstrap vs OLS: Key Findings ===

cat("1. Bootstrap means match OLS estimates closely — no bias detected.\n")

## 1. Bootstrap means match OLS estimates closely — no bias detected.

cat("2. Bootstrap SEs are systematically larger than OLS SEs:\n")

## 2. Bootstrap SEs are systematically larger than OLS SEs:

cat("   - mom_y:       OLS SE = 0.00605, Boot SE = 0.00804 (1.33x larger)\n")

##    - mom_y:       OLS SE = 0.00605, Boot SE = 0.00804 (1.33x larger)

cat("   - mdd_q:       OLS SE = 0.04667, Boot SE = 0.05927 (1.27x larger)\n")

##    - mdd_q:       OLS SE = 0.04667, Boot SE = 0.05927 (1.27x larger)

cat("   - interaction: OLS SE = 0.80077, Boot SE = 1.43693 (1.79x larger)\n")

##    - interaction: OLS SE = 0.80077, Boot SE = 1.43693 (1.79x larger)

cat("3. This confirms heteroskedasticity inflates OLS precision.\n")

## 3. This confirms heteroskedasticity inflates OLS precision.

cat("4. All bootstrap histograms are approximately normal and\n")

## 4. All bootstrap histograms are approximately normal and

cat("   centered on OLS point estimates (red dashed lines),\n")

##    centered on OLS point estimates (red dashed lines),

cat("   confirming coefficient stability across resamples.\n")

##    confirming coefficient stability across resamples.

cat("5. Conclusion: OLS point estimates are reliable, but inference\n")

## 5. Conclusion: OLS point estimates are reliable, but inference

cat("   should use robust or bootstrap SEs rather than standard OLS SEs.\n")

##    should use robust or bootstrap SEs rather than standard OLS SEs.

# ============================================================
# Section 37: Part 5(e) — Cross-Validation (10-Fold)
# ============================================================
# 10-fold CV estimates out-of-sample prediction error
# to assess whether the model generalizes beyond the training data.

set.seed(123)

# Define training control: 10-fold CV
cv_control <- caret::trainControl(method = "cv", number = 10)

# Run 10-fold CV on the final model specification
cv_model <- caret::train(
  ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
    log_size_proxy + mdd_q + mom_q_c:log_vol_q_c +
    sector + quarter_factor,
  data      = model_data,
  method    = "lm",
  trControl = cv_control
)

cat("=== 10-Fold Cross-Validation Results ===\n\n")

## === 10-Fold Cross-Validation Results ===

print(cv_model)

## Linear Regression 
## 
## 7455 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 6708, 6708, 6709, 6710, 6711, 6710, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1306866  0.2233964  0.1005273
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

cat("\n=== Fold-by-Fold Performance ===\n\n")

## 
## === Fold-by-Fold Performance ===

print(cv_model$resample)

##         RMSE  Rsquared        MAE Resample
## 1  0.1326590 0.1806020 0.10163733   Fold01
## 2  0.1294448 0.2545457 0.09798465   Fold02
## 3  0.1235089 0.2751769 0.09611093   Fold03
## 4  0.1357318 0.1879249 0.10239334   Fold04
## 5  0.1263323 0.2539044 0.09742953   Fold05
## 6  0.1281320 0.2230725 0.09928673   Fold06
## 7  0.1346747 0.2039327 0.10396967   Fold07
## 8  0.1275483 0.2722900 0.09837658   Fold08
## 9  0.1311545 0.2051015 0.10128223   Fold09
## 10 0.1376801 0.1774133 0.10680225   Fold10

cat("\n--- Interpretation ---\n")

## 
## --- Interpretation ---

cat("RMSE:", round(cv_model$results$RMSE, 5), "\n")

## RMSE: 0.13069

cat("R²:", round(cv_model$results$Rsquared, 5), "\n")

## R²: 0.2234

cat("MAE:", round(cv_model$results$MAE, 5), "\n\n")

## MAE: 0.10053

cat("CV R² close to in-sample Adj R² (0.2262) indicates\n")

## CV R² close to in-sample Adj R² (0.2262) indicates

cat("the model is not overfitting and generalizes well.\n")

## the model is not overfitting and generalizes well.

# ============================================================
# Section 38: Part 6(a) — Professional Regression Table
# ============================================================
# Present final model with OLS, Robust, and Bootstrap SEs side by side

ols_coefs  <- summary(final_model)$coefficients
robust_vcov <- sandwich::vcovHC(final_model, type = "HC1")
robust_test <- lmtest::coeftest(final_model, vcov = robust_vcov)

# Bootstrap SEs from Part 5
boot_se <- apply(boot_results$t, 2, sd)

# Build comparison table for key variables only
key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
              "beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)

results_tbl <- data.frame(
  Term       = names(coef(final_model))[key_idx],
  Estimate   = coef(final_model)[key_idx],
  OLS_SE     = ols_coefs[key_idx, 2],
  Robust_SE  = sqrt(diag(robust_vcov))[key_idx],
  Boot_SE    = boot_se[key_idx],
  Robust_p   = robust_test[key_idx, 4]
)

# Add significance stars based on robust p-values
results_tbl$Sig <- ifelse(results_tbl$Robust_p < 0.001, "***",
                   ifelse(results_tbl$Robust_p < 0.01, "**",
                   ifelse(results_tbl$Robust_p < 0.05, "*",
                   ifelse(results_tbl$Robust_p < 0.10, ".", ""))))

knitr::kable(
  results_tbl,
  digits    = c(0, 5, 5, 5, 5, 4, 0),
  align     = "lrrrrrr",
  col.names = c("Term", "Estimate", "OLS SE", "Robust SE", "Boot SE", "Robust p", "Sig."),
  caption   = "Table 4: Final Model — OLS, Robust, and Bootstrap Standard Errors"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    position          = "center"
  )

Table 4: Final Model — OLS, Robust, and Bootstrap Standard Errors
	Term	Estimate	OLS SE	Robust SE	Boot SE	Robust p	Sig.
(Intercept)	(Intercept)	-0.12807	0.03288	0.03727	0.03732	0.0006	***
mom_q_c	mom_q_c	-0.02092	0.02008	0.02516	0.02543	0.4058
mom_y	mom_y	0.02438	0.00604	0.00815	0.00852	0.0028	**
log_vol_q_c	log_vol_q_c	0.78098	0.39173	0.56082	0.56939	0.1638
beta_y	beta_y	0.00818	0.00549	0.00654	0.00633	0.2114
log_size_proxy	log_size_proxy	-0.00206	0.00162	0.00183	0.00184	0.2612
mdd_q	mdd_q	-0.10627	0.04659	0.05774	0.05714	0.0658	.
mom_q_c:log_vol_q_c	mom_q_c:log_vol_q_c	3.29701	0.79956	1.39211	1.39167	0.0179

cat("\nModel: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +\n")

## 
## Model: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +

cat("       log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor\n")

##        log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor

cat("N = 7,425 | Adj. R² = 0.2262 | F = 71.02 (p < 2e-16)\n")

## N = 7,425 | Adj. R² = 0.2262 | F = 71.02 (p < 2e-16)

cat("CV R² = 0.2224 | Bootstrap replications = 1,000\n")

## CV R² = 0.2224 | Bootstrap replications = 1,000

# ============================================================
# Section 39: Part 6(e) — Confidence Intervals (Robust)
# ============================================================
# 95% confidence intervals using robust (HC1) standard errors
# rather than OLS SEs, consistent with our heteroskedasticity findings.

robust_vcov <- sandwich::vcovHC(final_model, type = "HC1")
robust_test <- lmtest::coeftest(final_model, vcov = robust_vcov)

key_vars <- c("mom_q_c", "mom_y", "log_vol_q_c", "beta_y",
              "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)

robust_se <- sqrt(diag(robust_vcov))

ci_tbl <- data.frame(
  Term     = names(coef(final_model))[key_idx],
  Estimate = round(coef(final_model)[key_idx], 5),
  Lower_95 = round(coef(final_model)[key_idx] - 1.96 * robust_se[key_idx], 5),
  Upper_95 = round(coef(final_model)[key_idx] + 1.96 * robust_se[key_idx], 5),
  Robust_p = round(robust_test[key_idx, 4], 4)
)

cat("=== 95% Confidence Intervals (Robust HC1) ===\n\n")

## === 95% Confidence Intervals (Robust HC1) ===

knitr::kable(
  ci_tbl,
  align     = "lrrrr",
  col.names = c("Term", "Estimate", "Lower 95%", "Upper 95%", "Robust p"),
  caption   = "Table 5: 95% Confidence Intervals Using Robust Standard Errors"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width        = FALSE,
    position          = "center"
  )

Table 5: 95% Confidence Intervals Using Robust Standard Errors
	Term	Estimate	Lower 95%	Upper 95%	Robust p
mom_q_c	mom_q_c	-0.02092	-0.07023	0.02840	0.4058
mom_y	mom_y	0.02438	0.00840	0.04035	0.0028
log_vol_q_c	log_vol_q_c	0.78098	-0.31823	1.88019	0.1638
beta_y	beta_y	0.00818	-0.00465	0.02100	0.2114
log_size_proxy	log_size_proxy	-0.00206	-0.00565	0.00153	0.2612
mdd_q	mdd_q	-0.10627	-0.21944	0.00691	0.0658
mom_q_c:log_vol_q_c	mom_q_c:log_vol_q_c	3.29701	0.56849	6.02554	0.0179

Stock Returns Project

2026-02-06