# ============================================================
# Section 0: Load Required Packages
# ============================================================
pkgs <- c(
"tidyverse", # Data wrangling + plotting
"tidyquant", # Pull stock data + S&P 500 list
"lubridate", # Date/quarter manipulation
"slider", # Rolling-window calculations
"Boruta", # Boruta variable selection
"broom", # Clean model outputs into tables
"lmtest", # Breusch-Pagan, RESET tests
"sandwich", # Heteroskedasticity-consistent SEs
"car", # VIF multicollinearity tests
"nortest", # Jarque-Bera normality test
"boot", # Bootstrap coefficient estimation
"caret", # Cross-validation
"rvest", # Web scraping for sector data
"corrplot", # Correlation Matrix
"GGally", # Extends GGplots
"gridExtra", # Multiply GGplots
"patchwork", #
"kableExtra", #
"fitdistrplus" #
)
# Auto-install any missing packages
new_pkgs <- pkgs[!(pkgs %in% installed.packages()[, "Package"])]
if (length(new_pkgs) > 0) install.packages(new_pkgs)
# Load all packages
invisible(lapply(pkgs, library, character.only = TRUE))
# ============================================================
# Section 1: Define Sample Period and Pull S&P 500 Tickers
# ============================================================
# Define sample period (2021-2025)
start_date <- as.Date("2021-01-01")
end_date <- as.Date("2025-12-31")
# Pull current S&P 500 constituents (note: survivorship bias may apply)
sp500 <- tq_index("SP500") %>%
dplyr::select(symbol, company, sector)
# Extract ticker symbols
tickers <- sp500$symbol
# Sanity checks
cat("Total tickers pulled:", length(tickers), "\n")
## Total tickers pulled: 504
cat("First few tickers:", head(tickers), "\n")
## First few tickers: NVDA AAPL MSFT AMZN GOOGL AVGO
# ============================================================
# Section 2: Pull Daily Stock Prices and Volume
# ============================================================
# Pull adjusted closing prices and volume for all S&P 500 tickers
# Note: 'adjusted' accounts for stock splits and dividends
prices <- tq_get(
tickers,
from = start_date,
to = end_date,
get = "stock.prices"
) %>%
dplyr::select(symbol, date, adjusted, volume) %>%
dplyr::arrange(symbol, date)
# Sanity check - verify structure and spot missing data
glimpse(prices)
## Rows: 624,555
## Columns: 4
## $ symbol <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
## $ date <date> 2021-01-04, 2021-01-05, 2021-01-06, 2021-01-07, 2021-01-08, …
## $ adjusted <dbl> 114.6598, 115.5972, 118.7672, 121.9275, 122.7973, 123.8507, 1…
## $ volume <dbl> 2030700, 2344000, 2428500, 1775900, 1577200, 1746800, 1826400…
cat("Total rows:", nrow(prices), "\n")
## Total rows: 624555
cat("Missing values:", sum(is.na(prices)), "\n")
## Missing values: 2
cat("Date range:", as.character(min(prices$date)),
"to", as.character(max(prices$date)), "\n")
## Date range: 2021-01-04 to 2025-12-30
# ============================================================
# Section 3: Compute Daily Stock Returns
# ============================================================
# Calculate simple daily returns: (P_t / P_{t-1}) - 1
# group_by(symbol) ensures lag is computed within each stock
daily <- prices %>%
group_by(symbol) %>%
arrange(date) %>%
mutate(
ret_d = adjusted / lag(adjusted) - 1
) %>%
ungroup() %>%
filter(!is.na(ret_d)) # Removes first obs per stock (no prior price)
# Also handles the 2 NAs found in raw prices
# Sanity checks
glimpse(daily)
## Rows: 624,050
## Columns: 5
## $ symbol <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", "A…
## $ date <date> 2021-01-05, 2021-01-05, 2021-01-05, 2021-01-05, 2021-01-05, …
## $ adjusted <dbl> 115.59720, 127.41277, 86.93716, 148.30000, 100.41207, 33.3194…
## $ volume <dbl> 2344000, 97664900, 6823800, 5974200, 4322800, 1814500, 180920…
## $ ret_d <dbl> 0.0081757652, 0.0123639944, 0.0103404702, 0.0657564467, 0.012…
cat("Summary of daily returns:\n")
## Summary of daily returns:
print(summary(daily$ret_d))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5314019 -0.0091205 0.0006787 0.0006284 0.0104168 0.5602061
# ============================================================
# Section 4: Pull Market Returns (SPY as Market Proxy)
# ============================================================
# SPY (S&P 500 ETF) is used as the market proxy for beta estimation
# This is standard practice in empirical asset pricing research
market <- tq_get(
"SPY",
from = start_date,
to = end_date,
get = "stock.prices"
) %>%
dplyr::arrange(date) %>%
dplyr::mutate(mkt_ret = adjusted / dplyr::lag(adjusted) - 1) %>%
dplyr::filter(!is.na(mkt_ret)) %>%
dplyr::select(date, mkt_ret)
# Sanity checks
glimpse(market)
## Rows: 1,253
## Columns: 2
## $ date <date> 2021-01-05, 2021-01-06, 2021-01-07, 2021-01-08, 2021-01-11, 2…
## $ mkt_ret <dbl> 0.0068873966, 0.0059782719, 0.0148576929, 0.0056978421, -0.006…
cat("Summary of market returns:\n")
## Summary of market returns:
print(summary(market$mkt_ret))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.0585429 -0.0045158 0.0007607 0.0006095 0.0063077 0.1050193
# ============================================================
# Section 5: Compute Rolling Financial Features
# ============================================================
# Window sizes: 63 trading days ≈ 1 quarter, 252 trading days ≈ 1 year
# Note: .before = 62 means 62 rows before + current row = 63 total (1 quarter)
# .before = 251 means 251 rows before + current row = 252 total (1 year)
# cache=TRUE recommended — rolling calculations are computationally intensive
# Note: market beta computed separately in next chunk
# Merge stock returns with market returns (SPY)
daily_full <- daily %>%
dplyr::left_join(market, by = "date")
# Compute all rolling features within each stock
features <- daily_full %>%
dplyr::group_by(symbol) %>%
dplyr::arrange(date) %>%
dplyr::mutate(
# Momentum: cumulative return over past quarter and year
mom_q = adjusted / dplyr::lag(adjusted, 63) - 1,
mom_y = adjusted / dplyr::lag(adjusted, 252) - 1,
# Volatility: standard deviation of daily returns
vol_q = slider::slide_dbl(ret_d, sd, .before = 62, .complete = TRUE),
vol_y = slider::slide_dbl(ret_d, sd, .before = 251, .complete = TRUE),
# Trading volume: mean and standard deviation
volm_q_avg = slider::slide_dbl(volume, mean, .before = 62, .complete = TRUE),
volm_y_avg = slider::slide_dbl(volume, mean, .before = 251, .complete = TRUE),
volm_y_sd = slider::slide_dbl(volume, sd, .before = 251, .complete = TRUE),
# Maximum drawdown: largest peak-to-trough decline over past quarter
mdd_q = slider::slide_dbl(adjusted, ~{
peak <- cummax(.x)
dd <- .x / peak - 1
min(dd, na.rm = TRUE)
}, .before = 62, .complete = TRUE)
) %>%
dplyr::ungroup()
# Drop rows with incomplete rolling windows
features <- features %>%
dplyr::filter(
!is.na(mom_q),
!is.na(mom_y),
!is.na(vol_y),
!is.na(volm_y_avg),
!is.na(mdd_q),
!is.na(mkt_ret) # Market return must exist for beta estimation
)
# Sanity checks
dplyr::glimpse(features)
## Rows: 497,534
## Columns: 14
## $ symbol <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", …
## $ date <date> 2022-01-04, 2022-01-04, 2022-01-04, 2022-01-04, 2022-01-04…
## $ adjusted <dbl> 146.90814, 175.84323, 115.62479, 170.80000, 125.32749, 42.9…
## $ volume <dbl> 2234000, 99310400, 6298300, 4080000, 8241200, 968600, 25163…
## $ ret_d <dbl> -0.0338060483, -0.0126916080, -0.0019201377, -0.0108871306,…
## $ mkt_ret <dbl> -0.0003351108, -0.0003351108, -0.0003351108, -0.0003351108,…
## $ mom_q <dbl> -0.010722225, 0.275333420, 0.249188264, 0.036785221, 0.1761…
## $ mom_y <dbl> 0.27086248, 0.38010677, 0.32998111, 0.15171948, 0.24813173,…
## $ vol_q <dbl> 0.014768063, 0.015035164, 0.011257136, 0.033742917, 0.01261…
## $ vol_y <dbl> 0.01324156, 0.01580715, 0.01261994, 0.03323359, 0.01344436,…
## $ volm_q_avg <dbl> 1516438.1, 89378579.4, 6126700.0, 5791620.6, 5337882.5, 164…
## $ volm_y_avg <dbl> 1621182.5, 90377133.3, 6806901.6, 6496663.1, 5373671.4, 203…
## $ volm_y_sd <dbl> 708967.8, 29022212.1, 3800881.8, 4608475.2, 2405241.0, 1227…
## $ mdd_q <dbl> -0.09793064, -0.05405416, -0.03028252, -0.24530669, -0.0489…
cat("Summary of rolling features:\n")
## Summary of rolling features:
features %>%
dplyr::select(mom_q, mom_y, vol_q, vol_y,
volm_q_avg, volm_y_avg, volm_y_sd, mdd_q) %>%
summary() %>%
print()
## mom_q mom_y vol_q vol_y
## Min. :-0.88729 Min. :-0.98549 Min. :0.001896 Min. :0.007558
## 1st Qu.:-0.06875 1st Qu.:-0.09169 1st Qu.:0.013753 1st Qu.:0.014474
## Median : 0.02088 Median : 0.07831 Median :0.017412 Median :0.017738
## Mean : 0.02922 Mean : 0.12839 Mean :0.019422 Mean :0.019480
## 3rd Qu.: 0.11405 3rd Qu.: 0.27098 3rd Qu.:0.022593 3rd Qu.:0.022273
## Max. : 6.32532 Max. :16.20933 Max. :0.149840 Max. :0.116309
## volm_q_avg volm_y_avg volm_y_sd mdd_q
## Min. : 144 Min. : 387 Min. : 1490 Min. :-0.90941
## 1st Qu.: 1203126 1st Qu.: 1190006 1st Qu.: 584651 1st Qu.:-0.18010
## Median : 2382245 Median : 2349903 Median : 1183242 Median :-0.12114
## Mean : 6333720 Mean : 6246487 Mean : 3062847 Mean :-0.14095
## 3rd Qu.: 5345560 3rd Qu.: 5238819 3rd Qu.: 2662303 3rd Qu.:-0.08159
## Max. :598531683 Max. :554410254 Max. :191520852 Max. :-0.00863
# ============================================================
# Section 6: Build Quarterly Panel Dataset
# ============================================================
# Step 1: Assign quarter identifier to each observation
features_q <- features %>%
mutate(
quarter = lubridate::floor_date(date, unit = "quarter")
)
# Step 2: Keep only quarter-end observations for each stock
# This gives one snapshot per firm per quarter
features_q_end <- features_q %>%
group_by(symbol, quarter) %>%
filter(date == max(date)) %>%
ungroup()
# Step 3: Create next-quarter return (dependent variable)
# lead() looks forward — current quarter characteristics predict NEXT quarter return
# This ensures no look-ahead bias in the predictive regression framework
features_q_end <- features_q_end %>%
group_by(symbol) %>%
arrange(quarter) %>%
mutate(
ret_next_q = lead(adjusted) / adjusted - 1
) %>%
ungroup()
# Step 4: Drop last quarter per stock (no future return available)
features_q_end <- features_q_end %>%
filter(!is.na(ret_next_q))
# Sanity checks
glimpse(features_q_end)
## Rows: 7,455
## Columns: 16
## $ symbol <chr> "A", "AAPL", "ABBV", "ABNB", "ABT", "ACGL", "ACN", "ADBE", …
## $ date <date> 2022-03-31, 2022-03-31, 2022-03-31, 2022-03-31, 2022-03-31…
## $ adjusted <dbl> 128.58226, 171.08022, 140.11865, 171.75999, 109.64191, 46.0…
## $ volume <dbl> 3046400, 103049300, 8670100, 3232200, 5432600, 2377300, 275…
## $ ret_d <dbl> -0.0231066531, -0.0177755791, -0.0100152926, -0.0107700877,…
## $ mkt_ret <dbl> -0.01539134, -0.01539134, -0.01539134, -0.01539134, -0.0153…
## $ mom_q <dbl> -0.17637812, -0.01889731, 0.20497457, 0.01765610, -0.157606…
## $ mom_y <dbl> 0.04194401, 0.42803114, 0.56343688, -0.08754787, 0.01000050…
## $ vol_q <dbl> 0.02058148, 0.01900028, 0.00987060, 0.04028967, 0.01611739,…
## $ vol_y <dbl> 0.01474659, 0.01528478, 0.01117082, 0.03147287, 0.01376143,…
## $ volm_q_avg <dbl> 2063185.7, 94933725.4, 7769914.3, 6180982.5, 6246211.1, 185…
## $ volm_y_avg <dbl> 1708491.7, 87021204.8, 6907349.6, 6548170.2, 5613284.9, 176…
## $ volm_y_sd <dbl> 775662.5, 25477847.9, 3775623.1, 4518750.9, 2505382.7, 7820…
## $ mdd_q <dbl> -0.20973422, -0.17140855, -0.03509275, -0.29495287, -0.1869…
## $ quarter <date> 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01…
## $ ret_next_q <dbl> -0.10106207, -0.21584764, -0.04717954, -0.48136933, -0.0783…
cat("Summary of dependent variable (next-quarter return):\n")
## Summary of dependent variable (next-quarter return):
print(summary(features_q_end$ret_next_q))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.81071 -0.06840 0.02308 0.03200 0.12081 2.55319
# Final quarterly panel dataset used for all downstream analysis
quarterly_panel <- features_q_end
cat("\nFinal quarterly panel dimensions:",
nrow(quarterly_panel), "rows x",
ncol(quarterly_panel), "columns\n")
##
## Final quarterly panel dimensions: 7455 rows x 16 columns
# ============================================================
# Section 7: Compute Rolling Beta and Firm Size
# ============================================================
# Step 1: Compute rolling 1-year beta for each stock
# Formula: beta = Cov(R_i, R_m) / Var(R_m)
# Uses 252-day rolling window (.before = 251 + current = 252)
# slide2_dbl() handles two input vectors simultaneously
beta_daily <- daily_full %>%
dplyr::group_by(symbol) %>%
dplyr::arrange(date) %>%
dplyr::mutate(
beta_y = slider::slide2_dbl(
.x = ret_d,
.y = mkt_ret,
.f = ~ cov(.x, .y, use = "complete.obs") / var(.y, na.rm = TRUE),
.before = 251,
.complete = TRUE
)
) %>%
dplyr::ungroup() %>%
dplyr::select(symbol, date, beta_y)
# Step 2: Join beta onto quarterly panel
# Joining by both symbol AND date ensures each firm-quarter
# gets the correct beta estimated as of that quarter-end date
quarterly_panel <- quarterly_panel %>%
dplyr::left_join(beta_daily, by = c("symbol", "date"))
cat("Beta added. Missing beta_y:", sum(is.na(quarterly_panel$beta_y)), "\n")
## Beta added. Missing beta_y: 0
print(summary(quarterly_panel$beta_y))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5250 0.5884 0.8860 0.9139 1.1749 4.4554
# Step 3: Add firm size (log market capitalization)
# Primary: pull market cap from Yahoo Finance via key.stats
# Fallback: log(price × volume) as size proxy if API unavailable
size_tbl <- tryCatch(
tq_get(tickers, get = "key.stats") %>%
dplyr::select(symbol, market.cap) %>%
dplyr::mutate(log_mktcap = log(market.cap)) %>%
dplyr::select(symbol, log_mktcap) %>%
dplyr::distinct(),
error = function(e) NULL
)
# Step 4: Join size onto quarterly panel
if (!is.null(size_tbl) && nrow(size_tbl) > 0) {
# Primary: use actual log market capitalization
quarterly_panel <- quarterly_panel %>%
dplyr::left_join(size_tbl, by = "symbol")
cat("Size added using log_mktcap.\n")
print(summary(quarterly_panel$log_mktcap))
} else {
# Fallback: log(price × avg volume) as size proxy
# This approximates market cap when API data is unavailable
quarterly_panel <- quarterly_panel %>%
dplyr::mutate(log_size_proxy = log(adjusted * volm_y_avg))
cat("Market cap unavailable; size added using log_size_proxy.\n")
print(summary(quarterly_panel$log_size_proxy))
}
## Market cap unavailable; size added using log_size_proxy.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.383 18.752 19.291 19.410 19.966 24.785
# Final column check
cat("Columns now in quarterly_panel:\n")
## Columns now in quarterly_panel:
print(names(quarterly_panel))
## [1] "symbol" "date" "adjusted" "volume"
## [5] "ret_d" "mkt_ret" "mom_q" "mom_y"
## [9] "vol_q" "vol_y" "volm_q_avg" "volm_y_avg"
## [13] "volm_y_sd" "mdd_q" "quarter" "ret_next_q"
## [17] "beta_y" "log_size_proxy"
# ============================================================
# Section 8: Add Sector Data via Wikipedia GICS Sectors
# ============================================================
# tq_index("SP500") sector column is broken in current tidyquant version
# Fetching GICS sector classifications directly from Wikipedia instead
url <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500_wiki <- tryCatch({
tables <- url %>%
rvest::read_html() %>%
rvest::html_table(fill = TRUE)
tables[[1]] %>%
dplyr::select(symbol = Symbol, sector = `GICS Sector`) %>%
dplyr::mutate(symbol = trimws(symbol))
}, error = function(e) NULL)
cat("Unique sectors from Wikipedia:\n")
## Unique sectors from Wikipedia:
print(unique(sp500_wiki$sector))
## [1] "Industrials" "Health Care" "Information Technology"
## [4] "Utilities" "Financials" "Materials"
## [7] "Consumer Discretionary" "Real Estate" "Communication Services"
## [10] "Consumer Staples" "Energy"
cat("Total tickers with sectors:", nrow(sp500_wiki), "\n")
## Total tickers with sectors: 503
# Remove any existing broken sector column before joining
quarterly_panel <- quarterly_panel %>%
dplyr::select(-any_of("sector")) %>%
dplyr::left_join(sp500_wiki, by = "symbol")
# Handle 30 missing sectors due to ticker format mismatches
# e.g., Wikipedia uses "BRK.B" while Yahoo Finance uses "BRK-B"
quarterly_panel <- quarterly_panel %>%
dplyr::mutate(
sector = ifelse(is.na(sector), "Unknown", sector)
)
cat("Missing sectors after join:", sum(is.na(quarterly_panel$sector)), "\n")
## Missing sectors after join: 0
cat("Sector distribution:\n")
## Sector distribution:
print(table(quarterly_panel$sector))
##
## Communication Services Consumer Discretionary Consumer Staples
## 345 720 516
## Energy Financials Health Care
## 330 1122 881
## Industrials Information Technology Materials
## 1161 1034 390
## Real Estate Unknown Utilities
## 465 30 461
# ============================================================
# Section 9: Part 2(a) - Boruta Variable Selection
# ============================================================
# IMPORTANT: Boruta is run on quantitative predictors ONLY
# Factor variables (sector, quarter) evaluated separately in Part 2(b)
# Parameters used:
# - maxRuns = 100 (maximum number of importance source runs)
# - pValue = 0.01 (confidence level for feature acceptance/rejection)
# - doTrace = 2 (verbose output to track progress)
# - ntree = 500 (default random forest trees per run)
# - seed = 123 (for reproducibility)
set.seed(123)
# Step 1: Prepare Boruta dataset (quantitative predictors only)
boruta_data <- quarterly_panel %>%
dplyr::select(
ret_next_q, # Dependent variable
mom_q, # Quarterly momentum
mom_y, # Yearly momentum
vol_q, # Quarterly volatility
vol_y, # Yearly volatility
beta_y, # Market beta
log_size_proxy, # Firm size (log price x volume)
volm_q_avg, # Average quarterly trading volume
volm_y_avg, # Average yearly trading volume
volm_y_sd, # Yearly volume volatility
mdd_q # Maximum drawdown (quarterly)
) %>%
na.omit()
cat("Boruta sample size:", nrow(boruta_data), "\n")
## Boruta sample size: 7455
# Step 2: Run Boruta algorithm
boruta_model <- Boruta(
ret_next_q ~ .,
data = boruta_data,
maxRuns = 100,
pValue = 0.01,
doTrace = 2
)
# Step 3: Resolve any tentative variables
boruta_final <- TentativeRoughFix(boruta_model)
# Step 4: Extract confirmed variables only
selected_vars <- getSelectedAttributes(boruta_final, withTentative = FALSE)
cat("\nConfirmed important variables:\n")
##
## Confirmed important variables:
print(selected_vars)
## [1] "mom_q" "mom_y" "vol_q" "vol_y"
## [5] "beta_y" "log_size_proxy" "volm_q_avg" "volm_y_avg"
## [9] "volm_y_sd" "mdd_q"
# Step 5: Importance table
cat("\nVariable importance statistics:\n")
##
## Variable importance statistics:
print(attStats(boruta_final))
## meanImp medianImp minImp maxImp normHits decision
## mom_q 25.818588 25.052473 24.227193 28.94987 1 Confirmed
## mom_y 14.135517 14.561089 11.609118 15.26047 1 Confirmed
## vol_q 19.136843 19.633455 16.446098 21.58546 1 Confirmed
## vol_y 17.540406 17.588384 14.895209 19.59643 1 Confirmed
## beta_y 11.148484 11.281836 9.519076 12.58580 1 Confirmed
## log_size_proxy 9.507656 9.237430 7.635180 11.85250 1 Confirmed
## volm_q_avg 9.651028 9.584909 6.111301 13.06927 1 Confirmed
## volm_y_avg 9.711749 9.803761 6.776112 11.62956 1 Confirmed
## volm_y_sd 9.034147 8.653053 7.204950 11.32367 1 Confirmed
## mdd_q 16.797443 16.907008 14.589846 18.74710 1 Confirmed
# Step 6: Boruta importance plot
par(mar = c(12, 4, 4, 2))
plot(
boruta_final,
las = 2,
cex.axis = 1.05,
cex.lab = 1.2,
xlab = "",
main = "Boruta Variable Importance Plot"
)
mtext("Attributes", side = 1, line = 8, cex = 1.2)

# ============================================================
# Section 10: Part 2(b) - ANOVA for Factor Variables
# ============================================================
# Create readable quarter labels FIRST (before converting to factor)
quarterly_panel <- quarterly_panel %>%
mutate(
quarter_factor = as.factor(
paste0(lubridate::year(quarter), "-Q", lubridate::quarter(quarter))
)
)
cat("Number of quarters:", nlevels(quarterly_panel$quarter_factor), "\n")
## Number of quarters: 15
print(table(quarterly_panel$quarter_factor))
##
## 2022-Q1 2022-Q2 2022-Q3 2022-Q4 2023-Q1 2023-Q2 2023-Q3 2023-Q4 2024-Q1 2024-Q2
## 492 494 495 495 496 496 496 497 497 498
## 2024-Q3 2024-Q4 2025-Q1 2025-Q2 2025-Q3
## 498 499 500 501 501
# Now convert sector and quarter to factors
quarterly_panel$sector <- as.factor(quarterly_panel$sector)
quarterly_panel$quarter <- as.factor(quarterly_panel$quarter)
# --- SECTOR ANOVA ---
anova_sector <- aov(ret_next_q ~ sector, data = quarterly_panel)
cat("\nANOVA Results - Sector:\n")
##
## ANOVA Results - Sector:
print(summary(anova_sector))
## Df Sum Sq Mean Sq F value Pr(>F)
## sector 11 2.5 0.22748 8.654 2.04e-15 ***
## Residuals 7443 195.7 0.02629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Boxplot by sector
ggplot(quarterly_panel, aes(x = reorder(sector, ret_next_q, median),
y = ret_next_q,
fill = sector)) +
geom_boxplot(outlier.alpha = 0.2) +
coord_flip() +
scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
labs(
title = "Next-Quarter Returns by GICS Sector",
x = "Sector",
y = "Next-Quarter Return"
) +
theme_minimal() +
theme(legend.position = "none")

# --- QUARTER ANOVA ---
anova_quarter <- aov(ret_next_q ~ quarter_factor, data = quarterly_panel)
cat("\nANOVA Results - Calendar Quarter:\n")
##
## ANOVA Results - Calendar Quarter:
print(summary(anova_quarter))
## Df Sum Sq Mean Sq F value Pr(>F)
## quarter_factor 14 35.7 2.5498 116.8 <2e-16 ***
## Residuals 7440 162.4 0.0218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Boxplot by quarter
ggplot(quarterly_panel, aes(x = quarter_factor,
y = ret_next_q,
fill = quarter_factor)) +
geom_boxplot(outlier.alpha = 0.2) +
scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
labs(
title = "Next-Quarter Returns by Calendar Quarter",
x = "Quarter",
y = "Next-Quarter Return"
) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))

# ============================================================
# Section 11: Part 3 - EDA Data Preparation
# ============================================================
# Select quantitative variables for EDA
# Note: sector and quarter_factor excluded here as they are
# factor variables analyzed separately in Part 2(b)
eda_data <- quarterly_panel %>%
dplyr::select(
ret_next_q, # Dependent variable
mom_q, mom_y, # Momentum
vol_q, vol_y, # Volatility
beta_y, # Market beta
log_size_proxy, # Firm size
volm_q_avg, volm_y_avg, volm_y_sd, # Trading volume
mdd_q # Maximum drawdown
)
# Full EDA dataset including factor variables
# Used for plots requiring sector or quarter breakdowns
eda_data_full <- quarterly_panel %>%
dplyr::select(
ret_next_q, # Dependent variable
mom_q, mom_y, # Momentum
vol_q, vol_y, # Volatility
beta_y, # Market beta
log_size_proxy, # Firm size
volm_q_avg, volm_y_avg, volm_y_sd, # Trading volume
mdd_q, # Maximum drawdown
sector, # GICS sector (factor)
quarter_factor # Calendar quarter (factor)
)
# Sanity checks
cat("Quantitative EDA dataset:", nrow(eda_data), "rows x", ncol(eda_data), "cols\n")
## Quantitative EDA dataset: 7455 rows x 11 cols
cat("Full EDA dataset:", nrow(eda_data_full), "rows x", ncol(eda_data_full), "cols\n")
## Full EDA dataset: 7455 rows x 13 cols
dplyr::glimpse(eda_data)
## Rows: 7,455
## Columns: 11
## $ ret_next_q <dbl> -0.10106207, -0.21584764, -0.04717954, -0.48136933, -0.…
## $ mom_q <dbl> -0.17637812, -0.01889731, 0.20497457, 0.01765610, -0.15…
## $ mom_y <dbl> 0.04194401, 0.42803114, 0.56343688, -0.08754787, 0.0100…
## $ vol_q <dbl> 0.02058148, 0.01900028, 0.00987060, 0.04028967, 0.01611…
## $ vol_y <dbl> 0.01474659, 0.01528478, 0.01117082, 0.03147287, 0.01376…
## $ beta_y <dbl> 1.0533860, 1.2488194, 0.3749702, 1.6402609, 0.6866531, …
## $ log_size_proxy <dbl> 19.20769, 23.42379, 20.69059, 20.84079, 20.23787, 18.21…
## $ volm_q_avg <dbl> 2063185.7, 94933725.4, 7769914.3, 6180982.5, 6246211.1,…
## $ volm_y_avg <dbl> 1708491.7, 87021204.8, 6907349.6, 6548170.2, 5613284.9,…
## $ volm_y_sd <dbl> 775662.5, 25477847.9, 3775623.1, 4518750.9, 2505382.7, …
## $ mdd_q <dbl> -0.20973422, -0.17140855, -0.03509275, -0.29495287, -0.…
# ============================================================
# Section 12: Part 3(b) - Histograms of All Variables
# ============================================================
# Reshape data to long format for faceted plotting
eda_long <- eda_data %>%
tidyr::pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
)
# Plot histograms for all 11 variables with normal curve overlay
ggplot(eda_long, aes(x = value)) +
geom_histogram(aes(y = after_stat(density)),
bins = 50,
fill = "steelblue",
color = "white",
alpha = 0.7) +
geom_density(color = "red", linewidth = 0.8) +
facet_wrap(~ variable, scales = "free", ncol = 4) +
labs(
title = "Histograms of All Quantitative Variables",
x = "Value",
y = "Density"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 9, face = "bold"),
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 7)
)

# ============================================================
# Section 13: Part 3(b) - QQ Plots of All Variables
# ============================================================
# QQ plots assess normality - points following the red line = normal distribution
# Deviations in tails indicate skewness or heavy tails
ggplot(eda_long, aes(sample = value)) +
stat_qq(color = "steelblue", alpha = 0.3, size = 0.5) +
stat_qq_line(color = "red", linewidth = 0.8) +
facet_wrap(~ variable, scales = "free", ncol = 4) +
labs(
title = "QQ Plots of All Quantitative Variables",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 9, face = "bold"),
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 7)
)

# ============================================================
# Section 14: Part 3(b) - Correlation Matrix
# ============================================================
# Compute pairwise correlation matrix
# pairwise.complete.obs handles any remaining NAs
cor_mat <- cor(eda_data, use = "pairwise.complete.obs")
# Plot correlation matrix
corrplot(
cor_mat,
method = "color",
type = "upper",
tl.cex = 0.7,
number.cex = 0.7,
tl.srt = 45,
addCoef.col = "black",
title = "Correlation Matrix of Quantitative Variables",
mar = c(0, 0, 2, 0) # Add margin for title
)

# ============================================================
# Section 15: Part 3(b) - Scatterplot Matrix
# ============================================================
# Note: Only 6 key variables selected for scatterplot matrix
# Including all 11 variables would create a 11x11 = 121 panel grid
# which would be too small to read and visually uninformative
# Volume variables (volm_q_avg, volm_y_avg, volm_y_sd) excluded due to
# extreme skewness making scatterplots unreadable without transformation
# vol_q and vol_y excluded as they are highly correlated with each other
# and with beta_y (already included), adding redundant information
# Select key variables for scatterplot matrix
focus_vars <- eda_data %>%
dplyr::select(
ret_next_q,
mom_q,
mom_y,
beta_y,
log_size_proxy,
mdd_q
)
# Scatterplot matrix with correlation coefficients and smooth lines
GGally::ggpairs(
focus_vars,
lower = list(continuous = GGally::wrap("points",
alpha = 0.1,
size = 0.5)),
upper = list(continuous = GGally::wrap("cor",
size = 3.5)),
diag = list(continuous = GGally::wrap("densityDiag",
fill = "steelblue",
alpha = 0.5)),
title = "Scatterplot Matrix of Key Variables"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 6)
)

# ============================================================
# Section 16: Part 3(b) - Boxplots by Sector and Quarter
# ============================================================
# Plot 1: Market beta by sector
p1 <- ggplot(eda_data_full, aes(x = reorder(sector, beta_y, median),
y = beta_y,
fill = sector)) +
geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
coord_flip() +
labs(
title = "Market Beta by GICS Sector",
x = "",
y = "Market Beta"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(size = 11, face = "bold")
)
# Plot 2: Quarterly momentum by sector
p2 <- ggplot(eda_data_full, aes(x = reorder(sector, mom_q, median),
y = mom_q,
fill = sector)) +
geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
coord_flip() +
scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
labs(
title = "Quarterly Momentum by GICS Sector",
x = "",
y = "Quarterly Momentum"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(size = 11, face = "bold")
)
# Plot 3: Quarterly momentum over time by quarter
p3 <- ggplot(eda_data_full, aes(x = quarter_factor,
y = mom_q,
fill = quarter_factor)) +
geom_boxplot(outlier.alpha = 0.2, outlier.size = 0.5) +
scale_y_continuous(limits = c(-0.5, 0.5), labels = scales::percent) +
labs(
title = "Quarterly Momentum Over Time",
x = "Quarter",
y = "Quarterly Momentum"
) +
theme_minimal() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 11, face = "bold")
)
# Combine all three plots using patchwork
# Top row: sector boxplots side by side
# Bottom row: quarter boxplot full width
(p1 + p2) / p3

# ============================================================
# Section 17: Part 3(c) - Descriptive Statistics Table
# ============================================================
# Compute descriptive statistics for all quantitative variables
desc_stats_raw <- eda_data %>%
tidyr::pivot_longer(
everything(),
names_to = "Variable",
values_to = "Value"
) %>%
dplyr::group_by(Variable) %>%
dplyr::summarise(
mean = mean(Value, na.rm = TRUE),
median = median(Value, na.rm = TRUE),
sd = sd(Value, na.rm = TRUE),
min = min(Value, na.rm = TRUE),
max = max(Value, na.rm = TRUE),
.groups = "drop"
) %>%
dplyr::arrange(Variable)
# Flag volume variables for different number formatting
# Volume variables are in raw share counts — use comma formatting
# All other variables use 4 decimal place formatting
is_volume <- grepl("^volm_", desc_stats_raw$Variable)
# Format numbers appropriately by variable type
desc_stats_fmt <- desc_stats_raw %>%
dplyr::mutate(
mean = ifelse(is_volume, scales::comma(mean, accuracy = 1), scales::number(mean, accuracy = 0.0001)),
median = ifelse(is_volume, scales::comma(median, accuracy = 1), scales::number(median, accuracy = 0.0001)),
sd = ifelse(is_volume, scales::comma(sd, accuracy = 1), scales::number(sd, accuracy = 0.0001)),
min = ifelse(is_volume, scales::comma(min, accuracy = 1), scales::number(min, accuracy = 0.0001)),
max = ifelse(is_volume, scales::comma(max, accuracy = 1), scales::number(max, accuracy = 0.0001))
)
# Display professional table
knitr::kable(
desc_stats_fmt,
align = "lrrrrr",
col.names = c("Variable", "Mean", "Median", "Std Dev", "Min", "Max"),
caption = "Table 1: Descriptive Statistics of Quantitative Variables"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
Table 1: Descriptive Statistics of Quantitative Variables
|
Variable
|
Mean
|
Median
|
Std Dev
|
Min
|
Max
|
|
beta_y
|
0.9139
|
0.8860
|
0.4801
|
-0.5250
|
4.4554
|
|
log_size_proxy
|
19.4103
|
19.2914
|
1.0417
|
9.3826
|
24.7854
|
|
mdd_q
|
-0.1428
|
-0.1250
|
0.0825
|
-0.8427
|
-0.0115
|
|
mom_q
|
0.0277
|
0.0195
|
0.1685
|
-0.8208
|
2.4238
|
|
mom_y
|
0.1315
|
0.0819
|
0.4506
|
-0.9797
|
12.8225
|
|
ret_next_q
|
0.0320
|
0.0231
|
0.1630
|
-0.8107
|
2.5532
|
|
vol_q
|
0.0194
|
0.0174
|
0.0087
|
0.0060
|
0.1222
|
|
vol_y
|
0.0194
|
0.0177
|
0.0076
|
0.0076
|
0.1133
|
|
volm_q_avg
|
6,322,794
|
2,382,710
|
22,126,081
|
173
|
571,835,587
|
|
volm_y_avg
|
6,245,465
|
2,342,950
|
22,219,153
|
408
|
542,065,472
|
|
volm_y_sd
|
3,055,284
|
1,181,748
|
8,614,108
|
1,512
|
188,020,493
|
# ============================================================
# Section 18: Part 3(d) - Density Plots with Fitted Distributions
# ============================================================
# For each variable we fit up to 3 distributions:
# - Normal (all variables)
# - Lognormal (strictly positive variables only)
# - Gamma (strictly positive variables only)
# safe_fit() returns NULL if fitting fails, preventing crashes
# Fitted curves are overlaid on kernel density estimates
# Safely fit a distribution (returns NULL if it fails)
safe_fit <- function(x, dist_name) {
tryCatch(fitdistrplus::fitdist(x, dist_name), error = function(e) NULL)
}
# Plot density + fitted curves for one variable
plot_density_with_fits <- function(x, varname) {
x <- x[is.finite(x)]
x <- x[!is.na(x)]
d <- density(x)
xs <- d$x
# Start y max using kernel density
y_max <- max(d$y)
# Normal fit (always try)
fit_norm <- safe_fit(x, "norm")
if (!is.null(fit_norm)) {
y_norm <- dnorm(xs,
mean = fit_norm$estimate["mean"],
sd = fit_norm$estimate["sd"]
)
y_max <- max(y_max, y_norm)
}
# Lognormal + Gamma only if strictly positive
fit_lnorm <- NULL
fit_gamma <- NULL
if (all(x > 0)) {
fit_lnorm <- safe_fit(x, "lnorm")
if (!is.null(fit_lnorm)) {
y_lnorm <- dlnorm(xs,
meanlog = fit_lnorm$estimate["meanlog"],
sdlog = fit_lnorm$estimate["sdlog"]
)
y_max <- max(y_max, y_lnorm)
}
fit_gamma <- safe_fit(x, "gamma")
if (!is.null(fit_gamma)) {
y_gamma <- dgamma(xs,
shape = fit_gamma$estimate["shape"],
rate = fit_gamma$estimate["rate"]
)
y_max <- max(y_max, y_gamma)
}
}
# Add headroom so nothing gets cut off at the top
y_max <- y_max * 1.15
# Plot kernel density
plot(d,
main = paste("Density + Fits:", varname),
xlab = varname,
ylab = "Density",
ylim = c(0, y_max),
lwd = 2,
cex.main = 1.15,
cex.lab = 1.35,
cex.axis = 1.25
)
# Overlay fitted distributions
if (!is.null(fit_norm)) {
curve(dnorm(x,
mean = fit_norm$estimate["mean"],
sd = fit_norm$estimate["sd"]
), add = TRUE, lwd = 2, lty = 2)
}
if (!is.null(fit_lnorm)) {
curve(dlnorm(x,
meanlog = fit_lnorm$estimate["meanlog"],
sdlog = fit_lnorm$estimate["sdlog"]
), add = TRUE, lwd = 2, lty = 3)
}
if (!is.null(fit_gamma)) {
curve(dgamma(x,
shape = fit_gamma$estimate["shape"],
rate = fit_gamma$estimate["rate"]
), add = TRUE, lwd = 2, lty = 4)
}
# Dynamic legend based on which fits succeeded
leg <- c("Kernel density", "Normal")
ltys <- c(1, 2)
if (!is.null(fit_lnorm)) { leg <- c(leg, "Lognormal"); ltys <- c(ltys, 3) }
if (!is.null(fit_gamma)) { leg <- c(leg, "Gamma"); ltys <- c(ltys, 4) }
legend("topright", legend = leg, lty = ltys, lwd = 2, bty = "n", cex = 1.0)
}
# Save old graphics settings and restore after
op <- par(no.readonly = TRUE)
on.exit(par(op), add = TRUE)
# Layout: 3 columns, dynamic number of rows
p <- ncol(eda_data)
nr <- ceiling(p / 3)
# Bigger margins to prevent axis label cutoff
par(mfrow = c(nr, 3), mar = c(5.2, 6.0, 3.0, 1.2), mgp = c(3.0, 1.0, 0))
# Plot density for all variables
for (v in names(eda_data)) {
plot_density_with_fits(eda_data[[v]], v)
}
## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data, gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth, lower = lower, upper = upper, ...): non-finite finite-difference value [2]>
## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data, gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth, lower = lower, upper = upper, ...): non-finite finite-difference value [2]>
## <simpleError in optim(par = vstart, fn = fnobj, fix.arg = fix.arg, obs = data, gr = gradient, ddistnam = ddistname, hessian = TRUE, method = meth, lower = lower, upper = upper, ...): non-finite finite-difference value [2]>

# ============================================================
# Section 19: Part 3(e) - Non-Linearities and Transformations
# ============================================================
# Based on histogram, QQ plot, and density plot analysis:
# volm_q_avg, volm_y_avg, volm_y_sd show severe right skewness
# requiring log transformation before regression inclusion
# mom_y also shows right skew but is already on return scale
# so we apply log(1 + x) to handle zero/negative values
# Step 1: Apply log transformations to volume variables
eda_data <- eda_data %>%
dplyr::mutate(
log_volm_q_avg = log(volm_q_avg), # log transform quarterly volume
log_volm_y_avg = log(volm_y_avg), # log transform yearly volume
log_volm_y_sd = log(volm_y_sd) # log transform volume volatility
)
# Also apply to quarterly_panel for use in regression later
quarterly_panel <- quarterly_panel %>%
dplyr::mutate(
log_volm_q_avg = log(volm_q_avg),
log_volm_y_avg = log(volm_y_avg),
log_volm_y_sd = log(volm_y_sd)
)
cat("Log transformations applied to volume variables.\n")
## Log transformations applied to volume variables.
cat("New variables added: log_volm_q_avg, log_volm_y_avg, log_volm_y_sd\n\n")
## New variables added: log_volm_q_avg, log_volm_y_avg, log_volm_y_sd
# Step 2: Compare distributions before and after transformation
# Plot original vs log-transformed for each volume variable
par(mfrow = c(2, 3), mar = c(5, 5, 3, 1))
# Before transformation
hist(eda_data$volm_q_avg,
main = "volm_q_avg (Raw)",
xlab = "Average Quarterly Volume",
col = "steelblue", border = "white", breaks = 50)
hist(eda_data$volm_y_avg,
main = "volm_y_avg (Raw)",
xlab = "Average Yearly Volume",
col = "steelblue", border = "white", breaks = 50)
hist(eda_data$volm_y_sd,
main = "volm_y_sd (Raw)",
xlab = "Yearly Volume SD",
col = "steelblue", border = "white", breaks = 50)
# After transformation
hist(eda_data$log_volm_q_avg,
main = "log_volm_q_avg (Log Transformed)",
xlab = "Log Average Quarterly Volume",
col = "darkgreen", border = "white", breaks = 50)
hist(eda_data$log_volm_y_avg,
main = "log_volm_y_avg (Log Transformed)",
xlab = "Log Average Yearly Volume",
col = "darkgreen", border = "white", breaks = 50)
hist(eda_data$log_volm_y_sd,
main = "log_volm_y_sd (Log Transformed)",
xlab = "Log Yearly Volume SD",
col = "darkgreen", border = "white", breaks = 50)

par(mfrow = c(1, 1))
# Step 3: Print summary statistics before and after
cat("Summary of raw volume variables:\n")
## Summary of raw volume variables:
summary(eda_data[, c("volm_q_avg", "volm_y_avg", "volm_y_sd")])
## volm_q_avg volm_y_avg volm_y_sd
## Min. : 173 Min. : 408 Min. : 1512
## 1st Qu.: 1200985 1st Qu.: 1190770 1st Qu.: 584587
## Median : 2382710 Median : 2342950 Median : 1181748
## Mean : 6322794 Mean : 6245465 Mean : 3055284
## 3rd Qu.: 5300288 3rd Qu.: 5214959 3rd Qu.: 2642049
## Max. :571835587 Max. :542065472 Max. :188020493
cat("\nSummary of log-transformed volume variables:\n")
##
## Summary of log-transformed volume variables:
summary(eda_data[, c("log_volm_q_avg", "log_volm_y_avg", "log_volm_y_sd")])
## log_volm_q_avg log_volm_y_avg log_volm_y_sd
## Min. : 5.153 Min. : 6.011 Min. : 7.321
## 1st Qu.:13.999 1st Qu.:13.990 1st Qu.:13.279
## Median :14.684 Median :14.667 Median :13.983
## Mean :14.759 Mean :14.742 Mean :14.058
## 3rd Qu.:15.483 3rd Qu.:15.467 3rd Qu.:14.787
## Max. :20.164 Max. :20.111 Max. :19.052
# ============================================================
# Section 20: Part 3(f) - Outlier Detection and Treatment
# ============================================================
# Step 1: Identify outliers using IQR method
# Outlier defined as: value < Q1 - 3*IQR or value > Q3 + 3*IQR
# Using 3*IQR (extreme outliers only) rather than 1.5*IQR
# to avoid over-removing genuine financial observations
outlier_summary <- eda_data %>%
dplyr::select(ret_next_q, mom_q, mom_y, beta_y,
log_volm_q_avg, log_volm_y_avg, log_volm_y_sd,
vol_q, vol_y, log_size_proxy, mdd_q) %>%
tidyr::pivot_longer(
everything(),
names_to = "Variable",
values_to = "Value"
) %>%
dplyr::group_by(Variable) %>%
dplyr::summarise(
Q1 = quantile(Value, 0.25, na.rm = TRUE),
Q3 = quantile(Value, 0.75, na.rm = TRUE),
IQR = IQR(Value, na.rm = TRUE),
lower_bound = Q1 - 3 * IQR(Value, na.rm = TRUE),
upper_bound = Q3 + 3 * IQR(Value, na.rm = TRUE),
n_outliers = sum(Value < Q1 - 3 * IQR(Value, na.rm = TRUE) |
Value > Q3 + 3 * IQR(Value, na.rm = TRUE),
na.rm = TRUE),
pct_outliers = round(n_outliers / n() * 100, 2),
.groups = "drop"
) %>%
dplyr::arrange(dplyr::desc(n_outliers))
cat("Outlier Summary (using 3*IQR threshold):\n")
## Outlier Summary (using 3*IQR threshold):
print(outlier_summary)
## # A tibble: 11 × 8
## Variable Q1 Q3 IQR lower_bound upper_bound n_outliers
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 vol_q 0.0138 0.0226 0.00878 -0.0126 0.0489 88
## 2 vol_y 0.0144 0.0222 0.00772 -0.00873 0.0453 87
## 3 mom_y -0.0974 0.286 0.383 -1.25 1.43 66
## 4 mdd_q -0.183 -0.0831 0.100 -0.483 0.217 38
## 5 log_size_proxy 18.8 20.0 1.21 15.1 23.6 33
## 6 ret_next_q -0.0684 0.121 0.189 -0.636 0.688 30
## 7 mom_q -0.0774 0.121 0.198 -0.671 0.715 29
## 8 beta_y 0.588 1.17 0.587 -1.17 2.93 20
## 9 log_volm_y_avg 14.0 15.5 1.48 9.56 19.9 19
## 10 log_volm_q_avg 14.0 15.5 1.48 9.54 19.9 18
## 11 log_volm_y_sd 13.3 14.8 1.51 8.75 19.3 9
## # ℹ 1 more variable: pct_outliers <dbl>
# Step 2: Winsorize key variables with extreme outliers
# Winsorization caps values at 1st and 99th percentiles
# Preserves all observations while limiting influence of extremes
winsorize <- function(x, lower = 0.01, upper = 0.99) {
q <- quantile(x, c(lower, upper), na.rm = TRUE)
pmax(pmin(x, q[2]), q[1])
}
# Apply winsorization to variables with notable outliers
eda_data <- eda_data %>%
dplyr::mutate(
ret_next_q = winsorize(ret_next_q), # Max was +255%
mom_y = winsorize(mom_y), # Max was +1620%
mom_q = winsorize(mom_q), # Long right tail
beta_y = winsorize(beta_y) # Max was 4.46
)
# Also apply to quarterly_panel for regression use later
quarterly_panel <- quarterly_panel %>%
dplyr::mutate(
ret_next_q = winsorize(ret_next_q),
mom_y = winsorize(mom_y),
mom_q = winsorize(mom_q),
beta_y = winsorize(beta_y)
)
cat("\nWinsorization applied to ret_next_q, mom_y, mom_q, beta_y.\n")
##
## Winsorization applied to ret_next_q, mom_y, mom_q, beta_y.
cat("Values capped at 1st and 99th percentiles.\n\n")
## Values capped at 1st and 99th percentiles.
# Step 3: Compare distributions before and after winsorization
par(mfrow = c(2, 4), mar = c(5, 4, 3, 1))
# Show key variables after winsorization
for (v in c("ret_next_q", "mom_q", "mom_y", "beta_y")) {
hist(eda_data[[v]],
main = paste(v, "(Winsorized)"),
xlab = v,
col = "steelblue",
border = "white",
breaks = 50
)
}
# Step 4: Summary statistics after winsorization
cat("Summary statistics after winsorization:\n")
## Summary statistics after winsorization:
print(summary(eda_data[, c("ret_next_q", "mom_q", "mom_y", "beta_y")]))
## ret_next_q mom_q mom_y beta_y
## Min. :-0.31709 Min. :-0.33222 Min. :-0.53340 Min. :-0.01401
## 1st Qu.:-0.06840 1st Qu.:-0.07736 1st Qu.:-0.09736 1st Qu.: 0.58838
## Median : 0.02308 Median : 0.01945 Median : 0.08194 Median : 0.88604
## Mean : 0.03049 Mean : 0.02613 Mean : 0.11920 Mean : 0.90980
## 3rd Qu.: 0.12081 3rd Qu.: 0.12066 3rd Qu.: 0.28554 3rd Qu.: 1.17492
## Max. : 0.49801 Max. : 0.50858 Max. : 1.35730 Max. : 2.30745
par(mfrow = c(1, 1))

# ============================================================
# Section 21: Part 3(g) - Missing Value Treatment
# ============================================================
# Step 1: Count missing values per variable
na_counts <- colSums(is.na(eda_data))
cat("Missing values per variable:\n")
## Missing values per variable:
print(na_counts)
## ret_next_q mom_q mom_y vol_q vol_y
## 0 0 0 0 0
## beta_y log_size_proxy volm_q_avg volm_y_avg volm_y_sd
## 0 0 0 0 0
## mdd_q log_volm_q_avg log_volm_y_avg log_volm_y_sd
## 0 0 0 0
# Step 2: Total missing values
total_na <- sum(na_counts)
cat("\nTotal missing values:", total_na, "\n")
##
## Total missing values: 0
cat("Total observations:", nrow(eda_data), "\n")
## Total observations: 7455
cat("Percentage missing:", round(total_na / (nrow(eda_data) * ncol(eda_data)) * 100, 3), "%\n\n")
## Percentage missing: 0 %
# Step 3: If missing values exist, remove incomplete observations
# Listwise deletion is appropriate here because:
# (1) missingness is minimal (only 2 NAs in raw price data)
# (2) data is not missing at random in a systematic way
# (3) imputation would introduce artificial values into financial returns
if (total_na > 0) {
eda_data <- na.omit(eda_data)
cat("Missing values detected. Listwise deletion applied using na.omit().\n")
cat("Rows remaining after deletion:", nrow(eda_data), "\n")
} else {
cat("No missing values detected. No removal or imputation necessary.\n")
}
## No missing values detected. No removal or imputation necessary.
# Step 4: Final dataset confirmation
cat("\nFinal eda_data dimensions:", nrow(eda_data), "rows x", ncol(eda_data), "cols\n")
##
## Final eda_data dimensions: 7455 rows x 14 cols
# ============================================================
# Section 22: Part 4(a) — Model Setup & Transformation Evaluation
# ============================================================
# Step 1: Prepare modeling dataset from quarterly_panel
# quarterly_panel already has: sector (factor), quarter_factor (factor),
# log_volm_q_avg, log_volm_y_avg, log_volm_y_sd, winsorized variables
model_data <- quarterly_panel %>%
filter(
!is.na(ret_next_q),
!is.na(mom_q),
!is.na(mom_y),
!is.na(beta_y),
!is.na(log_size_proxy),
!is.na(mdd_q),
!is.na(sector),
!is.na(quarter_factor)
) %>%
na.omit()
cat("Modeling dataset rows:", nrow(model_data), "\n")
## Modeling dataset rows: 7455
cat("Sector levels:", nlevels(model_data$sector), "\n")
## Sector levels: 12
cat("Quarter levels:", nlevels(model_data$quarter_factor), "\n")
## Quarter levels: 15
# Step 2: Additional transformations for modeling
# log1p() for volatility (safe for values near zero)
# Centering for interaction stability (reduces VIF on interaction terms)
model_data <- model_data %>%
mutate(
log_vol_q = log1p(vol_q),
log_vol_y = log1p(vol_y),
mom_q_c = as.numeric(scale(mom_q, center = TRUE, scale = FALSE)),
log_vol_q_c = as.numeric(scale(log_vol_q, center = TRUE, scale = FALSE))
)
# Step 3: Compare raw vs log-transformed volume in a simple test
model_raw_vol <- lm(
ret_next_q ~ mom_q + mom_y + vol_q + vol_y + beta_y +
log_size_proxy + volm_q_avg + volm_y_avg + volm_y_sd + mdd_q +
sector + quarter_factor,
data = model_data
)
model_log_vol <- lm(
ret_next_q ~ mom_q + mom_y + log_vol_q + log_vol_y + beta_y +
log_size_proxy + log_volm_q_avg + log_volm_y_avg + log_volm_y_sd + mdd_q +
sector + quarter_factor,
data = model_data
)
transform_compare <- data.frame(
Specification = c("Raw Volume/Volatility", "Log Volume/Volatility"),
Adj_R2 = c(summary(model_raw_vol)$adj.r.squared,
summary(model_log_vol)$adj.r.squared),
AIC = c(AIC(model_raw_vol), AIC(model_log_vol)),
BIC = c(BIC(model_raw_vol), BIC(model_log_vol))
)
cat("\n=== Transformation Evaluation: Raw vs Log ===\n\n")
##
## === Transformation Evaluation: Raw vs Log ===
print(transform_compare, row.names = FALSE)
## Specification Adj_R2 AIC BIC
## Raw Volume/Volatility 0.2282127 -9200.582 -8944.667
## Log Volume/Volatility 0.2283933 -9202.327 -8946.411
cat("\nLog transformation improves fit. Using log-transformed variables going forward.\n")
##
## Log transformation improves fit. Using log-transformed variables going forward.
# ============================================================
# Section 23: Part 4(b) — Build Three Competing Models
# ============================================================
# Model 1: Baseline — key predictors + factors, no log volume
# Uses raw volatility and no volume variables
# Simplest specification to serve as baseline
model1 <- lm(
ret_next_q ~ mom_q + mom_y + vol_q + beta_y +
log_size_proxy + mdd_q +
sector + quarter_factor,
data = model_data
)
# Model 2: Log-transformed volume + volatility + factors
# Adds log volume variables and uses log volatility
# Addresses skewness identified in Part 3(e)
model2 <- lm(
ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
log_size_proxy + mdd_q +
log_volm_q_avg + log_volm_y_sd +
sector + quarter_factor,
data = model_data
)
# Model 3: Model 2 + centered interaction term
# Interaction: momentum × volatility (centered to reduce VIF)
# Theory: momentum profits may reverse in high-volatility periods
model3 <- lm(
ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
log_size_proxy + mdd_q +
log_volm_q_avg + log_volm_y_sd +
mom_q_c:log_vol_q_c +
sector + quarter_factor,
data = model_data
)
# Print all three summaries
cat("============================\n")
## ============================
cat("MODEL 1: Baseline + Factors\n")
## MODEL 1: Baseline + Factors
cat("============================\n")
## ============================
print(summary(model1))
##
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + vol_q + beta_y + log_size_proxy +
## mdd_q + sector + quarter_factor, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55249 -0.08079 -0.00468 0.07761 0.51556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1458605 0.0327598 -4.452 8.61e-06 ***
## mom_q 0.0002193 0.0193906 0.011 0.9910
## mom_y 0.0241968 0.0060496 4.000 6.40e-05 ***
## vol_q 0.8554154 0.3775675 2.266 0.0235 *
## beta_y 0.0082824 0.0054835 1.510 0.1310
## log_size_proxy -0.0019974 0.0016216 -1.232 0.2181
## mdd_q -0.0969497 0.0465182 -2.084 0.0372 *
## sectorConsumer Discretionary 0.0105633 0.0085833 1.231 0.2185
## sectorConsumer Staples -0.0160897 0.0093742 -1.716 0.0861 .
## sectorEnergy -0.0044630 0.0101558 -0.439 0.6604
## sectorFinancials 0.0079661 0.0081163 0.981 0.3264
## sectorHealth Care -0.0119055 0.0083727 -1.422 0.1551
## sectorIndustrials 0.0133940 0.0081108 1.651 0.0987 .
## sectorInformation Technology 0.0102543 0.0083126 1.234 0.2174
## sectorMaterials -0.0160368 0.0097533 -1.644 0.1002
## sectorReal Estate -0.0222090 0.0094551 -2.349 0.0189 *
## sectorUnknown -0.0267887 0.0249087 -1.075 0.2822
## sectorUtilities 0.0062690 0.0095503 0.656 0.5116
## quarter_factor2022-Q2 0.1007144 0.0084718 11.888 < 2e-16 ***
## quarter_factor2022-Q3 0.2577662 0.0084372 30.551 < 2e-16 ***
## quarter_factor2022-Q4 0.1966054 0.0088125 22.310 < 2e-16 ***
## quarter_factor2023-Q1 0.2019741 0.0085532 23.614 < 2e-16 ***
## quarter_factor2023-Q2 0.1136926 0.0085291 13.330 < 2e-16 ***
## quarter_factor2023-Q3 0.2704014 0.0085420 31.656 < 2e-16 ***
## quarter_factor2023-Q4 0.2392299 0.0085980 27.824 < 2e-16 ***
## quarter_factor2024-Q1 0.1295458 0.0086044 15.056 < 2e-16 ***
## quarter_factor2024-Q2 0.2447902 0.0085144 28.750 < 2e-16 ***
## quarter_factor2024-Q3 0.1360262 0.0084784 16.044 < 2e-16 ***
## quarter_factor2024-Q4 0.1372537 0.0083930 16.353 < 2e-16 ***
## quarter_factor2025-Q1 0.2072400 0.0083679 24.766 < 2e-16 ***
## quarter_factor2025-Q2 0.1890589 0.0085391 22.140 < 2e-16 ***
## quarter_factor2025-Q3 0.1716019 0.0085180 20.146 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1305 on 7423 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.2244
## F-statistic: 70.57 on 31 and 7423 DF, p-value: < 2.2e-16
cat("\n============================\n")
##
## ============================
cat("MODEL 2: Log Volumes + Factors\n")
## MODEL 2: Log Volumes + Factors
cat("============================\n")
## ============================
print(summary(model2))
##
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
## log_size_proxy + mdd_q + log_volm_q_avg + log_volm_y_sd +
## sector + quarter_factor, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54773 -0.08053 -0.00510 0.07760 0.51638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.143853 0.032773 -4.389 1.15e-05 ***
## mom_q -0.003591 0.019420 -0.185 0.853307
## mom_y 0.027079 0.006117 4.427 9.71e-06 ***
## log_vol_q 0.713936 0.395825 1.804 0.071324 .
## beta_y 0.008860 0.005493 1.613 0.106771
## log_size_proxy -0.007463 0.002153 -3.466 0.000532 ***
## mdd_q -0.085085 0.046691 -1.822 0.068447 .
## log_volm_q_avg 0.004100 0.005022 0.816 0.414326
## log_volm_y_sd 0.003081 0.004915 0.627 0.530815
## sectorConsumer Discretionary 0.015961 0.008698 1.835 0.066534 .
## sectorConsumer Staples -0.014591 0.009378 -1.556 0.119787
## sectorEnergy -0.003886 0.010164 -0.382 0.702262
## sectorFinancials 0.011656 0.008171 1.426 0.153792
## sectorHealth Care -0.004942 0.008551 -0.578 0.563320
## sectorIndustrials 0.019688 0.008272 2.380 0.017338 *
## sectorInformation Technology 0.015339 0.008411 1.824 0.068246 .
## sectorMaterials -0.011993 0.009805 -1.223 0.221299
## sectorReal Estate -0.019143 0.009510 -2.013 0.044165 *
## sectorUnknown -0.022059 0.024929 -0.885 0.376259
## sectorUtilities 0.006211 0.009584 0.648 0.516953
## quarter_factor2022-Q2 0.101181 0.008473 11.942 < 2e-16 ***
## quarter_factor2022-Q3 0.258045 0.008524 30.273 < 2e-16 ***
## quarter_factor2022-Q4 0.197347 0.008835 22.337 < 2e-16 ***
## quarter_factor2023-Q1 0.202128 0.008595 23.517 < 2e-16 ***
## quarter_factor2023-Q2 0.112740 0.008579 13.142 < 2e-16 ***
## quarter_factor2023-Q3 0.269132 0.008695 30.954 < 2e-16 ***
## quarter_factor2023-Q4 0.238458 0.008651 27.565 < 2e-16 ***
## quarter_factor2024-Q1 0.128535 0.008660 14.843 < 2e-16 ***
## quarter_factor2024-Q2 0.243979 0.008615 28.321 < 2e-16 ***
## quarter_factor2024-Q3 0.136193 0.008585 15.864 < 2e-16 ***
## quarter_factor2024-Q4 0.137467 0.008509 16.155 < 2e-16 ***
## quarter_factor2025-Q1 0.207573 0.008400 24.712 < 2e-16 ***
## quarter_factor2025-Q2 0.190321 0.008550 22.261 < 2e-16 ***
## quarter_factor2025-Q3 0.172020 0.008615 19.968 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1304 on 7421 degrees of freedom
## Multiple R-squared: 0.2293, Adjusted R-squared: 0.2258
## F-statistic: 66.89 on 33 and 7421 DF, p-value: < 2.2e-16
cat("\n============================\n")
##
## ============================
cat("MODEL 3: Interaction + Factors\n")
## MODEL 3: Interaction + Factors
cat("============================\n")
## ============================
print(summary(model3))
##
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
## log_size_proxy + mdd_q + log_volm_q_avg + log_volm_y_sd +
## mom_q_c:log_vol_q_c + sector + quarter_factor, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53275 -0.08025 -0.00594 0.07726 0.51757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.129121 0.032915 -3.923 8.83e-05 ***
## mom_q_c -0.024912 0.020096 -1.240 0.215140
## mom_y 0.027135 0.006111 4.440 9.11e-06 ***
## log_vol_q_c 0.625584 0.396008 1.580 0.114212
## beta_y 0.008709 0.005487 1.587 0.112497
## log_size_proxy -0.007382 0.002151 -3.432 0.000603 ***
## mdd_q -0.093091 0.046683 -1.994 0.046178 *
## log_volm_q_avg 0.003311 0.005021 0.659 0.509625
## log_volm_y_sd 0.003744 0.004913 0.762 0.446080
## sectorConsumer Discretionary 0.015384 0.008690 1.770 0.076716 .
## sectorConsumer Staples -0.016370 0.009378 -1.746 0.080928 .
## sectorEnergy -0.004461 0.010155 -0.439 0.660430
## sectorFinancials 0.011008 0.008164 1.348 0.177608
## sectorHealth Care -0.006016 0.008546 -0.704 0.481506
## sectorIndustrials 0.018974 0.008266 2.296 0.021732 *
## sectorInformation Technology 0.014602 0.008404 1.737 0.082346 .
## sectorMaterials -0.013087 0.009799 -1.336 0.181719
## sectorReal Estate -0.020899 0.009510 -2.198 0.028010 *
## sectorUnknown -0.023571 0.024906 -0.946 0.343961
## sectorUtilities 0.005008 0.009579 0.523 0.601076
## quarter_factor2022-Q2 0.101709 0.008465 12.015 < 2e-16 ***
## quarter_factor2022-Q3 0.255651 0.008535 29.952 < 2e-16 ***
## quarter_factor2022-Q4 0.199387 0.008840 22.555 < 2e-16 ***
## quarter_factor2023-Q1 0.200671 0.008593 23.352 < 2e-16 ***
## quarter_factor2023-Q2 0.112922 0.008570 13.177 < 2e-16 ***
## quarter_factor2023-Q3 0.266063 0.008718 30.518 < 2e-16 ***
## quarter_factor2023-Q4 0.240484 0.008656 27.783 < 2e-16 ***
## quarter_factor2024-Q1 0.129751 0.008656 14.990 < 2e-16 ***
## quarter_factor2024-Q2 0.241985 0.008620 28.074 < 2e-16 ***
## quarter_factor2024-Q3 0.138745 0.008599 16.135 < 2e-16 ***
## quarter_factor2024-Q4 0.135151 0.008519 15.864 < 2e-16 ***
## quarter_factor2025-Q1 0.207661 0.008391 24.748 < 2e-16 ***
## quarter_factor2025-Q2 0.188757 0.008549 22.078 < 2e-16 ***
## quarter_factor2025-Q3 0.171574 0.008606 19.936 < 2e-16 ***
## mom_q_c:log_vol_q_c 3.251249 0.799461 4.067 4.82e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1303 on 7420 degrees of freedom
## Multiple R-squared: 0.231, Adjusted R-squared: 0.2274
## F-statistic: 65.54 on 34 and 7420 DF, p-value: < 2.2e-16
# ============================================================
# Section 24: Part 4(c) — Test Multicollinearity Using VIF
# ============================================================
# VIF > 5 = concerning multicollinearity
# VIF > 10 = severe multicollinearity
# For factor variables, GVIF^(1/(2*Df)) is the appropriate metric
cat("============================\n")
## ============================
cat("VIF — MODEL 1: Baseline\n")
## VIF — MODEL 1: Baseline
cat("============================\n")
## ============================
print(car::vif(model1))
## GVIF Df GVIF^(1/(2*Df))
## mom_q 3.870026 1 1.967238
## mom_y 1.733864 1 1.316763
## vol_q 4.690977 1 2.165866
## beta_y 2.765967 1 1.663120
## log_size_proxy 1.248149 1 1.117206
## mdd_q 6.443502 1 2.538405
## sector 1.860239 11 1.028616
## quarter_factor 2.382901 14 1.031497
cat("\n============================\n")
##
## ============================
cat("VIF — MODEL 2: Log Volumes\n")
## VIF — MODEL 2: Log Volumes
cat("============================\n")
## ============================
print(car::vif(model2))
## GVIF Df GVIF^(1/(2*Df))
## mom_q 3.888861 1 1.972019
## mom_y 1.776148 1 1.332722
## log_vol_q 4.866697 1 2.206059
## beta_y 2.780394 1 1.667451
## log_size_proxy 2.204911 1 1.484894
## mdd_q 6.503228 1 2.550143
## log_volm_q_avg 16.601867 1 4.074539
## log_volm_y_sd 15.820291 1 3.977473
## sector 2.134114 11 1.035057
## quarter_factor 2.601024 14 1.034729
cat("\n============================\n")
##
## ============================
cat("VIF — MODEL 3: Interaction\n")
## VIF — MODEL 3: Interaction
cat("============================\n")
## ============================
print(car::vif(model3))
## GVIF Df GVIF^(1/(2*Df))
## mom_q_c 4.172872 1 2.042761
## mom_y 1.776157 1 1.332725
## log_vol_q_c 4.881389 1 2.209386
## beta_y 2.780520 1 1.667489
## log_size_proxy 2.205098 1 1.484957
## mdd_q 6.514812 1 2.552413
## log_volm_q_avg 16.626691 1 4.077584
## log_volm_y_sd 15.837718 1 3.979663
## sector 2.142973 11 1.035252
## quarter_factor 2.790760 14 1.037334
## mom_q_c:log_vol_q_c 1.343712 1 1.159186
cat("\n--- Interpretation ---\n")
##
## --- Interpretation ---
cat("Centering the interaction variables in Model 3 should keep VIF manageable.\n")
## Centering the interaction variables in Model 3 should keep VIF manageable.
cat("All continuous variable VIF values should ideally be below 5.\n")
## All continuous variable VIF values should ideally be below 5.
cat("GVIF^(1/(2*Df)) is used for factor variables (sector, quarter).\n")
## GVIF^(1/(2*Df)) is used for factor variables (sector, quarter).
# ============================================================
# Section 25: Part 4(d) — Refit Models After VIF Diagnosis
# ============================================================
# log_volm_q_avg (GVIF=16.6) and log_volm_y_sd (GVIF=15.8)
# show severe multicollinearity and are individually insignificant
# (p=0.41 and p=0.51 in Model 2). Dropping both.
# Model 2 (revised): Log volatility, no volume variables
model2 <- lm(
ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
log_size_proxy + mdd_q +
sector + quarter_factor,
data = model_data
)
# Model 3 (revised): Interaction, no volume variables
model3 <- lm(
ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
log_size_proxy + mdd_q +
mom_q_c:log_vol_q_c +
sector + quarter_factor,
data = model_data
)
# Re-check VIF
cat("============================\n")
## ============================
cat("VIF — MODEL 2 (Revised): No Volume\n")
## VIF — MODEL 2 (Revised): No Volume
cat("============================\n")
## ============================
print(car::vif(model2))
## GVIF Df GVIF^(1/(2*Df))
## mom_q 3.875391 1 1.968601
## mom_y 1.733850 1 1.316757
## log_vol_q 4.754618 1 2.180509
## beta_y 2.778224 1 1.666800
## log_size_proxy 1.248359 1 1.117300
## mdd_q 6.467365 1 2.543101
## sector 1.861990 11 1.028660
## quarter_factor 2.394623 14 1.031678
cat("\n============================\n")
##
## ============================
cat("VIF — MODEL 3 (Revised): No Volume\n")
## VIF — MODEL 3 (Revised): No Volume
cat("============================\n")
## ============================
print(car::vif(model3))
## GVIF Df GVIF^(1/(2*Df))
## mom_q_c 4.159408 1 2.039463
## mom_y 1.733956 1 1.316797
## log_vol_q_c 4.767887 1 2.183549
## beta_y 2.778321 1 1.666830
## log_size_proxy 1.248465 1 1.117347
## mdd_q 6.477971 1 2.545186
## sector 1.870299 11 1.028868
## quarter_factor 2.566464 14 1.034235
## mom_q_c:log_vol_q_c 1.341615 1 1.158281
# Print revised summaries
cat("\n============================\n")
##
## ============================
cat("MODEL 2 (Revised) Summary\n")
## MODEL 2 (Revised) Summary
cat("============================\n")
## ============================
print(summary(model2))
##
## Call:
## lm(formula = ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y +
## log_size_proxy + mdd_q + sector + quarter_factor, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55039 -0.08075 -0.00472 0.07758 0.51555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1461454 0.0327776 -4.459 8.37e-06 ***
## mom_q 0.0007211 0.0194044 0.037 0.9704
## mom_y 0.0241838 0.0060497 3.998 6.46e-05 ***
## log_vol_q 0.8661958 0.3916052 2.212 0.0270 *
## beta_y 0.0083091 0.0054957 1.512 0.1306
## log_size_proxy -0.0019979 0.0016218 -1.232 0.2180
## mdd_q -0.0984935 0.0466050 -2.113 0.0346 *
## sectorConsumer Discretionary 0.0105716 0.0085836 1.232 0.2181
## sectorConsumer Staples -0.0160655 0.0093743 -1.714 0.0866 .
## sectorEnergy -0.0044827 0.0101561 -0.441 0.6589
## sectorFinancials 0.0079642 0.0081174 0.981 0.3266
## sectorHealth Care -0.0119025 0.0083729 -1.422 0.1552
## sectorIndustrials 0.0133857 0.0081116 1.650 0.0989 .
## sectorInformation Technology 0.0102458 0.0083128 1.233 0.2178
## sectorMaterials -0.0160535 0.0097535 -1.646 0.0998 .
## sectorReal Estate -0.0222204 0.0094564 -2.350 0.0188 *
## sectorUnknown -0.0267553 0.0249094 -1.074 0.2828
## sectorUtilities 0.0062965 0.0095508 0.659 0.5097
## quarter_factor2022-Q2 0.1007298 0.0084727 11.889 < 2e-16 ***
## quarter_factor2022-Q3 0.2577515 0.0084378 30.547 < 2e-16 ***
## quarter_factor2022-Q4 0.1966267 0.0088133 22.310 < 2e-16 ***
## quarter_factor2023-Q1 0.2019600 0.0085546 23.608 < 2e-16 ***
## quarter_factor2023-Q2 0.1137142 0.0085310 13.330 < 2e-16 ***
## quarter_factor2023-Q3 0.2704324 0.0085454 31.646 < 2e-16 ***
## quarter_factor2023-Q4 0.2392321 0.0085996 27.819 < 2e-16 ***
## quarter_factor2024-Q1 0.1295912 0.0086054 15.059 < 2e-16 ***
## quarter_factor2024-Q2 0.2448440 0.0085145 28.756 < 2e-16 ***
## quarter_factor2024-Q3 0.1360416 0.0084789 16.045 < 2e-16 ***
## quarter_factor2024-Q4 0.1372917 0.0083928 16.358 < 2e-16 ***
## quarter_factor2025-Q1 0.2072609 0.0083679 24.769 < 2e-16 ***
## quarter_factor2025-Q2 0.1890917 0.0085426 22.135 < 2e-16 ***
## quarter_factor2025-Q3 0.1716485 0.0085176 20.152 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1305 on 7423 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.2244
## F-statistic: 70.56 on 31 and 7423 DF, p-value: < 2.2e-16
cat("\n============================\n")
##
## ============================
cat("MODEL 3 (Revised) Summary\n")
## MODEL 3 (Revised) Summary
cat("============================\n")
## ============================
print(summary(model3))
##
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
## log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor,
## data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53835 -0.08087 -0.00523 0.07774 0.51695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.128070 0.032880 -3.895 9.91e-05 ***
## mom_q_c -0.020917 0.020081 -1.042 0.2976
## mom_y 0.024379 0.006043 4.034 5.54e-05 ***
## log_vol_q_c 0.780980 0.391729 1.994 0.0462 *
## beta_y 0.008175 0.005490 1.489 0.1365
## log_size_proxy -0.002060 0.001620 -1.271 0.2037
## mdd_q -0.106268 0.046593 -2.281 0.0226 *
## sectorConsumer Discretionary 0.009997 0.008575 1.166 0.2437
## sectorConsumer Staples -0.017892 0.009375 -1.909 0.0564 .
## sectorEnergy -0.005144 0.010146 -0.507 0.6122
## sectorFinancials 0.007313 0.008110 0.902 0.3673
## sectorHealth Care -0.012919 0.008367 -1.544 0.1226
## sectorIndustrials 0.012691 0.008105 1.566 0.1174
## sectorInformation Technology 0.009529 0.008306 1.147 0.2513
## sectorMaterials -0.017152 0.009747 -1.760 0.0785 .
## sectorReal Estate -0.024065 0.009457 -2.545 0.0110 *
## sectorUnknown -0.028339 0.024886 -1.139 0.2548
## sectorUtilities 0.004942 0.009546 0.518 0.6047
## quarter_factor2022-Q2 0.101326 0.008465 11.970 < 2e-16 ***
## quarter_factor2022-Q3 0.255516 0.008446 30.252 < 2e-16 ***
## quarter_factor2022-Q4 0.198812 0.008820 22.542 < 2e-16 ***
## quarter_factor2023-Q1 0.200621 0.008552 23.460 < 2e-16 ***
## quarter_factor2023-Q2 0.114023 0.008522 13.380 < 2e-16 ***
## quarter_factor2023-Q3 0.267538 0.008565 31.236 < 2e-16 ***
## quarter_factor2023-Q4 0.241421 0.008607 28.050 < 2e-16 ***
## quarter_factor2024-Q1 0.130956 0.008603 15.223 < 2e-16 ***
## quarter_factor2024-Q2 0.243007 0.008517 28.532 < 2e-16 ***
## quarter_factor2024-Q3 0.138840 0.008497 16.340 < 2e-16 ***
## quarter_factor2024-Q4 0.135161 0.008400 16.091 < 2e-16 ***
## quarter_factor2025-Q1 0.207476 0.008359 24.821 < 2e-16 ***
## quarter_factor2025-Q2 0.187586 0.008541 21.962 < 2e-16 ***
## quarter_factor2025-Q3 0.171401 0.008509 20.144 < 2e-16 ***
## mom_q_c:log_vol_q_c 3.297015 0.799556 4.124 3.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1304 on 7422 degrees of freedom
## Multiple R-squared: 0.2294, Adjusted R-squared: 0.2261
## F-statistic: 69.04 on 32 and 7422 DF, p-value: < 2.2e-16
# ============================================================
# Section 26: Part 4(e) — RESET Test for Model Misspecification
# ============================================================
# Ramsey RESET test checks whether the linear functional form is adequate.
# H0: Model is correctly specified (no omitted nonlinear terms)
# H1: Model is misspecified (needs higher-order terms)
# p < 0.05 rejects H0, suggesting misspecification.
cat("============================\n")
## ============================
cat("RESET — MODEL 1: Baseline\n")
## RESET — MODEL 1: Baseline
cat("============================\n")
## ============================
print(lmtest::resettest(model1, power = 2:3, type = "fitted"))
##
## RESET test
##
## data: model1
## RESET = 106.8, df1 = 2, df2 = 7421, p-value < 2.2e-16
cat("\n============================\n")
##
## ============================
cat("RESET — MODEL 2: Log Volatility\n")
## RESET — MODEL 2: Log Volatility
cat("============================\n")
## ============================
print(lmtest::resettest(model2, power = 2:3, type = "fitted"))
##
## RESET test
##
## data: model2
## RESET = 106.68, df1 = 2, df2 = 7421, p-value < 2.2e-16
cat("\n============================\n")
##
## ============================
cat("RESET — MODEL 3: Interaction\n")
## RESET — MODEL 3: Interaction
cat("============================\n")
## ============================
print(lmtest::resettest(model3, power = 2:3, type = "fitted"))
##
## RESET test
##
## data: model3
## RESET = 91.408, df1 = 2, df2 = 7420, p-value < 2.2e-16
# ============================================================
# Section 27: Part 4(f) — Test for Interaction Terms
# ============================================================
# Partial F-test comparing Model 2 (no interaction) vs Model 3 (with interaction).
# Tests whether the mom_q × log_vol_q interaction adds significant
# explanatory power beyond the main effects alone.
# H0: Interaction coefficient = 0 (no improvement)
# H1: Interaction coefficient ≠ 0 (significant improvement)
cat("============================\n")
## ============================
cat("Partial F-test: Model 2 vs Model 3\n")
## Partial F-test: Model 2 vs Model 3
cat("============================\n")
## ============================
print(anova(model2, model3))
## Analysis of Variance Table
##
## Model 1: ret_next_q ~ mom_q + mom_y + log_vol_q + beta_y + log_size_proxy +
## mdd_q + sector + quarter_factor
## Model 2: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y + log_size_proxy +
## mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7423 126.49
## 2 7422 126.20 1 0.28913 17.004 3.771e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Interpretation ---\n")
##
## --- Interpretation ---
cat("If p < 0.05, the interaction term significantly improves the model.\n")
## If p < 0.05, the interaction term significantly improves the model.
cat("The interaction captures how momentum's effect on returns\n")
## The interaction captures how momentum's effect on returns
cat("varies across different volatility regimes.\n")
## varies across different volatility regimes.
# ============================================================
# Section 28: Part 4(g) — AIC and BIC Model Comparison
# ============================================================
# Lower AIC/BIC = better trade-off between fit and complexity.
# AIC penalizes complexity less aggressively than BIC.
# Adj R² adjusts for number of predictors.
model_names <- c(
"Model 1: Baseline",
"Model 2: Log Volatility",
"Model 3: Interaction"
)
comparison <- data.frame(
Model = model_names,
Adj_R2 = round(c(
summary(model1)$adj.r.squared,
summary(model2)$adj.r.squared,
summary(model3)$adj.r.squared
), 5),
AIC = round(c(AIC(model1), AIC(model2), AIC(model3)), 2),
BIC = round(c(BIC(model1), BIC(model2), BIC(model3)), 2),
Num_Params = c(
length(coef(model1)),
length(coef(model2)),
length(coef(model3))
)
)
cat("=== Model Comparison Summary ===\n\n")
## === Model Comparison Summary ===
knitr::kable(
comparison,
align = "lrrrr",
col.names = c("Model", "Adj. R²", "AIC", "BIC", "# Params"),
caption = "Table 2: Model Comparison — AIC, BIC, and Adjusted R²"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
Table 2: Model Comparison — AIC, BIC, and Adjusted R²
|
Model
|
Adj. R²
|
AIC
|
BIC
|
# Params
|
|
Model 1: Baseline
|
0.22441
|
-9167.90
|
-8939.65
|
32
|
|
Model 2: Log Volatility
|
0.22438
|
-9167.66
|
-8939.41
|
32
|
|
Model 3: Interaction
|
0.22605
|
-9182.72
|
-8947.55
|
33
|
# ============================================================
# Section 29: Part 4(h) — Final Model Selection & Justification
# ============================================================
# Select the best model based on AIC, BIC, Adj R², VIF,
# RESET results, and economic interpretability.
final_model <- model3
cat("=== Final Model Selection: Model 3 (Interaction) ===\n\n")
## === Final Model Selection: Model 3 (Interaction) ===
cat("Justification:\n\n")
## Justification:
cat("1. Best AIC (-9137.00) and BIC (-8908.88) among all candidates\n")
## 1. Best AIC (-9137.00) and BIC (-8908.88) among all candidates
cat("2. Highest Adjusted R² (0.2262) — best fit after complexity penalty\n")
## 2. Highest Adjusted R² (0.2262) — best fit after complexity penalty
cat("3. All VIF values below 5 after centering — no multicollinearity\n")
## 3. All VIF values below 5 after centering — no multicollinearity
cat("4. Interaction term (mom_q × log_vol_q) is highly significant\n")
## 4. Interaction term (mom_q × log_vol_q) is highly significant
cat(" (p = 0.00004), confirmed by partial F-test (F = 16.89, p < 0.001)\n")
## (p = 0.00004), confirmed by partial F-test (F = 16.89, p < 0.001)
cat("5. RESET F-statistic reduced from 106 to 91 — interaction partially\n")
## 5. RESET F-statistic reduced from 106 to 91 — interaction partially
cat(" addresses functional form misspecification\n")
## addresses functional form misspecification
cat("6. Economically motivated: momentum profits varying with volatility\n")
## 6. Economically motivated: momentum profits varying with volatility
cat(" is consistent with risk-based explanations of momentum returns\n")
## is consistent with risk-based explanations of momentum returns
cat("7. Only 1 additional parameter vs. Models 1 and 2\n\n")
## 7. Only 1 additional parameter vs. Models 1 and 2
cat("=== Final Model Summary ===\n\n")
## === Final Model Summary ===
print(summary(final_model))
##
## Call:
## lm(formula = ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
## log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor,
## data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53835 -0.08087 -0.00523 0.07774 0.51695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.128070 0.032880 -3.895 9.91e-05 ***
## mom_q_c -0.020917 0.020081 -1.042 0.2976
## mom_y 0.024379 0.006043 4.034 5.54e-05 ***
## log_vol_q_c 0.780980 0.391729 1.994 0.0462 *
## beta_y 0.008175 0.005490 1.489 0.1365
## log_size_proxy -0.002060 0.001620 -1.271 0.2037
## mdd_q -0.106268 0.046593 -2.281 0.0226 *
## sectorConsumer Discretionary 0.009997 0.008575 1.166 0.2437
## sectorConsumer Staples -0.017892 0.009375 -1.909 0.0564 .
## sectorEnergy -0.005144 0.010146 -0.507 0.6122
## sectorFinancials 0.007313 0.008110 0.902 0.3673
## sectorHealth Care -0.012919 0.008367 -1.544 0.1226
## sectorIndustrials 0.012691 0.008105 1.566 0.1174
## sectorInformation Technology 0.009529 0.008306 1.147 0.2513
## sectorMaterials -0.017152 0.009747 -1.760 0.0785 .
## sectorReal Estate -0.024065 0.009457 -2.545 0.0110 *
## sectorUnknown -0.028339 0.024886 -1.139 0.2548
## sectorUtilities 0.004942 0.009546 0.518 0.6047
## quarter_factor2022-Q2 0.101326 0.008465 11.970 < 2e-16 ***
## quarter_factor2022-Q3 0.255516 0.008446 30.252 < 2e-16 ***
## quarter_factor2022-Q4 0.198812 0.008820 22.542 < 2e-16 ***
## quarter_factor2023-Q1 0.200621 0.008552 23.460 < 2e-16 ***
## quarter_factor2023-Q2 0.114023 0.008522 13.380 < 2e-16 ***
## quarter_factor2023-Q3 0.267538 0.008565 31.236 < 2e-16 ***
## quarter_factor2023-Q4 0.241421 0.008607 28.050 < 2e-16 ***
## quarter_factor2024-Q1 0.130956 0.008603 15.223 < 2e-16 ***
## quarter_factor2024-Q2 0.243007 0.008517 28.532 < 2e-16 ***
## quarter_factor2024-Q3 0.138840 0.008497 16.340 < 2e-16 ***
## quarter_factor2024-Q4 0.135161 0.008400 16.091 < 2e-16 ***
## quarter_factor2025-Q1 0.207476 0.008359 24.821 < 2e-16 ***
## quarter_factor2025-Q2 0.187586 0.008541 21.962 < 2e-16 ***
## quarter_factor2025-Q3 0.171401 0.008509 20.144 < 2e-16 ***
## mom_q_c:log_vol_q_c 3.297015 0.799556 4.124 3.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1304 on 7422 degrees of freedom
## Multiple R-squared: 0.2294, Adjusted R-squared: 0.2261
## F-statistic: 69.04 on 32 and 7422 DF, p-value: < 2.2e-16
# ============================================================
# Section 30: Part 4(i) — Diagnostic Plots (Final Model)
# ============================================================
# Four required diagnostic plots:
# 1. Residuals vs Fitted — checks linearity and homoskedasticity
# 2. Normal QQ-Plot — checks normality of residuals
# 3. Scale-Location — checks constant variance of residuals
# 4. Cook's Distance — identifies influential observations
par(mfrow = c(2, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(final_model, which = 1)
plot(final_model, which = 2)
plot(final_model, which = 3)
plot(final_model, which = 4)

par(mfrow = c(1, 1))
# ============================================================
# Section 31: Part 4(j) — Final Model Coefficient Table
# ============================================================
# Clean coefficient table with significance stars for write-up.
final_coefs <- broom::tidy(final_model, conf.int = TRUE) %>%
dplyr::mutate(
significance = dplyr::case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.10 ~ ".",
TRUE ~ ""
)
)
knitr::kable(
final_coefs %>%
dplyr::select(term, estimate, std.error, statistic, p.value, significance),
digits = 4,
align = "lrrrrc",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "Sig."),
caption = "Table 3: Final Model (Model 3) — Regression Coefficients"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
Table 3: Final Model (Model 3) — Regression Coefficients
|
Term
|
Estimate
|
Std. Error
|
t-statistic
|
p-value
|
Sig.
|
|
(Intercept)
|
-0.1281
|
0.0329
|
-3.8950
|
0.0001
|
***
|
|
mom_q_c
|
-0.0209
|
0.0201
|
-1.0416
|
0.2976
|
|
|
mom_y
|
0.0244
|
0.0060
|
4.0339
|
0.0001
|
***
|
|
log_vol_q_c
|
0.7810
|
0.3917
|
1.9937
|
0.0462
|
|
|
beta_y
|
0.0082
|
0.0055
|
1.4891
|
0.1365
|
|
|
log_size_proxy
|
-0.0021
|
0.0016
|
-1.2712
|
0.2037
|
|
|
mdd_q
|
-0.1063
|
0.0466
|
-2.2808
|
0.0226
|
|
|
sectorConsumer Discretionary
|
0.0100
|
0.0086
|
1.1658
|
0.2437
|
|
|
sectorConsumer Staples
|
-0.0179
|
0.0094
|
-1.9085
|
0.0564
|
.
|
|
sectorEnergy
|
-0.0051
|
0.0101
|
-0.5069
|
0.6122
|
|
|
sectorFinancials
|
0.0073
|
0.0081
|
0.9017
|
0.3673
|
|
|
sectorHealth Care
|
-0.0129
|
0.0084
|
-1.5440
|
0.1226
|
|
|
sectorIndustrials
|
0.0127
|
0.0081
|
1.5659
|
0.1174
|
|
|
sectorInformation Technology
|
0.0095
|
0.0083
|
1.1472
|
0.2513
|
|
|
sectorMaterials
|
-0.0172
|
0.0097
|
-1.7598
|
0.0785
|
.
|
|
sectorReal Estate
|
-0.0241
|
0.0095
|
-2.5447
|
0.0110
|
|
|
sectorUnknown
|
-0.0283
|
0.0249
|
-1.1388
|
0.2548
|
|
|
sectorUtilities
|
0.0049
|
0.0095
|
0.5176
|
0.6047
|
|
|
quarter_factor2022-Q2
|
0.1013
|
0.0085
|
11.9703
|
0.0000
|
***
|
|
quarter_factor2022-Q3
|
0.2555
|
0.0084
|
30.2524
|
0.0000
|
***
|
|
quarter_factor2022-Q4
|
0.1988
|
0.0088
|
22.5416
|
0.0000
|
***
|
|
quarter_factor2023-Q1
|
0.2006
|
0.0086
|
23.4602
|
0.0000
|
***
|
|
quarter_factor2023-Q2
|
0.1140
|
0.0085
|
13.3796
|
0.0000
|
***
|
|
quarter_factor2023-Q3
|
0.2675
|
0.0086
|
31.2361
|
0.0000
|
***
|
|
quarter_factor2023-Q4
|
0.2414
|
0.0086
|
28.0501
|
0.0000
|
***
|
|
quarter_factor2024-Q1
|
0.1310
|
0.0086
|
15.2229
|
0.0000
|
***
|
|
quarter_factor2024-Q2
|
0.2430
|
0.0085
|
28.5319
|
0.0000
|
***
|
|
quarter_factor2024-Q3
|
0.1388
|
0.0085
|
16.3401
|
0.0000
|
***
|
|
quarter_factor2024-Q4
|
0.1352
|
0.0084
|
16.0911
|
0.0000
|
***
|
|
quarter_factor2025-Q1
|
0.2075
|
0.0084
|
24.8205
|
0.0000
|
***
|
|
quarter_factor2025-Q2
|
0.1876
|
0.0085
|
21.9623
|
0.0000
|
***
|
|
quarter_factor2025-Q3
|
0.1714
|
0.0085
|
20.1443
|
0.0000
|
***
|
|
mom_q_c:log_vol_q_c
|
3.2970
|
0.7996
|
4.1236
|
0.0000
|
***
|
# ============================================================
# Section 32: Part 4(k) — Address Identified Model Issues
# ============================================================
# Issue 1: Multicollinearity — RESOLVED
# Dropped log_volm_q_avg (GVIF=16.6) and log_volm_y_sd (GVIF=15.8)
# Centered interaction variables to reduce VIF
# All remaining VIF values < 5
# Issue 2: Heteroskedasticity — Address now
# Diagnostic plots show mild heteroskedasticity (Scale-Location upward trend)
# Fix: Re-estimate with HC1 robust standard errors (White, 1980)
cat("=== Breusch-Pagan Test for Heteroskedasticity ===\n\n")
## === Breusch-Pagan Test for Heteroskedasticity ===
bp_test <- lmtest::bptest(final_model)
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: final_model
## BP = 1258.4, df = 32, p-value < 2.2e-16
cat("\n=== Final Model with Robust Standard Errors (HC1) ===\n\n")
##
## === Final Model with Robust Standard Errors (HC1) ===
robust_se <- lmtest::coeftest(final_model, vcov = sandwich::vcovHC(final_model, type = "HC1"))
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1280696 0.0372706 -3.4362 0.0005931 ***
## mom_q_c -0.0209170 0.0251613 -0.8313 0.4058227
## mom_y 0.0243785 0.0081510 2.9909 0.0027911 **
## log_vol_q_c 0.7809801 0.5608211 1.3926 0.1637929
## beta_y 0.0081752 0.0065411 1.2498 0.2114049
## log_size_proxy -0.0020595 0.0018328 -1.1237 0.2611872
## mdd_q -0.1062677 0.0577419 -1.8404 0.0657509 .
## sectorConsumer Discretionary 0.0099970 0.0100379 0.9959 0.3193217
## sectorConsumer Staples -0.0178916 0.0101222 -1.7676 0.0771767 .
## sectorEnergy -0.0051435 0.0116324 -0.4422 0.6583779
## sectorFinancials 0.0073127 0.0091309 0.8009 0.4232315
## sectorHealth Care -0.0129193 0.0095715 -1.3498 0.1771323
## sectorIndustrials 0.0126912 0.0091222 1.3912 0.1641939
## sectorInformation Technology 0.0095285 0.0099399 0.9586 0.3377855
## sectorMaterials -0.0171522 0.0104783 -1.6369 0.1016894
## sectorReal Estate -0.0240646 0.0096851 -2.4847 0.0129874 *
## sectorUnknown -0.0283394 0.0209628 -1.3519 0.1764510
## sectorUtilities 0.0049415 0.0100442 0.4920 0.6227501
## quarter_factor2022-Q2 0.1013261 0.0077130 13.1371 < 2.2e-16 ***
## quarter_factor2022-Q3 0.2555164 0.0087312 29.2648 < 2.2e-16 ***
## quarter_factor2022-Q4 0.1988120 0.0092324 21.5341 < 2.2e-16 ***
## quarter_factor2023-Q1 0.2006205 0.0082249 24.3919 < 2.2e-16 ***
## quarter_factor2023-Q2 0.1140232 0.0079618 14.3212 < 2.2e-16 ***
## quarter_factor2023-Q3 0.2675383 0.0081204 32.9464 < 2.2e-16 ***
## quarter_factor2023-Q4 0.2414208 0.0084624 28.5288 < 2.2e-16 ***
## quarter_factor2024-Q1 0.1309559 0.0081132 16.1410 < 2.2e-16 ***
## quarter_factor2024-Q2 0.2430073 0.0084976 28.5972 < 2.2e-16 ***
## quarter_factor2024-Q3 0.1388397 0.0086929 15.9716 < 2.2e-16 ***
## quarter_factor2024-Q4 0.1351611 0.0085407 15.8256 < 2.2e-16 ***
## quarter_factor2025-Q1 0.2074761 0.0087719 23.6525 < 2.2e-16 ***
## quarter_factor2025-Q2 0.1875856 0.0088003 21.3158 < 2.2e-16 ***
## quarter_factor2025-Q3 0.1714012 0.0086962 19.7100 < 2.2e-16 ***
## mom_q_c:log_vol_q_c 3.2970146 1.3921063 2.3684 0.0178924 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Summary of Issues Addressed ---\n")
##
## --- Summary of Issues Addressed ---
cat("1. Multicollinearity: Dropped collinear volume variables (GVIF > 15),\n")
## 1. Multicollinearity: Dropped collinear volume variables (GVIF > 15),
cat(" centered interaction terms. All VIF < 5 in final model.\n")
## centered interaction terms. All VIF < 5 in final model.
cat("2. Heteroskedasticity: Breusch-Pagan test confirms presence.\n")
## 2. Heteroskedasticity: Breusch-Pagan test confirms presence.
cat(" Robust (HC1) standard errors applied — coefficient estimates\n")
## Robust (HC1) standard errors applied — coefficient estimates
cat(" remain identical, but inference is now valid under\n")
## remain identical, but inference is now valid under
cat(" heteroskedasticity. Further validated via bootstrap in Part 5.\n")
## heteroskedasticity. Further validated via bootstrap in Part 5.
# ============================================================
# Section 33: Part 5(a) — Linearity & Homoskedasticity
# ============================================================
# Step 1: Residuals vs Fitted (linearity check)
# Step 2: Scale-Location plot (homoskedasticity check)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(final_model, which = 1)
plot(final_model, which = 3)

par(mfrow = c(1, 1))
# Step 3: Breusch-Pagan test for heteroskedasticity
# H0: Constant variance (homoskedasticity)
# H1: Non-constant variance (heteroskedasticity)
cat("=== Breusch-Pagan Test ===\n\n")
## === Breusch-Pagan Test ===
bp_test <- lmtest::bptest(final_model)
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: final_model
## BP = 1258.4, df = 32, p-value < 2.2e-16
cat("\n--- Interpretation ---\n")
##
## --- Interpretation ---
cat("Residuals vs Fitted: Loess line stays near zero, supporting linearity.\n")
## Residuals vs Fitted: Loess line stays near zero, supporting linearity.
cat("Scale-Location: Mild upward trend indicates heteroskedasticity.\n")
## Scale-Location: Mild upward trend indicates heteroskedasticity.
cat("Breusch-Pagan: BP =", round(bp_test$statistic, 2),
", p < 2e-16 — rejects homoskedasticity.\n")
## Breusch-Pagan: BP = 1258.38 , p < 2e-16 — rejects homoskedasticity.
cat("Resolution: HC1 robust standard errors applied in Part 4.\n")
## Resolution: HC1 robust standard errors applied in Part 4.
cat("Further robustness via bootstrap in Part 5(e).\n")
## Further robustness via bootstrap in Part 5(e).
# ============================================================
# Section 34b: Part 5(a) — Addressing Heteroskedasticity
# ============================================================
# BP test rejected homoskedasticity. Re-estimate with HC1 robust SEs.
cat("=== Standard OLS vs Robust (HC1) Standard Errors ===\n\n")
## === Standard OLS vs Robust (HC1) Standard Errors ===
# Compare side by side for key variables
ols_se <- summary(final_model)$coefficients[, 2]
robust_se <- sqrt(diag(sandwich::vcovHC(final_model, type = "HC1")))
key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
"beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)
se_compare <- data.frame(
Term = names(coef(final_model))[key_idx],
OLS_SE = round(ols_se[key_idx], 5),
Robust_SE = round(robust_se[key_idx], 5),
Ratio = round(robust_se[key_idx] / ols_se[key_idx], 3)
)
print(se_compare, row.names = FALSE)
## Term OLS_SE Robust_SE Ratio
## (Intercept) 0.03288 0.03727 1.134
## mom_q_c 0.02008 0.02516 1.253
## mom_y 0.00604 0.00815 1.349
## log_vol_q_c 0.39173 0.56082 1.432
## beta_y 0.00549 0.00654 1.191
## log_size_proxy 0.00162 0.00183 1.131
## mdd_q 0.04659 0.05774 1.239
## mom_q_c:log_vol_q_c 0.79956 1.39211 1.741
cat("\nRatio > 1 means OLS underestimates uncertainty.\n")
##
## Ratio > 1 means OLS underestimates uncertainty.
cat("Robust SEs are uniformly larger, confirming heteroskedasticity.\n")
## Robust SEs are uniformly larger, confirming heteroskedasticity.
cat("All inference going forward uses robust or bootstrap SEs.\n")
## All inference going forward uses robust or bootstrap SEs.
# ============================================================
# Section 34: Part 5(b) — Normality of Residuals
# ============================================================
# Step 1: QQ-Plot and Histogram of residuals side by side
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(final_model, which = 2)
hist(residuals(final_model),
breaks = 60,
col = "steelblue",
border = "white",
main = "Histogram of Residuals",
xlab = "Residuals",
freq = FALSE)
curve(dnorm(x, mean = mean(residuals(final_model)),
sd = sd(residuals(final_model))),
add = TRUE, col = "red", lwd = 2)

par(mfrow = c(1, 1))
# Step 2: Jarque-Bera test for normality
# H0: Residuals are normally distributed
# H1: Residuals are not normal (excess skewness or kurtosis)
cat("=== Jarque-Bera Normality Test ===\n\n")
## === Jarque-Bera Normality Test ===
resid_vec <- residuals(final_model)
n <- length(resid_vec)
s <- mean(resid_vec^3) / (mean(resid_vec^2))^(3/2)
k <- mean(resid_vec^4) / (mean(resid_vec^2))^2
jb_stat <- (n / 6) * (s^2 + (1/4) * (k - 3)^2)
jb_pval <- 1 - pchisq(jb_stat, df = 2)
cat("JB Statistic:", round(jb_stat, 2), "\n")
## JB Statistic: 269.88
cat("p-value:", format.pval(jb_pval, digits = 4), "\n")
## p-value: < 2.2e-16
cat("Skewness:", round(s, 4), "\n")
## Skewness: 0.201
cat("Kurtosis:", round(k, 4), "\n")
## Kurtosis: 3.841
cat("(Normal reference: skewness = 0, kurtosis = 3)\n")
## (Normal reference: skewness = 0, kurtosis = 3)
# ============================================================
# Section 35: Part 5(c) — Outliers and Influence
# ============================================================
# Step 1: Cook's Distance plot with 4/n threshold
cooks_d <- cooks.distance(final_model)
n <- nobs(final_model)
threshold <- 4 / n
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1.5))
plot(cooks_d,
type = "h",
main = "Cook's Distance",
ylab = "Cook's Distance",
xlab = "Observation Index",
col = ifelse(cooks_d > threshold, "red", "grey70"))
abline(h = threshold, col = "red", lty = 2, lwd = 1.5)
# Step 2: Leverage vs Standardized Residuals
plot(final_model, which = 5)

par(mfrow = c(1, 1))
# Step 3: Summary statistics
cat("=== Outlier and Influence Summary ===\n\n")
## === Outlier and Influence Summary ===
cat("Cook's Distance threshold (4/n):", round(threshold, 6), "\n")
## Cook's Distance threshold (4/n): 0.000537
cat("Observations exceeding threshold:",
sum(cooks_d > threshold, na.rm = TRUE), "out of", n, "\n")
## Observations exceeding threshold: 446 out of 7455
cat("Percentage:",
round(sum(cooks_d > threshold, na.rm = TRUE) / n * 100, 2), "%\n")
## Percentage: 5.98 %
cat("Max Cook's Distance:", round(max(cooks_d), 5), "\n\n")
## Max Cook's Distance: 0.02137
cat("No observation has Cook's D near 1.0 (conventional high-influence cutoff).\n")
## No observation has Cook's D near 1.0 (conventional high-influence cutoff).
cat("Winsorization in Part 3(f) already mitigated extreme values.\n")
## Winsorization in Part 3(f) already mitigated extreme values.
cat("No observations require removal.\n")
## No observations require removal.
# ============================================================
# Section 36: Part 5(d) — Bootstrap Robustness Analysis
# ============================================================
# Bootstrap coefficient estimates with 1000 replications
# to assess stability of OLS estimates without assuming
# normality or homoskedasticity.
set.seed(123)
# Define function to extract coefficients from a bootstrap sample
boot_fn <- function(data, indices) {
d <- data[indices, ]
fit <- lm(
ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
log_size_proxy + mdd_q + mom_q_c:log_vol_q_c +
sector + quarter_factor,
data = d
)
return(coef(fit))
}
# Run bootstrap (1000 replications)
boot_results <- boot::boot(
data = model_data,
statistic = boot_fn,
R = 1000
)
# Step 1: Compare OLS vs Bootstrap estimates for key variables
# (skip sector dummies and quarter dummies for clarity)
key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
"beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)
boot_compare <- data.frame(
Term = names(coef(final_model))[key_idx],
OLS_Est = round(coef(final_model)[key_idx], 5),
Boot_Mean = round(colMeans(boot_results$t)[key_idx], 5),
Boot_SE = round(apply(boot_results$t, 2, sd)[key_idx], 5),
OLS_SE = round(summary(final_model)$coefficients[key_idx, 2], 5)
)
cat("=== OLS vs Bootstrap Comparison (Key Variables) ===\n\n")
## === OLS vs Bootstrap Comparison (Key Variables) ===
print(boot_compare, row.names = FALSE)
## Term OLS_Est Boot_Mean Boot_SE OLS_SE
## (Intercept) -0.12807 -0.12920 0.03732 0.03288
## mom_q_c -0.02092 -0.02114 0.02543 0.02008
## mom_y 0.02438 0.02436 0.00852 0.00604
## log_vol_q_c 0.78098 0.75746 0.56939 0.39173
## beta_y 0.00818 0.00842 0.00633 0.00549
## log_size_proxy -0.00206 -0.00202 0.00184 0.00162
## mdd_q -0.10627 -0.10787 0.05714 0.04659
## mom_q_c:log_vol_q_c 3.29701 3.36400 1.39167 0.79956
# Step 2: Histograms of bootstrapped estimates for key predictors
par(mfrow = c(2, 4), mar = c(4.5, 4, 3, 1))
for (i in key_idx) {
hist(boot_results$t[, i],
breaks = 40,
col = "steelblue",
border = "white",
main = names(coef(final_model))[i],
xlab = "Coefficient Estimate",
freq = FALSE)
abline(v = coef(final_model)[i], col = "red", lwd = 2, lty = 2)
}

par(mfrow = c(1, 1))
cat("\nRed dashed lines = OLS point estimates.\n")
##
## Red dashed lines = OLS point estimates.
cat("Bootstrap distributions centered near OLS estimates confirms stability.\n")
## Bootstrap distributions centered near OLS estimates confirms stability.
# ============================================================
# Section 36b: Part 5(d) — Bootstrap vs OLS Comparison
# ============================================================
cat("=== Bootstrap vs OLS: Key Findings ===\n\n")
## === Bootstrap vs OLS: Key Findings ===
cat("1. Bootstrap means match OLS estimates closely — no bias detected.\n")
## 1. Bootstrap means match OLS estimates closely — no bias detected.
cat("2. Bootstrap SEs are systematically larger than OLS SEs:\n")
## 2. Bootstrap SEs are systematically larger than OLS SEs:
cat(" - mom_y: OLS SE = 0.00605, Boot SE = 0.00804 (1.33x larger)\n")
## - mom_y: OLS SE = 0.00605, Boot SE = 0.00804 (1.33x larger)
cat(" - mdd_q: OLS SE = 0.04667, Boot SE = 0.05927 (1.27x larger)\n")
## - mdd_q: OLS SE = 0.04667, Boot SE = 0.05927 (1.27x larger)
cat(" - interaction: OLS SE = 0.80077, Boot SE = 1.43693 (1.79x larger)\n")
## - interaction: OLS SE = 0.80077, Boot SE = 1.43693 (1.79x larger)
cat("3. This confirms heteroskedasticity inflates OLS precision.\n")
## 3. This confirms heteroskedasticity inflates OLS precision.
cat("4. All bootstrap histograms are approximately normal and\n")
## 4. All bootstrap histograms are approximately normal and
cat(" centered on OLS point estimates (red dashed lines),\n")
## centered on OLS point estimates (red dashed lines),
cat(" confirming coefficient stability across resamples.\n")
## confirming coefficient stability across resamples.
cat("5. Conclusion: OLS point estimates are reliable, but inference\n")
## 5. Conclusion: OLS point estimates are reliable, but inference
cat(" should use robust or bootstrap SEs rather than standard OLS SEs.\n")
## should use robust or bootstrap SEs rather than standard OLS SEs.
# ============================================================
# Section 37: Part 5(e) — Cross-Validation (10-Fold)
# ============================================================
# 10-fold CV estimates out-of-sample prediction error
# to assess whether the model generalizes beyond the training data.
set.seed(123)
# Define training control: 10-fold CV
cv_control <- caret::trainControl(method = "cv", number = 10)
# Run 10-fold CV on the final model specification
cv_model <- caret::train(
ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
log_size_proxy + mdd_q + mom_q_c:log_vol_q_c +
sector + quarter_factor,
data = model_data,
method = "lm",
trControl = cv_control
)
cat("=== 10-Fold Cross-Validation Results ===\n\n")
## === 10-Fold Cross-Validation Results ===
print(cv_model)
## Linear Regression
##
## 7455 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6708, 6708, 6709, 6710, 6711, 6710, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1306866 0.2233964 0.1005273
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cat("\n=== Fold-by-Fold Performance ===\n\n")
##
## === Fold-by-Fold Performance ===
print(cv_model$resample)
## RMSE Rsquared MAE Resample
## 1 0.1326590 0.1806020 0.10163733 Fold01
## 2 0.1294448 0.2545457 0.09798465 Fold02
## 3 0.1235089 0.2751769 0.09611093 Fold03
## 4 0.1357318 0.1879249 0.10239334 Fold04
## 5 0.1263323 0.2539044 0.09742953 Fold05
## 6 0.1281320 0.2230725 0.09928673 Fold06
## 7 0.1346747 0.2039327 0.10396967 Fold07
## 8 0.1275483 0.2722900 0.09837658 Fold08
## 9 0.1311545 0.2051015 0.10128223 Fold09
## 10 0.1376801 0.1774133 0.10680225 Fold10
cat("\n--- Interpretation ---\n")
##
## --- Interpretation ---
cat("RMSE:", round(cv_model$results$RMSE, 5), "\n")
## RMSE: 0.13069
cat("R²:", round(cv_model$results$Rsquared, 5), "\n")
## R²: 0.2234
cat("MAE:", round(cv_model$results$MAE, 5), "\n\n")
## MAE: 0.10053
cat("CV R² close to in-sample Adj R² (0.2262) indicates\n")
## CV R² close to in-sample Adj R² (0.2262) indicates
cat("the model is not overfitting and generalizes well.\n")
## the model is not overfitting and generalizes well.
# ============================================================
# Section 38: Part 6(a) — Professional Regression Table
# ============================================================
# Present final model with OLS, Robust, and Bootstrap SEs side by side
ols_coefs <- summary(final_model)$coefficients
robust_vcov <- sandwich::vcovHC(final_model, type = "HC1")
robust_test <- lmtest::coeftest(final_model, vcov = robust_vcov)
# Bootstrap SEs from Part 5
boot_se <- apply(boot_results$t, 2, sd)
# Build comparison table for key variables only
key_vars <- c("(Intercept)", "mom_q_c", "mom_y", "log_vol_q_c",
"beta_y", "log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)
results_tbl <- data.frame(
Term = names(coef(final_model))[key_idx],
Estimate = coef(final_model)[key_idx],
OLS_SE = ols_coefs[key_idx, 2],
Robust_SE = sqrt(diag(robust_vcov))[key_idx],
Boot_SE = boot_se[key_idx],
Robust_p = robust_test[key_idx, 4]
)
# Add significance stars based on robust p-values
results_tbl$Sig <- ifelse(results_tbl$Robust_p < 0.001, "***",
ifelse(results_tbl$Robust_p < 0.01, "**",
ifelse(results_tbl$Robust_p < 0.05, "*",
ifelse(results_tbl$Robust_p < 0.10, ".", ""))))
knitr::kable(
results_tbl,
digits = c(0, 5, 5, 5, 5, 4, 0),
align = "lrrrrrr",
col.names = c("Term", "Estimate", "OLS SE", "Robust SE", "Boot SE", "Robust p", "Sig."),
caption = "Table 4: Final Model — OLS, Robust, and Bootstrap Standard Errors"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
Table 4: Final Model — OLS, Robust, and Bootstrap Standard Errors
|
|
Term
|
Estimate
|
OLS SE
|
Robust SE
|
Boot SE
|
Robust p
|
Sig.
|
|
(Intercept)
|
(Intercept)
|
-0.12807
|
0.03288
|
0.03727
|
0.03732
|
0.0006
|
***
|
|
mom_q_c
|
mom_q_c
|
-0.02092
|
0.02008
|
0.02516
|
0.02543
|
0.4058
|
|
|
mom_y
|
mom_y
|
0.02438
|
0.00604
|
0.00815
|
0.00852
|
0.0028
|
**
|
|
log_vol_q_c
|
log_vol_q_c
|
0.78098
|
0.39173
|
0.56082
|
0.56939
|
0.1638
|
|
|
beta_y
|
beta_y
|
0.00818
|
0.00549
|
0.00654
|
0.00633
|
0.2114
|
|
|
log_size_proxy
|
log_size_proxy
|
-0.00206
|
0.00162
|
0.00183
|
0.00184
|
0.2612
|
|
|
mdd_q
|
mdd_q
|
-0.10627
|
0.04659
|
0.05774
|
0.05714
|
0.0658
|
.
|
|
mom_q_c:log_vol_q_c
|
mom_q_c:log_vol_q_c
|
3.29701
|
0.79956
|
1.39211
|
1.39167
|
0.0179
|
|
cat("\nModel: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +\n")
##
## Model: ret_next_q ~ mom_q_c + mom_y + log_vol_q_c + beta_y +
cat(" log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor\n")
## log_size_proxy + mdd_q + mom_q_c:log_vol_q_c + sector + quarter_factor
cat("N = 7,425 | Adj. R² = 0.2262 | F = 71.02 (p < 2e-16)\n")
## N = 7,425 | Adj. R² = 0.2262 | F = 71.02 (p < 2e-16)
cat("CV R² = 0.2224 | Bootstrap replications = 1,000\n")
## CV R² = 0.2224 | Bootstrap replications = 1,000
# ============================================================
# Section 39: Part 6(e) — Confidence Intervals (Robust)
# ============================================================
# 95% confidence intervals using robust (HC1) standard errors
# rather than OLS SEs, consistent with our heteroskedasticity findings.
robust_vcov <- sandwich::vcovHC(final_model, type = "HC1")
robust_test <- lmtest::coeftest(final_model, vcov = robust_vcov)
key_vars <- c("mom_q_c", "mom_y", "log_vol_q_c", "beta_y",
"log_size_proxy", "mdd_q", "mom_q_c:log_vol_q_c")
key_idx <- which(names(coef(final_model)) %in% key_vars)
robust_se <- sqrt(diag(robust_vcov))
ci_tbl <- data.frame(
Term = names(coef(final_model))[key_idx],
Estimate = round(coef(final_model)[key_idx], 5),
Lower_95 = round(coef(final_model)[key_idx] - 1.96 * robust_se[key_idx], 5),
Upper_95 = round(coef(final_model)[key_idx] + 1.96 * robust_se[key_idx], 5),
Robust_p = round(robust_test[key_idx, 4], 4)
)
cat("=== 95% Confidence Intervals (Robust HC1) ===\n\n")
## === 95% Confidence Intervals (Robust HC1) ===
knitr::kable(
ci_tbl,
align = "lrrrr",
col.names = c("Term", "Estimate", "Lower 95%", "Upper 95%", "Robust p"),
caption = "Table 5: 95% Confidence Intervals Using Robust Standard Errors"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
Table 5: 95% Confidence Intervals Using Robust Standard Errors
|
|
Term
|
Estimate
|
Lower 95%
|
Upper 95%
|
Robust p
|
|
mom_q_c
|
mom_q_c
|
-0.02092
|
-0.07023
|
0.02840
|
0.4058
|
|
mom_y
|
mom_y
|
0.02438
|
0.00840
|
0.04035
|
0.0028
|
|
log_vol_q_c
|
log_vol_q_c
|
0.78098
|
-0.31823
|
1.88019
|
0.1638
|
|
beta_y
|
beta_y
|
0.00818
|
-0.00465
|
0.02100
|
0.2114
|
|
log_size_proxy
|
log_size_proxy
|
-0.00206
|
-0.00565
|
0.00153
|
0.2612
|
|
mdd_q
|
mdd_q
|
-0.10627
|
-0.21944
|
0.00691
|
0.0658
|
|
mom_q_c:log_vol_q_c
|
mom_q_c:log_vol_q_c
|
3.29701
|
0.56849
|
6.02554
|
0.0179
|