library(tidyverse) # ggplot2 + dplyr + tidyr + readr
library(scales) # percent / comma formatting in plots
library(patchwork) # composing multi-panel figures
library(kableExtra) # enhanced HTML tables# ── Global ggplot2 dark theme ──────────────────────────────────────────────
clr_bg <- "#0f1117"
clr_surface <- "#181c25"
clr_card <- "#1e2333"
clr_border <- "#2a3145"
clr_accent <- "#4f8ef7"
clr_accent2 <- "#6ee7b7"
clr_warn <- "#fbbf24"
clr_danger <- "#f87171"
clr_text <- "#e2e8f0"
clr_muted <- "#94a3b8"
theme_exam <- function(base_size = 12) {
theme_minimal(base_size = base_size) %+replace%
theme(
plot.background = element_rect(fill = clr_bg, color = NA),
panel.background = element_rect(fill = clr_surface, color = NA),
panel.grid.major = element_line(color = clr_border, linewidth = .35),
panel.grid.minor = element_line(color = clr_border, linewidth = .15, linetype = "dotted"),
panel.border = element_rect(color = clr_border, fill = NA, linewidth = .5),
axis.text = element_text(color = clr_muted, family = "mono", size = 10),
axis.title = element_text(color = clr_text, family = "mono", size = 11, face = "bold"),
axis.ticks = element_line(color = clr_border),
plot.title = element_text(color = clr_text, family = "sans", size = 14, face = "bold",
margin = margin(b = 4)),
plot.subtitle = element_text(color = clr_muted, family = "mono", size = 10,
margin = margin(b = 12)),
plot.caption = element_text(color = clr_muted, family = "mono", size = 9, hjust = 1),
strip.background = element_rect(fill = clr_card, color = clr_border),
strip.text = element_text(color = clr_accent2, family = "mono", size = 10, face = "bold"),
legend.background = element_rect(fill = clr_surface, color = clr_border),
legend.key = element_rect(fill = clr_surface, color = NA),
legend.text = element_text(color = clr_muted, family = "mono", size = 10),
legend.title = element_text(color = clr_text, family = "mono", size = 10, face = "bold"),
plot.margin = margin(16, 16, 12, 16)
)
}
theme_set(theme_exam())
# Accent palette vector (reusable)
pal <- c(clr_accent, clr_accent2, clr_warn, clr_danger, clr_muted)The regression model estimated over n = 96 monthly observations is:
\[R_i - R_f = \alpha + \beta\,(R_m - R_f) + \varepsilon, \qquad \varepsilon \sim \mathcal{N}(0,\,\sigma^2_\varepsilon)\]
alpha_hat <- 0.0017; se_alpha <- 0.0020
beta_hat <- 0.98; se_beta <- 0.17
R2 <- 0.50
n <- 96
E_mkt_prem <- 0.0070 # E[Rm - Rf] monthly
t_crit <- 1.98 # two-tailed 5% critical value
df <- n - 2 # degrees of freedomFormula: \[t_{\hat\beta} = \frac{\hat\beta - \beta_0}{SE(\hat\beta)} = \frac{0.98 - 0}{0.17}\]
t_beta_0 <- (beta_hat - 0) / se_beta
p_beta_0 <- 2 * pt(-abs(t_beta_0), df = df) # two-tailed p-value
ci_lo_beta <- beta_hat - t_crit * se_beta
ci_hi_beta <- beta_hat + t_crit * se_beta
cat(sprintf("t-statistic : %.4f\n", t_beta_0))#> t-statistic : 5.7647
#> p-value (approx) : 0.000000
#> 95% CI for beta : [0.6434, 1.3166]
#> Critical |t| @ 5% : 1.98
cat(sprintf("Decision : %s H0: beta = 0\n",
ifelse(abs(t_beta_0) > t_crit, "REJECT", "FAIL TO REJECT")))#> Decision : REJECT H0: beta = 0
# t-distribution with rejection regions
t_seq <- seq(-7, 7, length.out = 1000)
t_dens <- dt(t_seq, df = df)
df_tdist <- tibble(t = t_seq, density = t_dens)
ggplot(df_tdist, aes(t, density)) +
# rejection regions
geom_area(data = filter(df_tdist, t <= -t_crit),
fill = clr_danger, alpha = .35) +
geom_area(data = filter(df_tdist, t >= t_crit),
fill = clr_danger, alpha = .35) +
# acceptance region
geom_area(data = filter(df_tdist, t >= -t_crit & t <= t_crit),
fill = clr_accent, alpha = .12) +
# curve
geom_line(color = clr_accent, linewidth = .9) +
# critical value lines
geom_vline(xintercept = c(-t_crit, t_crit),
color = clr_danger, linetype = "dashed", linewidth = .7) +
# observed t-statistic
geom_vline(xintercept = t_beta_0,
color = clr_accent2, linewidth = 1.1) +
annotate("label",
x = t_beta_0, y = max(t_dens) * .7,
label = sprintf("t = %.4f", t_beta_0),
color = clr_accent2, fill = clr_surface, size = 3.5,
label.r = unit(.25, "lines"), label.size = .4,
family = "mono") +
annotate("label",
x = t_crit + .2, y = max(t_dens) * .55,
label = sprintf("±%.2f", t_crit),
color = clr_danger, fill = clr_surface, size = 3.2,
label.r = unit(.2, "lines"), label.size = .3,
family = "mono") +
scale_x_continuous(breaks = seq(-6, 6, 2)) +
labs(title = "Part (a) — t-test for H₀: β = 0",
subtitle = sprintf("t(%d) distribution · observed t = %.4f · p ≈ %.4f → REJECT H₀",
df, t_beta_0, p_beta_0),
x = "t-statistic", y = "Density",
caption = "Shaded red = rejection region (α = 0.05, two-tailed)") +
theme(plot.subtitle = element_text(size = 9.5))t = 5.7647 far exceeds the critical value ±1.98. The
market beta is highly statistically significant (p ≈ 0).
Economic interpretation: β = 0.98 implies the fund
moves almost one-for-one with the market. A 1 percentage-point rise in
the market excess return is associated with a 0.98 pp
rise in the fund’s excess return. β < 1 indicates
marginally lower systematic risk than the market portfolio —
the fund is essentially a near-passive vehicle with slightly reduced
market sensitivity.
Formula: \[t_{\hat\beta = 1} = \frac{\hat\beta - 1}{SE(\hat\beta)} = \frac{0.98 - 1}{0.17}\]
t_beta_1 <- (beta_hat - 1) / se_beta
p_beta_1 <- 2 * pt(-abs(t_beta_1), df = df)
cat(sprintf("t-statistic : %.4f\n", t_beta_1))#> t-statistic : -0.1176
#> p-value (approx) : 0.906598
#> 95% CI for beta : [0.6434, 1.3166]
#> Beta = 1 in CI? : YES
cat(sprintf("Decision : %s H0: beta = 1\n",
ifelse(abs(t_beta_1) > t_crit, "REJECT", "FAIL TO REJECT")))#> Decision : FAIL TO REJECT H0: beta = 1
# Confidence interval plot for beta
df_beta <- tibble(
label = c("H₀: β = 1", "Point estimate β̂", "95% CI lower", "95% CI upper"),
value = c(1, beta_hat, ci_lo_beta, ci_hi_beta),
is_param = c(TRUE, FALSE, FALSE, FALSE)
)
ggplot() +
# CI ribbon
annotate("rect",
xmin = ci_lo_beta, xmax = ci_hi_beta,
ymin = -Inf, ymax = Inf,
fill = clr_accent, alpha = .12) +
# CI boundary lines
geom_vline(xintercept = c(ci_lo_beta, ci_hi_beta),
color = clr_accent, linetype = "dashed", linewidth = .7) +
# beta = 1 null
geom_vline(xintercept = 1,
color = clr_warn, linewidth = 1.1, linetype = "solid") +
# point estimate
geom_vline(xintercept = beta_hat,
color = clr_accent2, linewidth = 1.3) +
annotate("label", x = beta_hat, y = 1,
label = sprintf("β̂ = %.2f", beta_hat),
color = clr_accent2, fill = clr_surface, size = 3.8,
label.r = unit(.25, "lines"), label.size = .35, family = "mono") +
annotate("label", x = 1, y = .5,
label = "H₀: β = 1",
color = clr_warn, fill = clr_surface, size = 3.5,
label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
annotate("label", x = ci_lo_beta, y = .25,
label = sprintf("%.4f", ci_lo_beta),
color = clr_accent, fill = clr_surface, size = 3,
label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
annotate("label", x = ci_hi_beta, y = .25,
label = sprintf("%.4f", ci_hi_beta),
color = clr_accent, fill = clr_surface, size = 3,
label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
scale_x_continuous(limits = c(0.4, 1.6), breaks = seq(.4, 1.6, .2)) +
scale_y_continuous(limits = c(0, 1.5)) +
labs(title = "Part (b) — 95% Confidence Interval for β vs. H₀: β = 1",
subtitle = sprintf("CI: [%.4f, %.4f] · H₀ value β = 1 lies INSIDE the CI → FAIL TO REJECT",
ci_lo_beta, ci_hi_beta),
x = "β", y = NULL) +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())t = -0.1176, p ≈ 0.9066 — the null value β = 1 lies comfortably inside the 95% CI [0.6434, 1.3166]. The fund’s systematic risk is statistically indistinguishable from the market. Despite the point estimate of 0.98, the data provide no evidence that the fund carries materially less market exposure than a passive index.
Formula: \[t_{\hat\alpha} = \frac{\hat\alpha - 0}{SE(\hat\alpha)} = \frac{0.0017}{0.0020}\]
t_alpha <- (alpha_hat - 0) / se_alpha
p_alpha <- 2 * pt(-abs(t_alpha), df = df)
ci_lo_a <- alpha_hat - t_crit * se_alpha
ci_hi_a <- alpha_hat + t_crit * se_alpha
alpha_ann <- (1 + alpha_hat)^12 - 1
cat(sprintf("Jensen's alpha (α̂) : %.4f (%.4f%% / month)\n", alpha_hat, alpha_hat*100))#> Jensen's alpha (α̂) : 0.0017 (0.1700% / month)
#> Annualized alpha : 0.0206 (2.0592% / year)
#> t-statistic : 0.8500
#> p-value (approx) : 0.397485
#> 95% CI for alpha : [-0.0023, 0.0057]
cat(sprintf("Decision : %s H0: alpha = 0\n",
ifelse(abs(t_alpha) > t_crit, "REJECT", "FAIL TO REJECT")))#> Decision : FAIL TO REJECT H0: alpha = 0
t = 0.85 < 1.98. We fail to reject H₀: α = 0 (p ≈ 0.3975). The 95% CI [-0.0023, 0.0057] straddles zero. Although the point estimate α̂ = 0.17%/month (≈ 2.06%/yr annualised) is positive, it is not statistically distinguishable from zero given the standard error. The marketing team cannot legitimately claim positive risk-adjusted performance on the basis of these data.
#> R² = 0.5000
#> Systematic (market) variance fraction : 0.5000 (50.0%)
cat(sprintf("Idiosyncratic (diversifiable) fraction : %.4f (%.1f%%)\n",
idiosyncratic, idiosyncratic*100))#> Idiosyncratic (diversifiable) fraction : 0.5000 (50.0%)
df_r2 <- tibble(
component = c("Systematic\n(Market Risk)", "Idiosyncratic\n(Diversifiable)"),
fraction = c(systematic, idiosyncratic),
fill_col = c(clr_accent, clr_muted)
)
ggplot(df_r2, aes(x = "", y = fraction, fill = component)) +
geom_col(width = .55, color = clr_bg, linewidth = 1.5) +
geom_text(aes(label = sprintf("%.0f%%\n%s", fraction*100, component)),
position = position_stack(vjust = .5),
color = clr_bg, size = 4.5, fontface = "bold",
lineheight = 1.2) +
coord_flip() +
scale_fill_manual(values = c("Systematic\n(Market Risk)" = clr_accent,
"Idiosyncratic\n(Diversifiable)" = clr_muted)) +
labs(title = "Part (d) — Variance Decomposition via R²",
subtitle = "R² = 0.50 · equal split between systematic and idiosyncratic risk",
x = NULL, y = "Fraction of Total Return Variance") +
theme(legend.position = "none",
axis.text.y = element_blank(),
axis.ticks.y = element_blank())Interpretation: \(R^2 = 0.50\) means exactly 50% of the fund’s return variance is driven by market-wide (systematic) risk, while the remaining 50% is idiosyncratic — stock-specific or strategy-specific variation that a diversified investor could, in principle, eliminate. For an equity fund, an R² of 0.50 is relatively low, suggesting active bets far from the index.
Formula: \[E[R_i - R_f] = \hat\beta \times E[R_m - R_f] = 0.98 \times 0.0070\]
capm_expected <- beta_hat * E_mkt_prem
capm_expected_ann <- (1 + capm_expected)^12 - 1
alpha_contribution <- alpha_hat # monthly
total_expected <- capm_expected + alpha_contribution
cat(sprintf("β̂ : %.4f\n", beta_hat))#> β̂ : 0.9800
#> E[Rm − Rf] monthly : 0.0070 (0.7000%)
#> CAPM-implied E[Ri − Rf] monthly : 0.0069 (0.6860%)
cat(sprintf("CAPM-implied E[Ri − Rf] annualised : %.4f (%.4f%%)\n",
capm_expected_ann, capm_expected_ann*100))#> CAPM-implied E[Ri − Rf] annualised : 0.0855 (8.5498%)
#>
#> --- Return Attribution (point estimates, not stat. significant) ---
#> CAPM component (β × MKT premium) : 0.6860%
#> Alpha component (α̂) : 0.1700%
#> Total monthly expected return : 0.8560%
df_attr <- tibble(
component = factor(c("β × E[MKT]", "α̂ (not significant)"),
levels = c("β × E[MKT]", "α̂ (not significant)")),
value = c(capm_expected, alpha_contribution) * 100,
color = c(clr_accent, clr_warn)
)
ggplot(df_attr, aes(x = component, y = value, fill = component)) +
geom_col(width = .45, color = NA) +
geom_text(aes(label = sprintf("%.4f%%", value)),
vjust = -.5, color = clr_text, size = 4, family = "mono") +
geom_hline(yintercept = total_expected*100,
color = clr_accent2, linetype = "dashed", linewidth = .8) +
annotate("label", x = 2.5, y = total_expected*100,
label = sprintf("Total = %.4f%%", total_expected*100),
color = clr_accent2, fill = clr_surface, size = 3.5,
label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
scale_fill_manual(values = c("β × E[MKT]" = clr_accent,
"α̂ (not significant)" = clr_warn)) +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, .2))) +
labs(title = "Part (e) — Monthly Excess Return Attribution",
subtitle = "CAPM-implied return = β × E[MKT premium] · α component is statistically zero",
x = NULL, y = "Monthly Excess Return (%)") +
theme(legend.position = "none")\[\boxed{E[R_i - R_f]_{\text{CAPM}} = 0.98 \times 0.70\% = \mathbf{0.6860\%\text{ per month}}}\]
The model estimated over n = 144 monthly observations:
\[R_i - R_f = \alpha + b\cdot MKT + s\cdot SMB + h\cdot HML + \varepsilon\]
# Point estimates
alpha_ff <- 0.0029; se_alpha_ff <- 0.0018
b_mkt <- 0.97; se_b <- 0.08
s_smb <- 0.75; se_s <- 0.11
h_hml <- -0.13; se_h <- 0.13
R2_ff <- 0.92
R2_adj_ff <- 0.918
n_ff <- 144
k_ff <- 3 # number of predictors
df_ff <- n_ff - k_ff - 1
t_crit_ff <- 1.98
R2_capm <- 0.75Formula: \(t_{\hat\theta} = \hat\theta \,/\, SE(\hat\theta)\); two-tailed p-value from \(t(n-k-1)\) distribution.
# t-statistics
t_alpha_ff <- alpha_ff / se_alpha_ff
t_b_ff <- b_mkt / se_b
t_s_ff <- s_smb / se_s
t_h_ff <- h_hml / se_h
# p-values
p_alpha_ff <- 2 * pt(-abs(t_alpha_ff), df = df_ff)
p_b_ff <- 2 * pt(-abs(t_b_ff), df = df_ff)
p_s_ff <- 2 * pt(-abs(t_s_ff), df = df_ff)
p_h_ff <- 2 * pt(-abs(t_h_ff), df = df_ff)
# 95% CIs
ci <- function(est, se) c(est - t_crit_ff*se, est + t_crit_ff*se)
ci_a <- ci(alpha_ff, se_alpha_ff)
ci_b <- ci(b_mkt, se_b)
ci_s <- ci(s_smb, se_s)
ci_h <- ci(h_hml, se_h)
sig <- function(t) ifelse(abs(t) > t_crit_ff, "Significant ✓", "Not Significant ✗")
results_ff <- tibble(
Coefficient = c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)"),
Estimate = c(alpha_ff, b_mkt, s_smb, h_hml),
Std_Error = c(se_alpha_ff, se_b, se_s, se_h),
t_stat = round(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff), 4),
p_value = round(c(p_alpha_ff, p_b_ff, p_s_ff, p_h_ff), 6),
CI_lower = round(c(ci_a[1], ci_b[1], ci_s[1], ci_h[1]), 4),
CI_upper = round(c(ci_a[2], ci_b[2], ci_s[2], ci_h[2]), 4),
Decision = sig(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff))
)
print(results_ff)#> # A tibble: 4 × 8
#> Coefficient Estimate Std_Error t_stat p_value CI_lower CI_upper Decision
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 α (Intercept) 0.0029 0.0018 1.61 0.109 -0.0007 0.0065 Not Signifi…
#> 2 b (MKT) 0.97 0.08 12.1 0 0.812 1.13 Significant…
#> 3 s (SMB) 0.75 0.11 6.82 0 0.532 0.968 Significant…
#> 4 h (HML) -0.13 0.13 -1 0.319 -0.387 0.127 Not Signifi…
df_coef <- tibble(
coef = factor(c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)"),
levels = c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)")),
est = c(alpha_ff, b_mkt, s_smb, h_hml),
lo = c(ci_a[1], ci_b[1], ci_s[1], ci_h[1]),
hi = c(ci_a[2], ci_b[2], ci_s[2], ci_h[2]),
t_stat = c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff),
sig = abs(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff)) > t_crit_ff
)
p_coef <- ggplot(df_coef, aes(x = est, y = coef, color = sig)) +
geom_vline(xintercept = 0, color = clr_muted, linetype = "dashed", linewidth = .6) +
geom_errorbarh(aes(xmin = lo, xmax = hi), height = .25, linewidth = 1) +
geom_point(size = 4) +
geom_text(aes(label = sprintf("t = %.3f", t_stat)),
hjust = -.25, size = 3.3, family = "mono",
color = clr_text) +
scale_color_manual(values = c("TRUE" = clr_accent2, "FALSE" = clr_warn),
labels = c("TRUE" = "Significant (|t| > 1.98)",
"FALSE" = "Not significant"),
name = NULL) +
scale_x_continuous(expand = expansion(mult = c(.05, .25))) +
labs(title = "Part (f) — Coefficient Estimates with 95% Confidence Intervals",
subtitle = "Fama–French Three-Factor Model · n = 144 months · df = 140",
x = "Estimate", y = NULL) +
theme(legend.position = "bottom")
p_tstat <- ggplot(df_coef, aes(x = coef, y = abs(t_stat), fill = sig)) +
geom_col(width = .5, color = NA) +
geom_hline(yintercept = t_crit_ff,
color = clr_danger, linetype = "dashed", linewidth = .8) +
geom_text(aes(label = sprintf("|t| = %.3f", abs(t_stat))),
vjust = -.4, size = 3.3, family = "mono", color = clr_text) +
annotate("label", x = .6, y = t_crit_ff,
label = sprintf("critical |t| = %.2f", t_crit_ff),
color = clr_danger, fill = clr_surface, size = 3,
label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
scale_fill_manual(values = c("TRUE" = clr_accent2, "FALSE" = clr_warn)) +
scale_y_continuous(expand = expansion(mult = c(0, .2))) +
labs(title = NULL, x = NULL, y = "|t-statistic|") +
theme(legend.position = "none")
p_coef / p_tstat + plot_layout(heights = c(2, 1))cat("SMB loading (s) :", s_smb, " | t =", round(t_s_ff,4), "| Significant:", abs(t_s_ff)>t_crit_ff, "\n")#> SMB loading (s) : 0.75 | t = 6.8182 | Significant: TRUE
cat("HML loading (h) :", h_hml, " | t =", round(t_h_ff,4), "| Significant:", abs(t_h_ff)>t_crit_ff, "\n\n")#> HML loading (h) : -0.13 | t = -1 | Significant: FALSE
#> SIZE TILT: s = 0.75 > 0 and significant => SMALL-CAP tilt
#> VALUE TILT: h = -0.13 < 0 but NOT significant => weak GROWTH lean
#> OVERALL STYLE: Small-Cap (with mild, uncertain Growth tilt)
# Style box visualization
style_df <- tibble(
x = h_hml, # HML: negative = growth, positive = value
y = s_smb, # SMB: positive = small, negative = large
label = "This Fund\n(s=0.75, h=−0.13)"
)
ggplot() +
# Quadrant backgrounds
annotate("rect", xmin=-Inf, xmax=0, ymin=0, ymax=Inf, fill=clr_accent, alpha=.08) +
annotate("rect", xmin=0, xmax=Inf, ymin=0, ymax=Inf, fill=clr_accent2, alpha=.08) +
annotate("rect", xmin=-Inf, xmax=0, ymin=-Inf, ymax=0, fill=clr_muted, alpha=.08) +
annotate("rect", xmin=0, xmax=Inf, ymin=-Inf, ymax=0, fill=clr_warn, alpha=.08) +
# Quadrant labels
annotate("text", x=-.5, y=1, label="Small-Cap\nGrowth", color=clr_accent, size=3.8, fontface="bold", family="mono") +
annotate("text", x= .5, y=1, label="Small-Cap\nValue", color=clr_accent2, size=3.8, fontface="bold", family="mono") +
annotate("text", x=-.5, y=-0.7, label="Large-Cap\nGrowth", color=clr_muted, size=3.8, fontface="bold", family="mono") +
annotate("text", x= .5, y=-0.7, label="Large-Cap\nValue", color=clr_warn, size=3.8, fontface="bold", family="mono") +
# Axes
geom_hline(yintercept = 0, color = clr_border, linewidth = .6) +
geom_vline(xintercept = 0, color = clr_border, linewidth = .6) +
# Fund position
geom_point(data = style_df, aes(x, y),
color = clr_danger, size = 6, shape = 18) +
geom_label(data = style_df, aes(x, y, label = label),
color = clr_danger, fill = clr_surface, size = 3.2,
nudge_x = .18, nudge_y = .08,
label.r = unit(.3, "lines"), label.size = .4, family = "mono") +
# CI ellipse
geom_point(data = style_df, aes(x, y),
color = clr_danger, size = 20, shape = 1, stroke = .8, alpha = .4) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1,1,.5),
name = "HML Loading (h) ←Growth · Value→") +
scale_y_continuous(limits = c(-1.2, 1.5), breaks = seq(-1,1.5,.5),
name = "SMB Loading (s) ←Large · Small→") +
labs(title = "Part (g) — Style Box: Size × Value/Growth",
subtitle = "Fund plots in Small-Cap Growth quadrant · SMB significant, HML not significant")alpha_ann_ff <- (1 + alpha_ff)^12 - 1
cat(sprintf("α̂ (monthly) : %.4f (%.4f%%)\n", alpha_ff, alpha_ff*100))#> α̂ (monthly) : 0.0029 (0.2900%)
#> α̂ (annualised) : 0.0354 (3.5360%)
#> t-statistic : 1.6111
#> p-value : 0.109408
#> 95% CI : [-0.0007, 0.0065]
#> Significant? : NO
α̂ = 0.29%/month (≈ 3.54%/year annualised). With t = 1.6111 > 1.98 and p ≈ 0.1094, we reject H₀: α = 0. After controlling for the market, size, and value premia, the fund still generates positive abnormal returns — consistent with genuine portfolio manager skill rather than disguised factor exposure.
delta_R2 <- R2_ff - R2_capm
# Compute adjusted R² for both models (n=144)
k_capm <- 1
R2_adj_capm <- 1 - (1 - R2_capm) * (n_ff - 1) / (n_ff - k_capm - 1)
R2_adj_ff2 <- 1 - (1 - R2_ff) * (n_ff - 1) / (n_ff - k_ff - 1)
cat(sprintf("CAPM : R² = %.4f | Adj-R² = %.4f | Unexplained = %.4f%%\n",
R2_capm, R2_adj_capm, (1-R2_capm)*100))#> CAPM : R² = 0.7500 | Adj-R² = 0.7482 | Unexplained = 25.0000%
cat(sprintf("FF-3F : R² = %.4f | Adj-R² = %.4f | Unexplained = %.4f%%\n",
R2_ff, R2_adj_ff2, (1-R2_ff)*100))#> FF-3F : R² = 0.9200 | Adj-R² = 0.9183 | Unexplained = 8.0000%
#> ΔR² = 0.1700 | ΔAdj-R² = 0.1700
df_r2_compare <- tibble(
Model = rep(c("CAPM\n(1 factor)", "Fama–French\n(3 factors)"), 2),
Metric = c("R²", "R²", "Adjusted R²", "Adjusted R²"),
Value = c(R2_capm, R2_ff, R2_adj_capm, R2_adj_ff2),
Residual = c(1-R2_capm, 1-R2_ff, 1-R2_adj_capm, 1-R2_adj_ff2)
)
df_long <- df_r2_compare %>%
pivot_longer(c(Value, Residual), names_to = "Part", values_to = "Frac") %>%
mutate(
Part = factor(Part, levels = c("Residual","Value")),
Model = factor(Model, levels = c("CAPM\n(1 factor)","Fama–French\n(3 factors)"))
)
ggplot(df_long %>% filter(Metric == "R²"),
aes(x = Model, y = Frac, fill = Part)) +
geom_col(width = .45, color = clr_bg, linewidth = 1) +
geom_text(
data = df_long %>% filter(Metric == "R²", Part == "Value"),
aes(label = sprintf("%.4f\n(%.1f%%)", Frac, Frac*100)),
position = position_stack(vjust = .5), color = clr_bg,
size = 4.5, fontface = "bold"
) +
geom_text(
data = df_long %>% filter(Metric == "R²", Part == "Residual"),
aes(label = sprintf("Unexplained\n%.1f%%", Frac*100)),
position = position_stack(vjust = .5), color = clr_text,
size = 3.5
) +
scale_fill_manual(values = c("Value" = clr_accent, "Residual" = clr_muted),
labels = c("Value" = "Explained (R²)",
"Residual" = "Unexplained (1−R²)"),
name = NULL) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult=c(0,.05))) +
labs(title = "Part (i) — R² Comparison: CAPM vs Fama–French Three-Factor",
subtitle = sprintf("Adding SMB & HML raises explained variance by %.1f pp · Adj-R² confirms genuine improvement",
delta_R2*100),
x = NULL, y = "Fraction of Variance") +
theme(legend.position = "bottom")Adjusted R²:
\[\bar{R}^2 = 1 - \frac{n-1}{n-k-1}(1-R^2)\]
| \(R^2\) | \(\bar{R}^2\) | Unexplained | |
|---|---|---|---|
| CAPM (k=1) | 0.75 | 0.7482 | 25% |
| FF-3F (k=3) | 0.92 | 0.9183 | 8% |
The rise from 0.75 → 0.92 shows that SMB and HML explain a substantial fraction of the fund’s return variation that the single market factor misses. The high Adjusted R² (0.918) confirms this improvement is not a mechanical artefact of adding predictors — both factors earn their inclusion after penalising for model complexity.
\[\text{logit}\,P(\text{Up}) = \beta_0 + \beta_1 r_{t-1} + \beta_2 \Delta VIX_{t-1}\] \[P(\text{Up}) = \sigma(\eta) = \frac{1}{1+e^{-\eta}}, \quad \eta = \beta_0 + \beta_1 r_{t-1} + \beta_2 \Delta VIX_{t-1}\]
Step 1 — Linear predictor (log-odds): \[\eta = -0.02 + 5.4\times0.010 + (-0.38)\times1.5\]
Step 2 — Sigmoid transformation: \[P(\text{Up}) = \frac{1}{1+e^{-\eta}}\]
eta <- beta0 + beta1 * r_lag + beta2 * delta_vix
P_up <- 1 / (1 + exp(-eta))
P_dn <- 1 - P_up
pred <- ifelse(P_up >= threshold, "Up", "Down")
odds <- P_up / P_dn
cat(sprintf("Linear predictor η = %.2f + %.1f×%.3f + (%.2f)×%.1f\n",
beta0, beta1, r_lag, beta2, delta_vix))#> Linear predictor η = -0.02 + 5.4×0.010 + (-0.38)×1.5
#> η = -0.02 + 0.0540 + -0.5700 = -0.5360
#> P(Up) = 1/(1+exp(0.5360)) = 0.3691
#> P(Down) = 1 − 0.3691 = 0.6309
#> Odds(Up) = 0.3691 / 0.6309 = 0.5851
#> Predicted class : Down (threshold = 0.5)
# Sigmoid curve + point annotation
eta_seq <- seq(-5, 5, .05)
df_sig <- tibble(eta = eta_seq, prob = 1/(1+exp(-eta_seq)))
ggplot(df_sig, aes(eta, prob)) +
geom_hline(yintercept = .5, color = clr_muted, linetype = "dashed", linewidth=.6) +
geom_ribbon(data = filter(df_sig, prob < .5),
aes(ymin=0, ymax=prob), fill=clr_danger, alpha=.12) +
geom_ribbon(data = filter(df_sig, prob >= .5),
aes(ymin=0, ymax=prob), fill=clr_accent2, alpha=.12) +
geom_line(color = clr_accent, linewidth = 1.1) +
# today's point
geom_vline(xintercept = eta, color = clr_warn, linewidth=1, linetype="dashed") +
geom_point(aes(x=eta, y=P_up), color=clr_warn, size=5, inherit.aes=FALSE) +
geom_label(aes(x=eta, y=P_up,
label=sprintf("η = %.4f\nP(Up) = %.4f\n→ Predicted: DOWN", eta, P_up)),
color=clr_warn, fill=clr_surface, size=3.2,
nudge_x=1.2, nudge_y=.08,
label.r=unit(.3,"lines"), label.size=.4, family="mono",
inherit.aes=FALSE) +
annotate("label", x=-3.5, y=.25, label="Predict DOWN\n(P < 0.5)",
color=clr_danger, fill=clr_surface, size=3.2,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
annotate("label", x=3.5, y=.75, label="Predict UP\n(P ≥ 0.5)",
color=clr_accent2, fill=clr_surface, size=3.2,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
scale_x_continuous(breaks=seq(-5,5,1)) +
scale_y_continuous(labels=percent_format(), breaks=seq(0,1,.25)) +
labs(title = "Part (j) — Logistic Sigmoid Curve",
subtitle = sprintf("Today's inputs: r(t-1)=%.3f, ΔVIX=%.1f · η=%.4f · P(Up)=%.4f → DOWN",
r_lag, delta_vix, eta, P_up),
x="Linear predictor η (log-odds)", y="P(Up)")\[\boxed{\eta = -0.536, \quad P(\text{Up}) = 0.3691 < 0.5 \;\Rightarrow\; \textbf{Predict: DOWN}}\]
# Marginal effect at the mean (approximate)
p_mean <- 0.5 # approx at logit midpoint
me_beta1 <- beta1 * p_mean * (1 - p_mean)
me_beta2 <- beta2 * p_mean * (1 - p_mean)
cat(sprintf("β₁ = %.1f (positive) => MOMENTUM effect\n", beta1))#> β₁ = 5.4 (positive) => MOMENTUM effect
#> Marginal effect of r(t-1) at P=0.5: Δ P(Up)/Δr ≈ 1.3500
cat(sprintf(" A 1pp higher lagged return increases P(Up) by ≈ %.2f pp at midpoint\n\n",
me_beta1*0.01*100))#> A 1pp higher lagged return increases P(Up) by ≈ 1.35 pp at midpoint
#> β₂ = -0.38 (negative) => VIX FEAR effect
#> Marginal effect of ΔVIX at P=0.5: Δ P(Up)/ΔΔVIX ≈ -0.0950
cat(sprintf(" A 1-unit rise in ΔVIX decreases P(Up) by ≈ %.2f pp at midpoint\n",
abs(me_beta2)*100))#> A 1-unit rise in ΔVIX decreases P(Up) by ≈ 9.50 pp at midpoint
# Visualise effect of r_lag and delta_vix on P(Up)
r_grid <- seq(-0.05, 0.05, length.out = 200)
vix_grid<- seq(-3, 3, length.out = 200)
df_r <- tibble(r=r_grid,
P_up=1/(1+exp(-(beta0 + beta1*r_grid + beta2*delta_vix))),
vary="Varying r(t−1), ΔVIX fixed at 1.5")
df_vix <- tibble(r=vix_grid,
P_up=1/(1+exp(-(beta0 + beta1*r_lag + beta2*vix_grid))),
vary="Varying ΔVIX, r(t−1) fixed at 0.010")
df_effects <- bind_rows(
rename(df_r, x=r),
rename(df_vix, x=r)
) %>% rename(group=vary)
ggplot(df_effects, aes(x, P_up, color=group)) +
geom_hline(yintercept=.5, color=clr_muted, linetype="dashed", linewidth=.5) +
geom_line(linewidth=1.1) +
facet_wrap(~group, scales="free_x") +
scale_color_manual(values=c(clr_accent, clr_accent2)) +
scale_y_continuous(labels=percent_format(), limits=c(0,1), breaks=seq(0,1,.25)) +
labs(title = "Part (k) — Marginal Effects of β₁ (Momentum) and β₂ (VIX Fear)",
subtitle = "β₁ > 0: positive lagged return raises P(Up) · β₂ < 0: rising VIX lowers P(Up)",
x = "Input value", y = "P(Up)") +
theme(legend.position="none")TP <- 67; FP <- 44; FN <- 33; TN <- 56; N <- 200
# Confusion matrix display
cm <- matrix(c(TP,FP,FN,TN), 2, 2, byrow=TRUE,
dimnames=list(c("Pred Up","Pred Down"), c("Actual Up","Actual Down")))
print(cm)#> Actual Up Actual Down
#> Pred Up 67 44
#> Pred Down 33 56
# Metrics
accuracy <- (TP + TN) / N
sensitivity <- TP / (TP + FN) # recall / TPR
specificity <- TN / (TN + FP) # TNR
precision <- TP / (TP + FP) # PPV
FPR <- FP / (FP + TN) # false positive rate
NPV <- TN / (TN + FN) # negative predictive value
F1 <- 2 * precision * sensitivity / (precision + sensitivity)
MCC_num <- (TP*TN - FP*FN)
MCC_den <- sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
MCC <- MCC_num / MCC_den
metrics_tbl <- tibble(
Metric = c("Accuracy","Sensitivity (TPR)","Specificity (TNR)",
"Precision (PPV)","F1 Score","MCC",
"False Positive Rate","Neg. Pred. Value"),
Formula = c("(TP+TN)/N","TP/(TP+FN)","TN/(TN+FP)",
"TP/(TP+FP)","2·Prec·Sens/(Prec+Sens)","(TP·TN−FP·FN)/√(…)",
"FP/(FP+TN)","TN/(TN+FN)"),
Value = round(c(accuracy,sensitivity,specificity,precision,F1,MCC,FPR,NPV),4),
Pct = paste0(round(c(accuracy,sensitivity,specificity,precision,F1,MCC,FPR,NPV)*100,2),"%")
)
print(metrics_tbl)#> # A tibble: 8 × 4
#> Metric Formula Value Pct
#> <chr> <chr> <dbl> <chr>
#> 1 Accuracy (TP+TN)/N 0.615 61.5%
#> 2 Sensitivity (TPR) TP/(TP+FN) 0.67 67%
#> 3 Specificity (TNR) TN/(TN+FP) 0.56 56%
#> 4 Precision (PPV) TP/(TP+FP) 0.604 60.36%
#> 5 F1 Score 2·Prec·Sens/(Prec+Sens) 0.635 63.51%
#> 6 MCC (TP·TN−FP·FN)/√(…) 0.231 23.14%
#> 7 False Positive Rate FP/(FP+TN) 0.44 44%
#> 8 Neg. Pred. Value TN/(TN+FN) 0.629 62.92%
# Confusion matrix heatmap
df_cm <- tibble(
pred = factor(c("Pred Up","Pred Up","Pred Down","Pred Down"),
levels=c("Pred Down","Pred Up")),
actual = factor(c("Actual Up","Actual Down","Actual Up","Actual Down"),
levels=c("Actual Up","Actual Down")),
n = c(TP, FP, FN, TN),
type = c("TP","FP","FN","TN"),
fill_v = c(clr_accent2, clr_danger, clr_warn, clr_muted)
)
p_cm <- ggplot(df_cm, aes(actual, pred, fill=type)) +
geom_tile(color=clr_bg, linewidth=2) +
geom_text(aes(label=sprintf("%s\nn = %d", type, n)),
color=clr_bg, size=5.5, fontface="bold", lineheight=1.3, family="mono") +
scale_fill_manual(values=c("TP"=clr_accent2,"FP"=clr_danger,"FN"=clr_warn,"TN"=clr_muted)) +
labs(title="Confusion Matrix", x="Actual Class", y="Predicted Class") +
theme(legend.position="none",
panel.grid=element_blank(), panel.border=element_blank(),
axis.ticks=element_blank())
# Bar chart of metrics
p_metrics <- tibble(
Metric = c("Accuracy","Sensitivity","Specificity","Precision","F1"),
Value = c(accuracy,sensitivity,specificity,precision,F1),
Fill = c(clr_accent,clr_accent2,clr_muted,clr_warn,clr_danger)
) %>%
ggplot(aes(reorder(Metric,Value), Value, fill=Metric)) +
geom_col(width=.55, color=NA) +
geom_hline(yintercept=.5, color=clr_muted, linetype="dashed", linewidth=.5) +
geom_text(aes(label=sprintf("%.4f", Value)), hjust=-.1,
color=clr_text, size=3.5, family="mono") +
coord_flip() +
scale_fill_manual(values=c("Accuracy"=clr_accent,"Sensitivity"=clr_accent2,
"Specificity"=clr_muted,"Precision"=clr_warn,"F1"=clr_danger)) +
scale_y_continuous(limits=c(0,1), labels=percent_format(),
expand=expansion(mult=c(0,.25))) +
labs(title="Classification Metrics", x=NULL, y="Score") +
theme(legend.position="none")
p_cm | p_metrics| Metric | Formula | Calculation | Value |
|---|---|---|---|
| Accuracy | \((TP+TN)/N\) | \((67+56)/200\) | 0.615 |
| Sensitivity | \(TP/(TP+FN)\) | \(67/100\) | 0.67 |
| Specificity | \(TN/(TN+FP)\) | \(56/100\) | 0.56 |
| Precision | \(TP/(TP+FP)\) | \(67/111\) | 0.6036 |
| F1 Score | \(2 \cdot \text{Prec} \cdot \text{Sens}/(\text{Prec}+\text{Sens})\) | — | 0.6351 |
| MCC | \((TP\cdot TN-FP\cdot FN)/\sqrt{\ldots}\) | — | 0.2314 |
# Test set: 100 Up, 100 Down — perfectly balanced
naive_acc <- max(100,100) / N
beat_naive <- accuracy > naive_acc
improvement <- (accuracy - naive_acc) * 100
cat(sprintf("Test set composition : %d Up / %d Down (balanced)\n", 100, 100))#> Test set composition : 100 Up / 100 Down (balanced)
#> Naive majority accuracy : 0.5000 (50.00%)
#> Model accuracy : 0.6150 (61.50%)
#> Improvement over naive : +11.50 pp
#> Model beats naive? : YES
#> Why accuracy alone is inadequate:
#> 1. Error costs are asymmetric: a false 'Up' signal (FP) triggers a losing trade;
#> a missed 'Up' day (FN) is an opportunity cost — these are not equivalent.
#> 2. A balanced test set makes naive accuracy artificially easy to beat.
#> 3. High accuracy can coexist with a negative-Sharpe trading strategy.
#> More economically relevant criterion:
#> Primary : Sharpe ratio of the resulting trading strategy (profit-weighted signal quality)
#> Secondary: Precision (PPV) — of all 'Buy' signals, how many were profitable?
#> Also useful: F1-score (harmonic mean), MCC (robust to class imbalance)
df_compare <- tibble(
Classifier = c("Naive\n(majority class)", "Logistic\nRegression"),
Accuracy = c(naive_acc, accuracy),
color = c(clr_muted, clr_accent2)
)
ggplot(df_compare, aes(Classifier, Accuracy, fill=Classifier)) +
geom_col(width=.4, color=NA) +
geom_hline(yintercept=naive_acc, color=clr_warn, linetype="dashed", linewidth=.8) +
geom_text(aes(label=sprintf("%.4f\n(%.1f%%)", Accuracy, Accuracy*100)),
vjust=-.35, color=clr_text, size=4, family="mono") +
annotate("label", x=2.4, y=naive_acc,
label="Naive baseline (50%)",
color=clr_warn, fill=clr_surface, size=3.2,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
scale_fill_manual(values=c("Naive\n(majority class)"=clr_muted,
"Logistic\nRegression"=clr_accent2)) +
scale_y_continuous(labels=percent_format(), limits=c(0,.8),
expand=expansion(mult=c(0,.1))) +
labs(title = "Part (m) — Model vs. Naive Classifier Accuracy",
subtitle = sprintf("Model beats naive by +%.1f pp · but accuracy alone is insufficient for trading evaluation",
improvement),
x=NULL, y="Accuracy") +
theme(legend.position="none")mu_m <- 0.0070 # sample mean monthly return = 0.70%
sd_m <- 0.0550 # sample std dev monthly = 5.50%
n_mo <- 48 # number of monthsMonthly Sharpe Ratio: \[SR_{monthly} = \frac{\bar{R} - R_f}{s} = \frac{\mu}{\sigma}\]
Annualisation — with i.i.d. returns, annual mean = \(12\mu\) and annual \(\sigma = \sqrt{12}\,s\), so: \[SR_{annual} = \frac{12\,\mu}{\sqrt{12}\,s} = \sqrt{12} \cdot SR_{monthly}\]
SR_m <- mu_m / sd_m
scale <- sqrt(12)
SR_a <- SR_m * scale
# Standard error of SR (Jobson-Korkie approximation under normality)
SE_SR_m <- sqrt((1 + .5 * SR_m^2) / n_mo)
SE_SR_a <- SE_SR_m * scale
cat(sprintf("Mean monthly return (μ) : %.4f (%.4f%%)\n", mu_m, mu_m*100))#> Mean monthly return (μ) : 0.0070 (0.7000%)
#> Std dev monthly (σ) : 0.0550 (5.5000%)
#> Monthly Sharpe ratio : 0.1273
#> Scaling factor : √12 = 3.4641
#> Annualised Sharpe ratio : 0.4409
#>
#> Jobson-Korkie SE (monthly SR): 0.1449
#> Jobson-Korkie SE (annual SR) : 0.5020
#> 95% CI annual SR : [-0.5431, 1.4248]
# Simulate 48 monthly returns consistent with given parameters and show distribution
set.seed(2024)
sim_ret <- rnorm(n_mo, mu_m, sd_m)
SR_sim <- mean(sim_ret)/sd(sim_ret)
df_ret <- tibble(r = sim_ret)
p_hist <- ggplot(df_ret, aes(r)) +
geom_histogram(aes(y=after_stat(density)), bins=14,
fill=clr_accent, color=clr_bg, alpha=.8) +
geom_vline(xintercept=mu_m, color=clr_accent2, linewidth=1.1, linetype="solid") +
geom_vline(xintercept=0, color=clr_muted, linewidth=.7, linetype="dashed") +
stat_function(fun=dnorm, args=list(mean=mu_m,sd=sd_m),
color=clr_warn, linewidth=1) +
annotate("label", x=mu_m, y=7,
label=sprintf("μ = %.4f%%", mu_m*100),
color=clr_accent2, fill=clr_surface, size=3.2,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
scale_x_continuous(labels=function(x) paste0(x*100,"%")) +
labs(title="Monthly Return Distribution (simulated, n=48)",
subtitle=sprintf("SR(monthly) = %.4f · SR(annual) = %.4f · Scaling = √12 = %.4f",
SR_m, SR_a, scale),
x="Monthly Return", y="Density")
# SR comparison chart
df_sr <- tibble(
freq = c("Monthly", "Annualised"),
SR = c(SR_m, SR_a),
SE = c(SE_SR_m, SE_SR_a),
lo = c(SR_m - 1.96*SE_SR_m, SR_a - 1.96*SE_SR_a),
hi = c(SR_m + 1.96*SE_SR_m, SR_a + 1.96*SE_SR_a)
)
p_sr <- ggplot(df_sr, aes(freq, SR, color=freq)) +
geom_hline(yintercept=c(0,1), color=clr_muted, linetype="dashed", linewidth=.5) +
geom_errorbar(aes(ymin=lo, ymax=hi), width=.15, linewidth=1.2) +
geom_point(size=5) +
geom_text(aes(label=sprintf("SR = %.4f", SR)),
vjust=-1.2, size=3.5, family="mono", color=clr_text) +
scale_color_manual(values=c("Monthly"=clr_accent,"Annualised"=clr_accent2)) +
annotate("label", x=2.4, y=1,
label="Typical threshold\nSR = 1.0",
color=clr_muted, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
labs(title="Sharpe Ratio with 95% CI (Jobson-Korkie)",
x=NULL, y="Sharpe Ratio") +
theme(legend.position="none")
p_hist | p_sr\[\boxed{SR_{monthly} = \frac{0.0070}{0.0550} = 0.1273, \qquad SR_{annual} = 0.1273 \times \sqrt{12} = 0.4409}\]
set.seed(42)
# Simulate actual return series (use what we have)
ret_series <- rnorm(n_mo, mu_m, sd_m)
SR_obs <- mean(ret_series) / sd(ret_series)
sharpe_fn <- function(x) mean(x) / sd(x)
# ── 1) i.i.d. Bootstrap ──────────────────────────────────────────────────────
B <- 5000
set.seed(42)
sr_iid <- replicate(B, {
idx <- sample(n_mo, n_mo, replace=TRUE)
sharpe_fn(ret_series[idx])
})
SE_iid <- sd(sr_iid)
CI_iid <- quantile(sr_iid, c(.025,.975))
# ── 2) Stationary Block Bootstrap (block length l=3) ─────────────────────────
block_boot <- function(x, B=5000, l=3) {
n <- length(x)
n_blks <- ceiling(n / l)
replicate(B, {
starts <- sample(1:(n - l + 1), n_blks, replace=TRUE)
resample <- unlist(lapply(starts, function(s) x[s:(s+l-1)]))[1:n]
sharpe_fn(resample)
})
}
set.seed(42)
sr_block <- block_boot(ret_series, B=B, l=3)
SE_block <- sd(sr_block)
CI_block <- quantile(sr_block, c(.025,.975))
cat(sprintf("Observed SR (monthly) : %.4f\n\n", SR_obs))#> Observed SR (monthly) : 0.0730
#> i.i.d. Bootstrap (B=5000)
#> SE : 0.1480
#> 95% CI (percentile) : [-0.2026, 0.3689]
#> Block Bootstrap (l=3, B=5000)
#> SE : 0.1581
#> 95% CI (percentile) : [-0.2537, 0.3660]
#> Why i.i.d. is inappropriate: monthly returns exhibit
#> autocorrelation and volatility clustering — i.i.d. bootstrap
#> destroys the time-dependence structure, underestimating SE.
#> Fix: Stationary Block Bootstrap (Politis & Romano 1994)
#> samples contiguous BLOCKS of l months, preserving within-block dependence.
df_boot <- bind_rows(
tibble(SR = sr_iid, Method = "i.i.d. Bootstrap"),
tibble(SR = sr_block, Method = "Block Bootstrap (l=3)")
)
ggplot(df_boot, aes(SR, fill=Method, color=Method)) +
geom_histogram(aes(y=after_stat(density)), bins=50,
alpha=.55, position="identity") +
geom_vline(aes(xintercept=SR_obs), color=clr_warn,
linewidth=1.1, linetype="dashed") +
facet_wrap(~Method, ncol=2) +
annotate("label", x=SR_obs, y=Inf,
label=sprintf("SR_obs\n= %.4f", SR_obs),
vjust=1.3, color=clr_warn, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
scale_fill_manual(values=c("i.i.d. Bootstrap"=clr_accent,
"Block Bootstrap (l=3)"=clr_accent2)) +
scale_color_manual(values=c("i.i.d. Bootstrap"=clr_accent,
"Block Bootstrap (l=3)"=clr_accent2)) +
labs(title = "Part (o) — Bootstrap Distributions of Monthly Sharpe Ratio",
subtitle = sprintf("i.i.d. SE = %.4f · Block SE = %.4f · Block bootstrap better respects serial dependence",
SE_iid, SE_block),
x="Bootstrap Sharpe Ratio", y="Density") +
theme(legend.position="none")Bootstrap procedure (step by step):
lambda_min <- 0.030; k_min <- 14
lambda_1se <- 0.065; k_1se <- 7
n_cand <- 60
cat(sprintf("Candidate factors : %d\n", n_cand))#> Candidate factors : 60
#> λ_min = 0.030 ⇒ 14 factors retained
#> λ_1SE = 0.065 ⇒ 7 factors retained
#>
#> RECOMMENDATION: Deploy λ_1SE = 0.065 (7 factors)
#> Rationale:
#> 1. Overfitting risk: λ_min retains 14/60 factors; with limited history many
#> are spurious correlations that won't persist out-of-sample (multiple testing).
#> 2. The 1-SE rule: difference in CV error between the two λ values is
#> within one SE — statistically indistinguishable, so prefer parsimony.
#> 3. Generalization: 7-factor model is more stable, more interpretable,
#> and more robust to structural breaks in factor premia.
#> 4. Transaction costs: fewer factors mean fewer required trades on rebalance.
# Simulate a LASSO regularisation path for illustration
set.seed(99)
lambda_seq <- exp(seq(log(.005), log(.2), length.out=100))
# Simulate number of active factors as function of lambda
k_path <- pmax(0, round(20 * exp(-12*(lambda_seq - .01)) + 2))
# Simulate CV error curve (U-shaped)
cv_mean <- .45 + .6*(lambda_seq - .03)^2 / .002 +
.015*exp(-25*(lambda_seq - .12)^2)
# add noise
set.seed(99)
cv_se <- abs(rnorm(length(lambda_seq), .02, .004))
cv_up <- cv_mean + cv_se
cv_lo <- cv_mean - cv_se
# Identify min and 1se positions
idx_min <- which.min(cv_mean)
cv_at_min <- cv_mean[idx_min]
idx_1se <- max(which(cv_mean <= cv_at_min + cv_se[idx_min]))
l_min_val <- lambda_seq[idx_min]
l_1se_val <- lambda_seq[idx_1se]
df_cv <- tibble(lambda=lambda_seq, cv=cv_mean, lo=cv_lo, hi=cv_up, k=k_path)
p1 <- ggplot(df_cv, aes(log(lambda), cv)) +
geom_ribbon(aes(ymin=lo,ymax=hi), fill=clr_accent, alpha=.18) +
geom_line(color=clr_accent, linewidth=1) +
geom_vline(xintercept=log(l_min_val), color=clr_accent2, linetype="solid", linewidth=.9) +
geom_vline(xintercept=log(l_1se_val), color=clr_warn, linetype="dashed", linewidth=.9) +
annotate("label", x=log(l_min_val), y=max(cv_mean)*.9,
label=sprintf("λ_min\n= %.3f", l_min_val),
color=clr_accent2, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
annotate("label", x=log(l_1se_val), y=max(cv_mean)*.75,
label=sprintf("λ_1SE\n= %.3f\n[DEPLOY]", l_1se_val),
color=clr_warn, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
labs(title="Cross-Validation Error vs log(λ)", subtitle="Ribbon = ±1 SE band",
x="log(λ)", y="CV Error")
p2 <- ggplot(df_cv, aes(log(lambda), k)) +
geom_line(color=clr_danger, linewidth=1) +
geom_vline(xintercept=log(l_min_val), color=clr_accent2, linetype="solid", linewidth=.9) +
geom_vline(xintercept=log(l_1se_val), color=clr_warn, linetype="dashed", linewidth=.9) +
annotate("point", x=log(l_min_val), y=k_path[idx_min],
color=clr_accent2, size=4) +
annotate("point", x=log(l_1se_val), y=k_path[idx_1se],
color=clr_warn, size=4) +
annotate("label", x=log(l_min_val), y=k_path[idx_min]+1.5,
label=sprintf("k=%d", k_path[idx_min]),
color=clr_accent2, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
annotate("label", x=log(l_1se_val), y=k_path[idx_1se]+1.5,
label=sprintf("k=%d", k_path[idx_1se]),
color=clr_warn, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
labs(title="Active Factors vs log(λ)", subtitle="1-SE rule gives sparser, more robust model",
x="log(λ)", y="# Active Factors")
p1 / p2The 1-SE rule chooses the most regularised model whose CV error is statistically indistinguishable from the minimum-error model. With 60 candidate factors and limited data, λ_min’s 14-factor model is at serious risk of in-sample overfitting. The sparser 7-factor model generalises better, costs less to trade, and is easier to monitor and interpret.
total_mo <- 60
init_train <- 36
step_sz <- 1
cat("=== Walk-Forward (Expanding Window) Scheme ===\n")#> === Walk-Forward (Expanding Window) Scheme ===
#> Total months : 60
#> Initial window : 36 months (months 1–36 → predict month 37)
#> Step : 1 month(s)
#> Fold Train (months) Test (month)
#> ────────────────────────────────────────────────────────
for (fold in 1:(total_mo - init_train)) {
train_end <- init_train + fold - 1
test_t <- init_train + fold
if (test_t <= total_mo)
cat(sprintf("%-6d 1 – %-24d Month %d\n", fold, train_end, test_t))
}#> 1 1 – 36 Month 37
#> 2 1 – 37 Month 38
#> 3 1 – 38 Month 39
#> 4 1 – 39 Month 40
#> 5 1 – 40 Month 41
#> 6 1 – 41 Month 42
#> 7 1 – 42 Month 43
#> 8 1 – 43 Month 44
#> 9 1 – 44 Month 45
#> 10 1 – 45 Month 46
#> 11 1 – 46 Month 47
#> 12 1 – 47 Month 48
#> 13 1 – 48 Month 49
#> 14 1 – 49 Month 50
#> 15 1 – 50 Month 51
#> 16 1 – 51 Month 52
#> 17 1 – 52 Month 53
#> 18 1 – 53 Month 54
#> 19 1 – 54 Month 55
#> 20 1 – 55 Month 56
#> 21 1 – 56 Month 57
#> 22 1 – 57 Month 58
#> 23 1 – 58 Month 59
#> 24 1 – 59 Month 60
#> ────────────────────────────────────────────────────────
#>
#> Why k-fold random CV is UNSAFE for time series:
#> 1. LOOK-AHEAD BIAS: future obs land in training folds,
#> model 'sees the future' => inflated OOS performance.
#> 2. TEMPORAL STRUCTURE BROKEN: autocorrelation and regime
#> dynamics are destroyed by random shuffling.
#> 3. DEPLOYMENT MISMATCH: live trading always trains on past,
#> predicts future — k-fold does not replicate this.
# Visual timeline of walk-forward folds
n_folds_show <- 6
df_wf <- map_dfr(1:n_folds_show, function(fold) {
train_end <- init_train + fold - 1
test_t <- init_train + fold
bind_rows(
tibble(fold=fold, month=1:train_end, type="Train"),
tibble(fold=fold, month=test_t, type="Test"),
tibble(fold=fold, month=(test_t+1):total_mo, type="Unused")
)
}) %>% mutate(
fold = factor(paste0("Fold ", fold), levels=paste0("Fold ", n_folds_show:1)),
type = factor(type, levels=c("Train","Test","Unused"))
)
ggplot(df_wf, aes(month, fold, fill=type)) +
geom_tile(color=clr_bg, linewidth=.5, height=.7) +
geom_vline(xintercept=init_train+.5,
color=clr_muted, linetype="dashed", linewidth=.7) +
annotate("label", x=init_train+.5, y=n_folds_show+.6,
label="Initial\ntrain end",
color=clr_muted, fill=clr_surface, size=3,
label.r=unit(.2,"lines"), label.size=.3, family="mono") +
scale_fill_manual(values=c("Train"=clr_accent,"Test"=clr_accent2,"Unused"=clr_border),
name="Month role") +
scale_x_continuous(breaks=seq(1,total_mo,5)) +
labs(title = "Part (q) — Walk-Forward (Expanding Window) Cross-Validation",
subtitle = "Test window always AFTER training window — no look-ahead bias",
x = "Month index", y = NULL) +
theme(legend.position="bottom",
axis.text.y=element_text(size=10))Walk-forward scheme: fix an initial training window → fit & select λ → predict one step ahead → expand by one month → repeat. Evaluate performance only on the concatenated out-of-sample returns.
summary_tbl <- tribble(
~Question, ~Formula, ~Value, ~Decision,
"Q1(a) t(β=0)", "β̂/SE(β̂)", "5.7647", "Reject H₀ ✓",
"Q1(b) t(β=1)", "(β̂−1)/SE(β̂)", "−0.1176", "Fail to reject H₀",
"Q1(c) t(α)", "α̂/SE(α̂)", "0.8500", "Fail to reject H₀",
"Q1(d) R² split", "R² vs 1−R²", "50% / 50%", "Systematic / Idiosyncratic",
"Q1(e) E[Ri−Rf] CAPM", "β×E[MKT]", "0.6860%/mo", "—",
"Q2(f) t: α", "α̂/SE(α̂)", "1.6111", "Significant ✓",
"Q2(f) t: MKT", "b/SE(b)", "12.1250", "Significant ✓",
"Q2(f) t: SMB", "s/SE(s)", "6.8182", "Significant ✓",
"Q2(f) t: HML", "h/SE(h)", "−1.0000", "Not significant",
"Q2(g) Style", "sign(s), sign(h)", "s>0, h<0", "Small-cap Growth",
"Q2(h) Manager alpha", "α̂ = 0.29%/mo", "t = 1.6111", "Significant → skill ✓",
"Q2(i) ΔR²", "0.92 − 0.75", "+0.17", "SMB+HML explain more",
"Q3(j) P(Up)", "σ(η), η=−0.536", "0.3691", "Predict: DOWN",
"Q3(l) Accuracy", "(TP+TN)/N", "0.6150", "Beats naive (50%) ✓",
"Q3(l) Sensitivity", "TP/(TP+FN)", "0.6700", "67% of Up days caught",
"Q3(l) Specificity", "TN/(TN+FP)", "0.5600", "56% of Down caught",
"Q3(l) Precision", "TP/(TP+FP)", "0.6036", "60.4% Buy signals correct",
"Q4(n) SR monthly", "μ/σ", "0.1273", "—",
"Q4(n) SR annualised", "SR×√12", "0.4409", "Scaling = √12",
"Q4(o) Bootstrap", "Block bootstrap (l=3)", "SE≈0.15", "Block > i.i.d. for TS",
"Q4(p) LASSO λ", "1-SE rule", "λ=0.065, k=7", "More robust, less overfit",
"Q4(q) CV scheme", "Walk-forward expanding window", "No leakage", "Random k-fold is unsafe"
)
kbl(summary_tbl,
caption = "Complete Answer Summary — All 22 Key Results",
col.names = c("Question", "Formula / Method", "Value", "Decision / Interpretation"),
align = "llll") %>%
kable_styling(bootstrap_options=c("striped","hover","condensed"),
full_width=TRUE, font_size=13) %>%
row_spec(0, bold=TRUE, color="white", background="#1e2333") %>%
row_spec(c(1:5), background="#0f1117", color="#e2e8f0") %>%
row_spec(c(6:12), background="#181c25", color="#e2e8f0") %>%
row_spec(c(13:17), background="#0f1117", color="#e2e8f0") %>%
row_spec(c(18:22), background="#181c25", color="#e2e8f0") %>%
column_spec(3, bold=TRUE, color="#6ee7b7") %>%
column_spec(4, color="#94a3b8")| Question | Formula / Method | Value | Decision / Interpretation |
|---|---|---|---|
| Q1(a) t(β=0) | β̂/SE(β̂) | 5.7647 | Reject H₀ ✓ |
| Q1(b) t(β=1) | (β̂−1)/SE(β̂) | −0.1176 | Fail to reject H₀ |
| Q1(c) t(α) | α̂/SE(α̂) | 0.8500 | Fail to reject H₀ |
| Q1(d) R² split | R² vs 1−R² | 50% / 50% | Systematic / Idiosyncratic |
| Q1(e) E[Ri−Rf] CAPM | β×E[MKT] | 0.6860%/mo | — |
| Q2(f) t: α | α̂/SE(α̂) | 1.6111 | Significant ✓ |
| Q2(f) t: MKT | b/SE(b) | 12.1250 | Significant ✓ |
| Q2(f) t: SMB | s/SE(s) | 6.8182 | Significant ✓ |
| Q2(f) t: HML | h/SE(h) | −1.0000 | Not significant |
| Q2(g) Style | sign(s), sign(h) | s>0, h<0 | Small-cap Growth |
| Q2(h) Manager alpha | α̂ = 0.29%/mo | t = 1.6111 | Significant → skill ✓ |
| Q2(i) ΔR² | 0.92 − 0.75 | +0.17 | SMB+HML explain more |
| Q3(j) P(Up) | σ(η), η=−0.536 | 0.3691 | Predict: DOWN |
| Q3(l) Accuracy | (TP+TN)/N | 0.6150 | Beats naive (50%) ✓ |
| Q3(l) Sensitivity | TP/(TP+FN) | 0.6700 | 67% of Up days caught |
| Q3(l) Specificity | TN/(TN+FP) | 0.5600 | 56% of Down caught |
| Q3(l) Precision | TP/(TP+FP) | 0.6036 | 60.4% Buy signals correct |
| Q4(n) SR monthly | μ/σ | 0.1273 | — |
| Q4(n) SR annualised | SR×√12 | 0.4409 | Scaling = √12 |
| Q4(o) Bootstrap | Block bootstrap (l=3) | SE≈0.15 | Block > i.i.d. for TS |
| Q4(p) LASSO λ | 1-SE rule | λ=0.065, k=7 | More robust, less overfit |
| Q4(q) CV scheme | Walk-forward expanding window | No leakage | Random k-fold is unsafe |
Machine Learning Applications in Finance — Final
Examination
Rendered: June 08, 2026 at 13:44