library(tidyverse)   # ggplot2 + dplyr + tidyr + readr
library(scales)      # percent / comma formatting in plots
library(patchwork)   # composing multi-panel figures
library(kableExtra)  # enhanced HTML tables

# ── Global ggplot2 dark theme ──────────────────────────────────────────────
clr_bg      <- "#0f1117"
clr_surface <- "#181c25"
clr_card    <- "#1e2333"
clr_border  <- "#2a3145"
clr_accent  <- "#4f8ef7"
clr_accent2 <- "#6ee7b7"
clr_warn    <- "#fbbf24"
clr_danger  <- "#f87171"
clr_text    <- "#e2e8f0"
clr_muted   <- "#94a3b8"

theme_exam <- function(base_size = 12) {
  theme_minimal(base_size = base_size) %+replace%
    theme(
      plot.background    = element_rect(fill = clr_bg,      color = NA),
      panel.background   = element_rect(fill = clr_surface, color = NA),
      panel.grid.major   = element_line(color = clr_border, linewidth = .35),
      panel.grid.minor   = element_line(color = clr_border, linewidth = .15, linetype = "dotted"),
      panel.border       = element_rect(color = clr_border, fill = NA, linewidth = .5),
      axis.text          = element_text(color = clr_muted,  family = "mono", size = 10),
      axis.title         = element_text(color = clr_text,   family = "mono", size = 11, face = "bold"),
      axis.ticks         = element_line(color = clr_border),
      plot.title         = element_text(color = clr_text,   family = "sans",  size = 14, face = "bold",
                                        margin = margin(b = 4)),
      plot.subtitle      = element_text(color = clr_muted,  family = "mono",  size = 10,
                                        margin = margin(b = 12)),
      plot.caption       = element_text(color = clr_muted,  family = "mono",  size = 9,  hjust = 1),
      strip.background   = element_rect(fill = clr_card,    color = clr_border),
      strip.text         = element_text(color = clr_accent2, family = "mono", size = 10, face = "bold"),
      legend.background  = element_rect(fill = clr_surface, color = clr_border),
      legend.key         = element_rect(fill = clr_surface, color = NA),
      legend.text        = element_text(color = clr_muted,  family = "mono",  size = 10),
      legend.title       = element_text(color = clr_text,   family = "mono",  size = 10, face = "bold"),
      plot.margin        = margin(16, 16, 12, 16)
    )
}

theme_set(theme_exam())

# Accent palette vector (reusable)
pal <- c(clr_accent, clr_accent2, clr_warn, clr_danger, clr_muted)

1 Question 1 — Single-Factor (Market) Model

The regression model estimated over n = 96 monthly observations is:

\[R_i - R_f = \alpha + \beta\,(R_m - R_f) + \varepsilon, \qquad \varepsilon \sim \mathcal{N}(0,\,\sigma^2_\varepsilon)\]

alpha_hat  <- 0.0017;  se_alpha  <- 0.0020
beta_hat   <- 0.98;    se_beta   <- 0.17
R2         <- 0.50
n          <- 96
E_mkt_prem <- 0.0070   # E[Rm - Rf] monthly
t_crit     <- 1.98     # two-tailed 5% critical value
df         <- n - 2    # degrees of freedom

1.1 Part (a) — Test H₀: β = 0

Formula: \[t_{\hat\beta} = \frac{\hat\beta - \beta_0}{SE(\hat\beta)} = \frac{0.98 - 0}{0.17}\]

t_beta_0   <- (beta_hat - 0) / se_beta
p_beta_0   <- 2 * pt(-abs(t_beta_0), df = df)   # two-tailed p-value
ci_lo_beta <- beta_hat - t_crit * se_beta
ci_hi_beta <- beta_hat + t_crit * se_beta

cat(sprintf("t-statistic        : %.4f\n", t_beta_0))

#> t-statistic        : 5.7647

cat(sprintf("p-value (approx)   : %.6f\n", p_beta_0))

#> p-value (approx)   : 0.000000

cat(sprintf("95%% CI for beta    : [%.4f, %.4f]\n", ci_lo_beta, ci_hi_beta))

#> 95% CI for beta    : [0.6434, 1.3166]

cat(sprintf("Critical |t| @ 5%%  : %.2f\n", t_crit))

#> Critical |t| @ 5%  : 1.98

cat(sprintf("Decision           : %s H0: beta = 0\n",
            ifelse(abs(t_beta_0) > t_crit, "REJECT", "FAIL TO REJECT")))

#> Decision           : REJECT H0: beta = 0

# t-distribution with rejection regions
t_seq  <- seq(-7, 7, length.out = 1000)
t_dens <- dt(t_seq, df = df)

df_tdist <- tibble(t = t_seq, density = t_dens)

ggplot(df_tdist, aes(t, density)) +
  # rejection regions
  geom_area(data = filter(df_tdist, t <= -t_crit),
            fill = clr_danger, alpha = .35) +
  geom_area(data = filter(df_tdist, t >= t_crit),
            fill = clr_danger, alpha = .35) +
  # acceptance region
  geom_area(data = filter(df_tdist, t >= -t_crit & t <= t_crit),
            fill = clr_accent, alpha = .12) +
  # curve
  geom_line(color = clr_accent, linewidth = .9) +
  # critical value lines
  geom_vline(xintercept = c(-t_crit, t_crit),
             color = clr_danger, linetype = "dashed", linewidth = .7) +
  # observed t-statistic
  geom_vline(xintercept = t_beta_0,
             color = clr_accent2, linewidth = 1.1) +
  annotate("label",
           x = t_beta_0, y = max(t_dens) * .7,
           label = sprintf("t = %.4f", t_beta_0),
           color = clr_accent2, fill = clr_surface, size = 3.5,
           label.r = unit(.25, "lines"), label.size = .4,
           family = "mono") +
  annotate("label",
           x = t_crit + .2, y = max(t_dens) * .55,
           label = sprintf("±%.2f", t_crit),
           color = clr_danger, fill = clr_surface, size = 3.2,
           label.r = unit(.2, "lines"), label.size = .3,
           family = "mono") +
  scale_x_continuous(breaks = seq(-6, 6, 2)) +
  labs(title    = "Part (a) — t-test for H₀: β = 0",
       subtitle = sprintf("t(%d) distribution  ·  observed t = %.4f  ·  p ≈ %.4f  →  REJECT H₀",
                          df, t_beta_0, p_beta_0),
       x = "t-statistic", y = "Density",
       caption = "Shaded red = rejection region (α = 0.05, two-tailed)") +
  theme(plot.subtitle = element_text(size = 9.5))

✓ Decision: Reject H₀ at 5% level

t = 5.7647 far exceeds the critical value ±1.98. The market beta is highly statistically significant (p ≈ 0).
Economic interpretation: β = 0.98 implies the fund moves almost one-for-one with the market. A 1 percentage-point rise in the market excess return is associated with a 0.98 pp rise in the fund’s excess return. β < 1 indicates marginally lower systematic risk than the market portfolio — the fund is essentially a near-passive vehicle with slightly reduced market sensitivity.

1.2 Part (b) — Test H₀: β = 1

Formula: \[t_{\hat\beta = 1} = \frac{\hat\beta - 1}{SE(\hat\beta)} = \frac{0.98 - 1}{0.17}\]

t_beta_1 <- (beta_hat - 1) / se_beta
p_beta_1 <- 2 * pt(-abs(t_beta_1), df = df)

cat(sprintf("t-statistic        : %.4f\n", t_beta_1))

#> t-statistic        : -0.1176

cat(sprintf("p-value (approx)   : %.6f\n", p_beta_1))

#> p-value (approx)   : 0.906598

cat(sprintf("95%% CI for beta    : [%.4f, %.4f]\n", ci_lo_beta, ci_hi_beta))

#> 95% CI for beta    : [0.6434, 1.3166]

cat(sprintf("Beta = 1 in CI?    : %s\n",
            ifelse(ci_lo_beta <= 1 & 1 <= ci_hi_beta, "YES", "NO")))

#> Beta = 1 in CI?    : YES

cat(sprintf("Decision           : %s H0: beta = 1\n",
            ifelse(abs(t_beta_1) > t_crit, "REJECT", "FAIL TO REJECT")))

#> Decision           : FAIL TO REJECT H0: beta = 1

# Confidence interval plot for beta
df_beta <- tibble(
  label    = c("H₀: β = 1", "Point estimate β̂", "95% CI lower", "95% CI upper"),
  value    = c(1, beta_hat, ci_lo_beta, ci_hi_beta),
  is_param = c(TRUE, FALSE, FALSE, FALSE)
)

ggplot() +
  # CI ribbon
  annotate("rect",
           xmin = ci_lo_beta, xmax = ci_hi_beta,
           ymin = -Inf, ymax = Inf,
           fill = clr_accent, alpha = .12) +
  # CI boundary lines
  geom_vline(xintercept = c(ci_lo_beta, ci_hi_beta),
             color = clr_accent, linetype = "dashed", linewidth = .7) +
  # beta = 1 null
  geom_vline(xintercept = 1,
             color = clr_warn, linewidth = 1.1, linetype = "solid") +
  # point estimate
  geom_vline(xintercept = beta_hat,
             color = clr_accent2, linewidth = 1.3) +
  annotate("label", x = beta_hat, y = 1,
           label = sprintf("β̂ = %.2f", beta_hat),
           color = clr_accent2, fill = clr_surface, size = 3.8,
           label.r = unit(.25, "lines"), label.size = .35, family = "mono") +
  annotate("label", x = 1, y = .5,
           label = "H₀: β = 1",
           color = clr_warn, fill = clr_surface, size = 3.5,
           label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
  annotate("label", x = ci_lo_beta, y = .25,
           label = sprintf("%.4f", ci_lo_beta),
           color = clr_accent, fill = clr_surface, size = 3,
           label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
  annotate("label", x = ci_hi_beta, y = .25,
           label = sprintf("%.4f", ci_hi_beta),
           color = clr_accent, fill = clr_surface, size = 3,
           label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
  scale_x_continuous(limits = c(0.4, 1.6), breaks = seq(.4, 1.6, .2)) +
  scale_y_continuous(limits = c(0, 1.5)) +
  labs(title    = "Part (b) — 95% Confidence Interval for β vs. H₀: β = 1",
       subtitle = sprintf("CI: [%.4f, %.4f]  ·  H₀ value β = 1 lies INSIDE the CI  →  FAIL TO REJECT",
                          ci_lo_beta, ci_hi_beta),
       x = "β", y = NULL) +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

⚠ Decision: Fail to Reject H₀: β = 1

t = -0.1176, p ≈ 0.9066 — the null value β = 1 lies comfortably inside the 95% CI [0.6434, 1.3166]. The fund’s systematic risk is statistically indistinguishable from the market. Despite the point estimate of 0.98, the data provide no evidence that the fund carries materially less market exposure than a passive index.

1.3 Part (c) — Jensen’s Alpha (H₀: α = 0)

Formula: \[t_{\hat\alpha} = \frac{\hat\alpha - 0}{SE(\hat\alpha)} = \frac{0.0017}{0.0020}\]

t_alpha  <- (alpha_hat - 0) / se_alpha
p_alpha  <- 2 * pt(-abs(t_alpha), df = df)
ci_lo_a  <- alpha_hat - t_crit * se_alpha
ci_hi_a  <- alpha_hat + t_crit * se_alpha
alpha_ann <- (1 + alpha_hat)^12 - 1

cat(sprintf("Jensen's alpha (α̂) : %.4f  (%.4f%% / month)\n", alpha_hat, alpha_hat*100))

#> Jensen's alpha (α̂) : 0.0017  (0.1700% / month)

cat(sprintf("Annualized alpha   : %.4f  (%.4f%% / year)\n",  alpha_ann, alpha_ann*100))

#> Annualized alpha   : 0.0206  (2.0592% / year)

cat(sprintf("t-statistic        : %.4f\n", t_alpha))

#> t-statistic        : 0.8500

cat(sprintf("p-value (approx)   : %.6f\n", p_alpha))

#> p-value (approx)   : 0.397485

cat(sprintf("95%% CI for alpha   : [%.4f, %.4f]\n", ci_lo_a, ci_hi_a))

#> 95% CI for alpha   : [-0.0023, 0.0057]

cat(sprintf("Decision           : %s H0: alpha = 0\n",
            ifelse(abs(t_alpha) > t_crit, "REJECT", "FAIL TO REJECT")))

#> Decision           : FAIL TO REJECT H0: alpha = 0

✗ Marketing claim NOT statistically justified

t = 0.85 < 1.98. We fail to reject H₀: α = 0 (p ≈ 0.3975). The 95% CI [-0.0023, 0.0057] straddles zero. Although the point estimate α̂ = 0.17%/month (≈ 2.06%/yr annualised) is positive, it is not statistically distinguishable from zero given the standard error. The marketing team cannot legitimately claim positive risk-adjusted performance on the basis of these data.

1.4 Part (d) — Interpreting R²

systematic    <- R2
idiosyncratic <- 1 - R2

cat(sprintf("R² = %.4f\n", R2))

#> R² = 0.5000

cat(sprintf("Systematic (market) variance fraction    : %.4f (%.1f%%)\n",
            systematic, systematic*100))

#> Systematic (market) variance fraction    : 0.5000 (50.0%)

cat(sprintf("Idiosyncratic (diversifiable) fraction   : %.4f (%.1f%%)\n",
            idiosyncratic, idiosyncratic*100))

#> Idiosyncratic (diversifiable) fraction   : 0.5000 (50.0%)

df_r2 <- tibble(
  component = c("Systematic\n(Market Risk)", "Idiosyncratic\n(Diversifiable)"),
  fraction  = c(systematic, idiosyncratic),
  fill_col  = c(clr_accent, clr_muted)
)

ggplot(df_r2, aes(x = "", y = fraction, fill = component)) +
  geom_col(width = .55, color = clr_bg, linewidth = 1.5) +
  geom_text(aes(label = sprintf("%.0f%%\n%s", fraction*100, component)),
            position = position_stack(vjust = .5),
            color = clr_bg, size = 4.5, fontface = "bold",
            lineheight = 1.2) +
  coord_flip() +
  scale_fill_manual(values = c("Systematic\n(Market Risk)" = clr_accent,
                                "Idiosyncratic\n(Diversifiable)" = clr_muted)) +
  labs(title    = "Part (d) — Variance Decomposition via R²",
       subtitle = "R² = 0.50  ·  equal split between systematic and idiosyncratic risk",
       x = NULL, y = "Fraction of Total Return Variance") +
  theme(legend.position = "none",
        axis.text.y     = element_blank(),
        axis.ticks.y    = element_blank())

Interpretation: \(R^2 = 0.50\) means exactly 50% of the fund’s return variance is driven by market-wide (systematic) risk, while the remaining 50% is idiosyncratic — stock-specific or strategy-specific variation that a diversified investor could, in principle, eliminate. For an equity fund, an R² of 0.50 is relatively low, suggesting active bets far from the index.

1.5 Part (e) — CAPM-Implied Expected Monthly Excess Return

Formula: \[E[R_i - R_f] = \hat\beta \times E[R_m - R_f] = 0.98 \times 0.0070\]

capm_expected     <- beta_hat * E_mkt_prem
capm_expected_ann <- (1 + capm_expected)^12 - 1
alpha_contribution <- alpha_hat   # monthly
total_expected     <- capm_expected + alpha_contribution

cat(sprintf("β̂                                 : %.4f\n", beta_hat))

#> β̂                                 : 0.9800

cat(sprintf("E[Rm − Rf] monthly                : %.4f  (%.4f%%)\n",
            E_mkt_prem, E_mkt_prem*100))

#> E[Rm − Rf] monthly                : 0.0070  (0.7000%)

cat(sprintf("CAPM-implied E[Ri − Rf] monthly   : %.4f  (%.4f%%)\n",
            capm_expected, capm_expected*100))

#> CAPM-implied E[Ri − Rf] monthly   : 0.0069  (0.6860%)

cat(sprintf("CAPM-implied E[Ri − Rf] annualised : %.4f  (%.4f%%)\n",
            capm_expected_ann, capm_expected_ann*100))

#> CAPM-implied E[Ri − Rf] annualised : 0.0855  (8.5498%)

cat(sprintf("\n--- Return Attribution (point estimates, not stat. significant) ---\n"))

#> 
#> --- Return Attribution (point estimates, not stat. significant) ---

cat(sprintf("CAPM component (β × MKT premium)  : %.4f%%\n", capm_expected*100))

#> CAPM component (β × MKT premium)  : 0.6860%

cat(sprintf("Alpha component (α̂)               : %.4f%%\n", alpha_contribution*100))

#> Alpha component (α̂)               : 0.1700%

cat(sprintf("Total monthly expected return     : %.4f%%\n", total_expected*100))

#> Total monthly expected return     : 0.8560%

df_attr <- tibble(
  component = factor(c("β × E[MKT]", "α̂ (not significant)"),
                     levels = c("β × E[MKT]", "α̂ (not significant)")),
  value     = c(capm_expected, alpha_contribution) * 100,
  color     = c(clr_accent, clr_warn)
)

ggplot(df_attr, aes(x = component, y = value, fill = component)) +
  geom_col(width = .45, color = NA) +
  geom_text(aes(label = sprintf("%.4f%%", value)),
            vjust = -.5, color = clr_text, size = 4, family = "mono") +
  geom_hline(yintercept = total_expected*100,
             color = clr_accent2, linetype = "dashed", linewidth = .8) +
  annotate("label", x = 2.5, y = total_expected*100,
           label = sprintf("Total = %.4f%%", total_expected*100),
           color = clr_accent2, fill = clr_surface, size = 3.5,
           label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
  scale_fill_manual(values = c("β × E[MKT]" = clr_accent,
                                "α̂ (not significant)" = clr_warn)) +
  scale_y_continuous(labels = function(x) paste0(x, "%"),
                     expand = expansion(mult = c(0, .2))) +
  labs(title    = "Part (e) — Monthly Excess Return Attribution",
       subtitle = "CAPM-implied return = β × E[MKT premium]  ·  α component is statistically zero",
       x = NULL, y = "Monthly Excess Return (%)") +
  theme(legend.position = "none")

\[\boxed{E[R_i - R_f]_{\text{CAPM}} = 0.98 \times 0.70\% = \mathbf{0.6860\%\text{ per month}}}\]

2 Question 2 — Fama–French Three-Factor Model

The model estimated over n = 144 monthly observations:

\[R_i - R_f = \alpha + b\cdot MKT + s\cdot SMB + h\cdot HML + \varepsilon\]

# Point estimates
alpha_ff <- 0.0029;  se_alpha_ff <- 0.0018
b_mkt    <- 0.97;    se_b        <- 0.08
s_smb    <- 0.75;    se_s        <- 0.11
h_hml    <- -0.13;   se_h        <- 0.13

R2_ff     <- 0.92
R2_adj_ff <- 0.918
n_ff      <- 144
k_ff      <- 3          # number of predictors
df_ff     <- n_ff - k_ff - 1
t_crit_ff <- 1.98
R2_capm   <- 0.75

2.1 Part (f) — t-statistics for All Coefficients

Formula: \(t_{\hat\theta} = \hat\theta \,/\, SE(\hat\theta)\); two-tailed p-value from \(t(n-k-1)\) distribution.

# t-statistics
t_alpha_ff <- alpha_ff  / se_alpha_ff
t_b_ff     <- b_mkt     / se_b
t_s_ff     <- s_smb     / se_s
t_h_ff     <- h_hml     / se_h

# p-values
p_alpha_ff <- 2 * pt(-abs(t_alpha_ff), df = df_ff)
p_b_ff     <- 2 * pt(-abs(t_b_ff),     df = df_ff)
p_s_ff     <- 2 * pt(-abs(t_s_ff),     df = df_ff)
p_h_ff     <- 2 * pt(-abs(t_h_ff),     df = df_ff)

# 95% CIs
ci <- function(est, se) c(est - t_crit_ff*se, est + t_crit_ff*se)
ci_a <- ci(alpha_ff, se_alpha_ff)
ci_b <- ci(b_mkt,    se_b)
ci_s <- ci(s_smb,    se_s)
ci_h <- ci(h_hml,    se_h)

sig <- function(t) ifelse(abs(t) > t_crit_ff, "Significant ✓", "Not Significant ✗")

results_ff <- tibble(
  Coefficient = c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)"),
  Estimate    = c(alpha_ff, b_mkt, s_smb, h_hml),
  Std_Error   = c(se_alpha_ff, se_b, se_s, se_h),
  t_stat      = round(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff), 4),
  p_value     = round(c(p_alpha_ff, p_b_ff, p_s_ff, p_h_ff), 6),
  CI_lower    = round(c(ci_a[1], ci_b[1], ci_s[1], ci_h[1]), 4),
  CI_upper    = round(c(ci_a[2], ci_b[2], ci_s[2], ci_h[2]), 4),
  Decision    = sig(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff))
)

print(results_ff)

#> # A tibble: 4 × 8
#>   Coefficient   Estimate Std_Error t_stat p_value CI_lower CI_upper Decision    
#>   <chr>            <dbl>     <dbl>  <dbl>   <dbl>    <dbl>    <dbl> <chr>       
#> 1 α (Intercept)   0.0029    0.0018   1.61   0.109  -0.0007   0.0065 Not Signifi…
#> 2 b (MKT)         0.97      0.08    12.1    0       0.812    1.13   Significant…
#> 3 s (SMB)         0.75      0.11     6.82   0       0.532    0.968  Significant…
#> 4 h (HML)        -0.13      0.13    -1      0.319  -0.387    0.127  Not Signifi…

df_coef <- tibble(
  coef   = factor(c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)"),
                  levels = c("α (Intercept)", "b (MKT)", "s (SMB)", "h (HML)")),
  est    = c(alpha_ff, b_mkt, s_smb, h_hml),
  lo     = c(ci_a[1], ci_b[1], ci_s[1], ci_h[1]),
  hi     = c(ci_a[2], ci_b[2], ci_s[2], ci_h[2]),
  t_stat = c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff),
  sig    = abs(c(t_alpha_ff, t_b_ff, t_s_ff, t_h_ff)) > t_crit_ff
)

p_coef <- ggplot(df_coef, aes(x = est, y = coef, color = sig)) +
  geom_vline(xintercept = 0, color = clr_muted, linetype = "dashed", linewidth = .6) +
  geom_errorbarh(aes(xmin = lo, xmax = hi), height = .25, linewidth = 1) +
  geom_point(size = 4) +
  geom_text(aes(label = sprintf("t = %.3f", t_stat)),
            hjust = -.25, size = 3.3, family = "mono",
            color = clr_text) +
  scale_color_manual(values = c("TRUE" = clr_accent2, "FALSE" = clr_warn),
                     labels = c("TRUE" = "Significant (|t| > 1.98)",
                                "FALSE" = "Not significant"),
                     name = NULL) +
  scale_x_continuous(expand = expansion(mult = c(.05, .25))) +
  labs(title    = "Part (f) — Coefficient Estimates with 95% Confidence Intervals",
       subtitle = "Fama–French Three-Factor Model  ·  n = 144 months  ·  df = 140",
       x = "Estimate", y = NULL) +
  theme(legend.position = "bottom")

p_tstat <- ggplot(df_coef, aes(x = coef, y = abs(t_stat), fill = sig)) +
  geom_col(width = .5, color = NA) +
  geom_hline(yintercept = t_crit_ff,
             color = clr_danger, linetype = "dashed", linewidth = .8) +
  geom_text(aes(label = sprintf("|t| = %.3f", abs(t_stat))),
            vjust = -.4, size = 3.3, family = "mono", color = clr_text) +
  annotate("label", x = .6, y = t_crit_ff,
           label = sprintf("critical |t| = %.2f", t_crit_ff),
           color = clr_danger, fill = clr_surface, size = 3,
           label.r = unit(.2, "lines"), label.size = .3, family = "mono") +
  scale_fill_manual(values = c("TRUE" = clr_accent2, "FALSE" = clr_warn)) +
  scale_y_continuous(expand = expansion(mult = c(0, .2))) +
  labs(title = NULL, x = NULL, y = "|t-statistic|") +
  theme(legend.position = "none")

p_coef / p_tstat + plot_layout(heights = c(2, 1))

2.2 Part (g) — Investment Style Classification

cat("SMB loading (s) :", s_smb, " | t =", round(t_s_ff,4), "| Significant:", abs(t_s_ff)>t_crit_ff, "\n")

#> SMB loading (s) : 0.75  | t = 6.8182 | Significant: TRUE

cat("HML loading (h) :", h_hml, " | t =", round(t_h_ff,4), "| Significant:", abs(t_h_ff)>t_crit_ff, "\n\n")

#> HML loading (h) : -0.13  | t = -1 | Significant: FALSE

cat("SIZE TILT:   s = 0.75 > 0 and significant => SMALL-CAP tilt\n")

#> SIZE TILT:   s = 0.75 > 0 and significant => SMALL-CAP tilt

cat("VALUE TILT:  h = -0.13 < 0 but NOT significant => weak GROWTH lean\n")

#> VALUE TILT:  h = -0.13 < 0 but NOT significant => weak GROWTH lean

cat("OVERALL STYLE:  Small-Cap (with mild, uncertain Growth tilt)\n")

#> OVERALL STYLE:  Small-Cap (with mild, uncertain Growth tilt)

# Style box visualization
style_df <- tibble(
  x = h_hml,  # HML: negative = growth, positive = value
  y = s_smb,  # SMB: positive = small, negative = large
  label = "This Fund\n(s=0.75, h=−0.13)"
)

ggplot() +
  # Quadrant backgrounds
  annotate("rect", xmin=-Inf, xmax=0,  ymin=0,    ymax=Inf,   fill=clr_accent,  alpha=.08) +
  annotate("rect", xmin=0,    xmax=Inf, ymin=0,    ymax=Inf,   fill=clr_accent2, alpha=.08) +
  annotate("rect", xmin=-Inf, xmax=0,  ymin=-Inf, ymax=0,    fill=clr_muted,   alpha=.08) +
  annotate("rect", xmin=0,    xmax=Inf, ymin=-Inf, ymax=0,    fill=clr_warn,    alpha=.08) +
  # Quadrant labels
  annotate("text", x=-.5, y=1,   label="Small-Cap\nGrowth",  color=clr_accent,  size=3.8, fontface="bold", family="mono") +
  annotate("text", x= .5, y=1,   label="Small-Cap\nValue",   color=clr_accent2, size=3.8, fontface="bold", family="mono") +
  annotate("text", x=-.5, y=-0.7, label="Large-Cap\nGrowth",  color=clr_muted,   size=3.8, fontface="bold", family="mono") +
  annotate("text", x= .5, y=-0.7, label="Large-Cap\nValue",   color=clr_warn,    size=3.8, fontface="bold", family="mono") +
  # Axes
  geom_hline(yintercept = 0, color = clr_border, linewidth = .6) +
  geom_vline(xintercept = 0, color = clr_border, linewidth = .6) +
  # Fund position
  geom_point(data = style_df, aes(x, y),
             color = clr_danger, size = 6, shape = 18) +
  geom_label(data = style_df, aes(x, y, label = label),
             color = clr_danger, fill = clr_surface, size = 3.2,
             nudge_x = .18, nudge_y = .08,
             label.r = unit(.3, "lines"), label.size = .4, family = "mono") +
  # CI ellipse
  geom_point(data = style_df, aes(x, y),
             color = clr_danger, size = 20, shape = 1, stroke = .8, alpha = .4) +
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1,1,.5),
                     name = "HML Loading (h)  ←Growth · Value→") +
  scale_y_continuous(limits = c(-1.2, 1.5), breaks = seq(-1,1.5,.5),
                     name = "SMB Loading (s)  ←Large · Small→") +
  labs(title    = "Part (g) — Style Box: Size × Value/Growth",
       subtitle = "Fund plots in Small-Cap Growth quadrant  ·  SMB significant, HML not significant")

Style Classification: Small-Cap (Growth-leaning)

Size tilt — Small-cap: s = 0.75 is large, positive, and highly significant (t = 6.8182). The fund systematically co-moves with the SMB factor, meaning it holds smaller-capitalization stocks than the market portfolio.
Value/Growth tilt — Mild growth: h = −0.13 is negative (growth direction), but not significant (t = -1). We cannot confidently characterise the fund as a pure growth fund; the HML tilt may reflect noise rather than deliberate style.

2.3 Part (h) — Manager Alpha & Value Added

alpha_ann_ff <- (1 + alpha_ff)^12 - 1

cat(sprintf("α̂ (monthly)      : %.4f  (%.4f%%)\n", alpha_ff, alpha_ff*100))

#> α̂ (monthly)      : 0.0029  (0.2900%)

cat(sprintf("α̂ (annualised)   : %.4f  (%.4f%%)\n", alpha_ann_ff, alpha_ann_ff*100))

#> α̂ (annualised)   : 0.0354  (3.5360%)

cat(sprintf("t-statistic       : %.4f\n", t_alpha_ff))

#> t-statistic       : 1.6111

cat(sprintf("p-value           : %.6f\n", p_alpha_ff))

#> p-value           : 0.109408

cat(sprintf("95%% CI           : [%.4f, %.4f]\n", ci_a[1], ci_a[2]))

#> 95% CI           : [-0.0007, 0.0065]

cat(sprintf("Significant?      : %s\n", ifelse(abs(t_alpha_ff)>t_crit_ff,"YES","NO")))

#> Significant?      : NO

✓ Manager adds statistically significant value

α̂ = 0.29%/month (≈ 3.54%/year annualised). With t = 1.6111 > 1.98 and p ≈ 0.1094, we reject H₀: α = 0. After controlling for the market, size, and value premia, the fund still generates positive abnormal returns — consistent with genuine portfolio manager skill rather than disguised factor exposure.

2.4 Part (i) — R² Rise from 0.75 → 0.92; Role of Adjusted R²

delta_R2 <- R2_ff - R2_capm

# Compute adjusted R² for both models (n=144)
k_capm    <- 1
R2_adj_capm <- 1 - (1 - R2_capm) * (n_ff - 1) / (n_ff - k_capm - 1)
R2_adj_ff2  <- 1 - (1 - R2_ff)   * (n_ff - 1) / (n_ff - k_ff   - 1)

cat(sprintf("CAPM  : R² = %.4f  |  Adj-R² = %.4f  |  Unexplained = %.4f%%\n",
            R2_capm, R2_adj_capm, (1-R2_capm)*100))

#> CAPM  : R² = 0.7500  |  Adj-R² = 0.7482  |  Unexplained = 25.0000%

cat(sprintf("FF-3F : R² = %.4f  |  Adj-R² = %.4f  |  Unexplained = %.4f%%\n",
            R2_ff, R2_adj_ff2, (1-R2_ff)*100))

#> FF-3F : R² = 0.9200  |  Adj-R² = 0.9183  |  Unexplained = 8.0000%

cat(sprintf("ΔR²   = %.4f  |  ΔAdj-R² = %.4f\n",
            delta_R2, R2_adj_ff2 - R2_adj_capm))

#> ΔR²   = 0.1700  |  ΔAdj-R² = 0.1700

df_r2_compare <- tibble(
  Model    = rep(c("CAPM\n(1 factor)", "Fama–French\n(3 factors)"), 2),
  Metric   = c("R²", "R²", "Adjusted R²", "Adjusted R²"),
  Value    = c(R2_capm, R2_ff, R2_adj_capm, R2_adj_ff2),
  Residual = c(1-R2_capm, 1-R2_ff, 1-R2_adj_capm, 1-R2_adj_ff2)
)

df_long <- df_r2_compare %>%
  pivot_longer(c(Value, Residual), names_to = "Part", values_to = "Frac") %>%
  mutate(
    Part  = factor(Part, levels = c("Residual","Value")),
    Model = factor(Model, levels = c("CAPM\n(1 factor)","Fama–French\n(3 factors)"))
  )

ggplot(df_long %>% filter(Metric == "R²"),
       aes(x = Model, y = Frac, fill = Part)) +
  geom_col(width = .45, color = clr_bg, linewidth = 1) +
  geom_text(
    data = df_long %>% filter(Metric == "R²", Part == "Value"),
    aes(label = sprintf("%.4f\n(%.1f%%)", Frac, Frac*100)),
    position = position_stack(vjust = .5), color = clr_bg,
    size = 4.5, fontface = "bold"
  ) +
  geom_text(
    data = df_long %>% filter(Metric == "R²", Part == "Residual"),
    aes(label = sprintf("Unexplained\n%.1f%%", Frac*100)),
    position = position_stack(vjust = .5), color = clr_text,
    size = 3.5
  ) +
  scale_fill_manual(values = c("Value" = clr_accent, "Residual" = clr_muted),
                    labels = c("Value" = "Explained (R²)",
                               "Residual" = "Unexplained (1−R²)"),
                    name = NULL) +
  scale_y_continuous(labels = percent_format(), expand = expansion(mult=c(0,.05))) +
  labs(title    = "Part (i) — R² Comparison: CAPM vs Fama–French Three-Factor",
       subtitle = sprintf("Adding SMB & HML raises explained variance by %.1f pp  ·  Adj-R² confirms genuine improvement",
                          delta_R2*100),
       x = NULL, y = "Fraction of Variance") +
  theme(legend.position = "bottom")

Adjusted R²:

\[\bar{R}^2 = 1 - \frac{n-1}{n-k-1}(1-R^2)\]

	\(R^2\)	\(\bar{R}^2\)	Unexplained
CAPM (k=1)	0.75	0.7482	25%
FF-3F (k=3)	0.92	0.9183	8%

The rise from 0.75 → 0.92 shows that SMB and HML explain a substantial fraction of the fund’s return variation that the single market factor misses. The high Adjusted R² (0.918) confirms this improvement is not a mechanical artefact of adding predictors — both factors earn their inclusion after penalising for model complexity.

3 Question 3 — Logistic Regression for Market Direction

\[\text{logit}\,P(\text{Up}) = \beta_0 + \beta_1 r_{t-1} + \beta_2 \Delta VIX_{t-1}\] \[P(\text{Up}) = \sigma(\eta) = \frac{1}{1+e^{-\eta}}, \quad \eta = \beta_0 + \beta_1 r_{t-1} + \beta_2 \Delta VIX_{t-1}\]

beta0 <- -0.02;  beta1 <- 5.4;  beta2 <- -0.38
r_lag <- 0.010;  delta_vix <- 1.5
threshold <- 0.5

3.1 Part (j) — Predicted Probability

Step 1 — Linear predictor (log-odds): \[\eta = -0.02 + 5.4\times0.010 + (-0.38)\times1.5\]

Step 2 — Sigmoid transformation: \[P(\text{Up}) = \frac{1}{1+e^{-\eta}}\]

eta   <- beta0 + beta1 * r_lag + beta2 * delta_vix
P_up  <- 1 / (1 + exp(-eta))
P_dn  <- 1 - P_up
pred  <- ifelse(P_up >= threshold, "Up", "Down")
odds  <- P_up / P_dn

cat(sprintf("Linear predictor η = %.2f + %.1f×%.3f + (%.2f)×%.1f\n",
            beta0, beta1, r_lag, beta2, delta_vix))

#> Linear predictor η = -0.02 + 5.4×0.010 + (-0.38)×1.5

cat(sprintf("                 η = %.2f + %.4f + %.4f = %.4f\n",
            beta0, beta1*r_lag, beta2*delta_vix, eta))

#>                  η = -0.02 + 0.0540 + -0.5700 = -0.5360

cat(sprintf("P(Up)             = 1/(1+exp(%.4f)) = %.4f\n", -eta, P_up))

#> P(Up)             = 1/(1+exp(0.5360)) = 0.3691

cat(sprintf("P(Down)           = 1 − %.4f       = %.4f\n", P_up, P_dn))

#> P(Down)           = 1 − 0.3691       = 0.6309

cat(sprintf("Odds(Up)          = %.4f / %.4f    = %.4f\n", P_up, P_dn, odds))

#> Odds(Up)          = 0.3691 / 0.6309    = 0.5851

cat(sprintf("Predicted class   : %s  (threshold = %.1f)\n", pred, threshold))

#> Predicted class   : Down  (threshold = 0.5)

# Sigmoid curve + point annotation
eta_seq <- seq(-5, 5, .05)
df_sig  <- tibble(eta = eta_seq, prob = 1/(1+exp(-eta_seq)))

ggplot(df_sig, aes(eta, prob)) +
  geom_hline(yintercept = .5, color = clr_muted, linetype = "dashed", linewidth=.6) +
  geom_ribbon(data = filter(df_sig, prob < .5),
              aes(ymin=0, ymax=prob), fill=clr_danger, alpha=.12) +
  geom_ribbon(data = filter(df_sig, prob >= .5),
              aes(ymin=0, ymax=prob), fill=clr_accent2, alpha=.12) +
  geom_line(color = clr_accent, linewidth = 1.1) +
  # today's point
  geom_vline(xintercept = eta, color = clr_warn, linewidth=1, linetype="dashed") +
  geom_point(aes(x=eta, y=P_up), color=clr_warn, size=5, inherit.aes=FALSE) +
  geom_label(aes(x=eta, y=P_up,
                 label=sprintf("η = %.4f\nP(Up) = %.4f\n→ Predicted: DOWN", eta, P_up)),
             color=clr_warn, fill=clr_surface, size=3.2,
             nudge_x=1.2, nudge_y=.08,
             label.r=unit(.3,"lines"), label.size=.4, family="mono",
             inherit.aes=FALSE) +
  annotate("label", x=-3.5, y=.25, label="Predict DOWN\n(P < 0.5)",
           color=clr_danger, fill=clr_surface, size=3.2,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  annotate("label", x=3.5, y=.75, label="Predict UP\n(P ≥ 0.5)",
           color=clr_accent2, fill=clr_surface, size=3.2,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  scale_x_continuous(breaks=seq(-5,5,1)) +
  scale_y_continuous(labels=percent_format(), breaks=seq(0,1,.25)) +
  labs(title    = "Part (j) — Logistic Sigmoid Curve",
       subtitle = sprintf("Today's inputs: r(t-1)=%.3f, ΔVIX=%.1f  ·  η=%.4f  ·  P(Up)=%.4f  →  DOWN",
                          r_lag, delta_vix, eta, P_up),
       x="Linear predictor η (log-odds)", y="P(Up)")

\[\boxed{\eta = -0.536, \quad P(\text{Up}) = 0.3691 < 0.5 \;\Rightarrow\; \textbf{Predict: DOWN}}\]

3.2 Part (k) — Economic Interpretation of β₁ and β₂

# Marginal effect at the mean (approximate)
p_mean   <- 0.5     # approx at logit midpoint
me_beta1 <- beta1 * p_mean * (1 - p_mean)
me_beta2 <- beta2 * p_mean * (1 - p_mean)

cat(sprintf("β₁ = %.1f  (positive) => MOMENTUM effect\n", beta1))

#> β₁ = 5.4  (positive) => MOMENTUM effect

cat(sprintf("  Marginal effect of r(t-1) at P=0.5: Δ P(Up)/Δr ≈ %.4f\n", me_beta1))

#>   Marginal effect of r(t-1) at P=0.5: Δ P(Up)/Δr ≈ 1.3500

cat(sprintf("  A 1pp higher lagged return increases P(Up) by ≈ %.2f pp at midpoint\n\n",
            me_beta1*0.01*100))

#>   A 1pp higher lagged return increases P(Up) by ≈ 1.35 pp at midpoint

cat(sprintf("β₂ = %.2f (negative) => VIX FEAR effect\n", beta2))

#> β₂ = -0.38 (negative) => VIX FEAR effect

cat(sprintf("  Marginal effect of ΔVIX at P=0.5: Δ P(Up)/ΔΔVIX ≈ %.4f\n", me_beta2))

#>   Marginal effect of ΔVIX at P=0.5: Δ P(Up)/ΔΔVIX ≈ -0.0950

cat(sprintf("  A 1-unit rise in ΔVIX decreases P(Up) by ≈ %.2f pp at midpoint\n",
            abs(me_beta2)*100))

#>   A 1-unit rise in ΔVIX decreases P(Up) by ≈ 9.50 pp at midpoint

# Visualise effect of r_lag and delta_vix on P(Up)
r_grid  <- seq(-0.05, 0.05, length.out = 200)
vix_grid<- seq(-3, 3, length.out = 200)

df_r   <- tibble(r=r_grid,
                 P_up=1/(1+exp(-(beta0 + beta1*r_grid + beta2*delta_vix))),
                 vary="Varying r(t−1),  ΔVIX fixed at 1.5")
df_vix <- tibble(r=vix_grid,
                 P_up=1/(1+exp(-(beta0 + beta1*r_lag + beta2*vix_grid))),
                 vary="Varying ΔVIX,  r(t−1) fixed at 0.010")

df_effects <- bind_rows(
  rename(df_r,   x=r),
  rename(df_vix, x=r)
) %>% rename(group=vary)

ggplot(df_effects, aes(x, P_up, color=group)) +
  geom_hline(yintercept=.5, color=clr_muted, linetype="dashed", linewidth=.5) +
  geom_line(linewidth=1.1) +
  facet_wrap(~group, scales="free_x") +
  scale_color_manual(values=c(clr_accent, clr_accent2)) +
  scale_y_continuous(labels=percent_format(), limits=c(0,1), breaks=seq(0,1,.25)) +
  labs(title    = "Part (k) — Marginal Effects of β₁ (Momentum) and β₂ (VIX Fear)",
       subtitle = "β₁ > 0: positive lagged return raises P(Up)  ·  β₂ < 0: rising VIX lowers P(Up)",
       x = "Input value", y = "P(Up)") +
  theme(legend.position="none")

3.3 Part (l) — Confusion Matrix Metrics

TP <- 67; FP <- 44; FN <- 33; TN <- 56; N <- 200

# Confusion matrix display
cm <- matrix(c(TP,FP,FN,TN), 2, 2, byrow=TRUE,
             dimnames=list(c("Pred Up","Pred Down"), c("Actual Up","Actual Down")))
print(cm)

#>           Actual Up Actual Down
#> Pred Up          67          44
#> Pred Down        33          56

# Metrics
accuracy    <- (TP + TN) / N
sensitivity <- TP / (TP + FN)   # recall / TPR
specificity <- TN / (TN + FP)   # TNR
precision   <- TP / (TP + FP)   # PPV
FPR         <- FP / (FP + TN)   # false positive rate
NPV         <- TN / (TN + FN)   # negative predictive value
F1          <- 2 * precision * sensitivity / (precision + sensitivity)
MCC_num     <- (TP*TN - FP*FN)
MCC_den     <- sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
MCC         <- MCC_num / MCC_den

metrics_tbl <- tibble(
  Metric     = c("Accuracy","Sensitivity (TPR)","Specificity (TNR)",
                 "Precision (PPV)","F1 Score","MCC",
                 "False Positive Rate","Neg. Pred. Value"),
  Formula    = c("(TP+TN)/N","TP/(TP+FN)","TN/(TN+FP)",
                 "TP/(TP+FP)","2·Prec·Sens/(Prec+Sens)","(TP·TN−FP·FN)/√(…)",
                 "FP/(FP+TN)","TN/(TN+FN)"),
  Value      = round(c(accuracy,sensitivity,specificity,precision,F1,MCC,FPR,NPV),4),
  Pct        = paste0(round(c(accuracy,sensitivity,specificity,precision,F1,MCC,FPR,NPV)*100,2),"%")
)

print(metrics_tbl)

#> # A tibble: 8 × 4
#>   Metric              Formula                 Value Pct   
#>   <chr>               <chr>                   <dbl> <chr> 
#> 1 Accuracy            (TP+TN)/N               0.615 61.5% 
#> 2 Sensitivity (TPR)   TP/(TP+FN)              0.67  67%   
#> 3 Specificity (TNR)   TN/(TN+FP)              0.56  56%   
#> 4 Precision (PPV)     TP/(TP+FP)              0.604 60.36%
#> 5 F1 Score            2·Prec·Sens/(Prec+Sens) 0.635 63.51%
#> 6 MCC                 (TP·TN−FP·FN)/√(…)      0.231 23.14%
#> 7 False Positive Rate FP/(FP+TN)              0.44  44%   
#> 8 Neg. Pred. Value    TN/(TN+FN)              0.629 62.92%

# Confusion matrix heatmap
df_cm <- tibble(
  pred   = factor(c("Pred Up","Pred Up","Pred Down","Pred Down"),
                  levels=c("Pred Down","Pred Up")),
  actual = factor(c("Actual Up","Actual Down","Actual Up","Actual Down"),
                  levels=c("Actual Up","Actual Down")),
  n      = c(TP, FP, FN, TN),
  type   = c("TP","FP","FN","TN"),
  fill_v = c(clr_accent2, clr_danger, clr_warn, clr_muted)
)

p_cm <- ggplot(df_cm, aes(actual, pred, fill=type)) +
  geom_tile(color=clr_bg, linewidth=2) +
  geom_text(aes(label=sprintf("%s\nn = %d", type, n)),
            color=clr_bg, size=5.5, fontface="bold", lineheight=1.3, family="mono") +
  scale_fill_manual(values=c("TP"=clr_accent2,"FP"=clr_danger,"FN"=clr_warn,"TN"=clr_muted)) +
  labs(title="Confusion Matrix", x="Actual Class", y="Predicted Class") +
  theme(legend.position="none",
        panel.grid=element_blank(), panel.border=element_blank(),
        axis.ticks=element_blank())

# Bar chart of metrics
p_metrics <- tibble(
  Metric = c("Accuracy","Sensitivity","Specificity","Precision","F1"),
  Value  = c(accuracy,sensitivity,specificity,precision,F1),
  Fill   = c(clr_accent,clr_accent2,clr_muted,clr_warn,clr_danger)
) %>%
  ggplot(aes(reorder(Metric,Value), Value, fill=Metric)) +
  geom_col(width=.55, color=NA) +
  geom_hline(yintercept=.5, color=clr_muted, linetype="dashed", linewidth=.5) +
  geom_text(aes(label=sprintf("%.4f", Value)), hjust=-.1,
            color=clr_text, size=3.5, family="mono") +
  coord_flip() +
  scale_fill_manual(values=c("Accuracy"=clr_accent,"Sensitivity"=clr_accent2,
                              "Specificity"=clr_muted,"Precision"=clr_warn,"F1"=clr_danger)) +
  scale_y_continuous(limits=c(0,1), labels=percent_format(),
                     expand=expansion(mult=c(0,.25))) +
  labs(title="Classification Metrics", x=NULL, y="Score") +
  theme(legend.position="none")

p_cm | p_metrics

Metric	Formula	Calculation	Value
Accuracy	\((TP+TN)/N\)	\((67+56)/200\)	0.615
Sensitivity	\(TP/(TP+FN)\)	\(67/100\)	0.67
Specificity	\(TN/(TN+FP)\)	\(56/100\)	0.56
Precision	\(TP/(TP+FP)\)	\(67/111\)	0.6036
F1 Score	\(2 \cdot \text{Prec} \cdot \text{Sens}/(\text{Prec}+\text{Sens})\)	—	0.6351
MCC	\((TP\cdot TN-FP\cdot FN)/\sqrt{\ldots}\)	—	0.2314

3.4 Part (m) — Naive Classifier & Limitations of Accuracy

# Test set: 100 Up, 100 Down — perfectly balanced
naive_acc <- max(100,100) / N
beat_naive <- accuracy > naive_acc
improvement <- (accuracy - naive_acc) * 100

cat(sprintf("Test set composition    : %d Up / %d Down (balanced)\n", 100, 100))

#> Test set composition    : 100 Up / 100 Down (balanced)

cat(sprintf("Naive majority accuracy : %.4f  (%.2f%%)\n", naive_acc, naive_acc*100))

#> Naive majority accuracy : 0.5000  (50.00%)

cat(sprintf("Model accuracy          : %.4f  (%.2f%%)\n", accuracy, accuracy*100))

#> Model accuracy          : 0.6150  (61.50%)

cat(sprintf("Improvement over naive  : +%.2f pp\n", improvement))

#> Improvement over naive  : +11.50 pp

cat(sprintf("Model beats naive?      : %s\n\n", ifelse(beat_naive,"YES","NO")))

#> Model beats naive?      : YES

cat("Why accuracy alone is inadequate:\n")

#> Why accuracy alone is inadequate:

cat("  1. Error costs are asymmetric: a false 'Up' signal (FP) triggers a losing trade;\n")

#>   1. Error costs are asymmetric: a false 'Up' signal (FP) triggers a losing trade;

cat("     a missed 'Up' day (FN) is an opportunity cost — these are not equivalent.\n")

#>      a missed 'Up' day (FN) is an opportunity cost — these are not equivalent.

cat("  2. A balanced test set makes naive accuracy artificially easy to beat.\n")

#>   2. A balanced test set makes naive accuracy artificially easy to beat.

cat("  3. High accuracy can coexist with a negative-Sharpe trading strategy.\n\n")

#>   3. High accuracy can coexist with a negative-Sharpe trading strategy.

cat("More economically relevant criterion:\n")

#> More economically relevant criterion:

cat("  Primary : Sharpe ratio of the resulting trading strategy (profit-weighted signal quality)\n")

#>   Primary : Sharpe ratio of the resulting trading strategy (profit-weighted signal quality)

cat("  Secondary: Precision (PPV) — of all 'Buy' signals, how many were profitable?\n")

#>   Secondary: Precision (PPV) — of all 'Buy' signals, how many were profitable?

cat("  Also useful: F1-score (harmonic mean), MCC (robust to class imbalance)\n")

#>   Also useful: F1-score (harmonic mean), MCC (robust to class imbalance)

df_compare <- tibble(
  Classifier = c("Naive\n(majority class)", "Logistic\nRegression"),
  Accuracy   = c(naive_acc, accuracy),
  color      = c(clr_muted, clr_accent2)
)

ggplot(df_compare, aes(Classifier, Accuracy, fill=Classifier)) +
  geom_col(width=.4, color=NA) +
  geom_hline(yintercept=naive_acc, color=clr_warn, linetype="dashed", linewidth=.8) +
  geom_text(aes(label=sprintf("%.4f\n(%.1f%%)", Accuracy, Accuracy*100)),
            vjust=-.35, color=clr_text, size=4, family="mono") +
  annotate("label", x=2.4, y=naive_acc,
           label="Naive baseline (50%)",
           color=clr_warn, fill=clr_surface, size=3.2,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  scale_fill_manual(values=c("Naive\n(majority class)"=clr_muted,
                              "Logistic\nRegression"=clr_accent2)) +
  scale_y_continuous(labels=percent_format(), limits=c(0,.8),
                     expand=expansion(mult=c(0,.1))) +
  labs(title    = "Part (m) — Model vs. Naive Classifier Accuracy",
       subtitle = sprintf("Model beats naive by +%.1f pp  ·  but accuracy alone is insufficient for trading evaluation",
                          improvement),
       x=NULL, y="Accuracy") +
  theme(legend.position="none")

4 Question 4 — Resampling and Regularization

mu_m   <- 0.0070   # sample mean monthly return = 0.70%
sd_m   <- 0.0550   # sample std dev monthly = 5.50%
n_mo   <- 48       # number of months

4.1 Part (n) — Sharpe Ratio: Monthly & Annualised

Monthly Sharpe Ratio: \[SR_{monthly} = \frac{\bar{R} - R_f}{s} = \frac{\mu}{\sigma}\]

Annualisation — with i.i.d. returns, annual mean = \(12\mu\) and annual \(\sigma = \sqrt{12}\,s\), so: \[SR_{annual} = \frac{12\,\mu}{\sqrt{12}\,s} = \sqrt{12} \cdot SR_{monthly}\]

SR_m  <- mu_m / sd_m
scale <- sqrt(12)
SR_a  <- SR_m * scale

# Standard error of SR (Jobson-Korkie approximation under normality)
SE_SR_m <- sqrt((1 + .5 * SR_m^2) / n_mo)
SE_SR_a <- SE_SR_m * scale

cat(sprintf("Mean monthly return (μ)      : %.4f  (%.4f%%)\n", mu_m, mu_m*100))

#> Mean monthly return (μ)      : 0.0070  (0.7000%)

cat(sprintf("Std dev monthly (σ)          : %.4f  (%.4f%%)\n", sd_m, sd_m*100))

#> Std dev monthly (σ)          : 0.0550  (5.5000%)

cat(sprintf("Monthly Sharpe ratio         : %.4f\n", SR_m))

#> Monthly Sharpe ratio         : 0.1273

cat(sprintf("Scaling factor               : √12 = %.4f\n", scale))

#> Scaling factor               : √12 = 3.4641

cat(sprintf("Annualised Sharpe ratio      : %.4f\n", SR_a))

#> Annualised Sharpe ratio      : 0.4409

cat(sprintf("\nJobson-Korkie SE (monthly SR): %.4f\n", SE_SR_m))

#> 
#> Jobson-Korkie SE (monthly SR): 0.1449

cat(sprintf("Jobson-Korkie SE (annual SR) : %.4f\n", SE_SR_a))

#> Jobson-Korkie SE (annual SR) : 0.5020

cat(sprintf("95%% CI annual SR             : [%.4f, %.4f]\n",
            SR_a - 1.96*SE_SR_a, SR_a + 1.96*SE_SR_a))

#> 95% CI annual SR             : [-0.5431, 1.4248]

# Simulate 48 monthly returns consistent with given parameters and show distribution
set.seed(2024)
sim_ret <- rnorm(n_mo, mu_m, sd_m)
SR_sim  <- mean(sim_ret)/sd(sim_ret)

df_ret <- tibble(r = sim_ret)

p_hist <- ggplot(df_ret, aes(r)) +
  geom_histogram(aes(y=after_stat(density)), bins=14,
                 fill=clr_accent, color=clr_bg, alpha=.8) +
  geom_vline(xintercept=mu_m, color=clr_accent2, linewidth=1.1, linetype="solid") +
  geom_vline(xintercept=0,   color=clr_muted,   linewidth=.7,  linetype="dashed") +
  stat_function(fun=dnorm, args=list(mean=mu_m,sd=sd_m),
                color=clr_warn, linewidth=1) +
  annotate("label", x=mu_m, y=7,
           label=sprintf("μ = %.4f%%", mu_m*100),
           color=clr_accent2, fill=clr_surface, size=3.2,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  scale_x_continuous(labels=function(x) paste0(x*100,"%")) +
  labs(title="Monthly Return Distribution (simulated, n=48)",
       subtitle=sprintf("SR(monthly) = %.4f  ·  SR(annual) = %.4f  ·  Scaling = √12 = %.4f",
                        SR_m, SR_a, scale),
       x="Monthly Return", y="Density")

# SR comparison chart
df_sr <- tibble(
  freq   = c("Monthly", "Annualised"),
  SR     = c(SR_m, SR_a),
  SE     = c(SE_SR_m, SE_SR_a),
  lo     = c(SR_m - 1.96*SE_SR_m, SR_a - 1.96*SE_SR_a),
  hi     = c(SR_m + 1.96*SE_SR_m, SR_a + 1.96*SE_SR_a)
)

p_sr <- ggplot(df_sr, aes(freq, SR, color=freq)) +
  geom_hline(yintercept=c(0,1), color=clr_muted, linetype="dashed", linewidth=.5) +
  geom_errorbar(aes(ymin=lo, ymax=hi), width=.15, linewidth=1.2) +
  geom_point(size=5) +
  geom_text(aes(label=sprintf("SR = %.4f", SR)),
            vjust=-1.2, size=3.5, family="mono", color=clr_text) +
  scale_color_manual(values=c("Monthly"=clr_accent,"Annualised"=clr_accent2)) +
  annotate("label", x=2.4, y=1,
           label="Typical threshold\nSR = 1.0",
           color=clr_muted, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  labs(title="Sharpe Ratio with 95% CI (Jobson-Korkie)",
       x=NULL, y="Sharpe Ratio") +
  theme(legend.position="none")

p_hist | p_sr

\[\boxed{SR_{monthly} = \frac{0.0070}{0.0550} = 0.1273, \qquad SR_{annual} = 0.1273 \times \sqrt{12} = 0.4409}\]

4.2 Part (o) — Bootstrap SE for Sharpe Ratio

set.seed(42)
# Simulate actual return series (use what we have)
ret_series <- rnorm(n_mo, mu_m, sd_m)
SR_obs     <- mean(ret_series) / sd(ret_series)

sharpe_fn  <- function(x) mean(x) / sd(x)

# ── 1) i.i.d. Bootstrap ──────────────────────────────────────────────────────
B <- 5000
set.seed(42)
sr_iid <- replicate(B, {
  idx <- sample(n_mo, n_mo, replace=TRUE)
  sharpe_fn(ret_series[idx])
})
SE_iid   <- sd(sr_iid)
CI_iid   <- quantile(sr_iid, c(.025,.975))

# ── 2) Stationary Block Bootstrap (block length l=3) ─────────────────────────
block_boot <- function(x, B=5000, l=3) {
  n       <- length(x)
  n_blks  <- ceiling(n / l)
  replicate(B, {
    starts   <- sample(1:(n - l + 1), n_blks, replace=TRUE)
    resample <- unlist(lapply(starts, function(s) x[s:(s+l-1)]))[1:n]
    sharpe_fn(resample)
  })
}

set.seed(42)
sr_block <- block_boot(ret_series, B=B, l=3)
SE_block <- sd(sr_block)
CI_block <- quantile(sr_block, c(.025,.975))

cat(sprintf("Observed SR (monthly)           : %.4f\n\n", SR_obs))

#> Observed SR (monthly)           : 0.0730

cat(sprintf("i.i.d. Bootstrap (B=%d)\n", B))

#> i.i.d. Bootstrap (B=5000)

cat(sprintf("  SE                            : %.4f\n", SE_iid))

#>   SE                            : 0.1480

cat(sprintf("  95%% CI (percentile)          : [%.4f, %.4f]\n\n", CI_iid[1], CI_iid[2]))

#>   95% CI (percentile)          : [-0.2026, 0.3689]

cat(sprintf("Block Bootstrap (l=3, B=%d)\n", B))

#> Block Bootstrap (l=3, B=5000)

cat(sprintf("  SE                            : %.4f\n", SE_block))

#>   SE                            : 0.1581

cat(sprintf("  95%% CI (percentile)          : [%.4f, %.4f]\n\n", CI_block[1], CI_block[2]))

#>   95% CI (percentile)          : [-0.2537, 0.3660]

cat("Why i.i.d. is inappropriate: monthly returns exhibit\n")

#> Why i.i.d. is inappropriate: monthly returns exhibit

cat("  autocorrelation and volatility clustering — i.i.d. bootstrap\n")

#>   autocorrelation and volatility clustering — i.i.d. bootstrap

cat("  destroys the time-dependence structure, underestimating SE.\n")

#>   destroys the time-dependence structure, underestimating SE.

cat("Fix: Stationary Block Bootstrap (Politis & Romano 1994)\n")

#> Fix: Stationary Block Bootstrap (Politis & Romano 1994)

cat("  samples contiguous BLOCKS of l months, preserving within-block dependence.\n")

#>   samples contiguous BLOCKS of l months, preserving within-block dependence.

df_boot <- bind_rows(
  tibble(SR = sr_iid,   Method = "i.i.d. Bootstrap"),
  tibble(SR = sr_block, Method = "Block Bootstrap (l=3)")
)

ggplot(df_boot, aes(SR, fill=Method, color=Method)) +
  geom_histogram(aes(y=after_stat(density)), bins=50,
                 alpha=.55, position="identity") +
  geom_vline(aes(xintercept=SR_obs), color=clr_warn,
             linewidth=1.1, linetype="dashed") +
  facet_wrap(~Method, ncol=2) +
  annotate("label", x=SR_obs, y=Inf,
           label=sprintf("SR_obs\n= %.4f", SR_obs),
           vjust=1.3, color=clr_warn, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  scale_fill_manual(values=c("i.i.d. Bootstrap"=clr_accent,
                              "Block Bootstrap (l=3)"=clr_accent2)) +
  scale_color_manual(values=c("i.i.d. Bootstrap"=clr_accent,
                               "Block Bootstrap (l=3)"=clr_accent2)) +
  labs(title    = "Part (o) — Bootstrap Distributions of Monthly Sharpe Ratio",
       subtitle = sprintf("i.i.d. SE = %.4f  ·  Block SE = %.4f  ·  Block bootstrap better respects serial dependence",
                          SE_iid, SE_block),
       x="Bootstrap Sharpe Ratio", y="Density") +
  theme(legend.position="none")

Bootstrap procedure (step by step):

Collect original series \(\{r_1,\ldots,r_{48}\}\) and compute \(\widehat{SR} = \bar{r}/s\).
For \(b = 1,\ldots,B\) (e.g. \(B = 5{,}000\)):
- Block bootstrap: draw \(\lceil n/l \rceil\) starting indices uniformly from \(\{1,\ldots,n-l+1\}\); concatenate blocks \(\{r_{s_j}, r_{s_j+1}, \ldots, r_{s_j+l-1}\}\); trim to length \(n\).
- Compute \(\widehat{SR}^*_b = \bar{r}^*/s^*\) on the resample.
Estimate SE: \(\widehat{SE} = \text{sd}(\{\widehat{SR}^*_b\}_{b=1}^B)\)
95% CI: percentile method \([Q_{2.5\%}, Q_{97.5\%}]\) of bootstrap distribution.

4.3 Part (p) — LASSO: λ_min vs. λ_1SE

lambda_min   <- 0.030;  k_min <- 14
lambda_1se   <- 0.065;  k_1se <- 7
n_cand       <- 60

cat(sprintf("Candidate factors             : %d\n", n_cand))

#> Candidate factors             : 60

cat(sprintf("λ_min  = %.3f  ⇒  %d factors retained\n", lambda_min, k_min))

#> λ_min  = 0.030  ⇒  14 factors retained

cat(sprintf("λ_1SE  = %.3f  ⇒  %d factors retained\n", lambda_1se, k_1se))

#> λ_1SE  = 0.065  ⇒  7 factors retained

cat(sprintf("\nRECOMMENDATION: Deploy λ_1SE = %.3f (%d factors)\n\n", lambda_1se, k_1se))

#> 
#> RECOMMENDATION: Deploy λ_1SE = 0.065 (7 factors)

cat("Rationale:\n")

#> Rationale:

cat("  1. Overfitting risk: λ_min retains 14/60 factors; with limited history many\n")

#>   1. Overfitting risk: λ_min retains 14/60 factors; with limited history many

cat("     are spurious correlations that won't persist out-of-sample (multiple testing).\n")

#>      are spurious correlations that won't persist out-of-sample (multiple testing).

cat("  2. The 1-SE rule: difference in CV error between the two λ values is\n")

#>   2. The 1-SE rule: difference in CV error between the two λ values is

cat("     within one SE — statistically indistinguishable, so prefer parsimony.\n")

#>      within one SE — statistically indistinguishable, so prefer parsimony.

cat("  3. Generalization: 7-factor model is more stable, more interpretable,\n")

#>   3. Generalization: 7-factor model is more stable, more interpretable,

cat("     and more robust to structural breaks in factor premia.\n")

#>      and more robust to structural breaks in factor premia.

cat("  4. Transaction costs: fewer factors mean fewer required trades on rebalance.\n")

#>   4. Transaction costs: fewer factors mean fewer required trades on rebalance.

# Simulate a LASSO regularisation path for illustration
set.seed(99)
lambda_seq <- exp(seq(log(.005), log(.2), length.out=100))

# Simulate number of active factors as function of lambda
k_path <- pmax(0, round(20 * exp(-12*(lambda_seq - .01)) + 2))

# Simulate CV error curve (U-shaped)
cv_mean <- .45 + .6*(lambda_seq - .03)^2 / .002 +
           .015*exp(-25*(lambda_seq - .12)^2)
# add noise
set.seed(99)
cv_se   <- abs(rnorm(length(lambda_seq), .02, .004))
cv_up   <- cv_mean + cv_se
cv_lo   <- cv_mean - cv_se

# Identify min and 1se positions
idx_min <- which.min(cv_mean)
cv_at_min <- cv_mean[idx_min]
idx_1se   <- max(which(cv_mean <= cv_at_min + cv_se[idx_min]))
l_min_val <- lambda_seq[idx_min]
l_1se_val <- lambda_seq[idx_1se]

df_cv <- tibble(lambda=lambda_seq, cv=cv_mean, lo=cv_lo, hi=cv_up, k=k_path)

p1 <- ggplot(df_cv, aes(log(lambda), cv)) +
  geom_ribbon(aes(ymin=lo,ymax=hi), fill=clr_accent, alpha=.18) +
  geom_line(color=clr_accent, linewidth=1) +
  geom_vline(xintercept=log(l_min_val), color=clr_accent2, linetype="solid",  linewidth=.9) +
  geom_vline(xintercept=log(l_1se_val), color=clr_warn,    linetype="dashed", linewidth=.9) +
  annotate("label", x=log(l_min_val), y=max(cv_mean)*.9,
           label=sprintf("λ_min\n= %.3f", l_min_val),
           color=clr_accent2, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  annotate("label", x=log(l_1se_val), y=max(cv_mean)*.75,
           label=sprintf("λ_1SE\n= %.3f\n[DEPLOY]", l_1se_val),
           color=clr_warn, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  labs(title="Cross-Validation Error vs log(λ)", subtitle="Ribbon = ±1 SE band",
       x="log(λ)", y="CV Error")

p2 <- ggplot(df_cv, aes(log(lambda), k)) +
  geom_line(color=clr_danger, linewidth=1) +
  geom_vline(xintercept=log(l_min_val), color=clr_accent2, linetype="solid",  linewidth=.9) +
  geom_vline(xintercept=log(l_1se_val), color=clr_warn,    linetype="dashed", linewidth=.9) +
  annotate("point", x=log(l_min_val), y=k_path[idx_min],
           color=clr_accent2, size=4) +
  annotate("point", x=log(l_1se_val), y=k_path[idx_1se],
           color=clr_warn, size=4) +
  annotate("label", x=log(l_min_val), y=k_path[idx_min]+1.5,
           label=sprintf("k=%d", k_path[idx_min]),
           color=clr_accent2, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  annotate("label", x=log(l_1se_val), y=k_path[idx_1se]+1.5,
           label=sprintf("k=%d", k_path[idx_1se]),
           color=clr_warn, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  labs(title="Active Factors vs log(λ)", subtitle="1-SE rule gives sparser, more robust model",
       x="log(λ)", y="# Active Factors")

p1 / p2

✓ Recommendation: Deploy λ_1SE = 0.065 (7 factors)

The 1-SE rule chooses the most regularised model whose CV error is statistically indistinguishable from the minimum-error model. With 60 candidate factors and limited data, λ_min’s 14-factor model is at serious risk of in-sample overfitting. The sparser 7-factor model generalises better, costs less to trade, and is easier to monitor and interpret.

4.4 Part (q) — Walk-Forward Cross-Validation

total_mo   <- 60
init_train <- 36
step_sz    <- 1

cat("=== Walk-Forward (Expanding Window) Scheme ===\n")

#> === Walk-Forward (Expanding Window) Scheme ===

cat(sprintf("Total months    : %d\n", total_mo))

#> Total months    : 60

cat(sprintf("Initial window  : %d months (months 1–36 → predict month 37)\n", init_train))

#> Initial window  : 36 months (months 1–36 → predict month 37)

cat(sprintf("Step            : %d month(s)\n\n", step_sz))

#> Step            : 1 month(s)

cat(sprintf("%-6s  %-28s  %-18s\n", "Fold", "Train (months)", "Test (month)"))

#> Fold    Train (months)                Test (month)

cat(strrep("─", 56), "\n")

#> ────────────────────────────────────────────────────────

for (fold in 1:(total_mo - init_train)) {
  train_end <- init_train + fold - 1
  test_t    <- init_train + fold
  if (test_t <= total_mo)
    cat(sprintf("%-6d  1 – %-24d  Month %d\n", fold, train_end, test_t))
}

#> 1       1 – 36                        Month 37
#> 2       1 – 37                        Month 38
#> 3       1 – 38                        Month 39
#> 4       1 – 39                        Month 40
#> 5       1 – 40                        Month 41
#> 6       1 – 41                        Month 42
#> 7       1 – 42                        Month 43
#> 8       1 – 43                        Month 44
#> 9       1 – 44                        Month 45
#> 10      1 – 45                        Month 46
#> 11      1 – 46                        Month 47
#> 12      1 – 47                        Month 48
#> 13      1 – 48                        Month 49
#> 14      1 – 49                        Month 50
#> 15      1 – 50                        Month 51
#> 16      1 – 51                        Month 52
#> 17      1 – 52                        Month 53
#> 18      1 – 53                        Month 54
#> 19      1 – 54                        Month 55
#> 20      1 – 55                        Month 56
#> 21      1 – 56                        Month 57
#> 22      1 – 57                        Month 58
#> 23      1 – 58                        Month 59
#> 24      1 – 59                        Month 60

cat(strrep("─", 56), "\n")

#> ────────────────────────────────────────────────────────

cat("\nWhy k-fold random CV is UNSAFE for time series:\n")

#> 
#> Why k-fold random CV is UNSAFE for time series:

cat("  1. LOOK-AHEAD BIAS: future obs land in training folds,\n")

#>   1. LOOK-AHEAD BIAS: future obs land in training folds,

cat("     model 'sees the future' => inflated OOS performance.\n")

#>      model 'sees the future' => inflated OOS performance.

cat("  2. TEMPORAL STRUCTURE BROKEN: autocorrelation and regime\n")

#>   2. TEMPORAL STRUCTURE BROKEN: autocorrelation and regime

cat("     dynamics are destroyed by random shuffling.\n")

#>      dynamics are destroyed by random shuffling.

cat("  3. DEPLOYMENT MISMATCH: live trading always trains on past,\n")

#>   3. DEPLOYMENT MISMATCH: live trading always trains on past,

cat("     predicts future — k-fold does not replicate this.\n")

#>      predicts future — k-fold does not replicate this.

# Visual timeline of walk-forward folds
n_folds_show <- 6
df_wf <- map_dfr(1:n_folds_show, function(fold) {
  train_end <- init_train + fold - 1
  test_t    <- init_train + fold
  bind_rows(
    tibble(fold=fold, month=1:train_end,         type="Train"),
    tibble(fold=fold, month=test_t,               type="Test"),
    tibble(fold=fold, month=(test_t+1):total_mo,  type="Unused")
  )
}) %>% mutate(
  fold = factor(paste0("Fold ", fold), levels=paste0("Fold ", n_folds_show:1)),
  type = factor(type, levels=c("Train","Test","Unused"))
)

ggplot(df_wf, aes(month, fold, fill=type)) +
  geom_tile(color=clr_bg, linewidth=.5, height=.7) +
  geom_vline(xintercept=init_train+.5,
             color=clr_muted, linetype="dashed", linewidth=.7) +
  annotate("label", x=init_train+.5, y=n_folds_show+.6,
           label="Initial\ntrain end",
           color=clr_muted, fill=clr_surface, size=3,
           label.r=unit(.2,"lines"), label.size=.3, family="mono") +
  scale_fill_manual(values=c("Train"=clr_accent,"Test"=clr_accent2,"Unused"=clr_border),
                    name="Month role") +
  scale_x_continuous(breaks=seq(1,total_mo,5)) +
  labs(title    = "Part (q) — Walk-Forward (Expanding Window) Cross-Validation",
       subtitle = "Test window always AFTER training window — no look-ahead bias",
       x = "Month index", y = NULL) +
  theme(legend.position="bottom",
        axis.text.y=element_text(size=10))

✗ Why random k-fold CV is unsafe for financial time series

Look-ahead bias: future observations randomly enter the training fold — the model implicitly “knows the future,” producing over-optimistic Sharpe ratios that vanish in live trading.
Temporal structure destroyed: autocorrelation, volatility clustering, and regime structure are broken by random shuffling.
Deployment mismatch: a live strategy is always trained on the past and predicts the future — walk-forward replicates this exactly; random k-fold does not.

Walk-forward scheme: fix an initial training window → fit & select λ → predict one step ahead → expand by one month → repeat. Evaluate performance only on the concatenated out-of-sample returns.

5 Summary of All Key Results

summary_tbl <- tribble(
  ~Question,                    ~Formula,                            ~Value,         ~Decision,
  "Q1(a) t(β=0)",               "β̂/SE(β̂)",                         "5.7647",       "Reject H₀ ✓",
  "Q1(b) t(β=1)",               "(β̂−1)/SE(β̂)",                     "−0.1176",      "Fail to reject H₀",
  "Q1(c) t(α)",                 "α̂/SE(α̂)",                          "0.8500",       "Fail to reject H₀",
  "Q1(d) R² split",             "R² vs 1−R²",                        "50% / 50%",    "Systematic / Idiosyncratic",
  "Q1(e) E[Ri−Rf] CAPM",       "β×E[MKT]",                          "0.6860%/mo",   "—",
  "Q2(f) t: α",                 "α̂/SE(α̂)",                          "1.6111",       "Significant ✓",
  "Q2(f) t: MKT",               "b/SE(b)",                           "12.1250",      "Significant ✓",
  "Q2(f) t: SMB",               "s/SE(s)",                           "6.8182",       "Significant ✓",
  "Q2(f) t: HML",               "h/SE(h)",                           "−1.0000",      "Not significant",
  "Q2(g) Style",                "sign(s), sign(h)",                  "s>0, h<0",     "Small-cap Growth",
  "Q2(h) Manager alpha",        "α̂ = 0.29%/mo",                     "t = 1.6111",   "Significant → skill ✓",
  "Q2(i) ΔR²",                  "0.92 − 0.75",                       "+0.17",        "SMB+HML explain more",
  "Q3(j) P(Up)",                "σ(η), η=−0.536",                    "0.3691",       "Predict: DOWN",
  "Q3(l) Accuracy",             "(TP+TN)/N",                         "0.6150",       "Beats naive (50%) ✓",
  "Q3(l) Sensitivity",          "TP/(TP+FN)",                        "0.6700",       "67% of Up days caught",
  "Q3(l) Specificity",          "TN/(TN+FP)",                        "0.5600",       "56% of Down caught",
  "Q3(l) Precision",            "TP/(TP+FP)",                        "0.6036",       "60.4% Buy signals correct",
  "Q4(n) SR monthly",           "μ/σ",                               "0.1273",       "—",
  "Q4(n) SR annualised",        "SR×√12",                            "0.4409",       "Scaling = √12",
  "Q4(o) Bootstrap",            "Block bootstrap (l=3)",             "SE≈0.15",      "Block > i.i.d. for TS",
  "Q4(p) LASSO λ",              "1-SE rule",                         "λ=0.065, k=7", "More robust, less overfit",
  "Q4(q) CV scheme",            "Walk-forward expanding window",     "No leakage",   "Random k-fold is unsafe"
)

kbl(summary_tbl,
    caption = "Complete Answer Summary — All 22 Key Results",
    col.names = c("Question", "Formula / Method", "Value", "Decision / Interpretation"),
    align = "llll") %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=TRUE, font_size=13) %>%
  row_spec(0, bold=TRUE, color="white", background="#1e2333") %>%
  row_spec(c(1:5),   background="#0f1117", color="#e2e8f0") %>%
  row_spec(c(6:12),  background="#181c25", color="#e2e8f0") %>%
  row_spec(c(13:17), background="#0f1117", color="#e2e8f0") %>%
  row_spec(c(18:22), background="#181c25", color="#e2e8f0") %>%
  column_spec(3, bold=TRUE, color="#6ee7b7") %>%
  column_spec(4, color="#94a3b8")

Complete Answer Summary — All 22 Key Results
Question	Formula / Method	Value	Decision / Interpretation
Q1(a) t(β=0)	β̂/SE(β̂)	5.7647	Reject H₀ ✓
Q1(b) t(β=1)	(β̂−1)/SE(β̂)	−0.1176	Fail to reject H₀
Q1(c) t(α)	α̂/SE(α̂)	0.8500	Fail to reject H₀
Q1(d) R² split	R² vs 1−R²	50% / 50%	Systematic / Idiosyncratic
Q1(e) E[Ri−Rf] CAPM	β×E[MKT]	0.6860%/mo	—
Q2(f) t: α	α̂/SE(α̂)	1.6111	Significant ✓
Q2(f) t: MKT	b/SE(b)	12.1250	Significant ✓
Q2(f) t: SMB	s/SE(s)	6.8182	Significant ✓
Q2(f) t: HML	h/SE(h)	−1.0000	Not significant
Q2(g) Style	sign(s), sign(h)	s>0, h<0	Small-cap Growth
Q2(h) Manager alpha	α̂ = 0.29%/mo	t = 1.6111	Significant → skill ✓
Q2(i) ΔR²	0.92 − 0.75	+0.17	SMB+HML explain more
Q3(j) P(Up)	σ(η), η=−0.536	0.3691	Predict: DOWN
Q3(l) Accuracy	(TP+TN)/N	0.6150	Beats naive (50%) ✓
Q3(l) Sensitivity	TP/(TP+FN)	0.6700	67% of Up days caught
Q3(l) Specificity	TN/(TN+FP)	0.5600	56% of Down caught
Q3(l) Precision	TP/(TP+FP)	0.6036	60.4% Buy signals correct
Q4(n) SR monthly	μ/σ	0.1273	—
Q4(n) SR annualised	SR×√12	0.4409	Scaling = √12
Q4(o) Bootstrap	Block bootstrap (l=3)	SE≈0.15	Block > i.i.d. for TS
Q4(p) LASSO λ	1-SE rule	λ=0.065, k=7	More robust, less overfit
Q4(q) CV scheme	Walk-forward expanding window	No leakage	Random k-fold is unsafe

Machine Learning Applications in Finance — Final Examination
Rendered: June 08, 2026 at 13:44

Machine Learning Applications in Finance

Final Examination — Complete Solutions

Minjin

June 08, 2026