# ── IMPORTANT: MASS is loaded FIRST so that dplyr::select wins when dplyr loads
# ── Data & Modelling ───────────────────────────────────────────────────────────
library(ISLR2)       # Auto, Weekly, Boston
## Warning: package 'ISLR2' was built under R version 4.5.3
library(MASS)        # LDA  — must precede dplyr
## Warning: package 'MASS' was built under R version 4.5.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(car)         # VIF calculation
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
library(class)       # KNN
## Warning: package 'class' was built under R version 4.5.3
# ── Visualisation ──────────────────────────────────────────────────────────────
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(patchwork)   # Compose multi-panel plots
## Warning: package 'patchwork' was built under R version 4.5.3
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
## 
##     area
# ── Data Wrangling — loaded AFTER MASS so dplyr::select takes precedence ───────
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(broom)
## Warning: package 'broom' was built under R version 4.5.3
library(purrr)       # map_dfr
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:car':
## 
##     some
# Explicitly bind dplyr verbs to guard against any residual MASS masking
select    <- dplyr::select
filter    <- dplyr::filter
mutate    <- dplyr::mutate
summarise <- dplyr::summarise

# ── Model Evaluation ──────────────────────────────────────────────────────────
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# ── Presentation ──────────────────────────────────────────────────────────────
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(scales)
## Warning: package 'scales' was built under R version 4.5.3
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
library(ggrepel)     # geom_text_repel for Figure 9
## Warning: package 'ggrepel' was built under R version 4.5.3
# ── Consistent plot theme ─────────────────────────────────────────────────────
theme_afs <- function(base_size = 12) {
  theme_minimal(base_size = base_size) %+replace%
    theme(
      plot.title    = element_text(face = "bold", size = base_size + 1,
                                   margin = margin(b = 6)),
      plot.subtitle = element_text(colour = "grey40", size = base_size - 1,
                                   margin = margin(b = 10)),
      plot.caption  = element_text(colour = "grey55", size = base_size - 2,
                                   hjust = 0),
      panel.grid.minor = element_blank(),
      strip.text    = element_text(face = "bold")
    )
}

PALETTE <- c(American = "#E74C3C", European = "#3498DB", Japanese = "#27AE60")
data(Auto)
Auto <- na.omit(Auto)
Auto$origin <- factor(Auto$origin, 1:3,
                      c("American", "European", "Japanese"))
cat(sprintf("Dimensions: %d observations × %d variables\n",
            nrow(Auto), ncol(Auto)))
## Dimensions: 392 observations × 9 variables
var_meta <- data.frame(
  Variable = names(Auto),
  Scale    = c("Quantitative (ratio)", "Quantitative (discrete)",
               "Quantitative (ratio)", "Quantitative (ratio)",
               "Quantitative (ratio)", "Quantitative (ratio)",
               "Qualitative (ordinal)", "Qualitative (nominal)",
               "Qualitative (nominal)"),
  Role     = c("Response (fuel efficiency, USD-linked operating cost)",
               "Engine architecture proxy",
               "Engine displacement — cubic inches",
               "Rated engine power",
               "Curb weight — lbs",
               "0–60 mph elapsed time — seconds",
               "Model year; encodes regulatory era, not magnitude",
               "Manufacturing region (1 = US, 2 = EU, 3 = JP)",
               "Unique car identifier — excluded from modelling")
)
kable(var_meta, caption = "Table 1. Variable Taxonomy — Auto Dataset") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(7:9, background = "#FFF3CD") %>%
  column_spec(2, italic = TRUE)
Table 1. Variable Taxonomy — Auto Dataset
Variable Scale Role
mpg Quantitative (ratio) Response (fuel efficiency, USD-linked operating cost)
cylinders Quantitative (discrete) Engine architecture proxy
displacement Quantitative (ratio) Engine displacement — cubic inches
horsepower Quantitative (ratio) Rated engine power
weight Quantitative (ratio) Curb weight — lbs
acceleration Quantitative (ratio) 0–60 mph elapsed time — seconds
year Qualitative (ordinal) Model year; encodes regulatory era, not magnitude
origin Qualitative (nominal) Manufacturing region (1 = US, 2 = EU, 3 = JP)
name Qualitative (nominal) Unique car identifier — excluded from modelling
q_vars <- c("mpg","cylinders","displacement","horsepower","weight",
            "acceleration","year")

range_tbl <- Auto[, q_vars] %>%
  summarise(across(everything(),
    list(Min = min, Max = max, Range = ~ diff(range(.)),
         CV  = ~ sd(.) / mean(.) * 100))) %>%       # Coefficient of variation
  pivot_longer(everything(),
               names_to  = c("Variable", ".value"),
               names_sep = "_")

kable(range_tbl, digits = 2,
      caption = "Table 2. Range and Coefficient of Variation (CV, %)") %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(range_tbl$CV > 30, "#C0392B", "black"))
Table 2. Range and Coefficient of Variation (CV, %)
Variable Min Max Range CV
mpg 9 46.6 37.6 33.29
cylinders 3 8.0 5.0 31.17
displacement 68 455.0 387.0 53.83
horsepower 46 230.0 184.0 36.84
weight 1613 5140.0 3527.0 28.53
acceleration 8 24.8 16.8 17.75
year 70 82.0 12.0 4.85
stats_tbl <- Auto[, q_vars] %>%
  summarise(across(everything(), list(Mean = mean, SD = sd, Median = median,
                                      IQR = IQR))) %>%
  pivot_longer(everything(), names_to = c("Variable",".value"),
               names_sep = "_") %>%
  mutate(Skew_flag = ifelse(abs(Mean - Median) / SD > 0.2, "Skewed ⚠", ""))

kable(stats_tbl, digits = 3,
      caption = "Table 3. Descriptive Statistics — Quantitative Predictors") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(stats_tbl$Skew_flag != ""), background = "#FCF3CF")
Table 3. Descriptive Statistics — Quantitative Predictors
Variable Mean SD Median IQR Skew_flag
mpg 23.446 7.805 22.75 12.00
cylinders 5.472 1.706 4.00 4.00 Skewed ⚠
displacement 194.412 104.644 151.00 170.75 Skewed ⚠
horsepower 104.469 38.491 93.50 51.00 Skewed ⚠
weight 2977.584 849.403 2803.50 1389.50 Skewed ⚠
acceleration 15.541 2.759 15.50 3.25
year 75.980 3.684 76.00 6.00
Auto_sub <- Auto[-(10:85), ]

compare_stats <- bind_rows(
  Auto[, q_vars] %>%
    summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
    pivot_longer(everything(), names_to = c("Variable",".value"),
                 names_sep = "_") %>%
    mutate(Sample = "Full (n = 392)"),
  Auto_sub[, q_vars] %>%
    summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
    pivot_longer(everything(), names_to = c("Variable",".value"),
                 names_sep = "_") %>%
    mutate(Sample = sprintf("Subset (n = %d)", nrow(Auto_sub)))
) %>%
  pivot_wider(names_from = Sample,
              values_from = c(Mean, SD), names_glue = "{Sample}_{.value}")

kable(compare_stats, digits = 3,
      caption = "Table 4. Full vs. Subset Descriptive Statistics") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Table 4. Full vs. Subset Descriptive Statistics
Variable Full (n = 392)_Mean Subset (n = 316)_Mean Full (n = 392)_SD Subset (n = 316)_SD
mpg 23.446 24.404 7.805 7.867
cylinders 5.472 5.373 1.706 1.654
displacement 194.412 187.241 104.644 99.678
horsepower 104.469 100.722 38.491 35.709
weight 2977.584 2935.972 849.403 811.300
acceleration 15.541 15.727 2.759 2.694
year 75.980 77.146 3.684 3.106
ggpairs(
  Auto[, c("mpg","displacement","horsepower","weight","acceleration")],
  aes(colour = Auto$origin, alpha = 0.35),
  upper = list(continuous = wrap("cor", size = 3.2, alignPercent = 0.8)),
  lower = list(continuous = wrap("points", size = 0.7)),
  diag  = list(continuous = wrap("densityDiag", alpha = 0.5))
) +
  scale_colour_manual(values = PALETTE) +
  scale_fill_manual(values  = PALETTE) +
  labs(title = "Figure 1. Scatterplot Matrix — Auto Dataset",
       subtitle = "Colour encodes manufacturing origin; upper triangle shows Pearson r") +
  theme_afs(10)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

cor_mat <- cor(Auto[, q_vars])
corrplot(cor_mat,
         method = "color", type = "upper", order = "hclust",
         addCoef.col = "black", number.cex = 0.70,
         tl.col = "black", tl.srt = 45,
         col = colorRampPalette(c("#2980B9","white","#C0392B"))(200),
         title = "Figure 2. Correlation Matrix (hierarchical clustering order)",
         mar = c(0, 0, 2.5, 0))

Auto %>%
  select(mpg, displacement, horsepower, weight, acceleration, origin) %>%
  pivot_longer(c(displacement, horsepower, weight, acceleration),
               names_to = "predictor", values_to = "value") %>%
  ggplot(aes(x = value, y = mpg)) +
  geom_point(aes(colour = origin), alpha = 0.22, size = 1) +
  geom_smooth(method = "loess", se = TRUE, colour = "#2C3E50",
              linewidth = 1.1, fill = "grey80") +
  geom_smooth(method = "lm",   se = FALSE, colour = "#E74C3C",
              linewidth = 0.7, linetype = "dashed") +
  scale_colour_manual(values = PALETTE) +
  facet_wrap(~ predictor, scales = "free_x") +
  labs(title    = "Figure 3. MPG vs. Key Predictors — LOESS vs. Linear Fit",
       subtitle = "Dashed red = linear fit; solid black = LOESS. Curvature indicates non-linearity.",
       x = NULL, y = "MPG", colour = "Origin",
       caption  = "Source: ISLR2::Auto") +
  theme_afs()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Auto %>%
  group_by(year, origin) %>%
  summarise(mean_mpg = mean(mpg), se = sd(mpg)/sqrt(n()), .groups = "drop") %>%
  ggplot(aes(x = year, y = mean_mpg, colour = origin, fill = origin)) +
  geom_ribbon(aes(ymin = mean_mpg - se, ymax = mean_mpg + se),
              alpha = 0.15, colour = NA) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.5) +
  geom_vline(xintercept = 73, linetype = "dotted", colour = "grey40") +
  annotate("text", x = 73.3, y = 14, label = "1973 Oil Crisis →",
           hjust = 0, size = 3.2, colour = "grey30") +
  scale_colour_manual(values = PALETTE) +
  scale_fill_manual(values   = PALETTE) +
  labs(title    = "Figure 4. Mean MPG Trend by Origin and Year (±1 SE)",
       x = "Model Year (19xx)", y = "Mean MPG",
       colour = "Origin", fill = "Origin") +
  theme_afs()

pairs(Auto[, q_vars],
      main = "Figure 5. Scatterplot Matrix — All Quantitative Predictors",
      pch = 19, cex = 0.4,
      col = adjustcolor("#2C3E50", 0.35),
      gap = 0.3)

cor_df <- round(cor(Auto[, q_vars]), 3)
kable(cor_df, caption = "Table 5. Pearson Correlation Matrix — Auto") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE, font_size = 11) %>%
  column_spec(1, bold = TRUE)
Table 5. Pearson Correlation Matrix — Auto
mpg cylinders displacement horsepower weight acceleration year
mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423 0.581
cylinders -0.778 1.000 0.951 0.843 0.898 -0.505 -0.346
displacement -0.805 0.951 1.000 0.897 0.933 -0.544 -0.370
horsepower -0.778 0.843 0.897 1.000 0.865 -0.689 -0.416
weight -0.832 0.898 0.933 0.865 1.000 -0.417 -0.309
acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000 0.290
year 0.581 -0.346 -0.370 -0.416 -0.309 0.290 1.000
lm_vif_check <- lm(mpg ~ displacement + horsepower + weight +
                     acceleration + year, data = Auto)
vif_vals <- car::vif(lm_vif_check)
kable(data.frame(Predictor = names(vif_vals), VIF = round(vif_vals, 2)),
      caption = "Table 6. Variance Inflation Factors") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(vif_vals > 5), background = "#FADBD8",
           bold = TRUE)
Table 6. Variance Inflation Factors
Predictor VIF
displacement displacement 10.82
horsepower horsepower 9.30
weight weight 10.58
acceleration acceleration 2.62
year year 1.24
lm_auto <- lm(mpg ~ . - name, data = Auto)
tidy_auto <- tidy(lm_auto, conf.int = TRUE)
glance_auto <- glance(lm_auto)

# Formatted coefficient table
tidy_auto %>%
  mutate(
    sig = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ ".",
      TRUE            ~ ""
    ),
    estimate   = round(estimate,   4),
    std.error  = round(std.error,  4),
    statistic  = round(statistic,  3),
    p.value    = format.pval(p.value, digits = 3, eps = 0.001),
    conf.low   = round(conf.low,   4),
    conf.high  = round(conf.high,  4)
  ) %>%
  select(term, estimate, std.error, statistic, p.value, sig,
         conf.low, conf.high) %>%
  kable(caption = "Table 7. OLS Coefficients — mpg ~ All Predictors (except name)",
        col.names = c("Term","Estimate","SE","t","p-value","Sig.",
                      "95% CI Low","95% CI High")) %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE, font_size = 11) %>%
  row_spec(which(tidy_auto$p.value < 0.05), background = "#D5F5E3")
Table 7. OLS Coefficients — mpg ~ All Predictors (except name)
Term Estimate SE t p-value Sig. 95% CI Low 95% CI High
(Intercept) -17.9546 4.6769 -3.839 < 0.001 *** -27.1503 -8.7589
cylinders -0.4897 0.3212 -1.524 0.12821 -1.1213 0.1419
displacement 0.0240 0.0077 3.133 0.00186 ** 0.0089 0.0390
horsepower -0.0182 0.0137 -1.326 0.18549 -0.0451 0.0088
weight -0.0067 0.0007 -10.243 < 0.001 *** -0.0080 -0.0054
acceleration 0.0791 0.0982 0.805 0.42110 -0.1140 0.2722
year 0.7770 0.0518 15.005 < 0.001 *** 0.6752 0.8788
originEuropean 2.6300 0.5664 4.643 < 0.001 *** 1.5163 3.7437
originJapanese 2.8532 0.5527 5.162 < 0.001 *** 1.7665 3.9400
cat(sprintf(
  "R²: %.4f | Adj. R²: %.4f | F(%d, %d) = %.1f | p < 2.2e-16\n",
  glance_auto$r.squared, glance_auto$adj.r.squared,
  glance_auto$df, glance_auto$df.residual, glance_auto$statistic
))
## R²: 0.8242 | Adj. R²: 0.8205 | F(8, 383) = 224.5 | p < 2.2e-16
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(lm_auto, pch = 19, cex = 0.55,
     col = adjustcolor("#2C3E50", 0.5),
     which = 1:4)

par(mfrow = c(1, 1))
lm_interact <- lm(mpg ~ displacement + horsepower + weight * year +
                    acceleration + origin, data = Auto)

tidy(lm_interact, conf.int = TRUE) %>%
  mutate(
    sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                    p.value < 0.05 ~ "*",   p.value < 0.1  ~ ".", TRUE ~ ""),
    across(where(is.numeric), ~ round(., 4)),
    p.value = format.pval(p.value, digits = 3, eps = 0.001)
  ) %>%
  kable(caption = "Table 8. Regression with weight × year Interaction") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(tidy(lm_interact)$p.value < 0.05), background = "#D5F5E3")
Table 8. Regression with weight × year Interaction
term estimate std.error statistic p.value conf.low conf.high sig
(Intercept) -124.8020 13.0660 -9.5516 <0.001 -150.4921 -99.1119 ***
displacement 0.0163 0.0054 3.0357 0.0026 0.0057 0.0268 **
horsepower -0.0303 0.0126 -2.4017 0.0168 -0.0551 -0.0055
weight 0.0317 0.0046 6.9624 <0.001 0.0227 0.0406 ***
year 2.1754 0.1704 12.7650 <0.001 1.8403 2.5105 ***
acceleration 0.1472 0.0905 1.6269 0.1046 -0.0307 0.3250
originEuropean 2.6955 0.5204 5.1801 <0.001 1.6724 3.7187 ***
originJapanese 2.3094 0.5085 4.5412 <0.001 1.3095 3.3093 ***
weight:year -0.0005 0.0001 -8.5380 <0.001 -0.0006 -0.0004 ***
# Visualise the weight × year interaction
year_cuts <- quantile(Auto$year, c(0.1, 0.5, 0.9))
Auto_interact <- Auto %>%
  mutate(year_group = cut(year,
                          breaks = c(-Inf, 73, 76, Inf),
                          labels = c("Early (≤73)","Mid (74–76)","Late (77+)")))

ggplot(Auto_interact, aes(x = weight, y = mpg, colour = year_group)) +
  geom_point(alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1.1) +
  scale_colour_manual(values = c("#E74C3C","#F39C12","#27AE60")) +
  labs(title    = "Figure 6. Weight × Year Interaction: Slopes Flatten Over Time",
       subtitle = "The negative weight–mpg relationship weakens in later model years",
       x = "Weight (lbs)", y = "MPG", colour = "Era") +
  theme_afs()
## `geom_smooth()` using formula = 'y ~ x'

lm_base <- lm(mpg ~ weight + horsepower + year + origin, data = Auto)
lm_log  <- lm(mpg ~ log(weight) + log(horsepower) + year + origin, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(weight) + sqrt(horsepower) + year + origin, data = Auto)
lm_sq   <- lm(mpg ~ I(weight^2) + I(horsepower^2) + year + origin, data = Auto)

transform_tbl <- map_dfr(
  list(Base = lm_base, Log = lm_log, Sqrt = lm_sqrt, Quadratic = lm_sq),
  ~ glance(.x)[c("r.squared","adj.r.squared","sigma","AIC","BIC")],
  .id = "Transformation"
) %>%
  arrange(AIC)

kable(transform_tbl, digits = 4,
      caption = "Table 9. Goodness-of-Fit Comparison Across Transformations") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#D5F5E3")
Table 9. Goodness-of-Fit Comparison Across Transformations
Transformation r.squared adj.r.squared sigma AIC BIC
Log 0.8460 0.8440 3.0824 2002.949 2030.748
Sqrt 0.8339 0.8318 3.2011 2032.570 2060.369
Base 0.8194 0.8171 3.3381 2065.442 2093.241
Quadratic 0.7869 0.7841 3.6266 2130.433 2158.232
# Residual comparison between base and log models
resid_compare <- data.frame(
  Fitted_base = fitted(lm_base), Resid_base = resid(lm_base),
  Fitted_log  = fitted(lm_log),  Resid_log  = resid(lm_log)
)

p1 <- ggplot(resid_compare, aes(Fitted_base, Resid_base)) +
  geom_point(alpha = 0.3, size = 1, colour = "#E74C3C") +
  geom_smooth(se = FALSE, colour = "black", linewidth = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Base Model", x = "Fitted", y = "Residuals") + theme_afs(11)

p2 <- ggplot(resid_compare, aes(Fitted_log, Resid_log)) +
  geom_point(alpha = 0.3, size = 1, colour = "#27AE60") +
  geom_smooth(se = FALSE, colour = "black", linewidth = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Log-transformed Model", x = "Fitted", y = "Residuals") +
  theme_afs(11)

(p1 | p2) +
  plot_annotation(
    title    = "Figure 7. Residual Plots: Base vs. Log Transformation",
    subtitle = "Log transformation substantially reduces the non-linear pattern"
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

data(Boston)
predictors_b <- setdiff(names(Boston), "crim")
slr_results <- lapply(predictors_b, function(v) {
  fit <- lm(reformulate(v, "crim"), data = Boston)
  s   <- summary(fit)
  tidy(fit)[2, ] %>%
    mutate(Predictor = v,
           R2        = s$r.squared,
           Direction = ifelse(estimate > 0, "Positive", "Negative"))
}) %>%
  bind_rows() %>%
  arrange(p.value) %>%
  mutate(Sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                         p.value < 0.05  ~ "*",   TRUE           ~ "n.s."))

kable(slr_results %>%
        select(Predictor, estimate, std.error, statistic, p.value, R2, Sig),
      digits = 4,
      caption = "Table 10. Simple Regression: Each Predictor vs. crim (sorted by p-value)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE, font_size = 11) %>%
  row_spec(which(slr_results$p.value < 0.05), background = "#D5F5E3")
Table 10. Simple Regression: Each Predictor vs. crim (sorted by p-value)
Predictor estimate std.error statistic p.value R2 Sig
rad 0.6179 0.0343 17.9982 0.0000 0.3913 ***
tax 0.0297 0.0018 16.0994 0.0000 0.3396 ***
lstat 0.5488 0.0478 11.4907 0.0000 0.2076 ***
nox 31.2485 2.9992 10.4190 0.0000 0.1772 ***
indus 0.5098 0.0510 9.9908 0.0000 0.1653 ***
medv -0.3632 0.0384 -9.4597 0.0000 0.1508 ***
black -0.0363 0.0039 -9.3670 0.0000 0.1483 ***
dis -1.5509 0.1683 -9.2135 0.0000 0.1441 ***
age 0.1078 0.0127 8.4628 0.0000 0.1244 ***
ptratio 1.1520 0.1694 6.8014 0.0000 0.0841 ***
rm -2.6841 0.5320 -5.0448 0.0000 0.0481 ***
zn -0.0739 0.0161 -4.5938 0.0000 0.0402 ***
chas -1.8928 1.5061 -1.2567 0.2094 0.0031 n.s.
# Plot top 4 predictors by R²
top4 <- slr_results %>% slice_max(R2, n = 4) %>% pull(Predictor)

Boston %>%
  select(crim, all_of(top4)) %>%
  pivot_longer(-crim, names_to = "predictor", values_to = "value") %>%
  ggplot(aes(x = value, y = crim)) +
  geom_point(alpha = 0.2, size = 1, colour = "#2C3E50") +
  geom_smooth(method = "lm",    colour = "#E74C3C",  se = FALSE,
              linewidth = 0.9, linetype = "dashed") +
  geom_smooth(method = "loess", colour = "#27AE60",  se = FALSE,
              linewidth = 0.9) +
  facet_wrap(~ predictor, scales = "free_x") +
  labs(title    = "Figure 8. Top-4 Predictors of Crime Rate",
       subtitle = "Red dashed = linear fit; green = LOESS. Divergence signals non-linearity.",
       x = NULL, y = "Per-capita crime rate") +
  theme_afs()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

lm_boston <- lm(crim ~ ., data = Boston)
tidy_boston <- tidy(lm_boston, conf.int = TRUE) %>%
  mutate(Sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
                         p.value < 0.05  ~ "*",   p.value < 0.1  ~ ".", TRUE ~ ""))

kable(tidy_boston %>%
        select(term, estimate, std.error, statistic, p.value, Sig),
      digits = 4,
      caption = "Table 11. Multiple Regression: crim ~ All Predictors") %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE, font_size = 11) %>%
  row_spec(which(tidy_boston$p.value < 0.05), background = "#D5F5E3")
Table 11. Multiple Regression: crim ~ All Predictors
term estimate std.error statistic p.value Sig
(Intercept) 17.0332 7.2349 2.3543 0.0189
zn 0.0449 0.0187 2.3943 0.0170
indus -0.0639 0.0834 -0.7656 0.4443
chas -0.7491 1.1801 -0.6348 0.5259
nox -10.3135 5.2755 -1.9550 0.0512 .
rm 0.4301 0.6128 0.7019 0.4831
age 0.0015 0.0179 0.0810 0.9355
dis -0.9872 0.2818 -3.5029 0.0005 ***
rad 0.5882 0.0880 6.6804 0.0000 ***
tax -0.0038 0.0052 -0.7332 0.4638
ptratio -0.2711 0.1865 -1.4539 0.1466
black -0.0075 0.0037 -2.0520 0.0407
lstat 0.1262 0.0757 1.6667 0.0962 .
medv -0.1989 0.0605 -3.2865 0.0011 **
mlr_coef <- setNames(tidy_boston$estimate[-1], tidy_boston$term[-1])
slr_coef <- setNames(slr_results$estimate,     slr_results$Predictor)
common   <- intersect(names(slr_coef), names(mlr_coef))

coef_df <- data.frame(
  Predictor = common,
  SLR       = slr_coef[common],
  MLR       = mlr_coef[common]
) %>%
  mutate(
    Shift     = MLR - SLR,
    Direction = ifelse(Shift > 0, "Upward", "Downward"),
    label_pos = ifelse(abs(Shift) > 0.3 * max(abs(SLR)), Predictor, "")
  )

ggplot(coef_df, aes(x = SLR, y = MLR, colour = Direction, label = Predictor)) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60") +
  geom_abline(slope = 1, intercept = 0, colour = "grey80", linewidth = 0.8) +
  geom_point(size = 3.5, alpha = 0.85) +
  ggrepel::geom_text_repel(size = 3.2, colour = "#2C3E50",
                            max.overlaps = 15) +
  scale_colour_manual(values = c(Upward = "#27AE60", Downward = "#E74C3C")) +
  labs(title    = "Figure 9. Univariate vs. Multiple Regression Coefficients",
       subtitle  = "Points on the diagonal (grey) indicate unchanged estimates; deviations reveal confounding.",
       x = "Simple Regression Coefficient",
       y = "Multiple Regression Coefficient",
       colour = "Direction of Shift",
       caption = "Source: ISLR2::Boston") +
  theme_afs()

poly_tbl <- map_dfr(predictors_b, function(v) {
  fit_poly <- lm(reformulate(
    c(v, paste0("I(",v,"^2)"), paste0("I(",v,"^3)")), "crim"),
    data = Boston)
  s     <- summary(fit_poly)
  c_mat <- coef(s)
  nrows <- nrow(c_mat)
  
  p2 <- ifelse(nrows >= 3, round(c_mat[3, 4], 4), NA_real_)
  p3 <- ifelse(nrows >= 4, round(c_mat[4, 4], 4), NA_real_)
  
  data.frame(
    Predictor   = v,
    p_linear    = round(c_mat[2, 4], 4),
    p_quadratic = p2,
    p_cubic     = p3,
    R2_poly     = round(s$r.squared, 4),
    NonLinear   = ifelse(!is.na(p2) & p2 < 0.05 |
                         !is.na(p3) & p3 < 0.05, "✓", "")
  )
}) %>% arrange(desc(NonLinear), p_quadratic)

kable(poly_tbl,
      caption = "Table 12. Non-linear Association Test (Cubic Polynomial)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE, font_size = 11) %>%
  row_spec(which(poly_tbl$NonLinear == "✓"), background = "#FCF3CF")
Table 12. Non-linear Association Test (Cubic Polynomial)
Predictor p_linear p_quadratic p_cubic R2_poly NonLinear
indus 0.0001 0.0000 0.0000 0.2597
nox 0.0000 0.0000 0.0000 0.2970
dis 0.0000 0.0000 0.0000 0.2778
medv 0.0000 0.0000 0.0000 0.4202
ptratio 0.0030 0.0041 0.0063 0.1138
age 0.1427 0.0474 0.0067 0.1742
lstat 0.3345 0.0646 0.1299 0.2179
zn 0.0026 0.0938 0.2295 0.0582
tax 0.1097 0.1375 0.2439 0.3689
rm 0.2118 0.3641 0.5086 0.0678
black 0.1386 0.4742 0.5436 0.1498
rad 0.6234 0.6130 0.4823 0.4000
chas 0.2094 NA NA 0.0031
data(Weekly)
cat("Dimensions:", dim(Weekly), "\n")
## Dimensions: 1089 9
prop_tbl <- prop.table(table(Weekly$Direction))
cat(sprintf("Up: %.1f%% | Down: %.1f%%\n",
            prop_tbl["Up"]*100, prop_tbl["Down"]*100))
## Up: 55.6% | Down: 44.4%
cat(sprintf("Naïve 'always-Up' baseline accuracy: %.1f%%\n",
            prop_tbl["Up"]*100))
## Naïve 'always-Up' baseline accuracy: 55.6%
summary(Weekly[, c("Lag1","Lag2","Lag3","Lag4","Lag5","Volume","Today")])
##       Lag1               Lag2               Lag3               Lag4         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580   1st Qu.: -1.1580  
##  Median :  0.2410   Median :  0.2410   Median :  0.2410   Median :  0.2380  
##  Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472   Mean   :  0.1458  
##  3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260
p_vol <- ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_point(alpha = 0.12, size = 0.8, colour = "#2C3E50") +
  geom_smooth(method = "loess", colour = "#E74C3C", se = TRUE,
              fill = "#FADBD8", linewidth = 1) +
  geom_vline(xintercept = c(2000, 2008), linetype = "dotted",
             colour = "grey40") +
  annotate("text", x = c(2000.2, 2008.2), y = c(7, 7),
           label = c("Dot-com →","GFC →"),
           hjust = 0, size = 3, colour = "grey30") +
  labs(title = "Figure 10. Weekly Trading Volume (1990–2010)",
       subtitle = "Structural breaks coincide with major market crises",
       y = "Avg. daily volume (billion shares)", x = NULL) +
  theme_afs()

p_ret <- Weekly %>%
  select(Direction, Lag1:Lag5) %>%
  pivot_longer(-Direction, names_to = "Lag", values_to = "Return") %>%
  ggplot(aes(x = Return, fill = Direction)) +
  geom_histogram(alpha = 0.6, bins = 50, position = "identity") +
  scale_fill_manual(values = c(Down = "#E74C3C", Up = "#27AE60")) +
  facet_wrap(~ Lag, nrow = 1) +
  labs(title = "Figure 11. Distribution of Lagged Returns by Direction",
       subtitle = "Distributions are nearly identical — consistent with EMH",
       x = "Lag Return (%)", y = "Count", fill = "Direction") +
  theme_afs(10)

p_cor <- ggcorr(Weekly[, c("Lag1","Lag2","Lag3","Lag4","Lag5",
                             "Volume","Today")],
                label = TRUE, label_round = 2,
                low = "#2980B9", mid = "white", high = "#C0392B",
                name = "r") +
  labs(title = "Figure 12. Correlation Matrix — Weekly Variables") +
  theme_afs(10)

p_vol / p_ret / p_cor
## `geom_smooth()` using formula = 'y ~ x'

logit_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
                  data = Weekly, family = binomial)
tidy_logit <- tidy(logit_full)

kable(tidy_logit %>%
        mutate(OR = exp(estimate),
               across(where(is.numeric), ~ round(., 4)),
               p.value = format.pval(p.value, digits = 3, eps = 0.001),
               Sig = case_when(as.numeric(p.value) < 0.05 ~ "*",
                               TRUE ~ "")) %>%
        select(term, estimate, OR, std.error, statistic, p.value, Sig),
      caption = "Table 13. Logistic Regression Coefficients — Full Model",
      col.names = c("Term","Log-Odds","Odds Ratio","SE","z","p-value","Sig.")) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(tidy_logit$p.value < 0.05), background = "#D5F5E3")
Table 13. Logistic Regression Coefficients — Full Model
Term Log-Odds Odds Ratio SE z p-value Sig.
(Intercept) 0.2669 1.3059 0.0859 3.1056 0.0019
Lag1 -0.0413 0.9596 0.0264 -1.5626 0.1181
Lag2 0.0584 1.0602 0.0269 2.1754 0.0296
Lag3 -0.0161 0.9841 0.0267 -0.6024 0.5469
Lag4 -0.0278 0.9726 0.0265 -1.0501 0.2937
Lag5 -0.0145 0.9856 0.0264 -0.5485 0.5833
Volume -0.0227 0.9775 0.0369 -0.6163 0.5377
pred_prob_full <- predict(logit_full, type = "response")
pred_class_full <- factor(ifelse(pred_prob_full > 0.5, "Up","Down"),
                          levels = c("Down","Up"))

cm_full <- confusionMatrix(pred_class_full,
                           factor(Weekly$Direction, c("Down","Up")),
                           positive = "Up")

# Heatmap
cm_df <- as.data.frame(cm_full$table) %>%
  group_by(Reference) %>%
  mutate(Rate = Freq / sum(Freq))

ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Rate)) +
  geom_tile(colour = "white", linewidth = 1.5) +
  geom_text(aes(label = sprintf("%d\n(%.1f%%)", Freq, Rate*100)),
            size = 5.5, fontface = "bold") +
  scale_fill_gradient2(low = "#EBF5FB", mid = "#AED6F1",
                       high = "#1A5276", midpoint = 0.5) +
  labs(title    = "Figure 13. Confusion Matrix — Full Logistic Regression (In-Sample)",
       subtitle = "Row-wise rates show severe class asymmetry in predictions",
       x = "Actual Direction", y = "Predicted Direction") +
  theme_afs(13) + theme(legend.position = "none")

metrics_full <- data.frame(
  Metric = c("Overall Accuracy","Sensitivity (Up recall)",
             "Specificity (Down recall)","Positive Predictive Value",
             "Negative Predictive Value","Balanced Accuracy"),
  Value  = c(cm_full$overall["Accuracy"],
             cm_full$byClass["Sensitivity"],
             cm_full$byClass["Specificity"],
             cm_full$byClass["Pos Pred Value"],
             cm_full$byClass["Neg Pred Value"],
             cm_full$byClass["Balanced Accuracy"])
)
kable(metrics_full, digits = 4,
      caption = "Table 14. Full Model Classification Metrics") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(metrics_full$Value < 0.15 | metrics_full$Value > 0.90),
           background = "#FADBD8")
Table 14. Full Model Classification Metrics
Metric Value
Accuracy Overall Accuracy 0.5611
Sensitivity Sensitivity (Up recall) 0.9207
Specificity Specificity (Down recall) 0.1116
Pos Pred Value Positive Predictive Value 0.5643
Neg Pred Value Negative Predictive Value 0.5294
Balanced Accuracy Balanced Accuracy 0.5161
train_idx     <- Weekly$Year <= 2008
Weekly_train  <- Weekly[train_idx, ]
Weekly_test   <- Weekly[!train_idx, ]

logit_lag2    <- glm(Direction ~ Lag2, data = Weekly_train, family = binomial)
pred_prob_l2  <- predict(logit_lag2, Weekly_test, type = "response")
pred_class_l2 <- factor(ifelse(pred_prob_l2 > 0.5,"Up","Down"),
                         levels = c("Down","Up"))

cm_l2 <- confusionMatrix(pred_class_l2,
                          factor(Weekly_test$Direction, c("Down","Up")),
                          positive = "Up")

cat(sprintf("Test Accuracy: %.4f | Sensitivity: %.4f | Specificity: %.4f\n",
            cm_l2$overall["Accuracy"],
            cm_l2$byClass["Sensitivity"],
            cm_l2$byClass["Specificity"]))
## Test Accuracy: 0.6250 | Sensitivity: 0.9180 | Specificity: 0.2093
# LDA
lda_fit  <- lda(Direction ~ Lag2, data = Weekly_train)
lda_pred <- predict(lda_fit, Weekly_test)

# KNN for K = 1, 3, 5, 7, 11, 21
set.seed(42)
train_X <- matrix(Weekly_train$Lag2, ncol = 1)
test_X  <- matrix(Weekly_test$Lag2,  ncol = 1)
train_Y <- Weekly_train$Direction

knn_results <- lapply(c(1, 3, 5, 7, 11, 21), function(k) {
  pred <- knn(train_X, test_X, train_Y, k = k)
  data.frame(K = k,
             Accuracy    = mean(pred == Weekly_test$Direction),
             Sensitivity = mean(pred[Weekly_test$Direction=="Up"] == "Up"),
             Specificity = mean(pred[Weekly_test$Direction=="Down"] == "Down"))
}) %>% bind_rows()
knn_results %>%
  pivot_longer(c(Accuracy, Sensitivity, Specificity),
               names_to = "Metric", values_to = "Value") %>%
  ggplot(aes(x = K, y = Value, colour = Metric, group = Metric)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 2.5) +
  geom_hline(yintercept = mean(Weekly_test$Direction == "Up"),
             linetype = "dashed", colour = "grey40") +
  annotate("text", x = 3, y = mean(Weekly_test$Direction == "Up") + 0.02,
           label = "Naïve baseline", colour = "grey40", size = 3.2) +
  scale_colour_manual(values = c(Accuracy="#2C3E50",
                                 Sensitivity="#27AE60",
                                 Specificity="#E74C3C")) +
  scale_x_continuous(breaks = c(1,3,5,7,11,21)) +
  labs(title    = "Figure 14. KNN Bias-Variance Tradeoff",
       subtitle = "As K increases, variance decreases but the model converges toward the baseline",
       x = "K (number of neighbours)", y = "Performance",
       colour = "Metric") +
  theme_afs()

test_Y_bin <- as.numeric(Weekly_test$Direction == "Up")

roc_logit <- roc(test_Y_bin, pred_prob_l2,                        quiet = TRUE)
roc_lda   <- roc(test_Y_bin, lda_pred$posterior[,"Up"],           quiet = TRUE)

# KNN=3 "probability" approximation
k3_pred   <- knn(train_X, test_X, train_Y, k=3, prob=TRUE)
k3_prob   <- ifelse(k3_pred=="Up", attr(k3_pred,"prob"), 1-attr(k3_pred,"prob"))
roc_knn3  <- roc(test_Y_bin, k3_prob, quiet = TRUE)

auc_logit <- round(auc(roc_logit), 4)
auc_lda   <- round(auc(roc_lda),   4)
auc_knn3  <- round(auc(roc_knn3),  4)

# Optimal threshold (Youden's J)
youden_logit <- coords(roc_logit, "best", ret=c("threshold","sensitivity","specificity"))

ggroc(list(
  `Logistic Regression` = roc_logit,
  LDA                   = roc_lda,
  `KNN (K=3)`           = roc_knn3
), legacy.axes = TRUE, linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "grey60", linewidth = 0.7) +
  geom_point(aes(x = 1 - youden_logit$specificity,
                 y = youden_logit$sensitivity),
             colour = "#E74C3C", size = 4, shape = 18) +
  annotate("text",
           x = 1 - youden_logit$specificity + 0.05,
           y = youden_logit$sensitivity - 0.03,
           label = sprintf("Youden Optimal\n(τ = %.2f)", youden_logit$threshold),
           size = 3, colour = "#E74C3C") +
  scale_colour_manual(values = c("#E74C3C","#3498DB","#27AE60")) +
  labs(
    title    = "Figure 15. ROC Curves — Model Comparison (Test Set: 2009–2010)",
    subtitle = sprintf("AUC: Logistic = %s | LDA = %s | KNN(K=3) = %s  |  Diamond = Youden-optimal threshold",
                       auc_logit, auc_lda, auc_knn3),
    x = "False Positive Rate  (1 – Specificity)",
    y = "True Positive Rate  (Sensitivity)",
    colour = "Model"
  ) +
  theme_afs(12) +
  theme(legend.position = c(0.72, 0.22))

acc_fn  <- function(p, a) mean(p == a)
sens_fn <- function(p, a) mean(p[a=="Up"] == "Up")
spec_fn <- function(p, a) mean(p[a=="Down"] == "Down")

act <- Weekly_test$Direction

perf_df <- bind_rows(
  data.frame(Model="Logistic (Lag2)",
             Accuracy=acc_fn(pred_class_l2,act),
             Sensitivity=sens_fn(pred_class_l2,act),
             Specificity=spec_fn(pred_class_l2,act),
             AUC=auc_logit,
             BV_Profile="Low variance · Some bias"),
  data.frame(Model="LDA (Lag2)",
             Accuracy=acc_fn(as.character(lda_pred$class),act),
             Sensitivity=sens_fn(as.character(lda_pred$class),act),
             Specificity=spec_fn(as.character(lda_pred$class),act),
             AUC=auc_lda,
             BV_Profile="Low variance · Some bias"),
  knn_results %>%
    filter(K %in% c(1, 3, 7, 21)) %>%
    transmute(Model=paste0("KNN (K=",K,")"),
              Accuracy, Sensitivity, Specificity,
              AUC=NA_real_,
              BV_Profile=case_when(
                K==1  ~ "Zero bias · Max variance (overfit)",
                K==3  ~ "Low bias · High variance",
                K==7  ~ "Moderate bias · Moderate variance",
                K==21 ~ "High bias · Low variance (near-baseline)"
              ))
) %>% arrange(desc(Accuracy))

kable(perf_df, digits = 4,
      caption = "Table 15. Full Model Comparison — Test Set (2009–2010)") %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE, font_size = 11) %>%
  row_spec(1, bold = TRUE, background = "#D5F5E3") %>%
  column_spec(6, italic = TRUE, color = "grey40")
Table 15. Full Model Comparison — Test Set (2009–2010)
Model Accuracy Sensitivity Specificity AUC BV_Profile
Logistic (Lag2) 0.6250 0.9180 0.2093 0.4537 Low variance · Some bias
LDA (Lag2) 0.6250 0.9180 0.2093 0.4537 Low variance · Some bias
KNN (K=21) 0.5577 0.6557 0.4186 NA High bias · Low variance (near-baseline)
KNN (K=3) 0.5481 0.6885 0.3488 NA Low bias · High variance
KNN (K=7) 0.5481 0.6885 0.3488 NA Moderate bias · Moderate variance
KNN (K=1) 0.5000 0.5082 0.4884 NA Zero bias · Max variance (overfit)
cat("R version:", R.version$version.string, "\n")
## R version: R version 4.5.1 (2025-06-13 ucrt)
cat("Key packages: ISLR2", as.character(packageVersion("ISLR2")),
    "| pROC", as.character(packageVersion("pROC")),
    "| caret", as.character(packageVersion("caret")), "\n")
## Key packages: ISLR2 1.3.2 | pROC 1.19.0.1 | caret 7.0.1