# ── IMPORTANT: MASS is loaded FIRST so that dplyr::select wins when dplyr loads
# ── Data & Modelling ───────────────────────────────────────────────────────────
library(ISLR2) # Auto, Weekly, Boston
## Warning: package 'ISLR2' was built under R version 4.5.3
library(MASS) # LDA — must precede dplyr
## Warning: package 'MASS' was built under R version 4.5.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(car) # VIF calculation
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
library(class) # KNN
## Warning: package 'class' was built under R version 4.5.3
# ── Visualisation ──────────────────────────────────────────────────────────────
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(patchwork) # Compose multi-panel plots
## Warning: package 'patchwork' was built under R version 4.5.3
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
##
## area
# ── Data Wrangling — loaded AFTER MASS so dplyr::select takes precedence ───────
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(broom)
## Warning: package 'broom' was built under R version 4.5.3
library(purrr) # map_dfr
##
## Attaching package: 'purrr'
## The following object is masked from 'package:car':
##
## some
# Explicitly bind dplyr verbs to guard against any residual MASS masking
select <- dplyr::select
filter <- dplyr::filter
mutate <- dplyr::mutate
summarise <- dplyr::summarise
# ── Model Evaluation ──────────────────────────────────────────────────────────
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# ── Presentation ──────────────────────────────────────────────────────────────
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(scales)
## Warning: package 'scales' was built under R version 4.5.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
library(ggrepel) # geom_text_repel for Figure 9
## Warning: package 'ggrepel' was built under R version 4.5.3
# ── Consistent plot theme ─────────────────────────────────────────────────────
theme_afs <- function(base_size = 12) {
theme_minimal(base_size = base_size) %+replace%
theme(
plot.title = element_text(face = "bold", size = base_size + 1,
margin = margin(b = 6)),
plot.subtitle = element_text(colour = "grey40", size = base_size - 1,
margin = margin(b = 10)),
plot.caption = element_text(colour = "grey55", size = base_size - 2,
hjust = 0),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold")
)
}
PALETTE <- c(American = "#E74C3C", European = "#3498DB", Japanese = "#27AE60")
data(Auto)
Auto <- na.omit(Auto)
Auto$origin <- factor(Auto$origin, 1:3,
c("American", "European", "Japanese"))
cat(sprintf("Dimensions: %d observations × %d variables\n",
nrow(Auto), ncol(Auto)))
## Dimensions: 392 observations × 9 variables
var_meta <- data.frame(
Variable = names(Auto),
Scale = c("Quantitative (ratio)", "Quantitative (discrete)",
"Quantitative (ratio)", "Quantitative (ratio)",
"Quantitative (ratio)", "Quantitative (ratio)",
"Qualitative (ordinal)", "Qualitative (nominal)",
"Qualitative (nominal)"),
Role = c("Response (fuel efficiency, USD-linked operating cost)",
"Engine architecture proxy",
"Engine displacement — cubic inches",
"Rated engine power",
"Curb weight — lbs",
"0–60 mph elapsed time — seconds",
"Model year; encodes regulatory era, not magnitude",
"Manufacturing region (1 = US, 2 = EU, 3 = JP)",
"Unique car identifier — excluded from modelling")
)
kable(var_meta, caption = "Table 1. Variable Taxonomy — Auto Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, font_size = 12) %>%
row_spec(7:9, background = "#FFF3CD") %>%
column_spec(2, italic = TRUE)
Table 1. Variable Taxonomy — Auto Dataset
|
Variable
|
Scale
|
Role
|
|
mpg
|
Quantitative (ratio)
|
Response (fuel efficiency, USD-linked operating cost)
|
|
cylinders
|
Quantitative (discrete)
|
Engine architecture proxy
|
|
displacement
|
Quantitative (ratio)
|
Engine displacement — cubic inches
|
|
horsepower
|
Quantitative (ratio)
|
Rated engine power
|
|
weight
|
Quantitative (ratio)
|
Curb weight — lbs
|
|
acceleration
|
Quantitative (ratio)
|
0–60 mph elapsed time — seconds
|
|
year
|
Qualitative (ordinal)
|
Model year; encodes regulatory era, not magnitude
|
|
origin
|
Qualitative (nominal)
|
Manufacturing region (1 = US, 2 = EU, 3 = JP)
|
|
name
|
Qualitative (nominal)
|
Unique car identifier — excluded from modelling
|
q_vars <- c("mpg","cylinders","displacement","horsepower","weight",
"acceleration","year")
range_tbl <- Auto[, q_vars] %>%
summarise(across(everything(),
list(Min = min, Max = max, Range = ~ diff(range(.)),
CV = ~ sd(.) / mean(.) * 100))) %>% # Coefficient of variation
pivot_longer(everything(),
names_to = c("Variable", ".value"),
names_sep = "_")
kable(range_tbl, digits = 2,
caption = "Table 2. Range and Coefficient of Variation (CV, %)") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE) %>%
column_spec(5, bold = TRUE,
color = ifelse(range_tbl$CV > 30, "#C0392B", "black"))
Table 2. Range and Coefficient of Variation (CV, %)
|
Variable
|
Min
|
Max
|
Range
|
CV
|
|
mpg
|
9
|
46.6
|
37.6
|
33.29
|
|
cylinders
|
3
|
8.0
|
5.0
|
31.17
|
|
displacement
|
68
|
455.0
|
387.0
|
53.83
|
|
horsepower
|
46
|
230.0
|
184.0
|
36.84
|
|
weight
|
1613
|
5140.0
|
3527.0
|
28.53
|
|
acceleration
|
8
|
24.8
|
16.8
|
17.75
|
|
year
|
70
|
82.0
|
12.0
|
4.85
|
stats_tbl <- Auto[, q_vars] %>%
summarise(across(everything(), list(Mean = mean, SD = sd, Median = median,
IQR = IQR))) %>%
pivot_longer(everything(), names_to = c("Variable",".value"),
names_sep = "_") %>%
mutate(Skew_flag = ifelse(abs(Mean - Median) / SD > 0.2, "Skewed ⚠", ""))
kable(stats_tbl, digits = 3,
caption = "Table 3. Descriptive Statistics — Quantitative Predictors") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(stats_tbl$Skew_flag != ""), background = "#FCF3CF")
Table 3. Descriptive Statistics — Quantitative Predictors
|
Variable
|
Mean
|
SD
|
Median
|
IQR
|
Skew_flag
|
|
mpg
|
23.446
|
7.805
|
22.75
|
12.00
|
|
|
cylinders
|
5.472
|
1.706
|
4.00
|
4.00
|
Skewed ⚠
|
|
displacement
|
194.412
|
104.644
|
151.00
|
170.75
|
Skewed ⚠
|
|
horsepower
|
104.469
|
38.491
|
93.50
|
51.00
|
Skewed ⚠
|
|
weight
|
2977.584
|
849.403
|
2803.50
|
1389.50
|
Skewed ⚠
|
|
acceleration
|
15.541
|
2.759
|
15.50
|
3.25
|
|
|
year
|
75.980
|
3.684
|
76.00
|
6.00
|
|
Auto_sub <- Auto[-(10:85), ]
compare_stats <- bind_rows(
Auto[, q_vars] %>%
summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
pivot_longer(everything(), names_to = c("Variable",".value"),
names_sep = "_") %>%
mutate(Sample = "Full (n = 392)"),
Auto_sub[, q_vars] %>%
summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
pivot_longer(everything(), names_to = c("Variable",".value"),
names_sep = "_") %>%
mutate(Sample = sprintf("Subset (n = %d)", nrow(Auto_sub)))
) %>%
pivot_wider(names_from = Sample,
values_from = c(Mean, SD), names_glue = "{Sample}_{.value}")
kable(compare_stats, digits = 3,
caption = "Table 4. Full vs. Subset Descriptive Statistics") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)
Table 4. Full vs. Subset Descriptive Statistics
|
Variable
|
Full (n = 392)_Mean
|
Subset (n = 316)_Mean
|
Full (n = 392)_SD
|
Subset (n = 316)_SD
|
|
mpg
|
23.446
|
24.404
|
7.805
|
7.867
|
|
cylinders
|
5.472
|
5.373
|
1.706
|
1.654
|
|
displacement
|
194.412
|
187.241
|
104.644
|
99.678
|
|
horsepower
|
104.469
|
100.722
|
38.491
|
35.709
|
|
weight
|
2977.584
|
2935.972
|
849.403
|
811.300
|
|
acceleration
|
15.541
|
15.727
|
2.759
|
2.694
|
|
year
|
75.980
|
77.146
|
3.684
|
3.106
|
ggpairs(
Auto[, c("mpg","displacement","horsepower","weight","acceleration")],
aes(colour = Auto$origin, alpha = 0.35),
upper = list(continuous = wrap("cor", size = 3.2, alignPercent = 0.8)),
lower = list(continuous = wrap("points", size = 0.7)),
diag = list(continuous = wrap("densityDiag", alpha = 0.5))
) +
scale_colour_manual(values = PALETTE) +
scale_fill_manual(values = PALETTE) +
labs(title = "Figure 1. Scatterplot Matrix — Auto Dataset",
subtitle = "Colour encodes manufacturing origin; upper triangle shows Pearson r") +
theme_afs(10)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

cor_mat <- cor(Auto[, q_vars])
corrplot(cor_mat,
method = "color", type = "upper", order = "hclust",
addCoef.col = "black", number.cex = 0.70,
tl.col = "black", tl.srt = 45,
col = colorRampPalette(c("#2980B9","white","#C0392B"))(200),
title = "Figure 2. Correlation Matrix (hierarchical clustering order)",
mar = c(0, 0, 2.5, 0))

Auto %>%
select(mpg, displacement, horsepower, weight, acceleration, origin) %>%
pivot_longer(c(displacement, horsepower, weight, acceleration),
names_to = "predictor", values_to = "value") %>%
ggplot(aes(x = value, y = mpg)) +
geom_point(aes(colour = origin), alpha = 0.22, size = 1) +
geom_smooth(method = "loess", se = TRUE, colour = "#2C3E50",
linewidth = 1.1, fill = "grey80") +
geom_smooth(method = "lm", se = FALSE, colour = "#E74C3C",
linewidth = 0.7, linetype = "dashed") +
scale_colour_manual(values = PALETTE) +
facet_wrap(~ predictor, scales = "free_x") +
labs(title = "Figure 3. MPG vs. Key Predictors — LOESS vs. Linear Fit",
subtitle = "Dashed red = linear fit; solid black = LOESS. Curvature indicates non-linearity.",
x = NULL, y = "MPG", colour = "Origin",
caption = "Source: ISLR2::Auto") +
theme_afs()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Auto %>%
group_by(year, origin) %>%
summarise(mean_mpg = mean(mpg), se = sd(mpg)/sqrt(n()), .groups = "drop") %>%
ggplot(aes(x = year, y = mean_mpg, colour = origin, fill = origin)) +
geom_ribbon(aes(ymin = mean_mpg - se, ymax = mean_mpg + se),
alpha = 0.15, colour = NA) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
geom_vline(xintercept = 73, linetype = "dotted", colour = "grey40") +
annotate("text", x = 73.3, y = 14, label = "1973 Oil Crisis →",
hjust = 0, size = 3.2, colour = "grey30") +
scale_colour_manual(values = PALETTE) +
scale_fill_manual(values = PALETTE) +
labs(title = "Figure 4. Mean MPG Trend by Origin and Year (±1 SE)",
x = "Model Year (19xx)", y = "Mean MPG",
colour = "Origin", fill = "Origin") +
theme_afs()

pairs(Auto[, q_vars],
main = "Figure 5. Scatterplot Matrix — All Quantitative Predictors",
pch = 19, cex = 0.4,
col = adjustcolor("#2C3E50", 0.35),
gap = 0.3)

cor_df <- round(cor(Auto[, q_vars]), 3)
kable(cor_df, caption = "Table 5. Pearson Correlation Matrix — Auto") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, font_size = 11) %>%
column_spec(1, bold = TRUE)
Table 5. Pearson Correlation Matrix — Auto
|
|
mpg
|
cylinders
|
displacement
|
horsepower
|
weight
|
acceleration
|
year
|
|
mpg
|
1.000
|
-0.778
|
-0.805
|
-0.778
|
-0.832
|
0.423
|
0.581
|
|
cylinders
|
-0.778
|
1.000
|
0.951
|
0.843
|
0.898
|
-0.505
|
-0.346
|
|
displacement
|
-0.805
|
0.951
|
1.000
|
0.897
|
0.933
|
-0.544
|
-0.370
|
|
horsepower
|
-0.778
|
0.843
|
0.897
|
1.000
|
0.865
|
-0.689
|
-0.416
|
|
weight
|
-0.832
|
0.898
|
0.933
|
0.865
|
1.000
|
-0.417
|
-0.309
|
|
acceleration
|
0.423
|
-0.505
|
-0.544
|
-0.689
|
-0.417
|
1.000
|
0.290
|
|
year
|
0.581
|
-0.346
|
-0.370
|
-0.416
|
-0.309
|
0.290
|
1.000
|
lm_vif_check <- lm(mpg ~ displacement + horsepower + weight +
acceleration + year, data = Auto)
vif_vals <- car::vif(lm_vif_check)
kable(data.frame(Predictor = names(vif_vals), VIF = round(vif_vals, 2)),
caption = "Table 6. Variance Inflation Factors") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(vif_vals > 5), background = "#FADBD8",
bold = TRUE)
Table 6. Variance Inflation Factors
|
|
Predictor
|
VIF
|
|
displacement
|
displacement
|
10.82
|
|
horsepower
|
horsepower
|
9.30
|
|
weight
|
weight
|
10.58
|
|
acceleration
|
acceleration
|
2.62
|
|
year
|
year
|
1.24
|
lm_auto <- lm(mpg ~ . - name, data = Auto)
tidy_auto <- tidy(lm_auto, conf.int = TRUE)
glance_auto <- glance(lm_auto)
# Formatted coefficient table
tidy_auto %>%
mutate(
sig = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ ""
),
estimate = round(estimate, 4),
std.error = round(std.error, 4),
statistic = round(statistic, 3),
p.value = format.pval(p.value, digits = 3, eps = 0.001),
conf.low = round(conf.low, 4),
conf.high = round(conf.high, 4)
) %>%
select(term, estimate, std.error, statistic, p.value, sig,
conf.low, conf.high) %>%
kable(caption = "Table 7. OLS Coefficients — mpg ~ All Predictors (except name)",
col.names = c("Term","Estimate","SE","t","p-value","Sig.",
"95% CI Low","95% CI High")) %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, font_size = 11) %>%
row_spec(which(tidy_auto$p.value < 0.05), background = "#D5F5E3")
Table 7. OLS Coefficients — mpg ~ All Predictors (except name)
|
Term
|
Estimate
|
SE
|
t
|
p-value
|
Sig.
|
95% CI Low
|
95% CI High
|
|
(Intercept)
|
-17.9546
|
4.6769
|
-3.839
|
< 0.001
|
***
|
-27.1503
|
-8.7589
|
|
cylinders
|
-0.4897
|
0.3212
|
-1.524
|
0.12821
|
|
-1.1213
|
0.1419
|
|
displacement
|
0.0240
|
0.0077
|
3.133
|
0.00186
|
**
|
0.0089
|
0.0390
|
|
horsepower
|
-0.0182
|
0.0137
|
-1.326
|
0.18549
|
|
-0.0451
|
0.0088
|
|
weight
|
-0.0067
|
0.0007
|
-10.243
|
< 0.001
|
***
|
-0.0080
|
-0.0054
|
|
acceleration
|
0.0791
|
0.0982
|
0.805
|
0.42110
|
|
-0.1140
|
0.2722
|
|
year
|
0.7770
|
0.0518
|
15.005
|
< 0.001
|
***
|
0.6752
|
0.8788
|
|
originEuropean
|
2.6300
|
0.5664
|
4.643
|
< 0.001
|
***
|
1.5163
|
3.7437
|
|
originJapanese
|
2.8532
|
0.5527
|
5.162
|
< 0.001
|
***
|
1.7665
|
3.9400
|
cat(sprintf(
"R²: %.4f | Adj. R²: %.4f | F(%d, %d) = %.1f | p < 2.2e-16\n",
glance_auto$r.squared, glance_auto$adj.r.squared,
glance_auto$df, glance_auto$df.residual, glance_auto$statistic
))
## R²: 0.8242 | Adj. R²: 0.8205 | F(8, 383) = 224.5 | p < 2.2e-16
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(lm_auto, pch = 19, cex = 0.55,
col = adjustcolor("#2C3E50", 0.5),
which = 1:4)

par(mfrow = c(1, 1))
lm_interact <- lm(mpg ~ displacement + horsepower + weight * year +
acceleration + origin, data = Auto)
tidy(lm_interact, conf.int = TRUE) %>%
mutate(
sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", p.value < 0.1 ~ ".", TRUE ~ ""),
across(where(is.numeric), ~ round(., 4)),
p.value = format.pval(p.value, digits = 3, eps = 0.001)
) %>%
kable(caption = "Table 8. Regression with weight × year Interaction") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(tidy(lm_interact)$p.value < 0.05), background = "#D5F5E3")
Table 8. Regression with weight × year Interaction
|
term
|
estimate
|
std.error
|
statistic
|
p.value
|
conf.low
|
conf.high
|
sig
|
|
(Intercept)
|
-124.8020
|
13.0660
|
-9.5516
|
<0.001
|
-150.4921
|
-99.1119
|
***
|
|
displacement
|
0.0163
|
0.0054
|
3.0357
|
0.0026
|
0.0057
|
0.0268
|
**
|
|
horsepower
|
-0.0303
|
0.0126
|
-2.4017
|
0.0168
|
-0.0551
|
-0.0055
|
|
|
weight
|
0.0317
|
0.0046
|
6.9624
|
<0.001
|
0.0227
|
0.0406
|
***
|
|
year
|
2.1754
|
0.1704
|
12.7650
|
<0.001
|
1.8403
|
2.5105
|
***
|
|
acceleration
|
0.1472
|
0.0905
|
1.6269
|
0.1046
|
-0.0307
|
0.3250
|
|
|
originEuropean
|
2.6955
|
0.5204
|
5.1801
|
<0.001
|
1.6724
|
3.7187
|
***
|
|
originJapanese
|
2.3094
|
0.5085
|
4.5412
|
<0.001
|
1.3095
|
3.3093
|
***
|
|
weight:year
|
-0.0005
|
0.0001
|
-8.5380
|
<0.001
|
-0.0006
|
-0.0004
|
***
|
# Visualise the weight × year interaction
year_cuts <- quantile(Auto$year, c(0.1, 0.5, 0.9))
Auto_interact <- Auto %>%
mutate(year_group = cut(year,
breaks = c(-Inf, 73, 76, Inf),
labels = c("Early (≤73)","Mid (74–76)","Late (77+)")))
ggplot(Auto_interact, aes(x = weight, y = mpg, colour = year_group)) +
geom_point(alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1.1) +
scale_colour_manual(values = c("#E74C3C","#F39C12","#27AE60")) +
labs(title = "Figure 6. Weight × Year Interaction: Slopes Flatten Over Time",
subtitle = "The negative weight–mpg relationship weakens in later model years",
x = "Weight (lbs)", y = "MPG", colour = "Era") +
theme_afs()
## `geom_smooth()` using formula = 'y ~ x'

lm_base <- lm(mpg ~ weight + horsepower + year + origin, data = Auto)
lm_log <- lm(mpg ~ log(weight) + log(horsepower) + year + origin, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(weight) + sqrt(horsepower) + year + origin, data = Auto)
lm_sq <- lm(mpg ~ I(weight^2) + I(horsepower^2) + year + origin, data = Auto)
transform_tbl <- map_dfr(
list(Base = lm_base, Log = lm_log, Sqrt = lm_sqrt, Quadratic = lm_sq),
~ glance(.x)[c("r.squared","adj.r.squared","sigma","AIC","BIC")],
.id = "Transformation"
) %>%
arrange(AIC)
kable(transform_tbl, digits = 4,
caption = "Table 9. Goodness-of-Fit Comparison Across Transformations") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#D5F5E3")
Table 9. Goodness-of-Fit Comparison Across Transformations
|
Transformation
|
r.squared
|
adj.r.squared
|
sigma
|
AIC
|
BIC
|
|
Log
|
0.8460
|
0.8440
|
3.0824
|
2002.949
|
2030.748
|
|
Sqrt
|
0.8339
|
0.8318
|
3.2011
|
2032.570
|
2060.369
|
|
Base
|
0.8194
|
0.8171
|
3.3381
|
2065.442
|
2093.241
|
|
Quadratic
|
0.7869
|
0.7841
|
3.6266
|
2130.433
|
2158.232
|
# Residual comparison between base and log models
resid_compare <- data.frame(
Fitted_base = fitted(lm_base), Resid_base = resid(lm_base),
Fitted_log = fitted(lm_log), Resid_log = resid(lm_log)
)
p1 <- ggplot(resid_compare, aes(Fitted_base, Resid_base)) +
geom_point(alpha = 0.3, size = 1, colour = "#E74C3C") +
geom_smooth(se = FALSE, colour = "black", linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Base Model", x = "Fitted", y = "Residuals") + theme_afs(11)
p2 <- ggplot(resid_compare, aes(Fitted_log, Resid_log)) +
geom_point(alpha = 0.3, size = 1, colour = "#27AE60") +
geom_smooth(se = FALSE, colour = "black", linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Log-transformed Model", x = "Fitted", y = "Residuals") +
theme_afs(11)
(p1 | p2) +
plot_annotation(
title = "Figure 7. Residual Plots: Base vs. Log Transformation",
subtitle = "Log transformation substantially reduces the non-linear pattern"
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

data(Boston)
predictors_b <- setdiff(names(Boston), "crim")
slr_results <- lapply(predictors_b, function(v) {
fit <- lm(reformulate(v, "crim"), data = Boston)
s <- summary(fit)
tidy(fit)[2, ] %>%
mutate(Predictor = v,
R2 = s$r.squared,
Direction = ifelse(estimate > 0, "Positive", "Negative"))
}) %>%
bind_rows() %>%
arrange(p.value) %>%
mutate(Sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", TRUE ~ "n.s."))
kable(slr_results %>%
select(Predictor, estimate, std.error, statistic, p.value, R2, Sig),
digits = 4,
caption = "Table 10. Simple Regression: Each Predictor vs. crim (sorted by p-value)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, font_size = 11) %>%
row_spec(which(slr_results$p.value < 0.05), background = "#D5F5E3")
Table 10. Simple Regression: Each Predictor vs. crim (sorted by p-value)
|
Predictor
|
estimate
|
std.error
|
statistic
|
p.value
|
R2
|
Sig
|
|
rad
|
0.6179
|
0.0343
|
17.9982
|
0.0000
|
0.3913
|
***
|
|
tax
|
0.0297
|
0.0018
|
16.0994
|
0.0000
|
0.3396
|
***
|
|
lstat
|
0.5488
|
0.0478
|
11.4907
|
0.0000
|
0.2076
|
***
|
|
nox
|
31.2485
|
2.9992
|
10.4190
|
0.0000
|
0.1772
|
***
|
|
indus
|
0.5098
|
0.0510
|
9.9908
|
0.0000
|
0.1653
|
***
|
|
medv
|
-0.3632
|
0.0384
|
-9.4597
|
0.0000
|
0.1508
|
***
|
|
black
|
-0.0363
|
0.0039
|
-9.3670
|
0.0000
|
0.1483
|
***
|
|
dis
|
-1.5509
|
0.1683
|
-9.2135
|
0.0000
|
0.1441
|
***
|
|
age
|
0.1078
|
0.0127
|
8.4628
|
0.0000
|
0.1244
|
***
|
|
ptratio
|
1.1520
|
0.1694
|
6.8014
|
0.0000
|
0.0841
|
***
|
|
rm
|
-2.6841
|
0.5320
|
-5.0448
|
0.0000
|
0.0481
|
***
|
|
zn
|
-0.0739
|
0.0161
|
-4.5938
|
0.0000
|
0.0402
|
***
|
|
chas
|
-1.8928
|
1.5061
|
-1.2567
|
0.2094
|
0.0031
|
n.s.
|
# Plot top 4 predictors by R²
top4 <- slr_results %>% slice_max(R2, n = 4) %>% pull(Predictor)
Boston %>%
select(crim, all_of(top4)) %>%
pivot_longer(-crim, names_to = "predictor", values_to = "value") %>%
ggplot(aes(x = value, y = crim)) +
geom_point(alpha = 0.2, size = 1, colour = "#2C3E50") +
geom_smooth(method = "lm", colour = "#E74C3C", se = FALSE,
linewidth = 0.9, linetype = "dashed") +
geom_smooth(method = "loess", colour = "#27AE60", se = FALSE,
linewidth = 0.9) +
facet_wrap(~ predictor, scales = "free_x") +
labs(title = "Figure 8. Top-4 Predictors of Crime Rate",
subtitle = "Red dashed = linear fit; green = LOESS. Divergence signals non-linearity.",
x = NULL, y = "Per-capita crime rate") +
theme_afs()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

lm_boston <- lm(crim ~ ., data = Boston)
tidy_boston <- tidy(lm_boston, conf.int = TRUE) %>%
mutate(Sig = case_when(p.value < 0.001 ~ "***", p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*", p.value < 0.1 ~ ".", TRUE ~ ""))
kable(tidy_boston %>%
select(term, estimate, std.error, statistic, p.value, Sig),
digits = 4,
caption = "Table 11. Multiple Regression: crim ~ All Predictors") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, font_size = 11) %>%
row_spec(which(tidy_boston$p.value < 0.05), background = "#D5F5E3")
Table 11. Multiple Regression: crim ~ All Predictors
|
term
|
estimate
|
std.error
|
statistic
|
p.value
|
Sig
|
|
(Intercept)
|
17.0332
|
7.2349
|
2.3543
|
0.0189
|
|
|
zn
|
0.0449
|
0.0187
|
2.3943
|
0.0170
|
|
|
indus
|
-0.0639
|
0.0834
|
-0.7656
|
0.4443
|
|
|
chas
|
-0.7491
|
1.1801
|
-0.6348
|
0.5259
|
|
|
nox
|
-10.3135
|
5.2755
|
-1.9550
|
0.0512
|
.
|
|
rm
|
0.4301
|
0.6128
|
0.7019
|
0.4831
|
|
|
age
|
0.0015
|
0.0179
|
0.0810
|
0.9355
|
|
|
dis
|
-0.9872
|
0.2818
|
-3.5029
|
0.0005
|
***
|
|
rad
|
0.5882
|
0.0880
|
6.6804
|
0.0000
|
***
|
|
tax
|
-0.0038
|
0.0052
|
-0.7332
|
0.4638
|
|
|
ptratio
|
-0.2711
|
0.1865
|
-1.4539
|
0.1466
|
|
|
black
|
-0.0075
|
0.0037
|
-2.0520
|
0.0407
|
|
|
lstat
|
0.1262
|
0.0757
|
1.6667
|
0.0962
|
.
|
|
medv
|
-0.1989
|
0.0605
|
-3.2865
|
0.0011
|
**
|
mlr_coef <- setNames(tidy_boston$estimate[-1], tidy_boston$term[-1])
slr_coef <- setNames(slr_results$estimate, slr_results$Predictor)
common <- intersect(names(slr_coef), names(mlr_coef))
coef_df <- data.frame(
Predictor = common,
SLR = slr_coef[common],
MLR = mlr_coef[common]
) %>%
mutate(
Shift = MLR - SLR,
Direction = ifelse(Shift > 0, "Upward", "Downward"),
label_pos = ifelse(abs(Shift) > 0.3 * max(abs(SLR)), Predictor, "")
)
ggplot(coef_df, aes(x = SLR, y = MLR, colour = Direction, label = Predictor)) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60") +
geom_abline(slope = 1, intercept = 0, colour = "grey80", linewidth = 0.8) +
geom_point(size = 3.5, alpha = 0.85) +
ggrepel::geom_text_repel(size = 3.2, colour = "#2C3E50",
max.overlaps = 15) +
scale_colour_manual(values = c(Upward = "#27AE60", Downward = "#E74C3C")) +
labs(title = "Figure 9. Univariate vs. Multiple Regression Coefficients",
subtitle = "Points on the diagonal (grey) indicate unchanged estimates; deviations reveal confounding.",
x = "Simple Regression Coefficient",
y = "Multiple Regression Coefficient",
colour = "Direction of Shift",
caption = "Source: ISLR2::Boston") +
theme_afs()

poly_tbl <- map_dfr(predictors_b, function(v) {
fit_poly <- lm(reformulate(
c(v, paste0("I(",v,"^2)"), paste0("I(",v,"^3)")), "crim"),
data = Boston)
s <- summary(fit_poly)
c_mat <- coef(s)
nrows <- nrow(c_mat)
p2 <- ifelse(nrows >= 3, round(c_mat[3, 4], 4), NA_real_)
p3 <- ifelse(nrows >= 4, round(c_mat[4, 4], 4), NA_real_)
data.frame(
Predictor = v,
p_linear = round(c_mat[2, 4], 4),
p_quadratic = p2,
p_cubic = p3,
R2_poly = round(s$r.squared, 4),
NonLinear = ifelse(!is.na(p2) & p2 < 0.05 |
!is.na(p3) & p3 < 0.05, "✓", "")
)
}) %>% arrange(desc(NonLinear), p_quadratic)
kable(poly_tbl,
caption = "Table 12. Non-linear Association Test (Cubic Polynomial)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE, font_size = 11) %>%
row_spec(which(poly_tbl$NonLinear == "✓"), background = "#FCF3CF")
Table 12. Non-linear Association Test (Cubic Polynomial)
|
Predictor
|
p_linear
|
p_quadratic
|
p_cubic
|
R2_poly
|
NonLinear
|
|
indus
|
0.0001
|
0.0000
|
0.0000
|
0.2597
|
✓
|
|
nox
|
0.0000
|
0.0000
|
0.0000
|
0.2970
|
✓
|
|
dis
|
0.0000
|
0.0000
|
0.0000
|
0.2778
|
✓
|
|
medv
|
0.0000
|
0.0000
|
0.0000
|
0.4202
|
✓
|
|
ptratio
|
0.0030
|
0.0041
|
0.0063
|
0.1138
|
✓
|
|
age
|
0.1427
|
0.0474
|
0.0067
|
0.1742
|
✓
|
|
lstat
|
0.3345
|
0.0646
|
0.1299
|
0.2179
|
|
|
zn
|
0.0026
|
0.0938
|
0.2295
|
0.0582
|
|
|
tax
|
0.1097
|
0.1375
|
0.2439
|
0.3689
|
|
|
rm
|
0.2118
|
0.3641
|
0.5086
|
0.0678
|
|
|
black
|
0.1386
|
0.4742
|
0.5436
|
0.1498
|
|
|
rad
|
0.6234
|
0.6130
|
0.4823
|
0.4000
|
|
|
chas
|
0.2094
|
NA
|
NA
|
0.0031
|
|
data(Weekly)
cat("Dimensions:", dim(Weekly), "\n")
## Dimensions: 1089 9
prop_tbl <- prop.table(table(Weekly$Direction))
cat(sprintf("Up: %.1f%% | Down: %.1f%%\n",
prop_tbl["Up"]*100, prop_tbl["Down"]*100))
## Up: 55.6% | Down: 44.4%
cat(sprintf("Naïve 'always-Up' baseline accuracy: %.1f%%\n",
prop_tbl["Up"]*100))
## Naïve 'always-Up' baseline accuracy: 55.6%
summary(Weekly[, c("Lag1","Lag2","Lag3","Lag4","Lag5","Volume","Today")])
## Lag1 Lag2 Lag3 Lag4
## Min. :-18.1950 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580 1st Qu.: -1.1580
## Median : 0.2410 Median : 0.2410 Median : 0.2410 Median : 0.2380
## Mean : 0.1506 Mean : 0.1511 Mean : 0.1472 Mean : 0.1458
## 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. : 12.0260 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag5 Volume Today
## Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. :9.32821 Max. : 12.0260
p_vol <- ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_point(alpha = 0.12, size = 0.8, colour = "#2C3E50") +
geom_smooth(method = "loess", colour = "#E74C3C", se = TRUE,
fill = "#FADBD8", linewidth = 1) +
geom_vline(xintercept = c(2000, 2008), linetype = "dotted",
colour = "grey40") +
annotate("text", x = c(2000.2, 2008.2), y = c(7, 7),
label = c("Dot-com →","GFC →"),
hjust = 0, size = 3, colour = "grey30") +
labs(title = "Figure 10. Weekly Trading Volume (1990–2010)",
subtitle = "Structural breaks coincide with major market crises",
y = "Avg. daily volume (billion shares)", x = NULL) +
theme_afs()
p_ret <- Weekly %>%
select(Direction, Lag1:Lag5) %>%
pivot_longer(-Direction, names_to = "Lag", values_to = "Return") %>%
ggplot(aes(x = Return, fill = Direction)) +
geom_histogram(alpha = 0.6, bins = 50, position = "identity") +
scale_fill_manual(values = c(Down = "#E74C3C", Up = "#27AE60")) +
facet_wrap(~ Lag, nrow = 1) +
labs(title = "Figure 11. Distribution of Lagged Returns by Direction",
subtitle = "Distributions are nearly identical — consistent with EMH",
x = "Lag Return (%)", y = "Count", fill = "Direction") +
theme_afs(10)
p_cor <- ggcorr(Weekly[, c("Lag1","Lag2","Lag3","Lag4","Lag5",
"Volume","Today")],
label = TRUE, label_round = 2,
low = "#2980B9", mid = "white", high = "#C0392B",
name = "r") +
labs(title = "Figure 12. Correlation Matrix — Weekly Variables") +
theme_afs(10)
p_vol / p_ret / p_cor
## `geom_smooth()` using formula = 'y ~ x'

logit_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Weekly, family = binomial)
tidy_logit <- tidy(logit_full)
kable(tidy_logit %>%
mutate(OR = exp(estimate),
across(where(is.numeric), ~ round(., 4)),
p.value = format.pval(p.value, digits = 3, eps = 0.001),
Sig = case_when(as.numeric(p.value) < 0.05 ~ "*",
TRUE ~ "")) %>%
select(term, estimate, OR, std.error, statistic, p.value, Sig),
caption = "Table 13. Logistic Regression Coefficients — Full Model",
col.names = c("Term","Log-Odds","Odds Ratio","SE","z","p-value","Sig.")) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(tidy_logit$p.value < 0.05), background = "#D5F5E3")
Table 13. Logistic Regression Coefficients — Full Model
|
Term
|
Log-Odds
|
Odds Ratio
|
SE
|
z
|
p-value
|
Sig.
|
|
(Intercept)
|
0.2669
|
1.3059
|
0.0859
|
3.1056
|
0.0019
|
|
|
Lag1
|
-0.0413
|
0.9596
|
0.0264
|
-1.5626
|
0.1181
|
|
|
Lag2
|
0.0584
|
1.0602
|
0.0269
|
2.1754
|
0.0296
|
|
|
Lag3
|
-0.0161
|
0.9841
|
0.0267
|
-0.6024
|
0.5469
|
|
|
Lag4
|
-0.0278
|
0.9726
|
0.0265
|
-1.0501
|
0.2937
|
|
|
Lag5
|
-0.0145
|
0.9856
|
0.0264
|
-0.5485
|
0.5833
|
|
|
Volume
|
-0.0227
|
0.9775
|
0.0369
|
-0.6163
|
0.5377
|
|
pred_prob_full <- predict(logit_full, type = "response")
pred_class_full <- factor(ifelse(pred_prob_full > 0.5, "Up","Down"),
levels = c("Down","Up"))
cm_full <- confusionMatrix(pred_class_full,
factor(Weekly$Direction, c("Down","Up")),
positive = "Up")
# Heatmap
cm_df <- as.data.frame(cm_full$table) %>%
group_by(Reference) %>%
mutate(Rate = Freq / sum(Freq))
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Rate)) +
geom_tile(colour = "white", linewidth = 1.5) +
geom_text(aes(label = sprintf("%d\n(%.1f%%)", Freq, Rate*100)),
size = 5.5, fontface = "bold") +
scale_fill_gradient2(low = "#EBF5FB", mid = "#AED6F1",
high = "#1A5276", midpoint = 0.5) +
labs(title = "Figure 13. Confusion Matrix — Full Logistic Regression (In-Sample)",
subtitle = "Row-wise rates show severe class asymmetry in predictions",
x = "Actual Direction", y = "Predicted Direction") +
theme_afs(13) + theme(legend.position = "none")

metrics_full <- data.frame(
Metric = c("Overall Accuracy","Sensitivity (Up recall)",
"Specificity (Down recall)","Positive Predictive Value",
"Negative Predictive Value","Balanced Accuracy"),
Value = c(cm_full$overall["Accuracy"],
cm_full$byClass["Sensitivity"],
cm_full$byClass["Specificity"],
cm_full$byClass["Pos Pred Value"],
cm_full$byClass["Neg Pred Value"],
cm_full$byClass["Balanced Accuracy"])
)
kable(metrics_full, digits = 4,
caption = "Table 14. Full Model Classification Metrics") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(metrics_full$Value < 0.15 | metrics_full$Value > 0.90),
background = "#FADBD8")
Table 14. Full Model Classification Metrics
|
|
Metric
|
Value
|
|
Accuracy
|
Overall Accuracy
|
0.5611
|
|
Sensitivity
|
Sensitivity (Up recall)
|
0.9207
|
|
Specificity
|
Specificity (Down recall)
|
0.1116
|
|
Pos Pred Value
|
Positive Predictive Value
|
0.5643
|
|
Neg Pred Value
|
Negative Predictive Value
|
0.5294
|
|
Balanced Accuracy
|
Balanced Accuracy
|
0.5161
|
train_idx <- Weekly$Year <= 2008
Weekly_train <- Weekly[train_idx, ]
Weekly_test <- Weekly[!train_idx, ]
logit_lag2 <- glm(Direction ~ Lag2, data = Weekly_train, family = binomial)
pred_prob_l2 <- predict(logit_lag2, Weekly_test, type = "response")
pred_class_l2 <- factor(ifelse(pred_prob_l2 > 0.5,"Up","Down"),
levels = c("Down","Up"))
cm_l2 <- confusionMatrix(pred_class_l2,
factor(Weekly_test$Direction, c("Down","Up")),
positive = "Up")
cat(sprintf("Test Accuracy: %.4f | Sensitivity: %.4f | Specificity: %.4f\n",
cm_l2$overall["Accuracy"],
cm_l2$byClass["Sensitivity"],
cm_l2$byClass["Specificity"]))
## Test Accuracy: 0.6250 | Sensitivity: 0.9180 | Specificity: 0.2093
# LDA
lda_fit <- lda(Direction ~ Lag2, data = Weekly_train)
lda_pred <- predict(lda_fit, Weekly_test)
# KNN for K = 1, 3, 5, 7, 11, 21
set.seed(42)
train_X <- matrix(Weekly_train$Lag2, ncol = 1)
test_X <- matrix(Weekly_test$Lag2, ncol = 1)
train_Y <- Weekly_train$Direction
knn_results <- lapply(c(1, 3, 5, 7, 11, 21), function(k) {
pred <- knn(train_X, test_X, train_Y, k = k)
data.frame(K = k,
Accuracy = mean(pred == Weekly_test$Direction),
Sensitivity = mean(pred[Weekly_test$Direction=="Up"] == "Up"),
Specificity = mean(pred[Weekly_test$Direction=="Down"] == "Down"))
}) %>% bind_rows()
knn_results %>%
pivot_longer(c(Accuracy, Sensitivity, Specificity),
names_to = "Metric", values_to = "Value") %>%
ggplot(aes(x = K, y = Value, colour = Metric, group = Metric)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.5) +
geom_hline(yintercept = mean(Weekly_test$Direction == "Up"),
linetype = "dashed", colour = "grey40") +
annotate("text", x = 3, y = mean(Weekly_test$Direction == "Up") + 0.02,
label = "Naïve baseline", colour = "grey40", size = 3.2) +
scale_colour_manual(values = c(Accuracy="#2C3E50",
Sensitivity="#27AE60",
Specificity="#E74C3C")) +
scale_x_continuous(breaks = c(1,3,5,7,11,21)) +
labs(title = "Figure 14. KNN Bias-Variance Tradeoff",
subtitle = "As K increases, variance decreases but the model converges toward the baseline",
x = "K (number of neighbours)", y = "Performance",
colour = "Metric") +
theme_afs()

test_Y_bin <- as.numeric(Weekly_test$Direction == "Up")
roc_logit <- roc(test_Y_bin, pred_prob_l2, quiet = TRUE)
roc_lda <- roc(test_Y_bin, lda_pred$posterior[,"Up"], quiet = TRUE)
# KNN=3 "probability" approximation
k3_pred <- knn(train_X, test_X, train_Y, k=3, prob=TRUE)
k3_prob <- ifelse(k3_pred=="Up", attr(k3_pred,"prob"), 1-attr(k3_pred,"prob"))
roc_knn3 <- roc(test_Y_bin, k3_prob, quiet = TRUE)
auc_logit <- round(auc(roc_logit), 4)
auc_lda <- round(auc(roc_lda), 4)
auc_knn3 <- round(auc(roc_knn3), 4)
# Optimal threshold (Youden's J)
youden_logit <- coords(roc_logit, "best", ret=c("threshold","sensitivity","specificity"))
ggroc(list(
`Logistic Regression` = roc_logit,
LDA = roc_lda,
`KNN (K=3)` = roc_knn3
), legacy.axes = TRUE, linewidth = 1.2) +
geom_abline(slope = 1, intercept = 0,
linetype = "dashed", colour = "grey60", linewidth = 0.7) +
geom_point(aes(x = 1 - youden_logit$specificity,
y = youden_logit$sensitivity),
colour = "#E74C3C", size = 4, shape = 18) +
annotate("text",
x = 1 - youden_logit$specificity + 0.05,
y = youden_logit$sensitivity - 0.03,
label = sprintf("Youden Optimal\n(τ = %.2f)", youden_logit$threshold),
size = 3, colour = "#E74C3C") +
scale_colour_manual(values = c("#E74C3C","#3498DB","#27AE60")) +
labs(
title = "Figure 15. ROC Curves — Model Comparison (Test Set: 2009–2010)",
subtitle = sprintf("AUC: Logistic = %s | LDA = %s | KNN(K=3) = %s | Diamond = Youden-optimal threshold",
auc_logit, auc_lda, auc_knn3),
x = "False Positive Rate (1 – Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = "Model"
) +
theme_afs(12) +
theme(legend.position = c(0.72, 0.22))

acc_fn <- function(p, a) mean(p == a)
sens_fn <- function(p, a) mean(p[a=="Up"] == "Up")
spec_fn <- function(p, a) mean(p[a=="Down"] == "Down")
act <- Weekly_test$Direction
perf_df <- bind_rows(
data.frame(Model="Logistic (Lag2)",
Accuracy=acc_fn(pred_class_l2,act),
Sensitivity=sens_fn(pred_class_l2,act),
Specificity=spec_fn(pred_class_l2,act),
AUC=auc_logit,
BV_Profile="Low variance · Some bias"),
data.frame(Model="LDA (Lag2)",
Accuracy=acc_fn(as.character(lda_pred$class),act),
Sensitivity=sens_fn(as.character(lda_pred$class),act),
Specificity=spec_fn(as.character(lda_pred$class),act),
AUC=auc_lda,
BV_Profile="Low variance · Some bias"),
knn_results %>%
filter(K %in% c(1, 3, 7, 21)) %>%
transmute(Model=paste0("KNN (K=",K,")"),
Accuracy, Sensitivity, Specificity,
AUC=NA_real_,
BV_Profile=case_when(
K==1 ~ "Zero bias · Max variance (overfit)",
K==3 ~ "Low bias · High variance",
K==7 ~ "Moderate bias · Moderate variance",
K==21 ~ "High bias · Low variance (near-baseline)"
))
) %>% arrange(desc(Accuracy))
kable(perf_df, digits = 4,
caption = "Table 15. Full Model Comparison — Test Set (2009–2010)") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, font_size = 11) %>%
row_spec(1, bold = TRUE, background = "#D5F5E3") %>%
column_spec(6, italic = TRUE, color = "grey40")
Table 15. Full Model Comparison — Test Set (2009–2010)
|
Model
|
Accuracy
|
Sensitivity
|
Specificity
|
AUC
|
BV_Profile
|
|
Logistic (Lag2)
|
0.6250
|
0.9180
|
0.2093
|
0.4537
|
Low variance · Some bias
|
|
LDA (Lag2)
|
0.6250
|
0.9180
|
0.2093
|
0.4537
|
Low variance · Some bias
|
|
KNN (K=21)
|
0.5577
|
0.6557
|
0.4186
|
NA
|
High bias · Low variance (near-baseline)
|
|
KNN (K=3)
|
0.5481
|
0.6885
|
0.3488
|
NA
|
Low bias · High variance
|
|
KNN (K=7)
|
0.5481
|
0.6885
|
0.3488
|
NA
|
Moderate bias · Moderate variance
|
|
KNN (K=1)
|
0.5000
|
0.5082
|
0.4884
|
NA
|
Zero bias · Max variance (overfit)
|
cat("R version:", R.version$version.string, "\n")
## R version: R version 4.5.1 (2025-06-13 ucrt)
cat("Key packages: ISLR2", as.character(packageVersion("ISLR2")),
"| pROC", as.character(packageVersion("pROC")),
"| caret", as.character(packageVersion("caret")), "\n")
## Key packages: ISLR2 1.3.2 | pROC 1.19.0.1 | caret 7.0.1