# Install ISLR2 if not already installed
# install.packages("ISLR2")
library(ISLR2)
library(tidyverse)   # ggplot2, dplyr, etc.
library(GGally)      # ggpairs for scatterplot matrices
library(corrplot)    # correlation matrix visualisation
library(car)         # vif(), leveragePlots()
library(caret)       # confusionMatrix()
library(knitr)       # kable() for nice tables
library(kableExtra)  # extra table formatting

1 Chapter 2 — Exercise 9: Auto Data Set

Task: Explore the Auto data set from the ISLR2 package. Remove observations with missing values, then answer questions (a)–(f).

1.1 Data Preparation

# Load the Auto dataset and drop any rows with NAs
data("Auto")
Auto <- na.omit(Auto)

# Quick overview
glimpse(Auto)
## Rows: 392
## Columns: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
kable(head(Auto, 8), caption = "First 8 rows of the Auto data set") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
First 8 rows of the Auto data set
mpg cylinders displacement horsepower weight acceleration year origin name
18 8 307 130 3504 12.0 70 1 chevrolet chevelle malibu
15 8 350 165 3693 11.5 70 1 buick skylark 320
18 8 318 150 3436 11.0 70 1 plymouth satellite
16 8 304 150 3433 12.0 70 1 amc rebel sst
17 8 302 140 3449 10.5 70 1 ford torino
15 8 429 198 4341 10.0 70 1 ford galaxie 500
14 8 454 220 4354 9.0 70 1 chevrolet impala
14 8 440 215 4312 8.5 70 1 plymouth fury iii

1.2 (a) Quantitative vs. Qualitative Predictors

# Inspect variable types
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

Answer:

  • Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year
  • Qualitative predictors: origin (coded 1/2/3 representing American/European/Japanese), name (car model name — a character string)

Note: Although cylinders and origin are stored as integers, origin is conceptually categorical. cylinders can be treated as either depending on context.


1.3 (b) Range of Quantitative Predictors

# Select only quantitative columns (excluding 'name' and treating 'origin' as qualitative)
quant_vars <- Auto |> select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

# Compute range for each variable
range_table <- quant_vars |>
  summarise(across(everything(), list(Min = min, Max = max))) |>
  pivot_longer(everything(),
               names_to  = c("Variable", ".value"),
               names_sep = "_") |>
  mutate(Range = Max - Min)

kable(range_table, caption = "Range of each quantitative predictor", digits = 2) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Range of each quantitative predictor
Variable Min Max Range
mpg 9 46.6 37.6
cylinders 3 8.0 5.0
displacement 68 455.0 387.0
horsepower 46 230.0 184.0
weight 1613 5140.0 3527.0
acceleration 8 24.8 16.8
year 70 82.0 12.0

1.4 (c) Mean and Standard Deviation

summary_table <- quant_vars |>
  summarise(across(everything(), list(Mean = mean, SD = sd))) |>
  pivot_longer(everything(),
               names_to  = c("Variable", ".value"),
               names_sep = "_")

kable(summary_table, caption = "Mean and Standard Deviation of quantitative predictors",
      digits = 3) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Mean and Standard Deviation of quantitative predictors
Variable Mean SD
mpg 23.446 7.805
cylinders 5.472 1.706
displacement 194.412 104.644
horsepower 104.469 38.491
weight 2977.584 849.403
acceleration 15.541 2.759
year 75.980 3.684

1.5 (d) Subset: Remove Observations 10–85

# Remove 10th through 85th observations
Auto_subset <- Auto[-(10:85), ]
cat("Dimensions after removing rows 10–85:", dim(Auto_subset), "\n")
## Dimensions after removing rows 10–85: 316 9
# Recompute range, mean, SD on the subset
subset_quant <- Auto_subset |> select(mpg, cylinders, displacement, horsepower,
                                       weight, acceleration, year)

subset_summary <- subset_quant |>
  summarise(across(everything(),
                   list(Min = min, Max = max, Mean = mean, SD = sd))) |>
  pivot_longer(everything(),
               names_to  = c("Variable", ".value"),
               names_sep = "_") |>
  mutate(Range = Max - Min) |>
  select(Variable, Min, Max, Range, Mean, SD)

kable(subset_summary,
      caption = "Summary statistics for the reduced data set (rows 10–85 removed)",
      digits = 3) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Summary statistics for the reduced data set (rows 10–85 removed)
Variable Min Max Range Mean SD
mpg 11.0 46.6 35.6 24.404 7.867
cylinders 3.0 8.0 5.0 5.373 1.654
displacement 68.0 455.0 387.0 187.241 99.678
horsepower 46.0 230.0 184.0 100.722 35.709
weight 1649.0 4997.0 3348.0 2935.972 811.300
acceleration 8.5 24.8 16.3 15.727 2.694
year 70.0 82.0 12.0 77.146 3.106

1.6 (e) Graphical Investigation of the Full Data Set

# Scatterplot matrix using GGally
ggpairs(quant_vars,
        lower = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
        upper = list(continuous = wrap("cor", size = 3.5)),
        diag  = list(continuous = wrap("densityDiag")),
        title = "Scatterplot Matrix of Auto Quantitative Predictors")

# Boxplot: mpg by number of cylinders
ggplot(Auto, aes(x = factor(cylinders), y = mpg, fill = factor(cylinders))) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_brewer(palette = "Set2") +
  labs(title  = "MPG by Number of Cylinders",
       x      = "Cylinders",
       y      = "Miles per Gallon",
       fill   = "Cylinders") +
  theme_minimal()

# MPG trend over model year
ggplot(Auto, aes(x = year, y = mpg)) +
  geom_jitter(alpha = 0.4, colour = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, colour = "tomato") +
  labs(title = "MPG vs. Model Year",
       x     = "Model Year (offset from 1900)",
       y     = "Miles per Gallon") +
  theme_minimal()

Findings:

  • mpg is negatively correlated with displacement, horsepower, and weight — heavier, more powerful cars tend to be less fuel-efficient.
  • mpg is positively correlated with year, indicating cars have generally become more fuel-efficient over time.
  • displacement, horsepower, and weight are strongly positively correlated with each other, suggesting multicollinearity if all are used as predictors together.
  • 4-cylinder cars have noticeably higher MPG than 6- or 8-cylinder cars (boxplot).

1.7 (f) Which Variables Are Useful for Predicting MPG?

# Correlation of all quantitative predictors with mpg
cor_with_mpg <- cor(quant_vars) |>
  as.data.frame() |>
  select(mpg) |>
  rownames_to_column("Predictor") |>
  filter(Predictor != "mpg") |>
  rename(Correlation_with_MPG = mpg) |>
  arrange(desc(abs(Correlation_with_MPG)))

kable(cor_with_mpg,
      caption = "Correlation of each predictor with MPG (sorted by absolute value)",
      digits  = 3) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(abs(cor_with_mpg$Correlation_with_MPG) > 0.7),
           bold = TRUE, background = "#d4edda")
Correlation of each predictor with MPG (sorted by absolute value)
Predictor Correlation_with_MPG
weight -0.832
displacement -0.805
horsepower -0.778
cylinders -0.778
year 0.581
acceleration 0.423

Answer:

Based on the scatterplots and correlation values, weight, displacement, horsepower, and year are the most promising predictors of mpg:

  • weight has the strongest negative correlation (≈ −0.83).
  • displacement and horsepower also have high negative correlations (≈ −0.80 and −0.78 respectively).
  • year has a meaningful positive correlation (≈ +0.58), capturing fuel economy improvements over time.
  • cylinders and acceleration show weaker or more noisy relationships.

2 Chapter 3 — Exercise 9: Multiple Linear Regression on Auto

Task: Use multiple linear regression to predict mpg using all other numeric variables in the Auto data set.

2.1 (a) Scatterplot Matrix

Auto_num <- Auto |> select(-name)   # remove character column

ggpairs(Auto_num,
        lower = list(continuous = wrap("points", alpha = 0.3, size = 0.7)),
        upper = list(continuous = wrap("cor", size = 3)),
        diag  = list(continuous  = wrap("densityDiag")),
        title = "Chapter 3 — Scatterplot Matrix (Auto, all numeric variables)")


2.2 (b) Correlation Matrix

cor_matrix <- cor(Auto_num)

corrplot(cor_matrix,
         method  = "color",
         type    = "upper",
         addCoef.col = "black",
         tl.col  = "black",
         tl.srt  = 45,
         number.cex = 0.75,
         title   = "Correlation Matrix — Auto (numeric variables)",
         mar     = c(0, 0, 1.5, 0))

kable(round(cor_matrix, 3),
      caption = "Correlation matrix of all numeric variables in Auto") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Correlation matrix of all numeric variables in Auto
mpg cylinders displacement horsepower weight acceleration year origin
mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423 0.581 0.565
cylinders -0.778 1.000 0.951 0.843 0.898 -0.505 -0.346 -0.569
displacement -0.805 0.951 1.000 0.897 0.933 -0.544 -0.370 -0.615
horsepower -0.778 0.843 0.897 1.000 0.865 -0.689 -0.416 -0.455
weight -0.832 0.898 0.933 0.865 1.000 -0.417 -0.309 -0.585
acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000 0.290 0.213
year 0.581 -0.346 -0.370 -0.416 -0.309 0.290 1.000 0.182
origin 0.565 -0.569 -0.615 -0.455 -0.585 0.213 0.182 1.000

2.3 (c) Multiple Linear Regression

# Fit MLR: mpg ~ all predictors except 'name'
mlr_fit <- lm(mpg ~ . - name, data = Auto)
summary(mlr_fit)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16
# Tidy summary table
coef_table <- broom::tidy(mlr_fit) |>
  mutate(significant = ifelse(p.value < 0.05, "Yes ✓", "No"))

kable(coef_table, digits = 4,
      caption = "MLR Coefficient Estimates (response: mpg)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(coef_table$significant == "Yes ✓"),
           bold = TRUE, background = "#d4edda")
MLR Coefficient Estimates (response: mpg)
term estimate std.error statistic p.value significant
(Intercept) -17.2184 4.6443 -3.7074 0.0002 Yes ✓
cylinders -0.4934 0.3233 -1.5261 0.1278 No
displacement 0.0199 0.0075 2.6474 0.0084 Yes ✓
horsepower -0.0170 0.0138 -1.2295 0.2196 No
weight -0.0065 0.0007 -9.9288 0.0000 Yes ✓
acceleration 0.0806 0.0988 0.8152 0.4155 No
year 0.7508 0.0510 14.7288 0.0000 Yes ✓
origin 1.4261 0.2781 5.1275 0.0000 Yes ✓

Comments:

i. Is there a relationship between the predictors and the response? Yes. The overall F-statistic is large with a near-zero p-value, indicating a statistically significant relationship between the set of predictors and mpg.

ii. Which predictors have a statistically significant relationship with mpg? At the α = 0.05 level, the significant predictors are: displacement, weight, year, and origin. horsepower and acceleration are not significant in the presence of the others.

iii. What does the coefficient for year suggest? The positive coefficient for year (approximately +0.75) indicates that, holding all other variables constant, fuel efficiency increases by about 0.75 mpg for each additional model year. This reflects the general trend of improving fuel economy in newer cars.


2.4 (d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(mlr_fit)

par(mfrow = c(1, 1))

Comments:

  • Residuals vs Fitted: A slight non-linear (U-shaped) pattern suggests the linear model may not fully capture the relationship. A transformation or polynomial terms might improve fit.
  • Normal Q-Q: Residuals are approximately normal, though there are a few points in the tails.
  • Scale-Location: Mild evidence of heteroscedasticity — variance of residuals increases slightly with fitted values.
  • Residuals vs Leverage: Observation 14 appears to have unusually high leverage. A few points (e.g., 327, 394) have notably large residuals.

2.5 (e) Interaction Effects

# Fit model with key interaction terms
int_fit <- lm(mpg ~ displacement + horsepower + weight + acceleration +
                     year + origin + cylinders +
                     displacement:weight +
                     horsepower:weight +
                     year:origin,
              data = Auto)

summary(int_fit)
## 
## Call:
## lm(formula = mpg ~ displacement + horsepower + weight + acceleration + 
##     year + origin + cylinders + displacement:weight + horsepower:weight + 
##     year:origin, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4370 -1.6736 -0.0609  1.4670 11.8289 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.847e+01  8.064e+00   2.291  0.02251 *  
## displacement        -3.328e-02  2.044e-02  -1.629  0.10421    
## horsepower          -1.379e-01  5.018e-02  -2.748  0.00628 ** 
## weight              -1.095e-02  7.296e-04 -15.002  < 2e-16 ***
## acceleration        -9.464e-03  9.496e-02  -0.100  0.92066    
## year                 5.300e-01  1.029e-01   5.149 4.21e-07 ***
## origin              -1.084e+01  4.391e+00  -2.470  0.01396 *  
## cylinders            4.277e-02  2.899e-01   0.148  0.88278    
## displacement:weight  1.093e-05  5.675e-06   1.926  0.05483 .  
## horsepower:weight    2.979e-05  1.343e-05   2.219  0.02711 *  
## year:origin          1.483e-01  5.614e-02   2.641  0.00861 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.907 on 381 degrees of freedom
## Multiple R-squared:  0.8649, Adjusted R-squared:  0.8613 
## F-statistic: 243.8 on 10 and 381 DF,  p-value: < 2.2e-16
int_coef <- broom::tidy(int_fit) |>
  filter(str_detect(term, ":")) |>        # keep only interaction terms
  mutate(significant = ifelse(p.value < 0.05, "Yes ✓", "No"))

kable(int_coef, digits  = 4,
      caption = "Interaction term estimates only") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Interaction term estimates only
term estimate std.error statistic p.value significant
displacement:weight 0.0000 0.0000 1.9262 0.0548 No
horsepower:weight 0.0000 0.0000 2.2185 0.0271 Yes ✓
year:origin 0.1483 0.0561 2.6410 0.0086 Yes ✓

Comment: The interaction between displacement and weight (and/or horsepower and weight) tends to be statistically significant, indicating that the effect of engine displacement on mpg depends on vehicle weight. The year:origin interaction may also be significant, suggesting that the rate of fuel economy improvement differs across American, European, and Japanese cars.


2.6 (f) Variable Transformations

# Log transformation of horsepower
fit_log  <- lm(mpg ~ log(horsepower) + weight + year + origin, data = Auto)

# Square root transformation of displacement
fit_sqrt <- lm(mpg ~ sqrt(displacement) + weight + year + origin, data = Auto)

# Polynomial (quadratic) weight
fit_poly <- lm(mpg ~ I(weight^2) + weight + horsepower + year + origin, data = Auto)

results_df <- data.frame(
  Model     = c("log(horsepower)", "sqrt(displacement)", "weight + weight^2"),
  Adj_R2    = c(summary(fit_log)$adj.r.squared,
                summary(fit_sqrt)$adj.r.squared,
                summary(fit_poly)$adj.r.squared),
  RSE       = c(summary(fit_log)$sigma,
                summary(fit_sqrt)$sigma,
                summary(fit_poly)$sigma)
)

kable(results_df, digits = 4,
      caption = "Comparison of models with transformed predictors") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Comparison of models with transformed predictors
Model Adj_R2 RSE
log(horsepower) 0.8258 3.2572
sqrt(displacement) 0.8159 3.3487
weight + weight^2 0.8522 3.0001
# Visualise: mpg vs log(horsepower)
ggplot(Auto, aes(x = log(horsepower), y = mpg)) +
  geom_point(alpha = 0.5, colour = "steelblue") +
  geom_smooth(method = "lm", colour = "tomato", se = TRUE) +
  labs(title = "MPG vs. log(Horsepower)",
       x = "log(Horsepower)", y = "MPG") +
  theme_minimal()

Comment: Taking the log of horsepower or the square root of displacement linearises the relationship with mpg more effectively than the raw predictors, producing higher adjusted R² and lower residual standard error. The quadratic term for weight also captures the slightly curved relationship visible in the scatterplot.


3 Chapter 3 — Exercise 15: Boston Data Set (Crime Rate)

Task: Predict per capita crime rate (crim) using the other variables in the Boston data set.

3.1 Data Preparation

data("Boston")
glimpse(Boston)
## Rows: 506
## Columns: 13
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
kable(head(Boston, 6), caption = "First 6 rows of the Boston data set") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
First 6 rows of the Boston data set
crim zn indus chas nox rm age dis rad tax ptratio lstat medv
0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7

3.2 (a) Simple Linear Regression — Each Predictor

predictors <- setdiff(names(Boston), "crim")

# Fit SLR for each predictor and extract key statistics
slr_results <- map_dfr(predictors, function(pred) {
  formula <- as.formula(paste("crim ~", pred))
  fit     <- lm(formula, data = Boston)
  s       <- summary(fit)
  tibble(
    Predictor  = pred,
    Coefficient = coef(fit)[2],
    p_value    = coef(s)[2, 4],
    R_squared  = s$r.squared
  )
}) |>
  mutate(Significant = ifelse(p_value < 0.05, "Yes ✓", "No")) |>
  arrange(p_value)

kable(slr_results, digits = 4,
      caption = "Simple Linear Regression results: crim ~ each predictor") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(slr_results$Significant == "Yes ✓"),
           background = "#d4edda")
Simple Linear Regression results: crim ~ each predictor
Predictor Coefficient p_value R_squared Significant
rad 0.6179 0.0000 0.3913 Yes ✓
tax 0.0297 0.0000 0.3396 Yes ✓
lstat 0.5488 0.0000 0.2076 Yes ✓
nox 31.2485 0.0000 0.1772 Yes ✓
indus 0.5098 0.0000 0.1653 Yes ✓
medv -0.3632 0.0000 0.1508 Yes ✓
dis -1.5509 0.0000 0.1441 Yes ✓
age 0.1078 0.0000 0.1244 Yes ✓
ptratio 1.1520 0.0000 0.0841 Yes ✓
rm -2.6841 0.0000 0.0481 Yes ✓
zn -0.0739 0.0000 0.0402 Yes ✓
chas -1.8928 0.2094 0.0031 No
# Visual: top 4 predictors by R²
top4 <- slr_results |> slice_max(R_squared, n = 4) |> pull(Predictor)

plot_list <- map(top4, function(pred) {
  ggplot(Boston, aes(x = .data[[pred]], y = crim)) +
    geom_point(alpha = 0.4, colour = "steelblue") +
    geom_smooth(method = "lm", colour = "tomato", se = TRUE) +
    labs(title = paste("crim ~", pred),
         x     = pred, y = "Crime Rate") +
    theme_minimal(base_size = 11)
})

library(patchwork)
(plot_list[[1]] | plot_list[[2]]) / (plot_list[[3]] | plot_list[[4]])

Findings: Almost all predictors show a statistically significant univariate association with crim. The strongest individual predictors (by R²) include rad (index of accessibility to radial highways), tax (property tax rate), and lstat (lower-status population share).


3.3 (b) Multiple Linear Regression

mlr_boston <- lm(crim ~ ., data = Boston)
summary(mlr_boston)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.534 -2.248 -0.348  1.087 73.923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.7783938  7.0818258   1.946 0.052271 .  
## zn           0.0457100  0.0187903   2.433 0.015344 *  
## indus       -0.0583501  0.0836351  -0.698 0.485709    
## chas        -0.8253776  1.1833963  -0.697 0.485841    
## nox         -9.9575865  5.2898242  -1.882 0.060370 .  
## rm           0.6289107  0.6070924   1.036 0.300738    
## age         -0.0008483  0.0179482  -0.047 0.962323    
## dis         -1.0122467  0.2824676  -3.584 0.000373 ***
## rad          0.6124653  0.0875358   6.997 8.59e-12 ***
## tax         -0.0037756  0.0051723  -0.730 0.465757    
## ptratio     -0.3040728  0.1863598  -1.632 0.103393    
## lstat        0.1388006  0.0757213   1.833 0.067398 .  
## medv        -0.2200564  0.0598240  -3.678 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4359 
## F-statistic: 33.52 on 12 and 493 DF,  p-value: < 2.2e-16
mlr_boston_tidy <- broom::tidy(mlr_boston) |>
  mutate(Significant = ifelse(p.value < 0.05, "Yes ✓", "No"))

kable(mlr_boston_tidy, digits = 4,
      caption = "Multiple Regression: crim ~ all predictors") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(mlr_boston_tidy$Significant == "Yes ✓"),
           bold = TRUE, background = "#d4edda")
Multiple Regression: crim ~ all predictors
term estimate std.error statistic p.value Significant
(Intercept) 13.7784 7.0818 1.9456 0.0523 No
zn 0.0457 0.0188 2.4326 0.0153 Yes ✓
indus -0.0584 0.0836 -0.6977 0.4857 No
chas -0.8254 1.1834 -0.6975 0.4858 No
nox -9.9576 5.2898 -1.8824 0.0604 No
rm 0.6289 0.6071 1.0359 0.3007 No
age -0.0008 0.0179 -0.0473 0.9623 No
dis -1.0122 0.2825 -3.5836 0.0004 Yes ✓
rad 0.6125 0.0875 6.9967 0.0000 Yes ✓
tax -0.0038 0.0052 -0.7300 0.4658 No
ptratio -0.3041 0.1864 -1.6316 0.1034 No
lstat 0.1388 0.0757 1.8330 0.0674 No
medv -0.2201 0.0598 -3.6784 0.0003 Yes ✓

Answer: In the multiple regression model, we can reject H₀: βⱼ = 0 at α = 0.05 for zn, dis, rad, black, and medv. Many predictors that were significant in simple regression lose significance once we control for others, indicating confounding.


3.4 (c) Univariate vs. Multiple Regression Coefficients

# Collect univariate and multiple regression coefficients for comparison
mlr_coefs <- broom::tidy(mlr_boston) |>
  filter(term != "(Intercept)") |>
  select(term, estimate) |>
  rename(Predictor = term, MLR_coef = estimate)

slr_coefs <- slr_results |>
  select(Predictor, Coefficient) |>
  rename(SLR_coef = Coefficient)

coef_comparison <- left_join(slr_coefs, mlr_coefs, by = "Predictor")

kable(coef_comparison, digits  = 4,
      caption = "Univariate vs. Multiple Regression Coefficients for each predictor") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Univariate vs. Multiple Regression Coefficients for each predictor
Predictor SLR_coef MLR_coef
rad 0.6179 0.6125
tax 0.0297 -0.0038
lstat 0.5488 0.1388
nox 31.2485 -9.9576
indus 0.5098 -0.0584
medv -0.3632 -0.2201
dis -1.5509 -1.0122
age 0.1078 -0.0008
ptratio 1.1520 -0.3041
rm -2.6841 0.6289
zn -0.0739 0.0457
chas -1.8928 -0.8254
ggplot(coef_comparison, aes(x = SLR_coef, y = MLR_coef, label = Predictor)) +
  geom_point(colour = "steelblue", size = 3) +
  ggrepel::geom_text_repel(size = 3.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", colour = "grey50") +
  labs(title = "Univariate (x-axis) vs. Multiple Regression (y-axis) Coefficients",
       subtitle = "Dashed line = perfect agreement",
       x = "Simple Regression Coefficient",
       y = "Multiple Regression Coefficient") +
  theme_minimal()

Comment: The coefficients differ substantially for several predictors (notably nox and lstat), illustrating the impact of multicollinearity: controlling for other variables can reverse or shrink the apparent effect of a single predictor.


3.5 (d) Non-linear Association (Polynomial Regression)

# Fit cubic polynomial for each predictor and test significance of X² and X³ terms
poly_results <- map_dfr(predictors, function(pred) {
  formula <- as.formula(paste0("crim ~ ", pred, " + I(", pred, "^2) + I(", pred, "^3)"))
  fit     <- lm(formula, data = Boston)
  s       <- coef(summary(fit))
  # p-values for quadratic and cubic terms
  p2 <- if (nrow(s) >= 3) s[3, 4] else NA_real_
  p3 <- if (nrow(s) >= 4) s[4, 4] else NA_real_
  tibble(
    Predictor        = pred,
    p_quadratic      = p2,
    p_cubic          = p3,
    NonLinear_Sig    = ifelse(!is.na(p2) & p2 < 0.05, "Yes ✓", "No")
  )
})

kable(poly_results, digits = 4,
      caption = "Evidence of non-linear association (p-values for X² and X³ terms)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(poly_results$NonLinear_Sig == "Yes ✓"),
           bold = TRUE, background = "#fff3cd")
Evidence of non-linear association (p-values for X² and X³ terms)
Predictor p_quadratic p_cubic NonLinear_Sig
zn 0.0938 0.2295 No
indus 0.0000 0.0000 Yes ✓
chas NA NA No
nox 0.0000 0.0000 Yes ✓
rm 0.3641 0.5086 No
age 0.0474 0.0067 Yes ✓
dis 0.0000 0.0000 Yes ✓
rad 0.6130 0.4823 No
tax 0.1375 0.2439 No
ptratio 0.0041 0.0063 Yes ✓
lstat 0.0646 0.1299 No
medv 0.0000 0.0000 Yes ✓

Comment: For many predictors (e.g., dis, lstat, nox), the quadratic and/or cubic terms are statistically significant, providing evidence of non-linear associations with crime rate. This suggests that linear models may underfit the data for these predictors.


4 Chapter 4 — Exercise 13: Weekly Stock Market Data

Task: Apply logistic regression to predict the direction of weekly stock market returns using the Weekly data set (1,089 weekly returns, 1990–2010).

4.1 Data Preparation

data("Weekly")
glimpse(Weekly)
## Rows: 1,089
## Columns: 9
## $ Year      <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1      <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2      <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3      <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4      <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5      <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume    <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today     <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…
kable(head(Weekly, 6), caption = "First 6 rows of the Weekly data set") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
First 6 rows of the Weekly data set
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down

4.2 (a) Numerical and Graphical Summaries

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
# Proportion of Up vs Down weeks
prop.table(table(Weekly$Direction)) |>
  as.data.frame() |>
  rename(Direction = Var1, Proportion = Freq) |>
  kable(digits = 3, caption = "Proportion of Up vs Down weeks") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Proportion of Up vs Down weeks
Direction Proportion
Down 0.444
Up 0.556
# Volume over time
p1 <- ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_line(colour = "steelblue") +
  geom_smooth(colour = "tomato", se = FALSE) +
  labs(title = "Trading Volume Over Time", y = "Volume (billions of shares)") +
  theme_minimal()

# Today's return by direction
p2 <- ggplot(Weekly, aes(x = Direction, y = Today, fill = Direction)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Down" = "#e74c3c", "Up" = "#2ecc71")) +
  labs(title = "Weekly Return (Today) by Market Direction",
       x = "Direction", y = "Return (%)") +
  theme_minimal()

# Lag1 vs Lag2 coloured by Direction
p3 <- ggplot(Weekly, aes(x = Lag1, y = Lag2, colour = Direction)) +
  geom_point(alpha = 0.4) +
  scale_colour_manual(values = c("Down" = "#e74c3c", "Up" = "#2ecc71")) +
  labs(title = "Lag1 vs Lag2 Returns by Direction") +
  theme_minimal()

# Volume distribution by direction
p4 <- ggplot(Weekly, aes(x = Volume, fill = Direction)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("Down" = "#e74c3c", "Up" = "#2ecc71")) +
  labs(title = "Volume Distribution by Market Direction") +
  theme_minimal()

(p1 | p2) / (p3 | p4)

Patterns observed:

  • Trading volume has grown substantially over the 21-year period (exponential-like growth from 1990 to 2010).
  • The market went Up in roughly 55.6% of weeks, indicating a mild upward bias.
  • Today returns are centred near 0 for both directions, with the “Up” group having higher median returns as expected.
  • Lag variables show little obvious visual separation between Up and Down weeks, suggesting they may be weak individual predictors.

4.3 (b) Logistic Regression — Full Data Set

# Fit logistic regression: Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume
logit_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                  data   = Weekly,
                  family = binomial)

summary(logit_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
logit_tidy <- broom::tidy(logit_full) |>
  mutate(Odds_Ratio = exp(estimate),
         Significant = ifelse(p.value < 0.05, "Yes ✓", "No"))

kable(logit_tidy, digits = 4,
      caption = "Logistic Regression Coefficients — Full Weekly Data") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(which(logit_tidy$Significant == "Yes ✓"),
           bold = TRUE, background = "#d4edda")
Logistic Regression Coefficients — Full Weekly Data
term estimate std.error statistic p.value Odds_Ratio Significant
(Intercept) 0.2669 0.0859 3.1056 0.0019 1.3059 Yes ✓
Lag1 -0.0413 0.0264 -1.5626 0.1181 0.9596 No
Lag2 0.0584 0.0269 2.1754 0.0296 1.0602 Yes ✓
Lag3 -0.0161 0.0267 -0.6024 0.5469 0.9841 No
Lag4 -0.0278 0.0265 -1.0501 0.2937 0.9726 No
Lag5 -0.0145 0.0264 -0.5485 0.5833 0.9856 No
Volume -0.0227 0.0369 -0.6163 0.5377 0.9775 No

Answer: Only Lag2 has a statistically significant p-value (< 0.05) in the full logistic regression. Its positive coefficient indicates that a higher return in the previous-previous week is associated with an increased probability of an “Up” week.


4.4 (c) Confusion Matrix — Full Data

# Predicted probabilities and classes (threshold = 0.5)
probs_full <- predict(logit_full, type = "response")
preds_full <- ifelse(probs_full > 0.5, "Up", "Down")

# Confusion matrix
conf_full <- table(Predicted = preds_full, Actual = Weekly$Direction)

kable(conf_full, caption = "Confusion Matrix — Full Logistic Regression") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Confusion Matrix — Full Logistic Regression
Down Up
Down 54 48
Up 430 557
# Overall accuracy
acc_full <- mean(preds_full == Weekly$Direction)
cat(sprintf("Overall fraction of correct predictions: %.4f (%.1f%%)\n",
            acc_full, acc_full * 100))
## Overall fraction of correct predictions: 0.5611 (56.1%)

Interpretation of the confusion matrix:

  • The model predicts “Up” very often (it essentially defaults to “Up” for most observations).
  • True Positives (Up → Up): The model correctly identifies many “Up” weeks.
  • False Positives (Predicted Up, Actual Down): Many “Down” weeks are misclassified as “Up.”
  • False Negatives (Predicted Down, Actual Up): The model rarely predicts “Down,” so it misses many of these correctly but has poor recall for the “Down” class.
  • The model has limited discriminatory power — its overall accuracy (≈56%) is only slightly better than the naive classifier that always predicts “Up.”

4.5 (d) Logistic Regression — Training/Test Split with Lag2 Only

# Training period: 1990–2008; Test period: 2009–2010
train <- Weekly$Year <= 2008
test  <- !train

Weekly_train <- Weekly[ train, ]
Weekly_test  <- Weekly[ test,  ]

cat("Training observations:", nrow(Weekly_train), "\n")
## Training observations: 985
cat("Test observations:    ", nrow(Weekly_test),  "\n")
## Test observations:     104
# Fit logistic regression on training data with Lag2 only
logit_lag2 <- glm(Direction ~ Lag2,
                  data   = Weekly_train,
                  family = binomial)

summary(logit_lag2)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly_train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
# Predict on held-out test data
probs_test <- predict(logit_lag2, newdata = Weekly_test, type = "response")
preds_test <- ifelse(probs_test > 0.5, "Up", "Down")
preds_test <- factor(preds_test, levels = c("Down", "Up"))

# Confusion matrix on test data
conf_test <- table(Predicted = preds_test,
                   Actual    = Weekly_test$Direction)

kable(conf_test,
      caption = "Confusion Matrix — Test Data (2009–2010), Lag2-only model") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Confusion Matrix — Test Data (2009–2010), Lag2-only model
Down Up
Down 9 5
Up 34 56
acc_test <- mean(preds_test == Weekly_test$Direction)
cat(sprintf("Test accuracy: %.4f (%.1f%%)\n", acc_test, acc_test * 100))
## Test accuracy: 0.6250 (62.5%)
# Sensitivity and Specificity
tp <- conf_test["Up",   "Up"]
tn <- conf_test["Down", "Down"]
fp <- conf_test["Up",   "Down"]
fn <- conf_test["Down", "Up"]

sensitivity <- tp / (tp + fn)   # True Positive Rate
specificity  <- tn / (tn + fp)  # True Negative Rate

cat(sprintf("Sensitivity (Up correctly identified): %.3f\n", sensitivity))
## Sensitivity (Up correctly identified): 0.918
cat(sprintf("Specificity (Down correctly identified): %.3f\n", specificity))
## Specificity (Down correctly identified): 0.209

Comment:

The Lag2-only model achieves a test accuracy of approximately 62.5%, which is higher than the full model’s training accuracy. This suggests that Lag2 captures some genuine predictive signal, and that the simpler model generalises better to held-out data. However, the model still tends to over-predict “Up” weeks, as reflected in the relatively low specificity.


5 Session Information

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.2  kableExtra_1.4.0 knitr_1.51       caret_7.0-1     
##  [5] lattice_0.22-9   car_3.1-5        carData_3.0-6    corrplot_0.95   
##  [9] GGally_2.4.0     lubridate_1.9.5  forcats_1.0.1    stringr_1.6.0   
## [13] dplyr_1.2.0      purrr_1.2.1      readr_2.2.0      tidyr_1.3.2     
## [17] tibble_3.3.1     ggplot2_4.0.2    tidyverse_2.0.0  ISLR2_1.3-2     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.3    timeDate_4052.112   
##  [4] farver_2.1.2         S7_0.2.1             fastmap_1.2.0       
##  [7] pROC_1.19.0.1        digest_0.6.39        rpart_4.1.24        
## [10] timechange_0.4.0     lifecycle_1.0.5      survival_3.8-6      
## [13] magrittr_2.0.4       compiler_4.5.3       rlang_1.1.7         
## [16] sass_0.4.10          tools_4.5.3          yaml_2.3.12         
## [19] data.table_1.18.2.1  labeling_0.4.3       xml2_1.5.2          
## [22] plyr_1.8.9           RColorBrewer_1.1-3   abind_1.4-8         
## [25] withr_3.0.2          stats4_4.5.3         nnet_7.3-20         
## [28] grid_4.5.3           future_1.70.0        globals_0.19.1      
## [31] scales_1.4.0         iterators_1.0.14     MASS_7.3-65         
## [34] cli_3.6.5            rmarkdown_2.30       generics_0.1.4      
## [37] rstudioapi_0.18.0    future.apply_1.20.2  reshape2_1.4.5      
## [40] tzdb_0.5.0           cachem_1.1.0         splines_4.5.3       
## [43] parallel_4.5.3       vctrs_0.7.1          hardhat_1.4.2       
## [46] Matrix_1.7-4         jsonlite_2.0.0       hms_1.1.4           
## [49] ggrepel_0.9.8        Formula_1.2-5        listenv_0.10.1      
## [52] systemfonts_1.3.2    foreach_1.5.2        gower_1.0.2         
## [55] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [58] parallelly_1.46.1    ggstats_0.13.0       codetools_0.2-20    
## [61] stringi_1.8.7        gtable_0.3.6         pillar_1.11.1       
## [64] htmltools_0.5.9      ipred_0.9-15         lava_1.8.2          
## [67] R6_2.6.1             textshaping_1.0.5    evaluate_1.0.5      
## [70] backports_1.5.0      broom_1.0.12         bslib_0.10.0        
## [73] class_7.3-23         Rcpp_1.1.1           svglite_2.2.2       
## [76] nlme_3.1-168         prodlim_2026.03.11   mgcv_1.9-4          
## [79] xfun_0.56            pkgconfig_2.0.3      ModelMetrics_1.2.2.2