library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)    # ggpairs scatterplot matrix
library(corrplot)  # correlation heatmap
## corrplot 0.95 loaded
library(pROC)      # ROC curves
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Chapter 2 – Exercise 9 (Auto Dataset)

Make sure that missing values have been removed from the data.

data(Auto)
Auto <- na.omit(Auto)
dim(Auto)
## [1] 392   9
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

(a) Quantitative vs. Qualitative Predictors

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year

Qualitative predictor: name (and origin is technically categorical — country code 1/2/3 — but stored as integer)

(b) Range of Each Quantitative Predictor

quant_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

range_df <- quant_vars %>%
  summarise(across(everything(), list(Min = min, Max = max))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)")

knitr::kable(range_df, caption = "Range of Quantitative Predictors")
Range of Quantitative Predictors
Variable Min Max
mpg 9 46.6
cylinders 3 8.0
displacement 68 455.0
horsepower 46 230.0
weight 1613 5140.0
acceleration 8 24.8
year 70 82.0

(c) Mean and Standard Deviation of Each Quantitative Predictor

stats_df <- quant_vars %>%
  summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
  mutate(across(c(Mean, SD), ~ round(.x, 2)))

knitr::kable(stats_df, caption = "Mean and Standard Deviation of Quantitative Predictors")
Mean and Standard Deviation of Quantitative Predictors
Variable Mean SD
mpg 23.45 7.81
cylinders 5.47 1.71
displacement 194.41 104.64
horsepower 104.47 38.49
weight 2977.58 849.40
acceleration 15.54 2.76
year 75.98 3.68

(d) Remove Observations 10–85; Recompute Statistics

Auto_sub <- Auto[-(10:85), ]
dim(Auto_sub)
## [1] 316   9
quant_sub <- Auto_sub %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

sub_stats <- quant_sub %>%
  summarise(across(everything(),
    list(Min = min, Max = max, Mean = mean, SD = sd))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
  mutate(across(c(Mean, SD), ~ round(.x, 2)))

knitr::kable(sub_stats, caption = "Statistics for Subset (rows 10–85 removed)")
Statistics for Subset (rows 10–85 removed)
Variable Min Max Mean SD
mpg 11.0 46.6 24.40 7.87
cylinders 3.0 8.0 5.37 1.65
displacement 68.0 455.0 187.24 99.68
horsepower 46.0 230.0 100.72 35.71
weight 1649.0 4997.0 2935.97 811.30
acceleration 8.5 24.8 15.73 2.69
year 70.0 82.0 77.15 3.11

(e) Graphical Investigation of Predictors

# Scatterplot matrix for quantitative variables
ggpairs(quant_vars,
        upper = list(continuous = wrap("cor", size = 3)),
        lower = list(continuous = wrap("points", alpha = 0.3, size = 0.6)),
        diag  = list(continuous = wrap("densityDiag")),
        title = "Scatterplot Matrix – Auto Dataset") +
  theme_bw(base_size = 9)

Findings: Strong negative correlations exist between mpg and displacement, horsepower, and weight. displacement, horsepower, and weight are heavily correlated with each other (multicollinearity). cylinders clusters around 4, 6, and 8 — behaving more like a categorical variable.

(f) Variables Useful for Predicting mpg

Auto %>%
  select(mpg, displacement, horsepower, weight, acceleration, year) %>%
  pivot_longer(-mpg, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = mpg)) +
  geom_point(alpha = 0.3, size = 0.8, color = "#2980B9") +
  geom_smooth(method = "loess", color = "#E74C3C", se = TRUE) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 3) +
  labs(title = "Predictors vs. mpg", x = "Predictor Value", y = "mpg") +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

Conclusion: displacement, horsepower, and weight show strong negative nonlinear relationships with mpg — all three are useful predictors. year shows a positive trend (newer cars are more fuel-efficient). acceleration has a weaker, noisier relationship.


Chapter 3 – Exercise 9 (Auto Dataset — Multiple Linear Regression)

(a) Scatterplot Matrix

Auto_num <- Auto %>% select(-name)

ggpairs(Auto_num,
        upper = list(continuous = wrap("cor", size = 2.8)),
        lower = list(continuous = wrap("points", alpha = 0.25, size = 0.5)),
        title = "Scatterplot Matrix (All Variables, excluding name)") +
  theme_bw(base_size = 8)

(b) Correlation Matrix

cor_matrix <- cor(Auto_num)
knitr::kable(round(cor_matrix, 3), caption = "Correlation Matrix (excluding name)")
Correlation Matrix (excluding name)
mpg cylinders displacement horsepower weight acceleration year origin
mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423 0.581 0.565
cylinders -0.778 1.000 0.951 0.843 0.898 -0.505 -0.346 -0.569
displacement -0.805 0.951 1.000 0.897 0.933 -0.544 -0.370 -0.615
horsepower -0.778 0.843 0.897 1.000 0.865 -0.689 -0.416 -0.455
weight -0.832 0.898 0.933 0.865 1.000 -0.417 -0.309 -0.585
acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000 0.290 0.213
year 0.581 -0.346 -0.370 -0.416 -0.309 0.290 1.000 0.182
origin 0.565 -0.569 -0.615 -0.455 -0.585 0.213 0.182 1.000
corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.65,
         tl.cex = 0.8, title = "Correlation Heatmap", mar = c(0,0,1,0))

(c) Multiple Linear Regression: mpg ~ All Predictors (excluding name)

lm_full <- lm(mpg ~ . - name, data = Auto)
summary(lm_full)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Is there a relationship between the predictors and the response?

Yes — the overall F-statistic is highly significant (p < 2.2e-16), confirming a relationship between the predictors and mpg.

ii. Which predictors are statistically significant?

displacement, weight, year, and origin all show p-values < 0.05 and are statistically significant predictors of mpg. horsepower and acceleration are not significant at the 5% level.

iii. What does the coefficient for year suggest?

The positive coefficient for year (~0.75) suggests that, holding other variables constant, each additional model year is associated with an increase of about 0.75 mpg — reflecting the trend toward more fuel-efficient vehicles over time.

(d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(lm_full)

par(mfrow = c(1, 1))

Comments:

  • Residuals vs Fitted: There is a slight nonlinear (U-shaped) pattern, suggesting the linear model may not fully capture the relationship — a transformation might help.
  • Q-Q Plot: Residuals follow the normal line fairly well, with mild deviation in the tails.
  • Scale-Location: Some heteroscedasticity is present (variance increases with fitted values).
  • Residuals vs Leverage: Observation 14 has notably high leverage. A few points (e.g., 323, 327) appear as mild outliers but no extreme Cook’s distance values.

(e) Interaction Effects

# Key interactions suggested by high correlations
lm_interact <- lm(mpg ~ displacement + horsepower + weight + year + origin +
                  cylinders + acceleration +
                  displacement:weight +
                  horsepower:year +
                  weight:year,
                  data = Auto)
summary(lm_interact)
## 
## Call:
## lm(formula = mpg ~ displacement + horsepower + weight + year + 
##     origin + cylinders + acceleration + displacement:weight + 
##     horsepower:year + weight:year, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4158 -1.4565 -0.0318  1.3979 11.2571 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -5.468e+01  1.385e+01  -3.947 9.43e-05 ***
## displacement        -7.006e-02  1.106e-02  -6.337 6.61e-10 ***
## horsepower           8.122e-01  1.696e-01   4.790 2.39e-06 ***
## weight              -2.118e-02  8.390e-03  -2.524   0.0120 *  
## year                 1.437e+00  1.708e-01   8.411 8.28e-16 ***
## origin               4.994e-01  2.468e-01   2.023   0.0438 *  
## cylinders            5.035e-01  2.825e-01   1.782   0.0755 .  
## acceleration        -6.393e-02  9.014e-02  -0.709   0.4786    
## displacement:weight  1.987e-05  2.310e-06   8.604  < 2e-16 ***
## horsepower:year     -1.160e-02  2.311e-03  -5.021 7.90e-07 ***
## weight:year          1.620e-04  1.114e-04   1.454   0.1467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.785 on 381 degrees of freedom
## Multiple R-squared:  0.8759, Adjusted R-squared:  0.8727 
## F-statistic:   269 on 10 and 381 DF,  p-value: < 2.2e-16

Finding: The interaction horsepower:year and weight:year are statistically significant (p < 0.05), suggesting that the effect of horsepower and weight on mpg has changed over time — heavier, high-horsepower cars improved more in fuel efficiency in later years.

(f) Variable Transformations

lm_log    <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) + year + origin, data = Auto)
lm_sqrt   <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + year + origin, data = Auto)
lm_sq     <- lm(mpg ~ displacement + I(displacement^2) + weight + year + origin, data = Auto)

cat("R² – log transform:    ", summary(lm_log)$r.squared,  "\n")
## R² – log transform:     0.8454534
cat("R² – sqrt transform:   ", summary(lm_sqrt)$r.squared, "\n")
## R² – sqrt transform:    0.8337855
cat("R² – quadratic:        ", summary(lm_sq)$r.squared,   "\n")
## R² – quadratic:         0.8418804
cat("R² – original (full):  ", summary(lm_full)$r.squared, "\n")
## R² – original (full):   0.8214781
par(mfrow = c(2, 2))
plot(lm_log)

par(mfrow = c(1, 1))

Conclusion: The log-transformed model achieves the highest R² and the residual plots are more homoscedastic — log transformations of displacement, horsepower, and weight improve model fit compared to using the raw values.


Chapter 3 – Exercise 15 (Boston Dataset — Simple & Multiple Regression)

data(Boston)
glimpse(Boston)
## Rows: 506
## Columns: 13
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…

(a) Simple Linear Regression for Each Predictor → crim

predictors <- setdiff(names(Boston), "crim")

slr_results <- map_dfr(predictors, function(pred) {
  fit <- lm(as.formula(paste("crim ~", pred)), data = Boston)
  s   <- summary(fit)
  tibble(
    Predictor   = pred,
    Coefficient = round(coef(fit)[2], 4),
    `p-value`   = round(coef(s)[2, 4], 4),
    R2          = round(s$r.squared, 4),
    Significant = ifelse(coef(s)[2, 4] < 0.05, "Yes ✓", "No")
  )
})

knitr::kable(slr_results, caption = "Simple Linear Regression: crim ~ each predictor")
Simple Linear Regression: crim ~ each predictor
Predictor Coefficient p-value R2 Significant
zn -0.0739 0.0000 0.0402 Yes ✓
indus 0.5098 0.0000 0.1653 Yes ✓
chas -1.8928 0.2094 0.0031 No
nox 31.2485 0.0000 0.1772 Yes ✓
rm -2.6841 0.0000 0.0481 Yes ✓
age 0.1078 0.0000 0.1244 Yes ✓
dis -1.5509 0.0000 0.1441 Yes ✓
rad 0.6179 0.0000 0.3913 Yes ✓
tax 0.0297 0.0000 0.3396 Yes ✓
ptratio 1.1520 0.0000 0.0841 Yes ✓
lstat 0.5488 0.0000 0.2076 Yes ✓
medv -0.3632 0.0000 0.1508 Yes ✓
# Plot crim vs. most significant predictors
sig_preds <- slr_results %>% filter(Significant == "Yes ✓") %>% pull(Predictor)

Boston %>%
  select(crim, all_of(sig_preds)) %>%
  pivot_longer(-crim, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = crim)) +
  geom_point(alpha = 0.3, size = 0.7, color = "#8E44AD") +
  geom_smooth(method = "lm", color = "#E74C3C", se = TRUE) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 4) +
  labs(title = "Significant Predictors vs. Crime Rate (crim)",
       x = "Predictor Value", y = "Per Capita Crime Rate") +
  theme_bw(base_size = 9)
## `geom_smooth()` using formula = 'y ~ x'

Finding: Almost all predictors are statistically significant in simple linear regression. rad (accessibility to radial highways), tax (tax rate), and lstat (% lower status) show the strongest associations with crime rate.

(b) Multiple Linear Regression: crim ~ All Predictors

mlr_boston <- lm(crim ~ ., data = Boston)
summary(mlr_boston)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.534 -2.248 -0.348  1.087 73.923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.7783938  7.0818258   1.946 0.052271 .  
## zn           0.0457100  0.0187903   2.433 0.015344 *  
## indus       -0.0583501  0.0836351  -0.698 0.485709    
## chas        -0.8253776  1.1833963  -0.697 0.485841    
## nox         -9.9575865  5.2898242  -1.882 0.060370 .  
## rm           0.6289107  0.6070924   1.036 0.300738    
## age         -0.0008483  0.0179482  -0.047 0.962323    
## dis         -1.0122467  0.2824676  -3.584 0.000373 ***
## rad          0.6124653  0.0875358   6.997 8.59e-12 ***
## tax         -0.0037756  0.0051723  -0.730 0.465757    
## ptratio     -0.3040728  0.1863598  -1.632 0.103393    
## lstat        0.1388006  0.0757213   1.833 0.067398 .  
## medv        -0.2200564  0.0598240  -3.678 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4359 
## F-statistic: 33.52 on 12 and 493 DF,  p-value: < 2.2e-16

Finding: In the multiple regression model, only zn, dis, rad, black, and medv are statistically significant (p < 0.05). Many predictors that were significant in simple regression lose significance — indicating confounding and multicollinearity among predictors.

(c) Univariate vs. Multiple Regression Coefficients

uni_coefs   <- slr_results$Coefficient
multi_coefs <- coef(mlr_boston)[-1]  # exclude intercept

coef_compare <- tibble(
  Predictor  = predictors,
  Univariate = uni_coefs,
  Multiple   = as.numeric(multi_coefs)
)

ggplot(coef_compare, aes(x = Univariate, y = Multiple, label = Predictor)) +
  geom_point(color = "#E67E22", size = 3) +
  geom_text(vjust = -0.7, size = 3.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(title = "Univariate vs. Multiple Regression Coefficients",
       subtitle = "Each point = one predictor",
       x = "Simple Regression Coefficient",
       y = "Multiple Regression Coefficient") +
  theme_bw()

Comment: The nox variable shows the largest discrepancy — a large positive coefficient in simple regression shrinks dramatically in the multiple model, reflecting collinearity with other predictors.

(d) Non-linear Associations (Cubic Polynomial)

poly_results <- map_dfr(predictors, function(pred) {
  n_unique <- length(unique(Boston[[pred]]))
  degree   <- min(3, n_unique - 1)   # cap degree to avoid poly() error

  fit_poly <- lm(as.formula(paste("crim ~ poly(", pred, ",", degree, ")")), data = Boston)
  s        <- summary(fit_poly)
  coefs    <- coef(s)

  tibble(
    Predictor   = pred,
    Max_Degree  = degree,
    p_linear    = round(coefs[2, 4], 4),
    p_quadratic = ifelse(nrow(coefs) >= 3, round(coefs[3, 4], 4), NA_real_),
    p_cubic     = ifelse(nrow(coefs) >= 4, round(coefs[4, 4], 4), NA_real_),
    R2          = round(s$r.squared, 4)
  )
})

knitr::kable(poly_results, caption = "Cubic Polynomial Fits: crim ~ poly(X, 3)")
Cubic Polynomial Fits: crim ~ poly(X, 3)
Predictor Max_Degree p_linear p_quadratic p_cubic R2
zn 3 0.0000 0.0044 0.2295 0.0582
indus 3 0.0000 0.0011 0.0000 0.2597
chas 1 0.2094 NA NA 0.0031
nox 3 0.0000 0.0001 0.0000 0.2970
rm 3 0.0000 0.0015 0.5086 0.0678
age 3 0.0000 0.0000 0.0067 0.1742
dis 3 0.0000 0.0000 0.0000 0.2778
rad 3 0.0000 0.0091 0.4823 0.4000
tax 3 0.0000 0.0000 0.2439 0.3689
ptratio 3 0.0000 0.0024 0.0063 0.1138
lstat 3 0.0000 0.0378 0.1299 0.2179
medv 3 0.0000 0.0000 0.0000 0.4202

Finding: Several predictors — including indus, nox, age, dis, ptratio, and medv — show significant quadratic or cubic terms, providing evidence of non-linear associations with crime rate.


Chapter 4 – Exercise 13 (Weekly Dataset — Logistic Regression)

data(Weekly)
glimpse(Weekly)
## Rows: 1,089
## Columns: 9
## $ Year      <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1      <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2      <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3      <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4      <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5      <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume    <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today     <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…

(a) Numerical and Graphical Summaries

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
# Volume over time
ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_line(color = "#2980B9", alpha = 0.6) +
  geom_smooth(color = "#E74C3C", se = FALSE) +
  labs(title = "Trading Volume Over Time (Weekly)", y = "Volume (billions of shares)") +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Lag correlations with Today
Weekly %>%
  select(Lag1:Lag5, Volume, Today) %>%
  cor() %>%
  corrplot(method = "color", addCoef.col = "black", number.cex = 0.7,
           tl.cex = 0.8, title = "Correlations – Weekly Variables", mar = c(0,0,1,0))

Patterns: Trading volume has increased substantially over the 21-year period. The lag variables show very weak correlations with Today’s return — consistent with market efficiency. Volume and Year are strongly correlated.

(b) Logistic Regression: Direction ~ Lag1–Lag5 + Volume

logit_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                  data = Weekly, family = binomial)
summary(logit_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Finding: Only Lag2 is statistically significant (p ≈ 0.03). The other lag variables and Volume do not appear significant.

(c) Confusion Matrix — Full Training Set

pred_prob_full <- predict(logit_full, type = "response")
pred_dir_full  <- ifelse(pred_prob_full > 0.5, "Up", "Down")

conf_mat <- table(Predicted = pred_dir_full, Actual = Weekly$Direction)
conf_mat
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
accuracy <- mean(pred_dir_full == Weekly$Direction)
cat("Overall Fraction of Correct Predictions:", round(accuracy, 4), "\n")
## Overall Fraction of Correct Predictions: 0.5611

Interpretation: The model has an overall accuracy of ~56%. Looking at the confusion matrix: it tends to predict “Up” very often — it captures most actual “Up” weeks correctly but misclassifies many “Down” weeks as “Up”. This reflects the market’s upward bias and the model’s tendency to predict the majority class.

(d) Train on 1990–2008, Test on 2009–2010 (Lag2 only)

train <- Weekly %>% filter(Year <= 2008)
test  <- Weekly %>% filter(Year  > 2008)

logit_lag2 <- glm(Direction ~ Lag2, data = train, family = binomial)
summary(logit_lag2)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
pred_prob_test <- predict(logit_lag2, newdata = test, type = "response")
pred_dir_test  <- ifelse(pred_prob_test > 0.5, "Up", "Down")

conf_test <- table(Predicted = pred_dir_test, Actual = test$Direction)
conf_test
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
acc_test <- mean(pred_dir_test == test$Direction)
cat("Test Accuracy (2009–2010):", round(acc_test, 4), "\n")
## Test Accuracy (2009–2010): 0.625

Interpretation: The Lag2-only model achieves approximately 62.5% accuracy on the held-out 2009–2010 data — better than the full model on training data, suggesting Lag2 is the most useful predictor and avoiding overfitting by dropping insignificant predictors helps generalization.


Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pROC_1.19.0.1   corrplot_0.95   GGally_2.4.0    lubridate_1.9.5
##  [5] forcats_1.0.1   stringr_1.6.0   dplyr_1.2.0     purrr_1.2.1    
##  [9] readr_2.2.0     tidyr_1.3.2     tibble_3.3.1    ggplot2_4.0.2  
## [13] tidyverse_2.0.0 ISLR2_1.3-2    
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     stringi_1.8.7      lattice_0.22-9    
##  [5] hms_1.1.4          digest_0.6.39      magrittr_2.0.4     evaluate_1.0.5    
##  [9] grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] Matrix_1.7-4       jsonlite_2.0.0     mgcv_1.9-4         scales_1.4.0      
## [17] jquerylib_0.1.4    cli_3.6.5          rlang_1.1.7        splines_4.5.3     
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] tzdb_0.5.0         ggstats_0.13.0     vctrs_0.7.1        R6_2.6.1          
## [29] lifecycle_1.0.5    pkgconfig_2.0.3    pillar_1.11.1      bslib_0.10.0      
## [33] gtable_0.3.6       glue_1.8.0         Rcpp_1.1.1         xfun_0.56         
## [37] tidyselect_1.2.1   rstudioapi_0.18.0  knitr_1.51         farver_2.1.2      
## [41] htmltools_0.5.9    nlme_3.1-168       rmarkdown_2.30     labeling_0.4.3    
## [45] compiler_4.5.3     S7_0.2.1