library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally) # ggpairs scatterplot matrix
library(corrplot) # correlation heatmap
## corrplot 0.95 loaded
library(pROC) # ROC curves
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Auto Dataset)Make sure that missing values have been removed from the data.
data(Auto)
Auto <- na.omit(Auto)
dim(Auto)
## [1] 392 9
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Quantitative predictors: mpg,
cylinders, displacement,
horsepower, weight, acceleration,
year
Qualitative predictor: name (and
origin is technically categorical — country code 1/2/3 —
but stored as integer)
quant_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)
range_df <- quant_vars %>%
summarise(across(everything(), list(Min = min, Max = max))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)")
knitr::kable(range_df, caption = "Range of Quantitative Predictors")
| Variable | Min | Max |
|---|---|---|
| mpg | 9 | 46.6 |
| cylinders | 3 | 8.0 |
| displacement | 68 | 455.0 |
| horsepower | 46 | 230.0 |
| weight | 1613 | 5140.0 |
| acceleration | 8 | 24.8 |
| year | 70 | 82.0 |
stats_df <- quant_vars %>%
summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
mutate(across(c(Mean, SD), ~ round(.x, 2)))
knitr::kable(stats_df, caption = "Mean and Standard Deviation of Quantitative Predictors")
| Variable | Mean | SD |
|---|---|---|
| mpg | 23.45 | 7.81 |
| cylinders | 5.47 | 1.71 |
| displacement | 194.41 | 104.64 |
| horsepower | 104.47 | 38.49 |
| weight | 2977.58 | 849.40 |
| acceleration | 15.54 | 2.76 |
| year | 75.98 | 3.68 |
Auto_sub <- Auto[-(10:85), ]
dim(Auto_sub)
## [1] 316 9
quant_sub <- Auto_sub %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)
sub_stats <- quant_sub %>%
summarise(across(everything(),
list(Min = min, Max = max, Mean = mean, SD = sd))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
mutate(across(c(Mean, SD), ~ round(.x, 2)))
knitr::kable(sub_stats, caption = "Statistics for Subset (rows 10–85 removed)")
| Variable | Min | Max | Mean | SD |
|---|---|---|---|---|
| mpg | 11.0 | 46.6 | 24.40 | 7.87 |
| cylinders | 3.0 | 8.0 | 5.37 | 1.65 |
| displacement | 68.0 | 455.0 | 187.24 | 99.68 |
| horsepower | 46.0 | 230.0 | 100.72 | 35.71 |
| weight | 1649.0 | 4997.0 | 2935.97 | 811.30 |
| acceleration | 8.5 | 24.8 | 15.73 | 2.69 |
| year | 70.0 | 82.0 | 77.15 | 3.11 |
# Scatterplot matrix for quantitative variables
ggpairs(quant_vars,
upper = list(continuous = wrap("cor", size = 3)),
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.6)),
diag = list(continuous = wrap("densityDiag")),
title = "Scatterplot Matrix – Auto Dataset") +
theme_bw(base_size = 9)
Findings: Strong negative correlations exist between
mpg and displacement, horsepower,
and weight. displacement,
horsepower, and weight are heavily correlated
with each other (multicollinearity). cylinders clusters
around 4, 6, and 8 — behaving more like a categorical variable.
mpgAuto %>%
select(mpg, displacement, horsepower, weight, acceleration, year) %>%
pivot_longer(-mpg, names_to = "Predictor", values_to = "Value") %>%
ggplot(aes(x = Value, y = mpg)) +
geom_point(alpha = 0.3, size = 0.8, color = "#2980B9") +
geom_smooth(method = "loess", color = "#E74C3C", se = TRUE) +
facet_wrap(~ Predictor, scales = "free_x", ncol = 3) +
labs(title = "Predictors vs. mpg", x = "Predictor Value", y = "mpg") +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
Conclusion: displacement,
horsepower, and weight show strong negative
nonlinear relationships with mpg — all three are useful
predictors. year shows a positive trend (newer cars are
more fuel-efficient). acceleration has a weaker, noisier
relationship.
Auto Dataset — Multiple Linear
Regression)Auto_num <- Auto %>% select(-name)
ggpairs(Auto_num,
upper = list(continuous = wrap("cor", size = 2.8)),
lower = list(continuous = wrap("points", alpha = 0.25, size = 0.5)),
title = "Scatterplot Matrix (All Variables, excluding name)") +
theme_bw(base_size = 8)
cor_matrix <- cor(Auto_num)
knitr::kable(round(cor_matrix, 3), caption = "Correlation Matrix (excluding name)")
| mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
|---|---|---|---|---|---|---|---|---|
| mpg | 1.000 | -0.778 | -0.805 | -0.778 | -0.832 | 0.423 | 0.581 | 0.565 |
| cylinders | -0.778 | 1.000 | 0.951 | 0.843 | 0.898 | -0.505 | -0.346 | -0.569 |
| displacement | -0.805 | 0.951 | 1.000 | 0.897 | 0.933 | -0.544 | -0.370 | -0.615 |
| horsepower | -0.778 | 0.843 | 0.897 | 1.000 | 0.865 | -0.689 | -0.416 | -0.455 |
| weight | -0.832 | 0.898 | 0.933 | 0.865 | 1.000 | -0.417 | -0.309 | -0.585 |
| acceleration | 0.423 | -0.505 | -0.544 | -0.689 | -0.417 | 1.000 | 0.290 | 0.213 |
| year | 0.581 | -0.346 | -0.370 | -0.416 | -0.309 | 0.290 | 1.000 | 0.182 |
| origin | 0.565 | -0.569 | -0.615 | -0.455 | -0.585 | 0.213 | 0.182 | 1.000 |
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", number.cex = 0.65,
tl.cex = 0.8, title = "Correlation Heatmap", mar = c(0,0,1,0))
mpg ~ All Predictors
(excluding name)lm_full <- lm(mpg ~ . - name, data = Auto)
summary(lm_full)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Is there a relationship between the predictors and the response?
Yes — the overall F-statistic is highly significant (p < 2.2e-16),
confirming a relationship between the predictors and
mpg.
ii. Which predictors are statistically significant?
displacement, weight, year,
and origin all show p-values < 0.05 and are
statistically significant predictors of mpg.
horsepower and acceleration are not
significant at the 5% level.
iii. What does the coefficient for year
suggest?
The positive coefficient for year (~0.75) suggests that,
holding other variables constant, each additional model year is
associated with an increase of about 0.75 mpg — reflecting the trend
toward more fuel-efficient vehicles over time.
par(mfrow = c(2, 2))
plot(lm_full)
par(mfrow = c(1, 1))
Comments:
# Key interactions suggested by high correlations
lm_interact <- lm(mpg ~ displacement + horsepower + weight + year + origin +
cylinders + acceleration +
displacement:weight +
horsepower:year +
weight:year,
data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ displacement + horsepower + weight + year +
## origin + cylinders + acceleration + displacement:weight +
## horsepower:year + weight:year, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4158 -1.4565 -0.0318 1.3979 11.2571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.468e+01 1.385e+01 -3.947 9.43e-05 ***
## displacement -7.006e-02 1.106e-02 -6.337 6.61e-10 ***
## horsepower 8.122e-01 1.696e-01 4.790 2.39e-06 ***
## weight -2.118e-02 8.390e-03 -2.524 0.0120 *
## year 1.437e+00 1.708e-01 8.411 8.28e-16 ***
## origin 4.994e-01 2.468e-01 2.023 0.0438 *
## cylinders 5.035e-01 2.825e-01 1.782 0.0755 .
## acceleration -6.393e-02 9.014e-02 -0.709 0.4786
## displacement:weight 1.987e-05 2.310e-06 8.604 < 2e-16 ***
## horsepower:year -1.160e-02 2.311e-03 -5.021 7.90e-07 ***
## weight:year 1.620e-04 1.114e-04 1.454 0.1467
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.785 on 381 degrees of freedom
## Multiple R-squared: 0.8759, Adjusted R-squared: 0.8727
## F-statistic: 269 on 10 and 381 DF, p-value: < 2.2e-16
Finding: The interaction
horsepower:year and weight:year are
statistically significant (p < 0.05), suggesting that the effect of
horsepower and weight on mpg has changed over time — heavier,
high-horsepower cars improved more in fuel efficiency in later
years.
lm_log <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) + year + origin, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + year + origin, data = Auto)
lm_sq <- lm(mpg ~ displacement + I(displacement^2) + weight + year + origin, data = Auto)
cat("R² – log transform: ", summary(lm_log)$r.squared, "\n")
## R² – log transform: 0.8454534
cat("R² – sqrt transform: ", summary(lm_sqrt)$r.squared, "\n")
## R² – sqrt transform: 0.8337855
cat("R² – quadratic: ", summary(lm_sq)$r.squared, "\n")
## R² – quadratic: 0.8418804
cat("R² – original (full): ", summary(lm_full)$r.squared, "\n")
## R² – original (full): 0.8214781
par(mfrow = c(2, 2))
plot(lm_log)
par(mfrow = c(1, 1))
Conclusion: The log-transformed model achieves the
highest R² and the residual plots are more homoscedastic — log
transformations of displacement, horsepower,
and weight improve model fit compared to using the raw
values.
Boston Dataset — Simple &
Multiple Regression)data(Boston)
glimpse(Boston)
## Rows: 506
## Columns: 13
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
crimpredictors <- setdiff(names(Boston), "crim")
slr_results <- map_dfr(predictors, function(pred) {
fit <- lm(as.formula(paste("crim ~", pred)), data = Boston)
s <- summary(fit)
tibble(
Predictor = pred,
Coefficient = round(coef(fit)[2], 4),
`p-value` = round(coef(s)[2, 4], 4),
R2 = round(s$r.squared, 4),
Significant = ifelse(coef(s)[2, 4] < 0.05, "Yes ✓", "No")
)
})
knitr::kable(slr_results, caption = "Simple Linear Regression: crim ~ each predictor")
| Predictor | Coefficient | p-value | R2 | Significant |
|---|---|---|---|---|
| zn | -0.0739 | 0.0000 | 0.0402 | Yes ✓ |
| indus | 0.5098 | 0.0000 | 0.1653 | Yes ✓ |
| chas | -1.8928 | 0.2094 | 0.0031 | No |
| nox | 31.2485 | 0.0000 | 0.1772 | Yes ✓ |
| rm | -2.6841 | 0.0000 | 0.0481 | Yes ✓ |
| age | 0.1078 | 0.0000 | 0.1244 | Yes ✓ |
| dis | -1.5509 | 0.0000 | 0.1441 | Yes ✓ |
| rad | 0.6179 | 0.0000 | 0.3913 | Yes ✓ |
| tax | 0.0297 | 0.0000 | 0.3396 | Yes ✓ |
| ptratio | 1.1520 | 0.0000 | 0.0841 | Yes ✓ |
| lstat | 0.5488 | 0.0000 | 0.2076 | Yes ✓ |
| medv | -0.3632 | 0.0000 | 0.1508 | Yes ✓ |
# Plot crim vs. most significant predictors
sig_preds <- slr_results %>% filter(Significant == "Yes ✓") %>% pull(Predictor)
Boston %>%
select(crim, all_of(sig_preds)) %>%
pivot_longer(-crim, names_to = "Predictor", values_to = "Value") %>%
ggplot(aes(x = Value, y = crim)) +
geom_point(alpha = 0.3, size = 0.7, color = "#8E44AD") +
geom_smooth(method = "lm", color = "#E74C3C", se = TRUE) +
facet_wrap(~ Predictor, scales = "free_x", ncol = 4) +
labs(title = "Significant Predictors vs. Crime Rate (crim)",
x = "Predictor Value", y = "Per Capita Crime Rate") +
theme_bw(base_size = 9)
## `geom_smooth()` using formula = 'y ~ x'
Finding: Almost all predictors are statistically
significant in simple linear regression. rad (accessibility
to radial highways), tax (tax rate), and lstat
(% lower status) show the strongest associations with crime rate.
crim ~ All
Predictorsmlr_boston <- lm(crim ~ ., data = Boston)
summary(mlr_boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Finding: In the multiple regression model, only
zn, dis, rad, black,
and medv are statistically significant (p < 0.05). Many
predictors that were significant in simple regression lose significance
— indicating confounding and multicollinearity among predictors.
uni_coefs <- slr_results$Coefficient
multi_coefs <- coef(mlr_boston)[-1] # exclude intercept
coef_compare <- tibble(
Predictor = predictors,
Univariate = uni_coefs,
Multiple = as.numeric(multi_coefs)
)
ggplot(coef_compare, aes(x = Univariate, y = Multiple, label = Predictor)) +
geom_point(color = "#E67E22", size = 3) +
geom_text(vjust = -0.7, size = 3.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
labs(title = "Univariate vs. Multiple Regression Coefficients",
subtitle = "Each point = one predictor",
x = "Simple Regression Coefficient",
y = "Multiple Regression Coefficient") +
theme_bw()
Comment: The nox variable shows the
largest discrepancy — a large positive coefficient in simple regression
shrinks dramatically in the multiple model, reflecting collinearity with
other predictors.
poly_results <- map_dfr(predictors, function(pred) {
n_unique <- length(unique(Boston[[pred]]))
degree <- min(3, n_unique - 1) # cap degree to avoid poly() error
fit_poly <- lm(as.formula(paste("crim ~ poly(", pred, ",", degree, ")")), data = Boston)
s <- summary(fit_poly)
coefs <- coef(s)
tibble(
Predictor = pred,
Max_Degree = degree,
p_linear = round(coefs[2, 4], 4),
p_quadratic = ifelse(nrow(coefs) >= 3, round(coefs[3, 4], 4), NA_real_),
p_cubic = ifelse(nrow(coefs) >= 4, round(coefs[4, 4], 4), NA_real_),
R2 = round(s$r.squared, 4)
)
})
knitr::kable(poly_results, caption = "Cubic Polynomial Fits: crim ~ poly(X, 3)")
| Predictor | Max_Degree | p_linear | p_quadratic | p_cubic | R2 |
|---|---|---|---|---|---|
| zn | 3 | 0.0000 | 0.0044 | 0.2295 | 0.0582 |
| indus | 3 | 0.0000 | 0.0011 | 0.0000 | 0.2597 |
| chas | 1 | 0.2094 | NA | NA | 0.0031 |
| nox | 3 | 0.0000 | 0.0001 | 0.0000 | 0.2970 |
| rm | 3 | 0.0000 | 0.0015 | 0.5086 | 0.0678 |
| age | 3 | 0.0000 | 0.0000 | 0.0067 | 0.1742 |
| dis | 3 | 0.0000 | 0.0000 | 0.0000 | 0.2778 |
| rad | 3 | 0.0000 | 0.0091 | 0.4823 | 0.4000 |
| tax | 3 | 0.0000 | 0.0000 | 0.2439 | 0.3689 |
| ptratio | 3 | 0.0000 | 0.0024 | 0.0063 | 0.1138 |
| lstat | 3 | 0.0000 | 0.0378 | 0.1299 | 0.2179 |
| medv | 3 | 0.0000 | 0.0000 | 0.0000 | 0.4202 |
Finding: Several predictors — including
indus, nox, age,
dis, ptratio, and medv — show
significant quadratic or cubic terms, providing evidence of non-linear
associations with crime rate.
Weekly Dataset — Logistic
Regression)data(Weekly)
glimpse(Weekly)
## Rows: 1,089
## Columns: 9
## $ Year <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1 <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2 <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3 <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4 <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5 <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Volume over time
ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_line(color = "#2980B9", alpha = 0.6) +
geom_smooth(color = "#E74C3C", se = FALSE) +
labs(title = "Trading Volume Over Time (Weekly)", y = "Volume (billions of shares)") +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Lag correlations with Today
Weekly %>%
select(Lag1:Lag5, Volume, Today) %>%
cor() %>%
corrplot(method = "color", addCoef.col = "black", number.cex = 0.7,
tl.cex = 0.8, title = "Correlations – Weekly Variables", mar = c(0,0,1,0))
Patterns: Trading volume has increased substantially
over the 21-year period. The lag variables show very weak correlations
with Today’s return — consistent with market efficiency.
Volume and Year are strongly correlated.
Direction ~ Lag1–Lag5 +
Volumelogit_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(logit_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Finding: Only Lag2 is statistically
significant (p ≈ 0.03). The other lag variables and Volume
do not appear significant.
pred_prob_full <- predict(logit_full, type = "response")
pred_dir_full <- ifelse(pred_prob_full > 0.5, "Up", "Down")
conf_mat <- table(Predicted = pred_dir_full, Actual = Weekly$Direction)
conf_mat
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- mean(pred_dir_full == Weekly$Direction)
cat("Overall Fraction of Correct Predictions:", round(accuracy, 4), "\n")
## Overall Fraction of Correct Predictions: 0.5611
Interpretation: The model has an overall accuracy of ~56%. Looking at the confusion matrix: it tends to predict “Up” very often — it captures most actual “Up” weeks correctly but misclassifies many “Down” weeks as “Up”. This reflects the market’s upward bias and the model’s tendency to predict the majority class.
train <- Weekly %>% filter(Year <= 2008)
test <- Weekly %>% filter(Year > 2008)
logit_lag2 <- glm(Direction ~ Lag2, data = train, family = binomial)
summary(logit_lag2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
pred_prob_test <- predict(logit_lag2, newdata = test, type = "response")
pred_dir_test <- ifelse(pred_prob_test > 0.5, "Up", "Down")
conf_test <- table(Predicted = pred_dir_test, Actual = test$Direction)
conf_test
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
acc_test <- mean(pred_dir_test == test$Direction)
cat("Test Accuracy (2009–2010):", round(acc_test, 4), "\n")
## Test Accuracy (2009–2010): 0.625
Interpretation: The Lag2-only model achieves approximately 62.5% accuracy on the held-out 2009–2010 data — better than the full model on training data, suggesting Lag2 is the most useful predictor and avoiding overfitting by dropping insignificant predictors helps generalization.
sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pROC_1.19.0.1 corrplot_0.95 GGally_2.4.0 lubridate_1.9.5
## [5] forcats_1.0.1 stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1
## [9] readr_2.2.0 tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.2
## [13] tidyverse_2.0.0 ISLR2_1.3-2
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 stringi_1.8.7 lattice_0.22-9
## [5] hms_1.1.4 digest_0.6.39 magrittr_2.0.4 evaluate_1.0.5
## [9] grid_4.5.3 timechange_0.4.0 RColorBrewer_1.1-3 fastmap_1.2.0
## [13] Matrix_1.7-4 jsonlite_2.0.0 mgcv_1.9-4 scales_1.4.0
## [17] jquerylib_0.1.4 cli_3.6.5 rlang_1.1.7 splines_4.5.3
## [21] withr_3.0.2 cachem_1.1.0 yaml_2.3.12 tools_4.5.3
## [25] tzdb_0.5.0 ggstats_0.13.0 vctrs_0.7.1 R6_2.6.1
## [29] lifecycle_1.0.5 pkgconfig_2.0.3 pillar_1.11.1 bslib_0.10.0
## [33] gtable_0.3.6 glue_1.8.0 Rcpp_1.1.1 xfun_0.56
## [37] tidyselect_1.2.1 rstudioapi_0.18.0 knitr_1.51 farver_2.1.2
## [41] htmltools_0.5.9 nlme_3.1-168 rmarkdown_2.30 labeling_0.4.3
## [45] compiler_4.5.3 S7_0.2.1