2026Make sure missing values have been removed.
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# Qualitative: name, origin (though origin is coded as integer, it is categorical)
cat("Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year\n")## Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year
## Qualitative predictors: origin (categorical: 1=American, 2=European, 3=Japanese), name
quant_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")
ranges <- sapply(Auto[, quant_vars], range)
rownames(ranges) <- c("Min", "Max")
knitr::kable(ranges, caption = "Range of Quantitative Predictors")| mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
|---|---|---|---|---|---|---|---|
| Min | 9.0 | 3 | 68 | 46 | 1613 | 8.0 | 70 |
| Max | 46.6 | 8 | 455 | 230 | 5140 | 24.8 | 82 |
means <- sapply(Auto[, quant_vars], mean)
sds <- sapply(Auto[, quant_vars], sd)
summary_stats <- rbind(Mean = round(means, 2), SD = round(sds, 2))
knitr::kable(summary_stats, caption = "Mean and SD of Quantitative Predictors")| mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
|---|---|---|---|---|---|---|---|
| Mean | 23.45 | 5.47 | 194.41 | 104.47 | 2977.58 | 15.54 | 75.98 |
| SD | 7.81 | 1.71 | 104.64 | 38.49 | 849.40 | 2.76 | 3.68 |
Auto_sub <- Auto[-(10:85), ]
ranges_sub <- sapply(Auto_sub[, quant_vars], range)
rownames(ranges_sub) <- c("Min", "Max")
means_sub <- sapply(Auto_sub[, quant_vars], mean)
sds_sub <- sapply(Auto_sub[, quant_vars], sd)
knitr::kable(ranges_sub, caption = "Range after removing obs 10–85")| mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
|---|---|---|---|---|---|---|---|
| Min | 11.0 | 3 | 68 | 46 | 1649 | 8.5 | 70 |
| Max | 46.6 | 8 | 455 | 230 | 4997 | 24.8 | 82 |
summary_sub <- rbind(Mean = round(means_sub, 2), SD = round(sds_sub, 2))
knitr::kable(summary_sub, caption = "Mean and SD after removing obs 10–85")| mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
|---|---|---|---|---|---|---|---|
| Mean | 24.40 | 5.37 | 187.24 | 100.72 | 2935.97 | 15.73 | 77.15 |
| SD | 7.87 | 1.65 | 99.68 | 35.71 | 811.30 | 2.69 | 3.11 |
ggpairs(Auto[, quant_vars],
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
diag = list(continuous = wrap("densityDiag")),
upper = list(continuous = wrap("cor", size = 3))) +
theme_bw(base_size = 9) +
ggtitle("Scatterplot Matrix – Auto Dataset (Quantitative Variables)")Findings: Strong negative correlations exist between
mpg and displacement, horsepower,
and weight. cylinders,
displacement, horsepower, and
weight are all highly positively correlated with each
other. year shows a moderate positive correlation with
mpg, suggesting fuel efficiency has improved over model
years.
par(mfrow = c(2, 3))
for (v in setdiff(quant_vars, "mpg")) {
plot(Auto[[v]], Auto$mpg,
xlab = v, ylab = "mpg",
main = paste("mpg vs", v),
pch = 20, col = rgb(0.2, 0.4, 0.8, 0.4))
abline(lm(mpg ~ Auto[[v]], data = Auto), col = "red", lwd = 2)
}Conclusion: displacement,
horsepower, and weight show the strongest
(negative) linear relationships with mpg and are the most
useful predictors. year has a moderate positive
relationship, indicating newer cars are more efficient.
cylinders also tracks closely with engine size
predictors.
pairs(Auto[, quant_vars],
pch = 20, col = rgb(0.3, 0.5, 0.7, 0.3),
main = "Scatterplot Matrix – Auto (All Quantitative Variables)")name)| mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
|---|---|---|---|---|---|---|---|
| mpg | 1.000 | -0.778 | -0.805 | -0.778 | -0.832 | 0.423 | 0.581 |
| cylinders | -0.778 | 1.000 | 0.951 | 0.843 | 0.898 | -0.505 | -0.346 |
| displacement | -0.805 | 0.951 | 1.000 | 0.897 | 0.933 | -0.544 | -0.370 |
| horsepower | -0.778 | 0.843 | 0.897 | 1.000 | 0.865 | -0.689 | -0.416 |
| weight | -0.832 | 0.898 | 0.933 | 0.865 | 1.000 | -0.417 | -0.309 |
| acceleration | 0.423 | -0.505 | -0.544 | -0.689 | -0.417 | 1.000 | 0.290 |
| year | 0.581 | -0.346 | -0.370 | -0.416 | -0.309 | 0.290 | 1.000 |
corrplot(cor_mat, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black",
number.cex = 0.7, title = "Correlation Matrix", mar = c(0,0,1,0))##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Relationship between predictors and
response?
Yes – the F-statistic is very large with a near-zero p-value, indicating
a statistically significant overall relationship.
ii. Which predictors are statistically
significant?
displacement, weight, year, and
origin all have p-values < 0.05 and appear statistically
significant.
iii. What does the coefficient for year
suggest?
The positive coefficient (~0.75) suggests that, holding other variables
constant, each additional model year is associated with about 0.75 more
miles per gallon – fuel efficiency has improved over time.
Comments:
lm_inter <- lm(mpg ~ displacement * weight + year * origin + horsepower * weight, data = Auto)
summary(lm_inter)##
## Call:
## lm(formula = mpg ~ displacement * weight + year * origin + horsepower *
## weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4474 -1.6780 -0.0525 1.4634 11.8047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.841e+01 7.914e+00 2.326 0.02056 *
## displacement -3.282e-02 1.798e-02 -1.825 0.06874 .
## weight -1.094e-02 6.996e-04 -15.641 < 2e-16 ***
## year 5.288e-01 1.021e-01 5.177 3.65e-07 ***
## origin -1.091e+01 4.319e+00 -2.525 0.01198 *
## horsepower -1.357e-01 4.204e-02 -3.229 0.00135 **
## displacement:weight 1.102e-05 5.187e-06 2.125 0.03421 *
## year:origin 1.491e-01 5.528e-02 2.698 0.00729 **
## weight:horsepower 2.936e-05 1.216e-05 2.415 0.01622 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.899 on 383 degrees of freedom
## Multiple R-squared: 0.8648, Adjusted R-squared: 0.862
## F-statistic: 306.3 on 8 and 383 DF, p-value: < 2.2e-16
Finding: The interaction
horsepower:weight is statistically significant (p <
0.05), suggesting the combined effect of engine power and vehicle weight
on fuel economy is greater than each factor alone.
par(mfrow = c(1, 3))
# log transformation
lm_log <- lm(mpg ~ log(horsepower), data = Auto)
plot(log(Auto$horsepower), Auto$mpg,
xlab = "log(horsepower)", ylab = "mpg", main = "mpg vs log(hp)", pch = 20,
col = rgb(0.2, 0.4, 0.8, 0.5))
abline(lm_log, col = "red", lwd = 2)
# sqrt transformation
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
plot(sqrt(Auto$horsepower), Auto$mpg,
xlab = "sqrt(horsepower)", ylab = "mpg", main = "mpg vs sqrt(hp)", pch = 20,
col = rgb(0.2, 0.7, 0.4, 0.5))
abline(lm_sqrt, col = "red", lwd = 2)
# quadratic transformation
lm_sq <- lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
hp_seq <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 200)
pred_sq <- predict(lm_sq, newdata = data.frame(horsepower = hp_seq))
plot(Auto$horsepower, Auto$mpg,
xlab = "horsepower", ylab = "mpg", main = "mpg vs hp + hp²", pch = 20,
col = rgb(0.8, 0.3, 0.2, 0.5))
lines(hp_seq, pred_sq, col = "red", lwd = 2)Findings: The log(horsepower) and
sqrt(horsepower) transformations linearize the relationship
better than the raw variable. The quadratic fit also captures the
non-linearity well, producing a lower residual standard error than the
simple linear model.
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
predictors <- setdiff(names(Boston), "crim")
# Fit univariate models and collect results
uni_results <- lapply(predictors, function(p) {
fit <- lm(as.formula(paste("crim ~", p)), data = Boston)
s <- summary(fit)
data.frame(
predictor = p,
coef = round(coef(fit)[2], 4),
p_value = round(s$coefficients[2, 4], 4),
r_squared = round(s$r.squared, 4)
)
})
uni_df <- do.call(rbind, uni_results)
uni_df$significant <- ifelse(uni_df$p_value < 0.05, "Yes", "No")
knitr::kable(uni_df, row.names = FALSE,
caption = "Univariate Regression Results (response = crim)")| predictor | coef | p_value | r_squared | significant |
|---|---|---|---|---|
| zn | -0.0739 | 0.0000 | 0.0402 | Yes |
| indus | 0.5098 | 0.0000 | 0.1653 | Yes |
| chas | -1.8928 | 0.2094 | 0.0031 | No |
| nox | 31.2485 | 0.0000 | 0.1772 | Yes |
| rm | -2.6841 | 0.0000 | 0.0481 | Yes |
| age | 0.1078 | 0.0000 | 0.1244 | Yes |
| dis | -1.5509 | 0.0000 | 0.1441 | Yes |
| rad | 0.6179 | 0.0000 | 0.3913 | Yes |
| tax | 0.0297 | 0.0000 | 0.3396 | Yes |
| ptratio | 1.1520 | 0.0000 | 0.0841 | Yes |
| lstat | 0.5488 | 0.0000 | 0.2076 | Yes |
| medv | -0.3632 | 0.0000 | 0.1508 | Yes |
par(mfrow = c(4, 4))
for (p in predictors) {
fit <- lm(as.formula(paste("crim ~", p)), data = Boston)
plot(Boston[[p]], Boston$crim,
xlab = p, ylab = "crim", pch = 20,
col = rgb(0.3, 0.5, 0.8, 0.4), cex = 0.6)
abline(fit, col = "red", lwd = 2)
}
par(mfrow = c(1, 1))Finding: Most predictors show a statistically
significant association with crim. Exceptions include
chas (the Charles River dummy variable), which is not
significant. Variables like rad (accessibility to radial
highways) and tax show the strongest associations.
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Finding: In the multiple regression, we can reject
H₀: βⱼ = 0 for zn, dis, rad,
black, and medv at α = 0.05. Many predictors
that were significant in simple regression are no longer significant
when controlling for others, suggesting multicollinearity.
multi_coefs <- coef(lm_multi)[-1] # exclude intercept
uni_coefs <- setNames(uni_df$coef, uni_df$predictor)
# Align by name
common <- intersect(names(uni_coefs), names(multi_coefs))
plot_df <- data.frame(
univariate = uni_coefs[common],
multiple = multi_coefs[common],
predictor = common
)
ggplot(plot_df, aes(x = univariate, y = multiple, label = predictor)) +
geom_point(color = "steelblue", size = 3) +
geom_text(vjust = -0.6, size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
labs(title = "Univariate vs Multiple Regression Coefficients",
subtitle = "Response: per capita crime rate (crim)",
x = "Simple Regression Coefficient",
y = "Multiple Regression Coefficient") +
theme_bw()Observation: The coefficients differ substantially
for several predictors (notably nox), indicating that
simple regression conflates the effect of correlated predictors.
poly_results <- lapply(setdiff(predictors, "chas"), function(p) {
fit3 <- lm(as.formula(paste("crim ~ poly(", p, ", 3)")), data = Boston)
s <- summary(fit3)
coef_p <- s$coefficients
sig2 <- ifelse(nrow(coef_p) >= 3 && coef_p[3, 4] < 0.05, "Yes", "No")
sig3 <- ifelse(nrow(coef_p) >= 4 && coef_p[4, 4] < 0.05, "Yes", "No")
data.frame(predictor = p,
quad_sig = sig2,
cubic_sig = sig3)
})
poly_df <- do.call(rbind, poly_results)
knitr::kable(poly_df, row.names = FALSE,
caption = "Evidence of Non-Linear Association (cubic polynomial)")| predictor | quad_sig | cubic_sig |
|---|---|---|
| zn | Yes | No |
| indus | Yes | Yes |
| nox | Yes | Yes |
| rm | Yes | No |
| age | Yes | Yes |
| dis | Yes | Yes |
| rad | Yes | No |
| tax | Yes | No |
| ptratio | Yes | Yes |
| lstat | Yes | No |
| medv | Yes | Yes |
Conclusion: Several predictors (including
indus, nox, age,
dis, ptratio, medv) show
statistically significant quadratic or cubic terms, indicating
non-linear relationships with crime rate.
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
par(mfrow = c(2, 3))
for (v in c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume")) {
hist(Weekly[[v]], main = v, xlab = v,
col = "steelblue", border = "white", breaks = 30)
}plot(Weekly$Year, Weekly$Volume,
type = "l", col = "steelblue", lwd = 1.5,
xlab = "Year", ylab = "Volume (avg shares traded, billions)",
main = "Weekly Trading Volume Over Time")Patterns: Trading volume has increased substantially over time, especially from the late 1990s through 2008. The lag variables show little obvious pattern on their own.
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_full)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Finding: Only Lag2 has a statistically
significant p-value (< 0.05). The other lag variables and Volume are
not significant in this model.
prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
conf_full <- table(Predicted = pred_full, Actual = Weekly$Direction)
conf_full## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Overall fraction correct
acc_full <- mean(pred_full == Weekly$Direction)
cat("Overall accuracy (full model):", round(acc_full, 4), "\n")## Overall accuracy (full model): 0.5611
Interpretation: The model correctly predicts about 56.1% of observations. However, the confusion matrix shows the model is biased toward predicting “Up” – it frequently misclassifies “Down” weeks. The high accuracy largely reflects the market’s upward bias over the sample period.
train <- Weekly$Year <= 2008
test <- Weekly[!train, ]
glm_lag2 <- glm(Direction ~ Lag2,
data = Weekly,
subset = train,
family = binomial)
summary(glm_lag2)##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
prob_lag2 <- predict(glm_lag2, newdata = test, type = "response")
pred_lag2 <- ifelse(prob_lag2 > 0.5, "Up", "Down")
conf_lag2 <- table(Predicted = pred_lag2, Actual = test$Direction)
conf_lag2## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
acc_lag2 <- mean(pred_lag2 == test$Direction)
cat("Test accuracy (Lag2 only, 2009–2010):", round(acc_lag2, 4), "\n")## Test accuracy (Lag2 only, 2009–2010): 0.625
Finding: The model using only Lag2
achieves approximately 62.5% accuracy on held-out data
from 2009–2010. This is better than random chance and better than the
full model on the training data, suggesting Lag2 is a
useful predictor.
End of Report
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] corrplot_0.95 dplyr_1.1.4 GGally_2.4.0 ggplot2_4.0.0 ISLR2_1.3-2
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.5 knitr_1.50 rlang_1.1.6
## [5] xfun_0.52 purrr_1.1.0 generics_0.1.4 S7_0.2.0
## [9] jsonlite_2.0.0 labeling_0.4.3 glue_1.8.0 htmltools_0.5.8.1
## [13] sass_0.4.10 scales_1.4.0 rmarkdown_2.29 ggstats_0.13.0
## [17] grid_4.5.1 tibble_3.3.0 evaluate_1.0.5 jquerylib_0.1.4
## [21] fastmap_1.2.0 yaml_2.3.10 lifecycle_1.0.4 compiler_4.5.1
## [25] RColorBrewer_1.1-3 pkgconfig_2.0.3 tidyr_1.3.1 rstudioapi_0.17.1
## [29] farver_2.1.2 digest_0.6.37 R6_2.6.1 tidyselect_1.2.1
## [33] pillar_1.11.1 magrittr_2.0.4 bslib_0.9.0 withr_3.0.2
## [37] tools_4.5.1 gtable_0.3.6 cachem_1.1.0