Auto dataset)Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Quantitative: mpg,
cylinders, displacement,
horsepower, weight, acceleration,
year
Qualitative: origin,
name
(Note: cylinders and origin are
sometimes treated as qualitative depending on context, but are stored as
integers here.)
quant_vars <- Auto[, c("mpg", "cylinders", "displacement",
"horsepower", "weight", "acceleration", "year")]
sapply(quant_vars, range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
rbind(Mean = sapply(quant_vars, mean),
SD = sapply(quant_vars, sd))
## mpg cylinders displacement horsepower weight acceleration
## Mean 23.445918 5.471939 194.412 104.46939 2977.5842 15.541327
## SD 7.805007 1.705783 104.644 38.49116 849.4026 2.758864
## year
## Mean 75.979592
## SD 3.683737
Auto_sub <- Auto[-(10:85), ]
quant_sub <- Auto_sub[, c("mpg", "cylinders", "displacement",
"horsepower", "weight", "acceleration", "year")]
rbind(Range_min = sapply(quant_sub, min),
Range_max = sapply(quant_sub, max),
Mean = sapply(quant_sub, mean),
SD = sapply(quant_sub, sd))
## mpg cylinders displacement horsepower weight acceleration
## Range_min 11.000000 3.000000 68.00000 46.00000 1649.0000 8.500000
## Range_max 46.600000 8.000000 455.00000 230.00000 4997.0000 24.800000
## Mean 24.404430 5.373418 187.24051 100.72152 2935.9715 15.726899
## SD 7.867283 1.654179 99.67837 35.70885 811.3002 2.693721
## year
## Range_min 70.000000
## Range_max 82.000000
## Mean 77.145570
## SD 3.106217
ggpairs(quant_vars,
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
upper = list(continuous = wrap("cor", size = 3)),
diag = list(continuous = wrap("densityDiag"))) +
theme_minimal(base_size = 10) +
labs(title = "Scatterplot matrix of Auto quantitative predictors")
Findings: Strong negative correlations exist between
mpg and displacement, weight,
horsepower, and cylinders. weight
and displacement are highly positively correlated with each
other. year shows a moderate positive association with
mpg, suggesting fuel efficiency improved over time.
mpg?par(mfrow = c(2, 3))
for (v in c("cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")) {
x <- Auto[[v]]
y <- Auto$mpg
ok <- is.finite(x) & is.finite(y)
xr <- range(x[ok])
yr <- range(y[ok])
tryCatch(
{
plot(x[ok], y[ok], xlim = xr, ylim = yr,
xlab = v, ylab = "mpg",
main = paste("mpg vs", v),
pch = 19, cex = 0.5, col = "steelblue")
abline(lm(y[ok] ~ x[ok]), col = "red", lwd = 2)
},
error = function(e) {
plot.new(); title(main = paste("FAILED:", v))
cat("Plot failed for", v, ":", conditionMessage(e), "\n")
}
)
}
par(mfrow = c(1, 1))
Conclusion: weight,
displacement, horsepower, and
cylinders all show strong negative linear (or slightly
curved) relationships with mpg and are likely useful
predictors. year also has a positive association and should
be included. acceleration shows a weaker relationship.
Auto dataset)pairs(Auto[, sapply(Auto, is.numeric)],
pch = 19, cex = 0.4, col = "steelblue",
main = "Scatterplot matrix — Auto dataset")
name)cor(Auto[, sapply(Auto, is.numeric)])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
mpg ~ all predictors
except namelm_auto <- lm(mpg ~ . - name, data = Auto)
summary(lm_auto)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Is there a relationship between the predictors and the
response? Yes. The overall F-statistic is large with a very
small p-value, indicating a statistically significant relationship
between the predictors and mpg.
ii. Which predictors are statistically significant?
displacement, weight, year, and
origin have p-values below 0.05 and appear to have
significant relationships with mpg. horsepower
and acceleration are not significant at the 0.05 level when
other variables are included.
iii. What does the coefficient for year
suggest? The positive coefficient for year (~0.75)
suggests that, holding other variables constant, fuel efficiency
(mpg) increased by approximately 0.75 miles per gallon each
year, reflecting improvements in automotive technology over time.
par(mfrow = c(2, 2))
plot(lm_auto)
par(mfrow = c(1, 1))
Comments:
lm_interact <- lm(mpg ~ (cylinders + displacement + horsepower +
weight + acceleration + year + origin)^2 - name,
data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ (cylinders + displacement + horsepower + weight +
## acceleration + year + origin)^2 - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
Several interaction terms are statistically significant (e.g.,
displacement:year, acceleration:origin),
suggesting that some predictors jointly influence mpg
beyond their individual effects.
par(mfrow = c(1, 3))
hp <- Auto$horsepower
mpg <- Auto$mpg
ok_hp <- is.finite(hp) & is.finite(mpg)
tryCatch({
x <- log(hp[ok_hp])
plot(x, mpg[ok_hp], xlim = range(x), ylim = range(mpg[ok_hp]),
xlab = "log(horsepower)", ylab = "mpg", main = "log(X)",
pch = 19, cex = 0.5, col = "steelblue")
abline(lm(mpg[ok_hp] ~ x), col = "red", lwd = 2)
}, error = function(e) { plot.new(); title("log(X) failed") })
tryCatch({
x <- sqrt(hp[ok_hp])
plot(x, mpg[ok_hp], xlim = range(x), ylim = range(mpg[ok_hp]),
xlab = "sqrt(horsepower)", ylab = "mpg", main = "sqrt(X)",
pch = 19, cex = 0.5, col = "steelblue")
abline(lm(mpg[ok_hp] ~ x), col = "red", lwd = 2)
}, error = function(e) { plot.new(); title("sqrt(X) failed") })
tryCatch({
x <- hp[ok_hp]^2
plot(x, mpg[ok_hp], xlim = range(x), ylim = range(mpg[ok_hp]),
xlab = "horsepower^2", ylab = "mpg", main = "X^2",
pch = 19, cex = 0.5, col = "steelblue")
abline(lm(mpg[ok_hp] ~ x), col = "red", lwd = 2)
}, error = function(e) { plot.new(); title("X^2 failed") })
par(mfrow = c(1, 1))
Findings: The log(horsepower)
transformation produces the most linear relationship with
mpg, improving model fit compared to the untransformed
predictor. sqrt(horsepower) also helps;
horsepower^2 introduces a slight curve in the wrong
direction.
Boston dataset)crimBoston_df <- Boston
predictors <- setdiff(names(Boston_df), "crim")
# Store simple regression coefficients for part (c)
simple_coefs <- sapply(predictors, function(p) {
coef(lm(reformulate(p, response = "crim"), data = Boston_df))[2]
})
names(simple_coefs) <- predictors # ensure names are always clean
# Show p-values for each simple regression
pvals <- sapply(predictors, function(p) {
summary(lm(reformulate(p, response = "crim"), data = Boston_df))$coefficients[2, 4]
})
data.frame(Predictor = predictors,
Coefficient = round(simple_coefs, 4),
P_value = signif(pvals, 3))
## Predictor Coefficient P_value
## zn zn -0.0739 5.51e-06
## indus indus 0.5098 1.45e-21
## chas chas -1.8928 2.09e-01
## nox nox 31.2485 3.75e-23
## rm rm -2.6841 6.35e-07
## age age 0.1078 2.85e-16
## dis dis -1.5509 8.52e-19
## rad rad 0.6179 2.69e-56
## tax tax 0.0297 2.36e-47
## ptratio ptratio 1.1520 2.94e-11
## lstat lstat 0.5488 2.65e-27
## medv medv -0.3632 1.17e-19
par(mfrow = c(4, 4))
for (p in predictors) {
x <- Boston_df[[p]]
y <- Boston_df$crim
ok <- is.finite(x) & is.finite(y)
if (sum(ok) < 2) next
xr <- range(x[ok])
yr <- range(y[ok])
if (!all(is.finite(xr)) || !all(is.finite(yr))) next
tryCatch(
{
plot(x[ok], y[ok], xlim = xr, ylim = yr,
xlab = p, ylab = "crim",
pch = 19, cex = 0.4, col = "steelblue")
abline(lm(y[ok] ~ x[ok]), col = "red", lwd = 1.5)
},
error = function(e) { plot.new(); title(main = paste("FAILED:", p)) }
)
}
par(mfrow = c(1, 1))
Findings: Almost all predictors show a statistically
significant association with crim in simple linear
regression. Notable predictors include rad (accessibility
to highways) and tax (property tax rate), both of which
show strong positive associations with crime rate.
lm_boston <- lm(crim ~ ., data = Boston_df)
summary(lm_boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
In the multiple regression model, we can reject H₀ : βⱼ = 0 for
zn, dis,
rad, black,
and medv at the 0.05 significance level.
Many predictors that were individually significant lose significance
when controlling for the others.
multi_coefs <- coef(lm_boston)[-1]
x <- simple_coefs
y <- multi_coefs[names(simple_coefs)]
plot(x, y,
xlab = "Simple regression coefficient",
ylab = "Multiple regression coefficient",
main = "Univariate vs. multiple regression coefficients",
pch = 19, col = "steelblue")
abline(0, 1, col = "red", lty = 2)
text(x, y, labels = names(x), cex = 0.65, pos = 3)
The coefficients differ substantially between simple and multiple
regression, especially for nox. This reflects
multicollinearity: predictors are correlated with each other, so their
individual effects change when jointly estimated.
poly_results <- lapply(predictors, function(p) {
df <- Boston_df[is.finite(Boston_df[[p]]) & is.finite(Boston_df$crim), ]
x <- df[[p]]
if (length(unique(x)) <= 3) {
return(data.frame(Predictor = p,
X2_or_X3_sig = NA,
X2_pval = NA,
X3_pval = NA,
Note = "skipped: too few unique values"))
}
fit <- lm(crim ~ poly(df[[p]], 3), data = df)
s <- summary(fit)$coefficients
sig <- nrow(s) >= 3 && any(s[3:nrow(s), 4] < 0.05)
data.frame(Predictor = p,
X2_or_X3_sig = sig,
X2_pval = ifelse(nrow(s) >= 3, round(s[3, 4], 4), NA),
X3_pval = ifelse(nrow(s) >= 4, round(s[4, 4], 4), NA),
Note = "")
})
do.call(rbind, poly_results)
## Predictor X2_or_X3_sig X2_pval X3_pval Note
## 1 zn TRUE 0.0044 0.2295
## 2 indus TRUE 0.0011 0.0000
## 3 chas NA NA NA skipped: too few unique values
## 4 nox TRUE 0.0001 0.0000
## 5 rm TRUE 0.0015 0.5086
## 6 age TRUE 0.0000 0.0067
## 7 dis TRUE 0.0000 0.0000
## 8 rad TRUE 0.0091 0.4823
## 9 tax TRUE 0.0000 0.2439
## 10 ptratio TRUE 0.0024 0.0063
## 11 lstat TRUE 0.0378 0.1299
## 12 medv TRUE 0.0000 0.0000
Findings: For many predictors (e.g.,
indus, nox, age,
dis, ptratio, medv), the
quadratic or cubic terms are statistically significant, providing
evidence of non-linear associations with per capita crime rate.
Weekly dataset)data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
ggplot(Weekly, aes(x = seq_along(Volume), y = Volume)) +
geom_line(color = "steelblue") +
labs(title = "Trading volume over time (Weekly)",
x = "Observation index", y = "Volume") +
theme_minimal()
ggpairs(Weekly[, c("Lag1","Lag2","Lag3","Lag4","Lag5","Volume","Today")],
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5)),
upper = list(continuous = wrap("cor", size = 3))) +
theme_minimal(base_size = 9) +
labs(title = "Pairwise relationships — Weekly dataset")
Patterns: Volume has increased substantially over
time. The lag variables show very little correlation with
Today, suggesting limited linear predictability.
Year and Volume are positively correlated.
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Significant predictors: Only
Lag2 has a p-value below 0.05 (p ≈ 0.03),
suggesting it has a statistically significant relationship with
Direction. The other lag variables and Volume
are not significant.
prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
conf_mat <- table(Predicted = pred_full, Actual = Weekly$Direction)
conf_mat
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(pred_full == Weekly$Direction)
## [1] 0.5610652
Interpretation: The confusion matrix shows the model predicts “Up” the vast majority of the time. It correctly identifies most “Up” weeks but performs poorly on “Down” weeks (high false positive rate). The overall accuracy is approximately 56%, only marginally better than random guessing. The model is over-predicting the “Up” direction.
Lag2train <- Weekly$Year <= 2008
test <- Weekly[!train, ]
glm_lag2 <- glm(Direction ~ Lag2,
data = Weekly,
subset = train,
family = binomial)
summary(glm_lag2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
prob_test <- predict(glm_lag2, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")
conf_test <- table(Predicted = pred_test, Actual = test$Direction)
conf_test
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
cat("Test accuracy:", round(mean(pred_test == test$Direction), 4), "\n")
## Test accuracy: 0.625
Findings: Using only Lag2 as a
predictor and evaluating on the 2009–2010 held-out data, the model
achieves approximately 62.5% accuracy — an improvement
over the full model trained on all data. This suggests that
Lag2 is a more parsimonious and generalizable predictor of
weekly market direction than including all lag variables.