## [1] 392 9
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Quantitative predictors: mpg,
cylinders, displacement,
horsepower, weight, acceleration,
year
Qualitative predictor: name (and
origin is coded as integer 1/2/3, representing
American/European/Japanese — it is best treated as
qualitative/categorical)
quant_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")
sapply(Auto[, quant_vars], range)## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
summary_stats <- data.frame(
Mean = sapply(Auto[, quant_vars], mean),
SD = sapply(Auto[, quant_vars], sd)
)
round(summary_stats, 2)## Mean SD
## mpg 23.45 7.81
## cylinders 5.47 1.71
## displacement 194.41 104.64
## horsepower 104.47 38.49
## weight 2977.58 849.40
## acceleration 15.54 2.76
## year 75.98 3.68
## Remaining observations: 316
sub_stats <- data.frame(
Range_Min = sapply(Auto_sub[, quant_vars], min),
Range_Max = sapply(Auto_sub[, quant_vars], max),
Mean = sapply(Auto_sub[, quant_vars], mean),
SD = sapply(Auto_sub[, quant_vars], sd)
)
round(sub_stats, 2)## Range_Min Range_Max Mean SD
## mpg 11.0 46.6 24.40 7.87
## cylinders 3.0 8.0 5.37 1.65
## displacement 68.0 455.0 187.24 99.68
## horsepower 46.0 230.0 100.72 35.71
## weight 1649.0 4997.0 2935.97 811.30
## acceleration 8.5 24.8 15.73 2.69
## year 70.0 82.0 77.15 3.11
# Scatterplot matrix (quantitative variables only)
pairs(Auto[, quant_vars],
main = "Scatterplot Matrix – Auto Dataset",
pch = 20, col = "steelblue", cex = 0.5)# Correlation heatmap
cor_mat <- cor(Auto[, quant_vars])
# Base R image-style correlation plot
corrplot_data <- as.data.frame(as.table(cor_mat))
ggplot(corrplot_data, aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "firebrick", mid = "white", high = "steelblue",
midpoint = 0, limits = c(-1, 1), name = "Correlation") +
geom_text(aes(label = round(Freq, 2)), size = 3) +
theme_minimal() +
labs(title = "Correlation Matrix – Auto Quantitative Predictors",
x = NULL, y = NULL) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Findings:
displacement, horsepower, and
weight are strongly positively correlated with each
other.mpg is strongly negatively correlated with
displacement, horsepower, and
weight.year has a moderate positive correlation with
mpg, suggesting fuel efficiency improved over time.cylinders is highly correlated with displacement and
weight, reflecting engine size.mpg?par(mfrow = c(2, 3))
for (v in setdiff(quant_vars, "mpg")) {
plot(Auto[[v]], Auto$mpg,
xlab = v, ylab = "mpg",
main = paste("mpg vs", v),
pch = 20, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ Auto[[v]], data = Auto), col = "firebrick", lwd = 2)
}Conclusion: Based on the plots and correlations:
displacement, horsepower,
weight show strong negative relationships with
mpg — heavier, more powerful cars get lower mileage. These
are the most useful predictors.cylinders also correlates negatively,
though it is often redundant with displacement.year shows a positive relationship —
newer cars are more fuel-efficient.acceleration has a weaker, positive
relationship with mpg.This question involves multiple linear regression on the
Autodata set.
pairs(Auto[, quant_vars],
main = "Scatterplot Matrix – Auto Dataset (Ch. 3)",
pch = 20, col = "darkorange", cex = 0.5)name)## mpg cylinders displacement horsepower weight acceleration
## mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423
## cylinders -0.778 1.000 0.951 0.843 0.898 -0.505
## displacement -0.805 0.951 1.000 0.897 0.933 -0.544
## horsepower -0.778 0.843 0.897 1.000 0.865 -0.689
## weight -0.832 0.898 0.933 0.865 1.000 -0.417
## acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000
## year 0.581 -0.346 -0.370 -0.416 -0.309 0.290
## year
## mpg 0.581
## cylinders -0.346
## displacement -0.370
## horsepower -0.416
## weight -0.309
## acceleration 0.290
## year 1.000
mpg ~ All Except name##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
Comments:
Is there a relationship between the predictors and the
response?
Yes. The overall F-statistic is very large with a p-value near zero,
confirming a statistically significant relationship between the
predictors and mpg.
Which predictors have a statistically significant
relationship?
At the 5% significance level: displacement,
weight, year, and origin are
significant. cylinders, horsepower, and
acceleration are not statistically significant when all
variables are included (due to multicollinearity).
What does the coefficient for year
suggest?
The positive coefficient for year (~0.75) indicates that,
holding all other predictors constant, each additional model year is
associated with roughly 0.75 more miles per gallon on average. This
reflects the trend of improving fuel efficiency over time (e.g., due to
fuel economy regulations and engine improvements).
Comments:
# Test some interactions suggested by domain knowledge
lm_int <- lm(mpg ~ displacement + weight + year + origin +
displacement:weight + horsepower:weight, data = Auto)
summary(lm_int)##
## Call:
## lm(formula = mpg ~ displacement + weight + year + origin + displacement:weight +
## horsepower:weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4889 -1.7223 -0.0711 1.6587 12.3700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.716e+00 3.828e+00 -1.493 0.13613
## displacement -7.896e-02 9.389e-03 -8.410 8.07e-16 ***
## weight -1.021e-02 6.546e-04 -15.598 < 2e-16 ***
## year 7.922e-01 4.554e-02 17.397 < 2e-16 ***
## origin 4.897e-01 2.582e-01 1.897 0.05856 .
## displacement:weight 2.562e-05 2.638e-06 9.711 < 2e-16 ***
## weight:horsepower -8.823e-06 2.839e-06 -3.108 0.00202 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.983 on 385 degrees of freedom
## Multiple R-squared: 0.8562, Adjusted R-squared: 0.854
## F-statistic: 382.1 on 6 and 385 DF, p-value: < 2.2e-16
Comments: The interaction
displacement:weight is statistically significant (p <
0.05), suggesting that the joint effect of engine size and vehicle
weight on fuel efficiency is not simply additive. The
horsepower:weight interaction may also be meaningful
depending on the model specification.
# Log, sqrt, and squared transformations
lm_log <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) +
acceleration + year + origin, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) +
acceleration + year + origin, data = Auto)
lm_sq <- lm(mpg ~ displacement + I(displacement^2) + weight +
horsepower + year + origin, data = Auto)
cat("R² (log transform): ", round(summary(lm_log)$r.squared, 4), "\n")## R² (log transform): 0.8469
## R² (sqrt transform): 0.8339
## R² (quadratic): 0.8527
## R² (original): 0.8215
par(mfrow = c(1, 2))
plot(lm_log, which = 1, main = "Log Transform – Residuals vs Fitted")
plot(lm_log, which = 2, main = "Log Transform – Q-Q Plot")Findings: Log-transforming
displacement, horsepower, and
weight improves the R² and reduces the non-linear pattern
in residuals, suggesting a multiplicative relationship between these
predictors and mpg. The quadratic term for
displacement also improves fit, but the log model tends to
be cleaner.
We will try to predict per capita crime rate (
crim) using the other variables in theBostondataset.
## [1] 506 13
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "lstat" "medv"
predictors <- setdiff(names(Boston), "crim")
# Fit simple linear regression for each predictor
slr_results <- lapply(predictors, function(p) {
fit <- lm(as.formula(paste("crim ~", p)), data = Boston)
s <- summary(fit)
data.frame(
Predictor = p,
Coefficient = coef(fit)[2],
P_value = coef(s)[2, 4],
R_squared = s$r.squared
)
})
slr_df <- do.call(rbind, slr_results)
slr_df$Significant <- ifelse(slr_df$P_value < 0.05, "Yes", "No")
slr_df <- slr_df[order(slr_df$P_value), ]
rownames(slr_df) <- NULL
# Round only numeric columns
slr_df_display <- slr_df
slr_df_display[, c("Coefficient","P_value","R_squared")] <-
round(slr_df[, c("Coefficient","P_value","R_squared")], 5)
slr_df_display## Predictor Coefficient P_value R_squared Significant
## 1 rad 0.61791 0.00000 0.39126 Yes
## 2 tax 0.02974 0.00000 0.33961 Yes
## 3 lstat 0.54880 0.00000 0.20759 Yes
## 4 nox 31.24853 0.00000 0.17722 Yes
## 5 indus 0.50978 0.00000 0.16531 Yes
## 6 medv -0.36316 0.00000 0.15078 Yes
## 7 dis -1.55090 0.00000 0.14415 Yes
## 8 age 0.10779 0.00000 0.12442 Yes
## 9 ptratio 1.15198 0.00000 0.08407 Yes
## 10 rm -2.68405 0.00000 0.04807 Yes
## 11 zn -0.07393 0.00001 0.04019 Yes
## 12 chas -1.89278 0.20943 0.00312 No
# Plot a few significant relationships
sig_preds <- slr_df$Predictor[slr_df$Significant == "Yes"][1:4]
par(mfrow = c(2, 2))
for (p in sig_preds) {
plot(Boston[[p]], Boston$crim,
xlab = p, ylab = "crim",
main = paste("crim vs", p),
pch = 20, col = "steelblue", cex = 0.6)
abline(lm(as.formula(paste("crim ~", p)), data = Boston),
col = "firebrick", lwd = 2)
}Results: Almost all predictors show a statistically
significant simple linear association with crim. Notable
exceptions (if any) are chas (Charles River dummy
variable), which is not significant. Predictors like rad
(accessibility to highways), tax, lstat, and
nox show particularly strong associations.
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Results: In the multiple regression, we can reject
H₀: βⱼ = 0 at the 5% level for: zn, dis,
rad, black, and medv. Many
predictors that were significant individually become non-significant in
the multiple regression due to multicollinearity.
# Extract multiple regression coefficients (excluding intercept)
mlr_coefs <- coef(mlr_boston)[-1]
# Match with univariate coefficients
slr_coefs_named <- setNames(slr_df$Coefficient, slr_df$Predictor)
slr_coefs_named <- slr_coefs_named[names(mlr_coefs)]
plot(slr_coefs_named, mlr_coefs,
xlab = "Simple Regression Coefficient",
ylab = "Multiple Regression Coefficient",
main = "Univariate vs. Multiple Regression Coefficients\n(Boston Dataset)",
pch = 19, col = "steelblue", cex = 1.2)
abline(0, 1, lty = 2, col = "gray50")
text(slr_coefs_named, mlr_coefs, labels = names(mlr_coefs),
cex = 0.7, pos = 3, col = "darkred")Comment: For most predictors, the coefficients
differ substantially between simple and multiple regression. The most
dramatic difference is for nox — its large positive simple
regression coefficient becomes negative in the multiple model,
reflecting confounding with other variables.
poly_results <- lapply(predictors, function(p) {
# Skip binary variable chas
if (p == "chas") return(NULL)
fit3 <- lm(as.formula(paste("crim ~ poly(", p, ", 3)")), data = Boston)
s <- summary(fit3)
coefs <- coef(s)
# Check significance of quadratic and cubic terms
data.frame(
Predictor = p,
P_quadratic = ifelse(nrow(coefs) >= 3, coefs[3, 4], NA),
P_cubic = ifelse(nrow(coefs) >= 4, coefs[4, 4], NA)
)
})
poly_df <- do.call(rbind, Filter(Negate(is.null), poly_results))
poly_df$NonLinear <- ifelse(poly_df$P_quadratic < 0.05 |
(!is.na(poly_df$P_cubic) & poly_df$P_cubic < 0.05),
"Yes", "No")
# Round only numeric columns
poly_df_display <- poly_df
poly_df_display[, c("P_quadratic","P_cubic")] <-
round(poly_df[, c("P_quadratic","P_cubic")], 5)
poly_df_display## Predictor P_quadratic P_cubic NonLinear
## 1 zn 0.00442 0.22954 Yes
## 2 indus 0.00109 0.00000 Yes
## 3 nox 0.00008 0.00000 Yes
## 4 rm 0.00151 0.50858 Yes
## 5 age 0.00000 0.00668 Yes
## 6 dis 0.00000 0.00000 Yes
## 7 rad 0.00912 0.48231 Yes
## 8 tax 0.00000 0.24385 Yes
## 9 ptratio 0.00241 0.00630 Yes
## 10 lstat 0.03780 0.12989 Yes
## 11 medv 0.00000 0.00000 Yes
Conclusion: There is evidence of non-linear
associations for several predictors. Variables such as
indus, nox, age,
dis, ptratio, and medv show
significant quadratic or cubic terms, indicating that the relationship
between these predictors and per-capita crime rate is not purely
linear.
This question uses the
Weeklydata set, which contains 1,089 weekly stock returns from 1990 to 2010.
## [1] 1089 9
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Distribution of returns by Direction
par(mfrow = c(2, 3))
for (v in c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume")) {
boxplot(Weekly[[v]] ~ Weekly$Direction,
xlab = "Direction", ylab = v,
main = paste(v, "by Direction"),
col = c("salmon", "steelblue"))
}# Volume over time
plot(Weekly$Year, Weekly$Volume,
type = "l", col = "steelblue", lwd = 1.5,
xlab = "Year", ylab = "Volume",
main = "Trading Volume Over Time (1990–2010)")Patterns observed:
Today (weekly return) has a clear difference between Up
and Down weeks, as expected.glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_full)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Comment: Only Lag2
appears to be statistically significant (p ≈ 0.03). The other lag
variables and Volume are not significant. This is consistent with the
efficient market hypothesis — past returns have very limited predictive
power for future returns.
# Predicted probabilities on training data
prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
# Confusion matrix
conf_mat <- table(Predicted = pred_full, Actual = Weekly$Direction)
conf_mat## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Overall fraction correct
accuracy <- mean(pred_full == Weekly$Direction)
cat("\nOverall fraction correct:", round(accuracy, 4), "\n")##
## Overall fraction correct: 0.5611
Interpretation of confusion matrix:
# Split data
train <- Weekly$Year <= 2008
Weekly_train <- Weekly[train, ]
Weekly_test <- Weekly[!train, ]
# Fit on training data using Lag2 only
glm_lag2 <- glm(Direction ~ Lag2,
data = Weekly_train,
family = binomial)
summary(glm_lag2)##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
# Predict on test data (2009–2010)
prob_test <- predict(glm_lag2, newdata = Weekly_test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")
# Confusion matrix on held-out data
conf_test <- table(Predicted = pred_test, Actual = Weekly_test$Direction)
conf_test## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
# Accuracy on test data
acc_test <- mean(pred_test == Weekly_test$Direction)
cat("\nTest set accuracy (2009–2010):", round(acc_test, 4), "\n")##
## Test set accuracy (2009–2010): 0.625
Using only Lag2 as a predictor and evaluating on the
held-out 2009–2010 data, the model achieves approximately
62.5% accuracy. This is a modest improvement over the
naive baseline, suggesting that Lag2 does contain some weak
predictive signal for the direction of weekly market returns.