data(Auto)
Auto <- na.omit(Auto)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# Qualitative: origin, name
cat("Quantitative predictors:\n")
## Quantitative predictors:
cat("mpg, cylinders, displacement, horsepower, weight, acceleration, year\n\n")
## mpg, cylinders, displacement, horsepower, weight, acceleration, year
cat("Qualitative predictors:\n")
## Qualitative predictors:
cat("origin (1=American, 2=European, 3=Japanese), name\n")
## origin (1=American, 2=European, 3=Japanese), name
quant_vars <- c("mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")
sapply(Auto[, quant_vars], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
cat("=== Mean ===\n")
## === Mean ===
print(sapply(Auto[, quant_vars], mean))
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
cat("\n=== Standard Deviation ===\n")
##
## === Standard Deviation ===
print(sapply(Auto[, quant_vars], sd))
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
Auto_subset <- Auto[-(10:85), ]
cat("=== Range (subset) ===\n")
## === Range (subset) ===
print(sapply(Auto_subset[, quant_vars], range))
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
cat("\n=== Mean (subset) ===\n")
##
## === Mean (subset) ===
print(sapply(Auto_subset[, quant_vars], mean))
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
cat("\n=== Standard Deviation (subset) ===\n")
##
## === Standard Deviation (subset) ===
print(sapply(Auto_subset[, quant_vars], sd))
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
# Scatterplot matrix of all quantitative variables
pairs(Auto[, quant_vars],
main = "Scatterplot Matrix of Auto Dataset",
col = "steelblue", pch = 19, cex = 0.5)
# Correlation heatmap
library(corrplot)
corr_matrix <- cor(Auto[, quant_vars])
corrplot(corr_matrix, method = "color", type = "upper",
addCoef.col = "black", number.cex = 0.7,
tl.col = "black", tl.srt = 45,
title = "Correlation Matrix", mar = c(0,0,1,0))
Findings: Strong negative correlations exist between
mpg and displacement, horsepower,
and weight. cylinders,
displacement, horsepower, and
weight are strongly positively correlated with each other.
year has a moderate positive correlation with
mpg, suggesting newer cars are more fuel-efficient.
par(mfrow = c(2, 3))
for (v in setdiff(quant_vars, "mpg")) {
plot(Auto[[v]], Auto$mpg,
xlab = v, ylab = "mpg",
main = paste("mpg vs", v),
col = "tomato", pch = 19, cex = 0.6)
abline(lm(mpg ~ Auto[[v]], data = Auto), col = "blue", lwd = 2)
}
par(mfrow = c(1, 1))
Conclusion: displacement,
horsepower, and weight show strong negative
relationships with mpg and are the most useful predictors.
year shows a positive trend (newer cars = better mileage).
cylinders and acceleration also have
associations but are slightly weaker.
pairs(Auto, main = "Scatterplot Matrix — Full Auto Dataset",
col = "steelblue", pch = 19, cex = 0.4)
name)Auto_num <- Auto[, sapply(Auto, is.numeric)] # drops 'name'
cor(Auto_num)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
Comments:
displacement, weight, year, and
origin have p-values < 0.05.year? The
positive coefficient (~0.75) means that, holding all else constant, each
additional model year is associated with about 0.75 more mpg — cars
became more fuel-efficient over time.par(mfrow = c(2, 2))
plot(lm_fit)
par(mfrow = c(1, 1))
Comments: The Residuals vs Fitted plot shows a slight non-linear pattern, suggesting the linear model may not fully capture the relationship. Observation 14 appears to have high leverage. No extremely large outliers, but the Scale-Location plot hints at mild heteroscedasticity.
# Test a few meaningful interactions
lm_interact <- lm(mpg ~ displacement * weight + horsepower * acceleration +
year * origin, data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ displacement * weight + horsepower * acceleration +
## year * origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8865 -1.7324 0.0457 1.4222 12.5838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.183e+00 8.556e+00 0.840 0.401711
## displacement -6.754e-02 8.951e-03 -7.546 3.32e-13 ***
## weight -8.527e-03 8.517e-04 -10.013 < 2e-16 ***
## horsepower 4.468e-02 2.358e-02 1.895 0.058864 .
## acceleration 5.384e-01 1.514e-01 3.555 0.000425 ***
## year 5.044e-01 9.881e-02 5.104 5.25e-07 ***
## origin -1.241e+01 4.128e+00 -3.006 0.002822 **
## displacement:weight 1.916e-05 2.309e-06 8.297 1.85e-15 ***
## horsepower:acceleration -6.245e-03 1.692e-03 -3.690 0.000257 ***
## year:origin 1.662e-01 5.301e-02 3.136 0.001847 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.871 on 382 degrees of freedom
## Multiple R-squared: 0.8678, Adjusted R-squared: 0.8647
## F-statistic: 278.6 on 9 and 382 DF, p-value: < 2.2e-16
Comment: The interaction between
displacement:weight and year:origin tends to
be statistically significant, suggesting the combined effect of these
pairs matters beyond their individual contributions.
par(mfrow = c(1, 3))
# log(horsepower)
plot(log(Auto$horsepower), Auto$mpg,
xlab = "log(horsepower)", ylab = "mpg",
main = "mpg vs log(horsepower)", col = "steelblue", pch = 19, cex = 0.6)
abline(lm(mpg ~ log(horsepower), data = Auto), col = "red", lwd = 2)
# sqrt(horsepower)
plot(sqrt(Auto$horsepower), Auto$mpg,
xlab = "sqrt(horsepower)", ylab = "mpg",
main = "mpg vs sqrt(horsepower)", col = "darkgreen", pch = 19, cex = 0.6)
abline(lm(mpg ~ sqrt(horsepower), data = Auto), col = "red", lwd = 2)
# horsepower^2
plot(Auto$horsepower^2, Auto$mpg,
xlab = "horsepower^2", ylab = "mpg",
main = "mpg vs horsepower^2", col = "tomato", pch = 19, cex = 0.6)
abline(lm(mpg ~ I(horsepower^2), data = Auto), col = "blue", lwd = 2)
par(mfrow = c(1, 1))
# Compare R-squared for each transformation
lm_log <- lm(mpg ~ log(horsepower), data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
lm_sq <- lm(mpg ~ I(horsepower^2), data = Auto)
cat("R-squared:\n")
## R-squared:
cat("log(hp): ", summary(lm_log)$r.squared, "\n")
## log(hp): 0.6683348
cat("sqrt(hp): ", summary(lm_sqrt)$r.squared, "\n")
## sqrt(hp): 0.6437036
cat("hp^2: ", summary(lm_sq)$r.squared, "\n")
## hp^2: 0.507367
Comment: The log transformation of
horsepower achieves the best linear fit with
mpg (highest R²), confirming a log-linear rather than
purely linear relationship.
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
predictors <- setdiff(names(Boston), "crim")
slr_results <- data.frame(predictor = predictors,
coef = NA,
p_value = NA)
for (i in seq_along(predictors)) {
fml <- as.formula(paste("crim ~", predictors[i]))
fit <- lm(fml, data = Boston)
s <- summary(fit)$coefficients
slr_results$coef[i] <- s[2, 1]
slr_results$p_value[i] <- s[2, 4]
}
slr_results$significant <- slr_results$p_value < 0.05
print(slr_results)
## predictor coef p_value significant
## 1 zn -0.07393498 5.506472e-06 TRUE
## 2 indus 0.50977633 1.450349e-21 TRUE
## 3 chas -1.89277655 2.094345e-01 FALSE
## 4 nox 31.24853120 3.751739e-23 TRUE
## 5 rm -2.68405122 6.346703e-07 TRUE
## 6 age 0.10778623 2.854869e-16 TRUE
## 7 dis -1.55090168 8.519949e-19 TRUE
## 8 rad 0.61791093 2.693844e-56 TRUE
## 9 tax 0.02974225 2.357127e-47 TRUE
## 10 ptratio 1.15198279 2.942922e-11 TRUE
## 11 lstat 0.54880478 2.654277e-27 TRUE
## 12 medv -0.36315992 1.173987e-19 TRUE
par(mfrow = c(4, 4))
for (p in predictors) {
plot(Boston[[p]], Boston$crim,
xlab = p, ylab = "crim",
main = paste("crim ~", p),
col = "steelblue", pch = 19, cex = 0.5)
abline(lm(as.formula(paste("crim ~", p)), data = Boston),
col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
Finding: Almost all predictors show a statistically
significant association with crim in simple linear
regression. Notable ones include rad, tax,
lstat, and medv.
lm_boston <- lm(crim ~ ., data = Boston)
summary(lm_boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Comment: In the multiple regression model, we can
reject H₀: βⱼ = 0 for predictors with p < 0.05: zn,
dis, rad, black, and
medv. Many predictors that were significant in simple
regression lose significance here due to collinearity.
mlr_coefs <- coef(lm_boston)[-1] # remove intercept
# Align
common_preds <- intersect(names(mlr_coefs), slr_results$predictor)
slr_coefs_aligned <- slr_results$coef[match(common_preds, slr_results$predictor)]
mlr_coefs_aligned <- mlr_coefs[common_preds]
plot(slr_coefs_aligned, mlr_coefs_aligned,
xlab = "Simple Regression Coefficients",
ylab = "Multiple Regression Coefficients",
main = "Univariate vs. Multiple Regression Coefficients",
col = "tomato", pch = 19, cex = 1.2)
abline(0, 1, lty = 2, col = "gray")
text(slr_coefs_aligned, mlr_coefs_aligned,
labels = common_preds, cex = 0.7, pos = 3)
Comment: Coefficients differ substantially,
especially for nox which has a large simple regression
coefficient but a very different multiple regression coefficient. This
reflects multicollinearity and confounding.
poly_results <- data.frame(predictor = predictors,
p_X2 = NA, p_X3 = NA)
for (i in seq_along(predictors)) {
p <- predictors[i]
fml <- as.formula(paste("crim ~", p, "+ I(", p, "^2) + I(", p, "^3)"))
fit <- lm(fml, data = Boston)
s <- summary(fit)$coefficients
if (nrow(s) >= 3) poly_results$p_X2[i] <- s[3, 4]
if (nrow(s) >= 4) poly_results$p_X3[i] <- s[4, 4]
}
poly_results$nonlinear <- (poly_results$p_X2 < 0.05 | poly_results$p_X3 < 0.05)
print(poly_results)
## predictor p_X2 p_X3 nonlinear
## 1 zn 9.375050e-02 2.295386e-01 FALSE
## 2 indus 3.420187e-10 1.196405e-12 TRUE
## 3 chas NA NA NA
## 4 nox 6.811300e-15 6.961110e-16 TRUE
## 5 rm 3.641094e-01 5.085751e-01 FALSE
## 6 age 4.737733e-02 6.679915e-03 TRUE
## 7 dis 4.941214e-12 1.088832e-08 TRUE
## 8 rad 6.130099e-01 4.823138e-01 FALSE
## 9 tax 1.374682e-01 2.438507e-01 FALSE
## 10 ptratio 4.119552e-03 6.300514e-03 TRUE
## 11 lstat 6.458736e-02 1.298906e-01 FALSE
## 12 medv 3.260523e-18 1.046510e-12 TRUE
Comment: Several predictors (e.g.,
indus, nox, age,
dis, ptratio, medv) show
significant non-linear (quadratic or cubic) terms, indicating non-linear
associations with crim.
data(Weekly)
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly[, -9],
col = ifelse(Weekly$Direction == "Up", "steelblue", "tomato"),
pch = 19, cex = 0.4,
main = "Weekly Data — Blue=Up, Red=Down")
# Volume over time
plot(Weekly$Volume,
type = "l", col = "steelblue", lwd = 1.5,
xlab = "Week Index", ylab = "Volume",
main = "Trading Volume Over Time")
Patterns: Volume has increased substantially over
time. The lag variables show very little autocorrelation with today’s
returns. Direction is slightly more often “Up” than
“Down”.
glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Comment: Only Lag2 appears
statistically significant (p ≈ 0.03). The other predictors do not show
significant associations with Direction.
glm_probs <- predict(glm_fit, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")
conf_mat <- table(Predicted = glm_pred, Actual = Weekly$Direction)
print(conf_mat)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Overall fraction correct
accuracy <- mean(glm_pred == Weekly$Direction)
cat("\nOverall accuracy:", round(accuracy, 4), "\n")
##
## Overall accuracy: 0.5611
Interpretation: The model predicts “Up” most of the time. It correctly identifies most “Up” weeks but misclassifies most “Down” weeks — the model is biased toward predicting market increases.
train <- Weekly$Year <= 2008
test <- Weekly[!train, ]
glm_lag2 <- glm(Direction ~ Lag2,
data = Weekly,
subset = train,
family = binomial)
# Predictions on held-out data
lag2_probs <- predict(glm_lag2, newdata = test, type = "response")
lag2_pred <- ifelse(lag2_probs > 0.5, "Up", "Down")
conf_mat2 <- table(Predicted = lag2_pred, Actual = test$Direction)
print(conf_mat2)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy2 <- mean(lag2_pred == test$Direction)
cat("\nHeld-out accuracy (2009–2010):", round(accuracy2, 4), "\n")
##
## Held-out accuracy (2009–2010): 0.625
Comment: With only Lag2 as predictor
and evaluated on held-out data (2009–2010), the model achieves better
accuracy than using all predictors on the full data, suggesting that
Lag2 alone captures meaningful signal and the other lags
added noise. ```