Make sure that the missing values have been removed from the data.
## Dimensions after removing NA: 392 9
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Based on the structure of the dataset:
Quantitative predictors (continuous or discrete
numeric measures): mpg, cylinders,
displacement, horsepower, weight,
acceleration, year
Qualitative predictors (categorical):
origin (coded 1/2/3 for American/European/Japanese —
numerically stored but represents distinct categories), and
name (character — car model name, unique identifier).
quant_vars <- Auto[, c("mpg", "cylinders", "displacement",
"horsepower", "weight", "acceleration", "year")]
range_df <- data.frame(
Min = sapply(quant_vars, min),
Max = sapply(quant_vars, max)
)
kable(range_df, caption = "Range of Quantitative Predictors") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Min | Max | |
|---|---|---|
| mpg | 9 | 46.6 |
| cylinders | 3 | 8.0 |
| displacement | 68 | 455.0 |
| horsepower | 46 | 230.0 |
| weight | 1613 | 5140.0 |
| acceleration | 8 | 24.8 |
| year | 70 | 82.0 |
summary_stats <- data.frame(
Mean = round(sapply(quant_vars, mean), 2),
SD = round(sapply(quant_vars, sd), 2)
)
kable(summary_stats, caption = "Mean and Standard Deviation of Quantitative Predictors") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Mean | SD | |
|---|---|---|
| mpg | 23.45 | 7.81 |
| cylinders | 5.47 | 1.71 |
| displacement | 194.41 | 104.64 |
| horsepower | 104.47 | 38.49 |
| weight | 2977.58 | 849.40 |
| acceleration | 15.54 | 2.76 |
| year | 75.98 | 3.68 |
Auto_sub <- Auto[-(10:85), ]
quant_sub <- Auto_sub[, c("mpg", "cylinders", "displacement",
"horsepower", "weight", "acceleration", "year")]
full_stats <- data.frame(
Full_Min = round(sapply(quant_vars, min), 2),
Full_Max = round(sapply(quant_vars, max), 2),
Full_Mean = round(sapply(quant_vars, mean), 2),
Full_SD = round(sapply(quant_vars, sd), 2),
Sub_Min = round(sapply(quant_sub, min), 2),
Sub_Max = round(sapply(quant_sub, max), 2),
Sub_Mean = round(sapply(quant_sub, mean), 2),
Sub_SD = round(sapply(quant_sub, sd), 2)
)
kable(full_stats,
caption = "Full Dataset vs. Subset (rows 10–85 removed): Range, Mean, SD") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
add_header_above(c(" " = 1, "Full Dataset" = 4, "Subset" = 4))| Full_Min | Full_Max | Full_Mean | Full_SD | Sub_Min | Sub_Max | Sub_Mean | Sub_SD | |
|---|---|---|---|---|---|---|---|---|
| mpg | 9 | 46.6 | 23.45 | 7.81 | 11.0 | 46.6 | 24.40 | 7.87 |
| cylinders | 3 | 8.0 | 5.47 | 1.71 | 3.0 | 8.0 | 5.37 | 1.65 |
| displacement | 68 | 455.0 | 194.41 | 104.64 | 68.0 | 455.0 | 187.24 | 99.68 |
| horsepower | 46 | 230.0 | 104.47 | 38.49 | 46.0 | 230.0 | 100.72 | 35.71 |
| weight | 1613 | 5140.0 | 2977.58 | 849.40 | 1649.0 | 4997.0 | 2935.97 | 811.30 |
| acceleration | 8 | 24.8 | 15.54 | 2.76 | 8.5 | 24.8 | 15.73 | 2.69 |
| year | 70 | 82.0 | 75.98 | 3.68 | 70.0 | 82.0 | 77.15 | 3.11 |
Comparison: Removing observations 10–85 (which
correspond largely to early-model-year vehicles) narrows the ranges of
several variables. Notably, horsepower and
displacement show reduced maximums in the subset, and the
mean mpg increases slightly — consistent with the removed
rows containing heavier, less efficient cars from the early 1970s.
Standard deviations generally decrease as the subset becomes more
homogeneous.
pairs(quant_vars, pch = 19, cex = 0.5, col = "steelblue",
main = "Scatterplot Matrix — Auto Dataset (Quantitative Predictors)")Findings:
displacement, horsepower, and
weight are strongly positively correlated
with each other — larger engines tend to be both heavier and more
powerful.cylinders follows the same pattern, rising with
displacement, horsepower, and weight.mpg shows a clear negative
relationship with displacement, horsepower, and weight — more powerful,
heavier cars are less fuel efficient.acceleration is negatively correlated
with horsepower; high-horsepower cars accelerate faster (lower 0–60 time
implied).year has a mild positive relationship
with mpg, reflecting fuel economy improvements over time,
and is largely independent of the mechanical variables.par(mfrow = c(2, 3))
for (var in c("cylinders", "displacement", "horsepower", "weight", "acceleration", "year")) {
plot(Auto[[var]], Auto$mpg,
main = paste("mpg vs", var),
xlab = var, ylab = "mpg",
pch = 19, cex = 0.6, col = "steelblue")
abline(lm(Auto$mpg ~ Auto[[var]]), col = "red", lwd = 2)
}cor_with_mpg <- round(cor(quant_vars)[, "mpg"], 3)
cor_df <- data.frame(
Predictor = names(cor_with_mpg[-1]),
Correlation_with_mpg = cor_with_mpg[-1]
)
cor_df <- cor_df[order(abs(cor_df$Correlation_with_mpg), decreasing = TRUE), ]
rownames(cor_df) <- NULL
kable(cor_df, caption = "Pearson Correlation of Each Predictor with mpg") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Predictor | Correlation_with_mpg |
|---|---|
| weight | -0.832 |
| displacement | -0.805 |
| cylinders | -0.778 |
| horsepower | -0.778 |
| year | 0.581 |
| acceleration | 0.423 |
Conclusion: weight (r = -0.832),
displacement (r = -0.805), and horsepower (r =
-0.778) show the strongest negative correlations with mpg.
year shows a moderate positive correlation (r ≈ 0.581),
reflecting the steady improvement in fuel economy over model years.
Based on both the plots and correlation table, weight,
displacement, horsepower, and
year are the most useful predictors of
mpg.
Auto_num <- Auto[, !names(Auto) %in% "name"]
pairs(Auto_num, pch = 19, cex = 0.5, col = "green",
main = "Scatterplot Matrix — All Auto Variables (incl. origin)")cor_matrix <- round(cor(Auto_num), 2)
kable(cor_matrix, caption = "Correlation Matrix — Auto Dataset (excluding name)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%")| mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
|---|---|---|---|---|---|---|---|---|
| mpg | 1.00 | -0.78 | -0.81 | -0.78 | -0.83 | 0.42 | 0.58 | 0.57 |
| cylinders | -0.78 | 1.00 | 0.95 | 0.84 | 0.90 | -0.50 | -0.35 | -0.57 |
| displacement | -0.81 | 0.95 | 1.00 | 0.90 | 0.93 | -0.54 | -0.37 | -0.61 |
| horsepower | -0.78 | 0.84 | 0.90 | 1.00 | 0.86 | -0.69 | -0.42 | -0.46 |
| weight | -0.83 | 0.90 | 0.93 | 0.86 | 1.00 | -0.42 | -0.31 | -0.59 |
| acceleration | 0.42 | -0.50 | -0.54 | -0.69 | -0.42 | 1.00 | 0.29 | 0.21 |
| year | 0.58 | -0.35 | -0.37 | -0.42 | -0.31 | 0.29 | 1.00 | 0.18 |
| origin | 0.57 | -0.57 | -0.61 | -0.46 | -0.59 | 0.21 | 0.18 | 1.00 |
The correlation matrix confirms the patterns observed in the
scatterplot. displacement, horsepower,
weight, and cylinders are all very highly
inter-correlated (|r| > 0.80), suggesting potential
multicollinearity in regression models that include all
of them simultaneously.
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
lm_coefs <- as.data.frame(coef(summary(lm_fit)))
lm_coefs <- round(lm_coefs, 4)
names(lm_coefs) <- c("Estimate", "Std. Error", "t value", "p value")
kable(lm_coefs, caption = "MLR Coefficient Table: mpg ~ all predictors except name") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
row_spec(which(lm_coefs[["p value"]] < 0.05), bold = TRUE, color = "white", background = "#2c7bb6")| Estimate | Std. Error | t value | p value | |
|---|---|---|---|---|
| (Intercept) | -17.2184 | 4.6443 | -3.7074 | 0.0002 |
| cylinders | -0.4934 | 0.3233 | -1.5261 | 0.1278 |
| displacement | 0.0199 | 0.0075 | 2.6474 | 0.0084 |
| horsepower | -0.0170 | 0.0138 | -1.2295 | 0.2196 |
| weight | -0.0065 | 0.0007 | -9.9288 | 0.0000 |
| acceleration | 0.0806 | 0.0988 | 0.8152 | 0.4155 |
| year | 0.7508 | 0.0510 | 14.7288 | 0.0000 |
| origin | 1.4261 | 0.2781 | 5.1275 | 0.0000 |
Comments:
i. Is there a relationship between the predictors and the
response? Yes. The overall F-statistic is 252.43 on 7 and 384
degrees of freedom, with p-value < 2.2e-16. This provides
overwhelming evidence that at least one predictor has a significant
relationship with mpg. The model explains 82.1% of the
variance in mpg (R² = 0.8215).
ii. Which predictors are statistically significant?
Highlighted in blue above (p < 0.05): displacement,
weight, year, and origin all have
statistically significant relationships with mpg. Despite
its high simple correlation with mpg,
horsepower is not significant in the multiple regression —
likely due to multicollinearity with displacement and
weight.
iii. Coefficient for year: The
coefficient for year is 0.7508. Holding all other
predictors constant, each additional model year is associated with
approximately a 0.75 mpg increase in fuel efficiency. This reflects
regulatory pressure and engineering improvements that drove better fuel
economy throughout the 1970s and 1980s.
Comments:
mpg. A non-linear or transformed
model may be more appropriate.lm_interact1 <- lm(mpg ~ . - name + displacement:weight, data = Auto)
lm_interact2 <- lm(mpg ~ . - name + horsepower:weight, data = Auto)
lm_interact3 <- lm(mpg ~ . - name + cylinders:displacement, data = Auto)
model_compare <- data.frame(
Model = c("Base MLR",
"+ displacement:weight",
"+ horsepower:weight",
"+ cylinders:displacement"),
AIC = round(c(AIC(lm_fit), AIC(lm_interact1),
AIC(lm_interact2), AIC(lm_interact3)), 2),
Adj_R2 = round(c(summary(lm_fit)$adj.r.squared,
summary(lm_interact1)$adj.r.squared,
summary(lm_interact2)$adj.r.squared,
summary(lm_interact3)$adj.r.squared), 4)
)
kable(model_compare, caption = "Model Comparison: Base vs. Interaction Models") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Model | AIC | Adj_R2 |
|---|---|---|
| Base MLR | 2064.95 | 0.8182 |
|
1975.14 | 0.8558 |
|
1966.49 | 0.8590 |
|
2007.67 | 0.8433 |
##
## Call:
## lm(formula = mpg ~ . - name + horsepower:weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.589 -1.617 -0.184 1.541 12.001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.876e+00 4.511e+00 0.638 0.524147
## cylinders -2.955e-02 2.881e-01 -0.103 0.918363
## displacement 5.950e-03 6.750e-03 0.881 0.378610
## horsepower -2.313e-01 2.363e-02 -9.791 < 2e-16 ***
## weight -1.121e-02 7.285e-04 -15.393 < 2e-16 ***
## acceleration -9.019e-02 8.855e-02 -1.019 0.309081
## year 7.695e-01 4.494e-02 17.124 < 2e-16 ***
## origin 8.344e-01 2.513e-01 3.320 0.000986 ***
## horsepower:weight 5.529e-05 5.227e-06 10.577 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.931 on 383 degrees of freedom
## Multiple R-squared: 0.8618, Adjusted R-squared: 0.859
## F-statistic: 298.6 on 8 and 383 DF, p-value: < 2.2e-16
Comment: Adding the horsepower:weight
interaction yields the lowest AIC and highest adjusted R², making it the
best-performing model among those tested. The interaction is
statistically significant (p < 0.05), indicating that the effect of
horsepower on fuel economy depends on the weight of the
vehicle — a heavier, high-horsepower car suffers a compounding fuel
efficiency penalty beyond what either variable predicts on its own.
lm_base <- lm(mpg ~ horsepower, data = Auto)
lm_log <- lm(mpg ~ log(horsepower), data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
lm_sq <- lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
transform_compare <- data.frame(
Transformation = c("Linear (baseline)", "log(horsepower)",
"sqrt(horsepower)", "horsepower + horsepower²"),
R_squared = round(c(summary(lm_base)$r.squared, summary(lm_log)$r.squared,
summary(lm_sqrt)$r.squared, summary(lm_sq)$r.squared), 4),
AIC = round(c(AIC(lm_base), AIC(lm_log), AIC(lm_sqrt), AIC(lm_sq)), 2)
)
kable(transform_compare,
caption = "Comparison of Transformations: horsepower predicting mpg") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Transformation | R_squared | AIC |
|---|---|---|
| Linear (baseline) | 0.6059 | 2363.32 |
| log(horsepower) | 0.6683 | 2295.76 |
| sqrt(horsepower) | 0.6437 | 2323.84 |
| horsepower + horsepower² | 0.6876 | 2274.35 |
par(mfrow = c(1, 3))
plot(log(Auto$horsepower), Auto$mpg,
main = "mpg vs log(horsepower)", xlab = "log(horsepower)", ylab = "mpg",
pch = 19, cex = 0.6, col = "steelblue")
abline(lm_log, col = "red", lwd = 2)
plot(sqrt(Auto$horsepower), Auto$mpg,
main = "mpg vs sqrt(horsepower)", xlab = "sqrt(horsepower)", ylab = "mpg",
pch = 19, cex = 0.6, col = "green")
abline(lm_sqrt, col = "red", lwd = 2)
hp_seq <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 300)
pred_sq <- predict(lm_sq, newdata = data.frame(horsepower = hp_seq))
plot(Auto$horsepower, Auto$mpg,
main = "mpg vs horsepower (quadratic)", xlab = "horsepower", ylab = "mpg",
pch = 19, cex = 0.6, col = "seagreen")
lines(hp_seq, pred_sq, col = "red", lwd = 2)Findings: All three transformations outperform the
simple linear model in both R² and AIC. The log(horsepower)
transformation achieves the highest R² (0.6683) and lowest AIC, and
produces the most linear scatterplot — strongly suggesting the
relationship between horsepower and mpg is multiplicative rather than
additive. The quadratic model also fits well, but the log transformation
is more parsimonious (one parameter vs. two) and more interpretable.
## Dimensions: 506 13
## Variables: crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv
The response variable is crim (per capita crime rate by
town). All other variables serve as predictors.
predictors <- setdiff(names(Boston), "crim")
slr_results <- lapply(predictors, function(var) {
fit <- lm(as.formula(paste("crim ~", var)), data = Boston)
s <- summary(fit)
data.frame(
Predictor = var,
Coefficient = round(coef(fit)[2], 4),
p_value = round(coef(s)[2, 4], 4),
R_squared = round(s$r.squared, 4),
Significant = ifelse(coef(s)[2, 4] < 0.05, "Yes", "No")
)
})
slr_df <- do.call(rbind, slr_results)
slr_df <- slr_df[order(slr_df$p_value), ]
rownames(slr_df) <- NULL
kable(slr_df, caption = "Simple Linear Regression Results: crim ~ each predictor") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
row_spec(which(slr_df$Significant == "Yes"), bold = TRUE, color = "white", background = "#2c7bb6") %>%
row_spec(which(slr_df$Significant == "No"), color = "gray")| Predictor | Coefficient | p_value | R_squared | Significant |
|---|---|---|---|---|
| zn | -0.0739 | 0.0000 | 0.0402 | Yes |
| indus | 0.5098 | 0.0000 | 0.1653 | Yes |
| nox | 31.2485 | 0.0000 | 0.1772 | Yes |
| rm | -2.6841 | 0.0000 | 0.0481 | Yes |
| age | 0.1078 | 0.0000 | 0.1244 | Yes |
| dis | -1.5509 | 0.0000 | 0.1441 | Yes |
| rad | 0.6179 | 0.0000 | 0.3913 | Yes |
| tax | 0.0297 | 0.0000 | 0.3396 | Yes |
| ptratio | 1.1520 | 0.0000 | 0.0841 | Yes |
| lstat | 0.5488 | 0.0000 | 0.2076 | Yes |
| medv | -0.3632 | 0.0000 | 0.1508 | Yes |
| chas | -1.8928 | 0.2094 | 0.0031 | No |
par(mfrow = c(3, 5))
for (var in predictors) {
plot(Boston[[var]], Boston$crim,
xlab = var, ylab = "crim",
pch = 19, cex = 0.4, col = "steelblue",
main = paste("crim ~", var))
abline(lm(as.formula(paste("crim ~", var)), data = Boston), col = "red", lwd = 1.5)
}
par(mfrow = c(1, 1))Results: 11 out of 12 predictors have a
statistically significant simple linear association with
crim (p < 0.05). The only non-significant predictor is
chas (a dummy for proximity to the Charles River),
suggesting that river adjacency alone has no linear association with
crime rate. Predictors such as rad (highway accessibility),
tax (property tax rate), and lstat (%
lower-status population) show particularly strong associations with
higher crime.
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
mlr_coefs <- as.data.frame(coef(summary(mlr_fit)))
mlr_coefs <- round(mlr_coefs, 4)
names(mlr_coefs) <- c("Estimate", "Std. Error", "t value", "p value")
kable(mlr_coefs, caption = "MLR Coefficient Table: crim ~ all predictors") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
row_spec(which(mlr_coefs[["p value"]] < 0.05), bold = TRUE, color = "white", background = "#2c7bb6")| Estimate | Std. Error | t value | p value | |
|---|---|---|---|---|
| (Intercept) | 13.7784 | 7.0818 | 1.9456 | 0.0523 |
| zn | 0.0457 | 0.0188 | 2.4326 | 0.0153 |
| indus | -0.0584 | 0.0836 | -0.6977 | 0.4857 |
| chas | -0.8254 | 1.1834 | -0.6975 | 0.4858 |
| nox | -9.9576 | 5.2898 | -1.8824 | 0.0604 |
| rm | 0.6289 | 0.6071 | 1.0359 | 0.3007 |
| age | -0.0008 | 0.0179 | -0.0473 | 0.9623 |
| dis | -1.0122 | 0.2825 | -3.5836 | 0.0004 |
| rad | 0.6125 | 0.0875 | 6.9967 | 0.0000 |
| tax | -0.0038 | 0.0052 | -0.7300 | 0.4658 |
| ptratio | -0.3041 | 0.1864 | -1.6316 | 0.1034 |
| lstat | 0.1388 | 0.0757 | 1.8330 | 0.0674 |
| medv | -0.2201 | 0.0598 | -3.6784 | 0.0003 |
Results: In the multiple regression model (R² =
0.4493), only a subset of predictors remain statistically significant:
zn, dis, rad, black,
and medv. We can reject H₀: βⱼ = 0 for these variables at
the 0.05 level. Many predictors that were significant in simple
regression (e.g., nox, lstat,
tax) lose significance when controlling for other variables
— a hallmark of multicollinearity and confounding.
uni_coefs <- setNames(slr_df$Coefficient, slr_df$Predictor)
multi_coefs <- coef(mlr_fit)[-1]
common <- intersect(names(uni_coefs), names(multi_coefs))
plot(uni_coefs[common], multi_coefs[common],
xlab = "Simple Regression Coefficient",
ylab = "Multiple Regression Coefficient",
main = "Univariate vs. Multiple Regression Coefficients\n(each point = one predictor)",
pch = 19, col = "steelblue", cex = 1.2)
abline(0, 1, lty = 2, col = "gray50")
abline(h = 0, lty = 3, col = "gray80")
abline(v = 0, lty = 3, col = "gray80")
text(uni_coefs[common], multi_coefs[common], labels = common, cex = 0.65, pos = 3)Comment: Several predictors show dramatic
differences between their simple and multiple regression coefficients —
most notably nox, which has a large positive simple
regression coefficient but a large negative coefficient in the multiple
model. This reversal indicates confounding: nox is highly
correlated with other predictors, so its apparent relationship with
crim changes sign once those predictors are controlled for.
Points lying far from the dashed 45° line are most affected by
multicollinearity.
poly_results <- lapply(predictors, function(var) {
if (length(unique(Boston[[var]])) < 4) return(NULL)
fit_lin <- lm(as.formula(paste("crim ~", var)), data = Boston)
fit_poly <- lm(as.formula(paste("crim ~", var,
"+ I(", var, "^2) + I(", var, "^3)")),
data = Boston)
s <- summary(fit_poly)
coefs <- coef(s)
anova_p <- anova(fit_lin, fit_poly)[2, "Pr(>F)"]
data.frame(
Predictor = var,
Linear_R2 = round(summary(fit_lin)$r.squared, 4),
Poly_R2 = round(s$r.squared, 4),
p_X2 = ifelse(nrow(coefs) >= 3, round(coefs[3, 4], 4), NA),
p_X3 = ifelse(nrow(coefs) >= 4, round(coefs[4, 4], 4), NA),
ANOVA_p = round(anova_p, 4),
NonLinear_Sig = ifelse(anova_p < 0.05, "Yes", "No")
)
})
poly_df <- do.call(rbind, Filter(Negate(is.null), poly_results))
poly_df <- poly_df[order(poly_df$ANOVA_p), ]
rownames(poly_df) <- NULL
kable(poly_df,
caption = "Polynomial Regression: Linear vs. Cubic Fit (ANOVA F-test for non-linearity)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(which(poly_df$NonLinear_Sig == "Yes"), bold = TRUE, color = "white", background = "#2c7bb6") %>%
scroll_box(width = "100%")| Predictor | Linear_R2 | Poly_R2 | p_X2 | p_X3 | ANOVA_p | NonLinear_Sig |
|---|---|---|---|---|---|---|
| indus | 0.1653 | 0.2597 | 0.0000 | 0.0000 | 0.0000 | Yes |
| nox | 0.1772 | 0.2970 | 0.0000 | 0.0000 | 0.0000 | Yes |
| age | 0.1244 | 0.1742 | 0.0474 | 0.0067 | 0.0000 | Yes |
| dis | 0.1441 | 0.2778 | 0.0000 | 0.0000 | 0.0000 | Yes |
| tax | 0.3396 | 0.3689 | 0.1375 | 0.2439 | 0.0000 | Yes |
| medv | 0.1508 | 0.4202 | 0.0000 | 0.0000 | 0.0000 | Yes |
| ptratio | 0.0841 | 0.1138 | 0.0041 | 0.0063 | 0.0003 | Yes |
| rm | 0.0481 | 0.0678 | 0.3641 | 0.5086 | 0.0052 | Yes |
| zn | 0.0402 | 0.0582 | 0.0938 | 0.2295 | 0.0085 | Yes |
| rad | 0.3913 | 0.4000 | 0.6130 | 0.4823 | 0.0261 | Yes |
| lstat | 0.2076 | 0.2179 | 0.0646 | 0.1299 | 0.0370 | Yes |
Findings: 11 predictors show statistically
significant non-linear associations with crim based on the
ANOVA F-test comparing the linear vs. cubic polynomial model (p <
0.05). These include dis, nox,
age, lstat, medv, and others. For
these predictors, a simple linear model is insufficient — the
relationship with per-capita crime rate follows a curved pattern,
suggesting that transformations or non-linear modelling approaches would
improve fit.
## Dimensions: 1089 9
## Period: 1990 to 2010
## Direction counts:
##
## Down Up
## 484 605
weekly_num_summary <- t(sapply(Weekly[, sapply(Weekly, is.numeric)],
function(x) round(c(Mean=mean(x), SD=sd(x),
Min=min(x), Max=max(x)), 3)))
kable(weekly_num_summary, caption = "Summary Statistics — Weekly Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Mean | SD | Min | Max | |
|---|---|---|---|---|
| Year | 2000.049 | 6.033 | 1990.000 | 2010.000 |
| Lag1 | 0.151 | 2.357 | -18.195 | 12.026 |
| Lag2 | 0.151 | 2.357 | -18.195 | 12.026 |
| Lag3 | 0.147 | 2.361 | -18.195 | 12.026 |
| Lag4 | 0.146 | 2.360 | -18.195 | 12.026 |
| Lag5 | 0.140 | 2.361 | -18.195 | 12.026 |
| Volume | 1.575 | 1.687 | 0.087 | 9.328 |
| Today | 0.150 | 2.357 | -18.195 | 12.026 |
weekly_num <- Weekly[, sapply(Weekly, is.numeric)]
cor_weekly <- round(cor(weekly_num), 3)
kable(cor_weekly, caption = "Correlation Matrix — Weekly (numeric variables)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed")) %>%
scroll_box(width = "100%")| Year | Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | |
|---|---|---|---|---|---|---|---|---|
| Year | 1.000 | -0.032 | -0.033 | -0.030 | -0.031 | -0.031 | 0.842 | -0.032 |
| Lag1 | -0.032 | 1.000 | -0.075 | 0.059 | -0.071 | -0.008 | -0.065 | -0.075 |
| Lag2 | -0.033 | -0.075 | 1.000 | -0.076 | 0.058 | -0.072 | -0.086 | 0.059 |
| Lag3 | -0.030 | 0.059 | -0.076 | 1.000 | -0.075 | 0.061 | -0.069 | -0.071 |
| Lag4 | -0.031 | -0.071 | 0.058 | -0.075 | 1.000 | -0.076 | -0.061 | -0.008 |
| Lag5 | -0.031 | -0.008 | -0.072 | 0.061 | -0.076 | 1.000 | -0.059 | 0.011 |
| Volume | 0.842 | -0.065 | -0.086 | -0.069 | -0.061 | -0.059 | 1.000 | -0.033 |
| Today | -0.032 | -0.075 | 0.059 | -0.071 | -0.008 | 0.011 | -0.033 | 1.000 |
par(mfrow = c(2, 3))
plot(Weekly$Year, Weekly$Volume,
main = "Trading Volume Over Time",
xlab = "Year", ylab = "Volume (billions of shares)",
pch = 19, cex = 0.4, col = "steelblue")
lines(lowess(Weekly$Year, Weekly$Volume), col = "red", lwd = 2)
boxplot(Today ~ Direction, data = Weekly,
main = "Weekly Return by Direction",
xlab = "Direction", ylab = "Return (%)",
col = c("salmon", "lightblue"))
hist(Weekly$Today, breaks = 60,
main = "Distribution of Weekly Returns",
xlab = "Return (%)", col = "steelblue", border = "white")
abline(v = mean(Weekly$Today), col = "red", lwd = 2, lty = 2)
plot(Weekly$Lag1, Weekly$Today,
main = "Today vs Lag1",
xlab = "Lag1 Return (%)", ylab = "Today's Return (%)",
pch = 19, cex = 0.4, col = "green")
abline(lm(Today ~ Lag1, data = Weekly), col = "red", lwd = 2)
plot(Weekly$Lag2, Weekly$Today,
main = "Today vs Lag2",
xlab = "Lag2 Return (%)", ylab = "Today's Return (%)",
pch = 19, cex = 0.4, col = "seagreen")
abline(lm(Today ~ Lag2, data = Weekly), col = "red", lwd = 2)
barplot(table(Weekly$Direction),
main = "Count of Up vs Down Weeks",
col = c("salmon", "lightblue"),
ylab = "Frequency")Patterns:
Today) are
approximately symmetric and centered near zero (mean = 0.15%),
consistent with a weak-form efficient market.Today (all |r| < 0.07), consistent
with the efficient market hypothesis. Volume and
Year are strongly correlated (r = 0.842), confirming the
time trend in trading activity.Lag2 shows a marginally larger absolute correlation with
Today.glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm_full)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm_coefs <- as.data.frame(coef(summary(glm_full)))
glm_coefs <- round(glm_coefs, 4)
names(glm_coefs) <- c("Estimate", "Std. Error", "z value", "p value")
kable(glm_coefs, caption = "Logistic Regression Coefficients (Full Model)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
row_spec(which(glm_coefs[["p value"]] < 0.05), bold = TRUE, color = "white", background = "#2c7bb6")| Estimate | Std. Error | z value | p value | |
|---|---|---|---|---|
| (Intercept) | 0.2669 | 0.0859 | 3.1056 | 0.0019 |
| Lag1 | -0.0413 | 0.0264 | -1.5626 | 0.1181 |
| Lag2 | 0.0584 | 0.0269 | 2.1754 | 0.0296 |
| Lag3 | -0.0161 | 0.0267 | -0.6024 | 0.5469 |
| Lag4 | -0.0278 | 0.0265 | -1.0501 | 0.2937 |
| Lag5 | -0.0145 | 0.0264 | -0.5485 | 0.5833 |
| Volume | -0.0227 | 0.0369 | -0.6163 | 0.5377 |
Comment: Among all six predictors, only
Lag2 is statistically significant (p = 0.0296). Its
positive coefficient (0.0584) suggests that a positive return two weeks
ago is associated with a slightly higher probability of an “Up” week.
All other lag terms and Volume are not significant,
suggesting they carry little additional predictive value for market
direction beyond what Lag2 already captures.
prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
conf_full <- table(Predicted = pred_full, Actual = Weekly$Direction)
print(conf_full)## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
TP <- conf_full["Up", "Up"]
TN <- conf_full["Down", "Down"]
FP <- conf_full["Up", "Down"]
FN <- conf_full["Down", "Up"]
accuracy <- round((TP + TN) / sum(conf_full), 4)
sensitivity <- round(TP / (TP + FN), 4)
specificity <- round(TN / (TN + FP), 4)
perf_df <- data.frame(
Metric = c("Overall Accuracy",
"Sensitivity (Up correctly identified)",
"Specificity (Down correctly identified)"),
Value = c(accuracy, sensitivity, specificity)
)
kable(perf_df, caption = "Performance Metrics — Full Logistic Regression (in-sample)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Metric | Value |
|---|---|
| Overall Accuracy | 0.5611 |
| Sensitivity (Up correctly identified) | 0.9207 |
| Specificity (Down correctly identified) | 0.1116 |
Interpretation: The overall accuracy of 56.11% is only marginally better than a naive classifier that always predicts “Up” (55.6%). The model has high sensitivity (92.07%) — correctly identifying “Up” weeks — but very low specificity (11.16%) — misclassifying the large majority of “Down” weeks as “Up”. This is a classic problem with imbalanced binary outcomes: the model learns to default to the majority class. A threshold other than 0.5 could improve specificity at the cost of sensitivity.
train_idx <- Weekly$Year <= 2008
test <- Weekly[!train_idx, ]
glm_lag2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train_idx)
summary(glm_lag2)##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train_idx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
prob_test <- predict(glm_lag2, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")
conf_test <- table(Predicted = pred_test, Actual = test$Direction)
print(conf_test)## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
TP2 <- conf_test["Up", "Up"]
TN2 <- conf_test["Down", "Down"]
FP2 <- conf_test["Up", "Down"]
FN2 <- conf_test["Down", "Up"]
acc2 <- round((TP2 + TN2) / sum(conf_test), 4)
sens2 <- round(TP2 / (TP2 + FN2), 4)
spec2 <- round(TN2 / (TN2 + FP2), 4)
perf2_df <- data.frame(
Metric = c("Overall Accuracy",
"Sensitivity (Up correctly identified)",
"Specificity (Down correctly identified)"),
Full_Model = c(accuracy, sensitivity, specificity),
Lag2_Test = c(acc2, sens2, spec2)
)
kable(perf2_df,
caption = "Performance: Full Model (in-sample) vs. Lag2-only (out-of-sample 2009–2010)",
col.names = c("Metric", "Full Model (in-sample)", "Lag2 Only (2009–2010 test)")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)| Metric | Full Model (in-sample) | Lag2 Only (2009–2010 test) |
|---|---|---|
| Overall Accuracy | 0.5611 | 0.6250 |
| Sensitivity (Up correctly identified) | 0.9207 | 0.9180 |
| Specificity (Down correctly identified) | 0.1116 | 0.2093 |
Comment: The Lag2-only model trained on 1990–2008
achieves an out-of-sample accuracy of 62.5% on the held-out 2009–2010
data — better than the full model’s in-sample accuracy. Sensitivity
remains high (91.8%), and while specificity (20.93%) is still relatively
low, the simpler model generalizes better. This is consistent with the
principle of parsimony: including noisy,
non-significant predictors (Lag1, Lag3–5, Volume) in the full model
introduces overfitting, and Lag2 alone contains the most
useful signal for predicting weekly market direction — though that
signal is admittedly modest.