# Load and clean data
data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Quantitative: mpg,
cylinders, displacement,
horsepower, weight, acceleration,
year
Qualitative: origin, name
quant_vars <- Auto[, c("mpg","cylinders","displacement","horsepower",
"weight","acceleration","year")]
sapply(quant_vars, range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
sapply(quant_vars, mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(quant_vars, sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
Auto_sub <- Auto[-(10:85), ]
quant_sub <- Auto_sub[, c("mpg","cylinders","displacement","horsepower",
"weight","acceleration","year")]
cat("--- Range ---\n")
## --- Range ---
sapply(quant_sub, range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
cat("--- Mean ---\n")
## --- Mean ---
sapply(quant_sub, mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
cat("--- SD ---\n")
## --- SD ---
sapply(quant_sub, sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
pairs(quant_vars, main = "Scatterplot Matrix of Auto Quantitative Predictors")
Findings: Strong negative relationships exist
between mpg and displacement,
horsepower, and weight. Cylinders
and displacement are highly correlated with each other.
par(mfrow = c(2, 3))
for (var in c("cylinders","displacement","horsepower","weight","acceleration","year")) {
plot(Auto[[var]], Auto$mpg,
xlab = var, ylab = "mpg",
main = paste("mpg vs", var),
col = "steelblue", pch = 16, cex = 0.6)
abline(lm(Auto$mpg ~ Auto[[var]]), col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
Conclusion: displacement,
horsepower, weight, and cylinders
show strong negative correlations with mpg and are likely
useful predictors. year shows a positive trend (newer cars
are more fuel-efficient). acceleration has a weaker
relationship.
pairs(Auto[, -9], main = "Scatterplot Matrix — Auto Dataset")
cor(Auto[, -9]) # Exclude 'name' (qualitative)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Relationship between predictors and
response:
Yes — the overall F-statistic is highly significant (p < 0.001),
indicating a relationship exists.
ii. Statistically significant predictors:
displacement, weight, year, and
origin have significant p-values (< 0.05).
iii. Coefficient for year:
The positive coefficient (~0.75) suggests that, holding other variables
constant, newer model year cars have higher mpg — reflecting
improvements in fuel efficiency over time.
par(mfrow = c(2, 2))
plot(lm_fit)
par(mfrow = c(1, 1))
Comments:
- The Residuals vs Fitted plot shows a slight non-linear pattern,
suggesting the linear model may not fully capture the
relationship.
- Observation 14 appears as a high-leverage point in
the leverage plot.
- A few potential outliers are visible in the Q-Q plot tails.
lm_interact <- lm(mpg ~ (. - name)^2, data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ (. - name)^2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
Comments: Several interaction terms are
statistically significant, for example displacement:year
and acceleration:year, suggesting that the effect of some
predictors on mpg depends on the value of other predictors.
par(mfrow = c(1, 3))
plot(log(Auto$horsepower), Auto$mpg, main = "mpg vs log(horsepower)",
xlab = "log(horsepower)", ylab = "mpg", col = "steelblue", pch = 16, cex = 0.7)
plot(sqrt(Auto$horsepower), Auto$mpg, main = "mpg vs sqrt(horsepower)",
xlab = "sqrt(horsepower)", ylab = "mpg", col = "darkorange", pch = 16, cex = 0.7)
plot(Auto$horsepower^2, Auto$mpg, main = "mpg vs horsepower^2",
xlab = "horsepower^2", ylab = "mpg", col = "darkgreen", pch = 16, cex = 0.7)
par(mfrow = c(1, 1))
# Fit models with transformations
lm_log <- lm(mpg ~ log(horsepower), data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
lm_sq <- lm(mpg ~ I(horsepower^2), data = Auto)
cat("R-squared — log(hp):", summary(lm_log)$r.squared, "\n")
## R-squared — log(hp): 0.6683348
cat("R-squared — sqrt(hp):", summary(lm_sqrt)$r.squared, "\n")
## R-squared — sqrt(hp): 0.6437036
cat("R-squared — hp^2:", summary(lm_sq)$r.squared, "\n")
## R-squared — hp^2: 0.507367
Comments: The log transformation of
horsepower yields the best linear fit with the highest R²,
linearizing the relationship with mpg more effectively than
the raw or squared values.
data(Boston)
predictors <- setdiff(names(Boston), "crim")
slr_results <- lapply(predictors, function(var) {
fit <- lm(as.formula(paste("crim ~", var)), data = Boston)
s <- summary(fit)
data.frame(
predictor = var,
coef = coef(fit)[2],
p_value = s$coefficients[2, 4],
r_squared = s$r.squared
)
})
slr_table <- do.call(rbind, slr_results)
slr_table[order(slr_table$p_value), ]
## predictor coef p_value r_squared
## rad rad 0.61791093 2.693844e-56 0.391256687
## tax tax 0.02974225 2.357127e-47 0.339614243
## lstat lstat 0.54880478 2.654277e-27 0.207590933
## nox nox 31.24853120 3.751739e-23 0.177217182
## indus indus 0.50977633 1.450349e-21 0.165310070
## medv medv -0.36315992 1.173987e-19 0.150780469
## dis dis -1.55090168 8.519949e-19 0.144149375
## age age 0.10778623 2.854869e-16 0.124421452
## ptratio ptratio 1.15198279 2.942922e-11 0.084068439
## rm rm -2.68405122 6.346703e-07 0.048069117
## zn zn -0.07393498 5.506472e-06 0.040187908
## chas chas -1.89277655 2.094345e-01 0.003123869
# Plot a few significant ones
sig_vars <- c("lstat","medv","nox","dis","rad")
par(mfrow = c(2, 3))
for (var in sig_vars) {
plot(Boston[[var]], Boston$crim,
xlab = var, ylab = "crim",
main = paste("crim vs", var),
col = "steelblue", pch = 16, cex = 0.6)
abline(lm(as.formula(paste("crim ~", var)), data = Boston), col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
Finding: Almost all predictors show a statistically significant simple linear association with crime rate.
mlr_fit <- lm(crim ~ ., data = Boston)
summary(mlr_fit)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Predictors where we can reject H₀ (β_j = 0) at α =
0.05: zn, dis, rad,
black, medv (and possibly others depending on
the dataset version).
slr_coefs <- setNames(slr_table$coef, slr_table$predictor)
mlr_coefs <- coef(mlr_fit)[-1] # Remove intercept
# Align by name
common <- intersect(names(slr_coefs), names(mlr_coefs))
plot(slr_coefs[common], mlr_coefs[common],
xlab = "Simple Regression Coefficients",
ylab = "Multiple Regression Coefficients",
main = "Univariate vs Multiple Regression Coefficients",
pch = 19, col = "steelblue")
abline(0, 1, col = "red", lty = 2)
text(slr_coefs[common], mlr_coefs[common], labels = common, cex = 0.65, pos = 3)
Comment: Coefficients differ substantially — notably
for nox, where the simple regression coefficient is large
but shrinks in the multiple regression, suggesting confounding with
other variables.
poly_results <- lapply(predictors, function(var) {
# Skip variables with fewer than 4 unique values (e.g. binary 'chas')
if (length(unique(Boston[[var]])) < 4) {
return(data.frame(predictor = var, p_X2 = NA, p_X3 = NA,
note = "skipped (too few unique values)"))
}
fit <- lm(as.formula(paste("crim ~ poly(", var, ", 3)")), data = Boston)
s <- summary(fit)
pvals <- s$coefficients[, 4]
data.frame(
predictor = var,
p_X2 = ifelse(length(pvals) >= 3, pvals[3], NA),
p_X3 = ifelse(length(pvals) >= 4, pvals[4], NA),
note = "fitted"
)
})
poly_table <- do.call(rbind, poly_results)
poly_table
## predictor p_X2 p_X3 note
## 1 zn 4.420507e-03 2.295386e-01 fitted
## 2 indus 1.086057e-03 1.196405e-12 fitted
## 3 chas NA NA skipped (too few unique values)
## 4 nox 7.736755e-05 6.961110e-16 fitted
## 5 rm 1.508545e-03 5.085751e-01 fitted
## 6 age 2.291156e-06 6.679915e-03 fitted
## 7 dis 7.869767e-14 1.088832e-08 fitted
## 8 rad 9.120558e-03 4.823138e-01 fitted
## 9 tax 3.665348e-06 2.438507e-01 fitted
## 10 ptratio 2.405468e-03 6.300514e-03 fitted
## 11 lstat 3.780418e-02 1.298906e-01 fitted
## 12 medv 2.928577e-35 1.046510e-12 fitted
Comment: For several predictors (e.g.,
dis, nox, medv), the quadratic
and/or cubic terms are statistically significant, providing evidence of
non-linear relationships with crime rate.
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly[, 1:8], col = ifelse(Weekly$Direction == "Up", "steelblue", "tomato"),
main = "Weekly Data — Blue=Up, Red=Down")
# Volume over time
plot(Weekly$Year, Weekly$Volume,
type = "l", col = "steelblue",
xlab = "Year", ylab = "Volume",
main = "Trading Volume Over Time")
Patterns: Trading volume has increased substantially over the 21-year period. Lag variables show little obvious visual pattern with Direction.
log_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(log_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Significant predictors: Lag2 has a
statistically significant p-value (< 0.05). The other lag variables
and Volume are not significant.
log_probs <- predict(log_fit, type = "response")
log_pred <- ifelse(log_probs > 0.5, "Up", "Down")
conf_mat <- table(Predicted = log_pred, Actual = Weekly$Direction)
conf_mat
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- mean(log_pred == Weekly$Direction)
cat("Overall Fraction Correct:", round(accuracy, 4), "\n")
## Overall Fraction Correct: 0.5611
Interpretation: The model predicts “Up” far more often than “Down”. It correctly classifies most “Up” weeks but misses many “Down” weeks — a common issue when one class dominates.
train <- Weekly$Year <= 2008
test <- Weekly[!train, ]
log_fit2 <- glm(Direction ~ Lag2,
data = Weekly,
subset = train,
family = binomial)
log_probs2 <- predict(log_fit2, newdata = test, type = "response")
log_pred2 <- ifelse(log_probs2 > 0.5, "Up", "Down")
conf_mat2 <- table(Predicted = log_pred2, Actual = test$Direction)
conf_mat2
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy2 <- mean(log_pred2 == test$Direction)
cat("Test Fraction Correct (2009-2010):", round(accuracy2, 4), "\n")
## Test Fraction Correct (2009-2010): 0.625
Comment: Using only Lag2 on the
held-out test set (2009–2010) achieves around 62.5% accuracy,
outperforming the full model on training data and suggesting
Lag2 has modest predictive power for future market
direction. ```