library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year
Qualitative: name, origin
quant_vars <- c("mpg","cylinders","displacement",
"horsepower","weight","acceleration","year")
ranges <- sapply(Auto[, quant_vars], range)
rownames(ranges) <- c("Min", "Max")
print(ranges)
## mpg cylinders displacement horsepower weight acceleration year
## Min 9.0 3 68 46 1613 8.0 70
## Max 46.6 8 455 230 5140 24.8 82
means <- sapply(Auto[, quant_vars], mean)
sds <- sapply(Auto[, quant_vars], sd)
summary_stats <- rbind(Mean = round(means, 2), SD = round(sds, 2))
print(summary_stats)
## mpg cylinders displacement horsepower weight acceleration year
## Mean 23.45 5.47 194.41 104.47 2977.58 15.54 75.98
## SD 7.81 1.71 104.64 38.49 849.40 2.76 3.68
Auto_sub <- Auto[-(10:85), ]
ranges2 <- sapply(Auto_sub[, quant_vars], range)
rownames(ranges2) <- c("Min", "Max")
means2 <- sapply(Auto_sub[, quant_vars], mean)
sds2 <- sapply(Auto_sub[, quant_vars], sd)
cat("=== Range (subset) ===\n")
## === Range (subset) ===
print(ranges2)
## mpg cylinders displacement horsepower weight acceleration year
## Min 11.0 3 68 46 1649 8.5 70
## Max 46.6 8 455 230 4997 24.8 82
cat("\n=== Mean (subset) ===\n")
##
## === Mean (subset) ===
print(round(means2, 2))
## mpg cylinders displacement horsepower weight acceleration
## 24.40 5.37 187.24 100.72 2935.97 15.73
## year
## 77.15
cat("\n=== SD (subset) ===\n")
##
## === SD (subset) ===
print(round(sds2, 2))
## mpg cylinders displacement horsepower weight acceleration
## 7.87 1.65 99.68 35.71 811.30 2.69
## year
## 3.11
pairs(Auto[, quant_vars],
main = "Scatterplot Matrix - Auto Dataset",
pch = 19,
col = "steelblue",
cex = 0.5)
Findings: There are strong negative relationships
between mpg and displacement,
horsepower, and weight. Heavier and more
powerful cars tend to have lower fuel efficiency. year
shows a positive relationship with mpg, suggesting cars
became more fuel-efficient over time.
par(mfrow = c(2, 3))
plot(Auto$displacement, Auto$mpg,
xlab = "Displacement", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ displacement, data = Auto), col = "red", lwd = 2)
plot(Auto$horsepower, Auto$mpg,
xlab = "Horsepower", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ horsepower, data = Auto), col = "red", lwd = 2)
plot(Auto$weight, Auto$mpg,
xlab = "Weight", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ weight, data = Auto), col = "red", lwd = 2)
plot(Auto$acceleration, Auto$mpg,
xlab = "Acceleration", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ acceleration, data = Auto), col = "red", lwd = 2)
plot(Auto$year, Auto$mpg,
xlab = "Year", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ year, data = Auto), col = "red", lwd = 2)
plot(Auto$cylinders, Auto$mpg,
xlab = "Cylinders", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ cylinders, data = Auto), col = "red", lwd = 2)
par(mfrow = c(1,1))
Conclusion: displacement,
horsepower, weight, and cylinders
are the strongest predictors of mpg. year is
also useful — newer cars are more fuel-efficient.
pairs(Auto[, quant_vars],
main = "Scatterplot Matrix - Auto Dataset",
pch = 19, col = "steelblue", cex = 0.5)
cor_matrix <- cor(Auto[, quant_vars])
round(cor_matrix, 3)
## mpg cylinders displacement horsepower weight acceleration
## mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423
## cylinders -0.778 1.000 0.951 0.843 0.898 -0.505
## displacement -0.805 0.951 1.000 0.897 0.933 -0.544
## horsepower -0.778 0.843 0.897 1.000 0.865 -0.689
## weight -0.832 0.898 0.933 0.865 1.000 -0.417
## acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000
## year 0.581 -0.346 -0.370 -0.416 -0.309 0.290
## year
## mpg 0.581
## cylinders -0.346
## displacement -0.370
## horsepower -0.416
## weight -0.309
## acceleration 0.290
## year 1.000
lm_auto <- lm(mpg ~ . - name, data = Auto)
summary(lm_auto)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Yes — the F-statistic is very large and p-value < 0.001, confirming a significant overall relationship between predictors and response.
ii. displacement, weight,
year, and origin are statistically significant
(p < 0.05).
iii. The positive coefficient for year
(~0.75) suggests each additional year is associated with ~0.75 more mpg
— cars became more fuel-efficient over time.
par(mfrow = c(2, 2))
plot(lm_auto)
par(mfrow = c(1, 1))
Comments:
lm_interact <- lm(mpg ~ displacement * weight + year * origin, data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ displacement * weight + year * origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5758 -1.6211 -0.0537 1.3264 13.3266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.793e+01 8.044e+00 2.229 0.026394 *
## displacement -7.519e-02 9.091e-03 -8.271 2.19e-15 ***
## weight -1.035e-02 6.450e-04 -16.053 < 2e-16 ***
## year 4.864e-01 1.017e-01 4.782 2.47e-06 ***
## origin -1.503e+01 4.232e+00 -3.551 0.000432 ***
## displacement:weight 2.098e-05 2.179e-06 9.625 < 2e-16 ***
## year:origin 1.980e-01 5.436e-02 3.642 0.000308 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.969 on 385 degrees of freedom
## Multiple R-squared: 0.8575, Adjusted R-squared: 0.8553
## F-statistic: 386.2 on 6 and 385 DF, p-value: < 2.2e-16
Finding: The interaction between
displacement and weight is statistically
significant, suggesting their combined effect on mpg is
non-additive.
par(mfrow = c(1, 3))
plot(log(Auto$horsepower), Auto$mpg,
xlab = "log(horsepower)", ylab = "MPG",
pch = 19, col = "steelblue", cex = 0.6,
main = "log(X)")
plot(sqrt(Auto$horsepower), Auto$mpg,
xlab = "sqrt(horsepower)", ylab = "MPG",
pch = 19, col = "darkgreen", cex = 0.6,
main = "sqrt(X)")
plot(Auto$horsepower^2, Auto$mpg,
xlab = "horsepower^2", ylab = "MPG",
pch = 19, col = "tomato", cex = 0.6,
main = "X^2")
par(mfrow = c(1,1))
lm_log <- lm(mpg ~ log(horsepower), data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
lm_sq <- lm(mpg ~ I(horsepower^2), data = Auto)
cat("R² log(hp) :", round(summary(lm_log)$r.squared, 4), "\n")
## R² log(hp) : 0.6683
cat("R² sqrt(hp) :", round(summary(lm_sqrt)$r.squared, 4), "\n")
## R² sqrt(hp) : 0.6437
cat("R² hp² :", round(summary(lm_sq)$r.squared, 4), "\n")
## R² hp² : 0.5074
Finding: log(horsepower) gives the best
linear fit (highest R²), confirming the non-linear relationship between
horsepower and mpg.
data(Boston)
predictors <- setdiff(names(Boston), "crim")
results <- data.frame(
Predictor = predictors,
Coefficient = NA,
P_value = NA
)
for (i in seq_along(predictors)) {
fit <- lm(Boston$crim ~ Boston[[ predictors[i] ]])
s <- summary(fit)
results$Coefficient[i] <- round(coef(fit)[2], 4)
results$P_value[i] <- round(s$coefficients[2, 4], 6)
}
results$Significant <- ifelse(results$P_value < 0.05, "Yes", "No")
print(results)
## Predictor Coefficient P_value Significant
## 1 zn -0.0739 0.000006 Yes
## 2 indus 0.5098 0.000000 Yes
## 3 chas -1.8928 0.209435 No
## 4 nox 31.2485 0.000000 Yes
## 5 rm -2.6841 0.000001 Yes
## 6 age 0.1078 0.000000 Yes
## 7 dis -1.5509 0.000000 Yes
## 8 rad 0.6179 0.000000 Yes
## 9 tax 0.0297 0.000000 Yes
## 10 ptratio 1.1520 0.000000 Yes
## 11 black -0.0363 0.000000 Yes
## 12 lstat 0.5488 0.000000 Yes
## 13 medv -0.3632 0.000000 Yes
par(mfrow = c(4, 4))
for (p in predictors) {
plot(Boston[[p]], Boston$crim,
xlab = p, ylab = "crim",
pch = 19, col = "steelblue", cex = 0.4)
abline(lm(Boston$crim ~ Boston[[p]]), col = "red", lwd = 1.5)
}
par(mfrow = c(1,1))
Finding: Almost all predictors have a statistically
significant association with crim. rad and
tax show the strongest positive associations.
lm_boston <- lm(crim ~ ., data = Boston)
summary(lm_boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
Finding: Only zn, dis,
rad, black, and medv remain
statistically significant. Many predictors that were significant
individually lose significance when controlling for others.
multi_coefs <- coef(lm_boston)[-1]
uni_coefs <- results$Coefficient
names(uni_coefs) <- predictors
common <- intersect(names(multi_coefs), names(uni_coefs))
plot(uni_coefs[common], multi_coefs[common],
xlab = "Simple Regression Coefficients",
ylab = "Multiple Regression Coefficients",
pch = 19, col = "steelblue",
main = "Univariate vs Multiple Regression Coefficients")
abline(0, 1, col = "red", lty = 2)
text(uni_coefs[common], multi_coefs[common],
labels = common, cex = 0.6, pos = 3)
Finding: Coefficients differ substantially between models, confirming multicollinearity among predictors.
# Exclude binary variable 'chas' — cannot fit poly degree 3 with only 2 values
poly_predictors <- setdiff(predictors, "chas")
poly_results <- data.frame(
Predictor = poly_predictors,
Beta2_pval = NA,
Beta3_pval = NA
)
for (i in seq_along(poly_predictors)) {
p <- poly_predictors[i]
fit <- lm(crim ~ poly(Boston[[p]], 3), data = Boston)
s <- summary(fit)$coefficients
poly_results$Beta2_pval[i] <- round(s[3, 4], 4)
poly_results$Beta3_pval[i] <- round(s[4, 4], 4)
}
print(poly_results)
## Predictor Beta2_pval Beta3_pval
## 1 zn 0.0044 0.2295
## 2 indus 0.0011 0.0000
## 3 nox 0.0001 0.0000
## 4 rm 0.0015 0.5086
## 5 age 0.0000 0.0067
## 6 dis 0.0000 0.0000
## 7 rad 0.0091 0.4823
## 8 tax 0.0000 0.2439
## 9 ptratio 0.0024 0.0063
## 10 black 0.4566 0.5436
## 11 lstat 0.0378 0.1299
## 12 medv 0.0000 0.0000
Finding: Several predictors (e.g., dis,
nox, medv) show significant quadratic or cubic
terms, confirming non-linear relationships with crime rate.
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly[, -9],
pch = 19,
col = ifelse(Weekly$Direction == "Up", "steelblue", "tomato"),
cex = 0.4,
main = "Weekly Data - Blue=Up, Red=Down")
Pattern: Volume has increased steadily
over time. The lag variables show very little correlation with
Direction.
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Finding: Only Lag2 is statistically
significant (p < 0.05).
probs_full <- predict(glm_full, type = "response")
preds_full <- ifelse(probs_full > 0.5, "Up", "Down")
conf_matrix <- table(Predicted = preds_full, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- mean(preds_full == Weekly$Direction)
cat("\nOverall accuracy:", round(accuracy * 100, 2), "%\n")
##
## Overall accuracy: 56.11 %
Interpretation: The model predicts “Up” most of the time, capturing the overall upward market bias but performing poorly at predicting “Down” weeks.
train <- Weekly$Year <= 2008
test <- Weekly[!train, ]
glm_lag2 <- glm(Direction ~ Lag2,
data = Weekly,
subset = train,
family = binomial)
probs_test <- predict(glm_lag2, newdata = test, type = "response")
preds_test <- ifelse(probs_test > 0.5, "Up", "Down")
conf_test <- table(Predicted = preds_test, Actual = test$Direction)
print(conf_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_test <- mean(preds_test == test$Direction)
cat("\nTest accuracy (2009-2010):", round(accuracy_test * 100, 2), "%\n")
##
## Test accuracy (2009-2010): 62.5 %
Finding: Using only Lag2 on the
held-out test set gives better accuracy than the full model, confirming
that a simpler model generalises better here.