library(ISLR)
library(ggplot2)
library(patchwork) # side-by-side plots
library(dplyr)
library(knitr)
library(kableExtra)This report examines the relationship between mpg
(miles per gallon) and horsepower in the
Auto dataset.
We fit two models — a simple linear and a
polynomial (degree 2) — and use a suite of diagnostic
plots to determine whether the relationship is linear or nonlinear.
kable(head(Auto), caption = "First 6 rows of the Auto dataset") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name |
|---|---|---|---|---|---|---|---|---|
| 18 | 8 | 307 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
| 15 | 8 | 350 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
| 18 | 8 | 318 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
| 16 | 8 | 304 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
| 17 | 8 | 302 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
| 15 | 8 | 429 | 198 | 4341 | 10.0 | 70 | 1 | ford galaxie 500 |
## mpg horsepower weight displacement cylinders
## Min. : 9.00 Min. : 46.0 Min. :1613 Min. : 68.0 Min. :3.000
## 1st Qu.:17.00 1st Qu.: 75.0 1st Qu.:2225 1st Qu.:105.0 1st Qu.:4.000
## Median :22.75 Median : 93.5 Median :2804 Median :151.0 Median :4.000
## Mean :23.45 Mean :104.5 Mean :2978 Mean :194.4 Mean :5.472
## 3rd Qu.:29.00 3rd Qu.:126.0 3rd Qu.:3615 3rd Qu.:275.8 3rd Qu.:8.000
## Max. :46.60 Max. :230.0 Max. :5140 Max. :455.0 Max. :8.000
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.45, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red",
linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = "darkgreen",
linewidth = 1) +
labs(title = "MPG vs. Horsepower",
subtitle = "Red dashed = linear fit | Green = LOESS smoother",
x = "Horsepower", y = "MPG") +
theme_minimal(base_size = 13)Note: The LOESS smoother curves noticeably relative to the straight linear fit — an early visual hint of nonlinearity.
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
# Build a combined diagnostic data frame for both models
diag_linear <- data.frame(
fitted = fitted(model_linear),
residuals = residuals(model_linear),
stdres = rstandard(model_linear),
leverage = hatvalues(model_linear),
model = "Linear"
)
diag_poly <- data.frame(
fitted = fitted(model_poly),
residuals = residuals(model_poly),
stdres = rstandard(model_poly),
leverage = hatvalues(model_poly),
model = "Polynomial (deg 2)"
)ggplot(diag_linear, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
labs(title = "Residual vs. Fitted — Linear Model",
subtitle = "A U-shaped pattern signals nonlinearity",
x = "Fitted Values", y = "Residuals") +
theme_minimal(base_size = 13)ggplot(diag_poly, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.5, color = "darkorange") +
geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
labs(title = "Residual vs. Fitted — Polynomial Model",
subtitle = "Residuals are more randomly scattered around zero",
x = "Fitted Values", y = "Residuals") +
theme_minimal(base_size = 13)all_diag <- bind_rows(diag_linear, diag_poly)
ggplot(all_diag, aes(x = fitted, y = residuals, color = model)) +
geom_point(alpha = 0.4) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
geom_smooth(method = "loess", se = TRUE, linewidth = 1, color = "darkred") +
facet_wrap(~model, scales = "free_x") +
scale_color_manual(values = c("steelblue", "darkorange")) +
labs(title = "Residual vs. Fitted — Side-by-Side Comparison",
x = "Fitted Values", y = "Residuals") +
theme_minimal(base_size = 13) +
theme(legend.position = "none")all_diag$sqrt_abs_stdres <- sqrt(abs(all_diag$stdres))
ggplot(all_diag, aes(x = fitted, y = sqrt_abs_stdres, color = model)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
facet_wrap(~model, scales = "free_x") +
scale_color_manual(values = c("steelblue", "darkorange")) +
labs(title = "Scale-Location Plot — Side-by-Side",
subtitle = "Checks homoscedasticity (equal spread of residuals)",
x = "Fitted Values",
y = expression(sqrt("|Standardized Residuals|"))) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")all_diag$actual <- c(Auto$mpg, Auto$mpg)
ggplot(all_diag, aes(x = actual, y = fitted, color = model)) +
geom_point(alpha = 0.4) +
geom_abline(slope = 1, intercept = 0, color = "red",
linetype = "dashed", linewidth = 1) +
facet_wrap(~model) +
scale_color_manual(values = c("steelblue", "darkorange")) +
labs(title = "Fitted vs. Actual Values — Side-by-Side",
subtitle = "Points closer to the red diagonal = better fit",
x = "Actual MPG", y = "Fitted MPG") +
theme_minimal(base_size = 13) +
theme(legend.position = "none")hp_seq <- data.frame(horsepower = seq(min(Auto$horsepower),
max(Auto$horsepower), length.out = 300))
hp_seq$pred_linear <- predict(model_linear, newdata = hp_seq)
hp_seq$pred_poly <- predict(model_poly, newdata = hp_seq)
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.35, color = "gray40") +
geom_line(data = hp_seq, aes(y = pred_linear, color = "Linear"),
linewidth = 1.2) +
geom_line(data = hp_seq, aes(y = pred_poly, color = "Polynomial (deg 2)"),
linewidth = 1.2) +
scale_color_manual(name = "Model",
values = c("Linear" = "steelblue",
"Polynomial (deg 2)" = "darkorange")) +
labs(title = "Regression Curves: Linear vs. Polynomial",
subtitle = "Polynomial curve follows the data more closely",
x = "Horsepower", y = "MPG") +
theme_minimal(base_size = 13)r2_lin <- summary(model_linear)$r.squared
r2_poly <- summary(model_poly)$r.squared
rmse <- function(m) sqrt(mean(residuals(m)^2))
comparison <- data.frame(
Model = c("Linear", "Polynomial (deg 2)"),
R_squared = round(c(r2_lin, r2_poly), 4),
Adj_R2 = round(c(summary(model_linear)$adj.r.squared,
summary(model_poly)$adj.r.squared), 4),
AIC = round(c(AIC(model_linear), AIC(model_poly)), 2),
RMSE = round(c(rmse(model_linear), rmse(model_poly)), 4),
Residual_Pattern = c("U-shaped (nonlinear)", "Random scatter (better)")
)
kable(comparison, caption = "Model Comparison Summary") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) |>
column_spec(2:5, bold = TRUE) |>
row_spec(2, background = "#d4edda")| Model | R_squared | Adj_R2 | AIC | RMSE | Residual_Pattern |
|---|---|---|---|---|---|
| Linear | 0.6059 | 0.6049 | 2363.32 | 4.8932 | U-shaped (nonlinear) |
| Polynomial (deg 2) | 0.6876 | 0.6860 | 2274.35 | 4.3572 | Random scatter (better) |