# Install ISLR if needed: install.packages("ISLR")
library(ISLR)
library(ggplot2)
data(Auto)
# Quick look
head(Auto[, c("mpg", "horsepower", "weight", "displacement", "cylinders")])
## mpg horsepower weight displacement cylinders
## 1 18 130 3504 307 8
## 2 15 165 3693 350 8
## 3 18 150 3436 318 8
## 4 16 150 3433 304 8
## 5 17 140 3449 302 8
## 6 15 198 4341 429 8
Before modelling, visualise the raw relationship between
mpg and horsepower.
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5, colour = "#2c7bb6") +
geom_smooth(method = "lm", se = TRUE, colour = "red", linetype = "dashed", linewidth = 0.9) +
geom_smooth(method = "loess", se = FALSE, colour = "darkgreen", linewidth = 1.1) +
labs(
title = "MPG vs. Horsepower",
subtitle = "Red dashed = linear fit | Green = LOESS smooth",
x = "Horsepower",
y = "Miles Per Gallon (MPG)"
) +
theme_minimal(base_size = 13)
The LOESS curve (green) bends away from the linear fit (red), hinting at a nonlinear relationship.
\[\text{mpg} = \beta_0 + \beta_1 \cdot \text{horsepower} + \varepsilon\]
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Extract fitted values and residuals
resid_df_lm <- data.frame(
fitted = fitted(lm_fit),
residuals = residuals(lm_fit)
)
ggplot(resid_df_lm, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.5, colour = "#2c7bb6") +
geom_hline(yintercept = 0, colour = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, colour = "darkorange", linewidth = 1.1) +
labs(
title = "Residual vs. Fitted — Linear Model",
subtitle = "A systematic 'U' shape indicates a missed nonlinear trend",
x = "Fitted Values",
y = "Residuals"
) +
theme_minimal(base_size = 13)
Interpretation: The LOESS curve (orange) shows a clear “U” shape, confirming that the linear model fails to capture the curvature in the data. The residuals are not randomly scattered — they are negative for low and high fitted values, and positive in the middle.
To address the nonlinearity, we add a quadratic term:
\[\text{mpg} = \beta_0 + \beta_1 \cdot \text{horsepower} + \beta_2 \cdot \text{horsepower}^2 + \varepsilon\]
poly2_fit <- lm(mpg ~ poly(horsepower, 2), data = Auto)
summary(poly2_fit)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
resid_df_poly2 <- data.frame(
fitted = fitted(poly2_fit),
residuals = residuals(poly2_fit)
)
ggplot(resid_df_poly2, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.5, colour = "#1a9641") +
geom_hline(yintercept = 0, colour = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, colour = "darkorange", linewidth = 1.1) +
labs(
title = "Residual vs. Fitted — Quadratic Model",
subtitle = "Residuals should now scatter randomly around zero",
x = "Fitted Values",
y = "Residuals"
) +
theme_minimal(base_size = 13)
Interpretation: The “U” shape largely disappears. Residuals scatter more randomly around zero, indicating the quadratic term has captured the nonlinear trend.
# R-squared and RMSE comparison
models <- list(Linear = lm_fit, Quadratic = poly2_fit)
comparison <- data.frame(
Model = names(models),
R2 = sapply(models, function(m) round(summary(m)$r.squared, 4)),
Adj_R2 = sapply(models, function(m) round(summary(m)$adj.r.squared, 4)),
RMSE = sapply(models, function(m) round(sqrt(mean(residuals(m)^2)), 4))
)
knitr::kable(comparison, caption = "Model Comparison: Linear vs. Quadratic")
| Model | R2 | Adj_R2 | RMSE | |
|---|---|---|---|---|
| Linear | Linear | 0.6059 | 0.6049 | 4.8932 |
| Quadratic | Quadratic | 0.6876 | 0.6860 | 4.3572 |
library(patchwork) # install.packages("patchwork") if needed
p1 <- ggplot(resid_df_lm, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.4, colour = "#2c7bb6") +
geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
geom_smooth(method = "loess", se = FALSE, colour = "darkorange", linewidth = 1) +
labs(title = "Linear Model", x = "Fitted", y = "Residuals") +
theme_minimal(base_size = 12)
p2 <- ggplot(resid_df_poly2, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.4, colour = "#1a9641") +
geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
geom_smooth(method = "loess", se = FALSE, colour = "darkorange", linewidth = 1) +
labs(title = "Quadratic Model", x = "Fitted", y = "Residuals") +
theme_minimal(base_size = 12)
p1 + p2 +
plot_annotation(
title = "Residual vs. Fitted: Linear vs. Quadratic",
subtitle = "Left: 'U' shape → nonlinearity detected | Right: random scatter → nonlinearity addressed"
)
| Diagnostic | Linear Model | Quadratic Model |
|---|---|---|
| Residual vs. Fitted shape | Clear “U” curve | Approximately random |
| Evidence of nonlinearity | Yes | Largely resolved |
| Adjusted R² | 0.605 | 0.686 |
The Residual vs. Fitted plot is a powerful, model-agnostic diagnostic:
*Analysis performed in R using the ISLR,
ggplot2, and patchwork packages.