We use the Auto dataset from the ISLR
package to demonstrate how to detect nonlinear relationships using
Residual vs. Fitted plots.
# Install ISLR if not already installed
if (!require(ISLR)) install.packages("ISLR")
library(ISLR)
# Inspect the data
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
dim(Auto)
## [1] 392 9
summary(Auto[, c("mpg", "horsepower", "weight", "displacement")])
## mpg horsepower weight displacement
## Min. : 9.00 Min. : 46.0 Min. :1613 Min. : 68.0
## 1st Qu.:17.00 1st Qu.: 75.0 1st Qu.:2225 1st Qu.:105.0
## Median :22.75 Median : 93.5 Median :2804 Median :151.0
## Mean :23.45 Mean :104.5 Mean :2978 Mean :194.4
## 3rd Qu.:29.00 3rd Qu.:126.0 3rd Qu.:3615 3rd Qu.:275.8
## Max. :46.60 Max. :230.0 Max. :5140 Max. :455.0
# Fit linear model
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Base R residual vs fitted plot
plot(lm_fit$fitted.values, lm_fit$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residual vs. Fitted: Linear Model (mpg ~ horsepower)",
pch = 16, col = "steelblue", cex = 0.7)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(lm_fit$fitted.values, lm_fit$residuals), col = "darkorange", lwd = 2)
legend("topright", legend = c("Zero line", "Lowess smoother"),
col = c("red", "darkorange"), lty = c(2, 1), lwd = 2)
Interpretation: A clear “U”-shaped (curved) pattern in the residuals indicates the linear model has failed to capture a nonlinear trend between
mpgandhorsepower.
# Fit quadratic (polynomial degree 2) model
poly_fit <- lm(mpg ~ poly(horsepower, 2), data = Auto)
summary(poly_fit)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
plot(poly_fit$fitted.values, poly_fit$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residual vs. Fitted: Polynomial Model (mpg ~ horsepower²)",
pch = 16, col = "forestgreen", cex = 0.7)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(poly_fit$fitted.values, poly_fit$residuals), col = "darkorange", lwd = 2)
legend("topright", legend = c("Zero line", "Lowess smoother"),
col = c("red", "darkorange"), lty = c(2, 1), lwd = 2)
Interpretation: After adding the quadratic term, residuals are more randomly scattered around zero — the nonlinear pattern has been captured by the model.
par(mfrow = c(1, 2))
# Linear
plot(lm_fit$fitted.values, lm_fit$residuals,
xlab = "Fitted Values", ylab = "Residuals",
main = "Linear Model\n(Shows U-shaped pattern)",
pch = 16, col = "steelblue", cex = 0.7)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(lm_fit$fitted.values, lm_fit$residuals), col = "darkorange", lwd = 2)
# Polynomial
plot(poly_fit$fitted.values, poly_fit$residuals,
xlab = "Fitted Values", ylab = "Residuals",
main = "Polynomial Model\n(Residuals randomly scattered)",
pch = 16, col = "forestgreen", cex = 0.7)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(poly_fit$fitted.values, poly_fit$residuals), col = "darkorange", lwd = 2)
par(mfrow = c(1, 1))
if (!require(ggplot2)) install.packages("ggplot2")
if (!require(patchwork)) install.packages("patchwork")
library(ggplot2)
library(patchwork)
# Build data frames for each model
df_lm <- data.frame(fitted = lm_fit$fitted.values, residuals = lm_fit$residuals, model = "Linear")
df_poly <- data.frame(fitted = poly_fit$fitted.values, residuals = poly_fit$residuals, model = "Polynomial")
p1 <- ggplot(df_lm, aes(x = fitted, y = residuals)) +
geom_point(color = "steelblue", alpha = 0.6, size = 1.5) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, color = "darkorange", linewidth = 1) +
labs(title = "Linear Model: mpg ~ horsepower",
subtitle = "U-shaped pattern → nonlinearity not captured",
x = "Fitted Values", y = "Residuals") +
theme_bw(base_size = 13)
p2 <- ggplot(df_poly, aes(x = fitted, y = residuals)) +
geom_point(color = "forestgreen", alpha = 0.6, size = 1.5) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, color = "darkorange", linewidth = 1) +
labs(title = "Polynomial Model: mpg ~ horsepower²",
subtitle = "Random scatter → nonlinearity captured",
x = "Fitted Values", y = "Residuals") +
theme_bw(base_size = 13)
p1 + p2
# Compare R-squared and RSE
lm_summary <- summary(lm_fit)
poly_summary <- summary(poly_fit)
comparison <- data.frame(
Model = c("Linear (degree 1)", "Polynomial (degree 2)"),
R_squared = round(c(lm_summary$r.squared, poly_summary$r.squared), 4),
Adj_R_squared = round(c(lm_summary$adj.r.squared, poly_summary$adj.r.squared), 4),
RSE = round(c(lm_summary$sigma, poly_summary$sigma), 4)
)
knitr::kable(comparison, caption = "Model Comparison: Linear vs Polynomial Regression")
| Model | R_squared | Adj_R_squared | RSE |
|---|---|---|---|
| Linear (degree 1) | 0.6059 | 0.6049 | 4.9058 |
| Polynomial (degree 2) | 0.6876 | 0.6860 | 4.3739 |
| Condition | Residual vs. Fitted Appearance |
|---|---|
| Linear relationship | Residuals randomly scattered around 0 — no pattern |
| Nonlinear relationship | “U”-shape, inverted “U”, or systematic curvature |
mpg and horsepower.