library(ISLR2)
library(ggplot2)
library(patchwork)
data(Auto)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
dim(Auto) # 392 obs, 9 variables
## [1] 392 9
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
# Quick correlation: mpg vs horsepower
cor(Auto$mpg, Auto$horsepower)
## [1] -0.7784268
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Key output:
# Coefficients, R-squared, RSE, F-statistic
# Predicted value at horsepower = 98
predict(lm_fit, data.frame(horsepower = 98))
## 1
## 24.46708
# 95% confidence interval
predict(lm_fit, data.frame(horsepower = 98), interval = "confidence")
## fit lwr upr
## 1 24.46708 23.97308 24.96108
# 95% prediction interval
predict(lm_fit, data.frame(horsepower = 98), interval = "prediction")
## fit lwr upr
## 1 24.46708 14.8094 34.12476
p_scatter <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.4, colour = "#3266ad") +
geom_smooth(method = "lm", se = TRUE,
colour = "#c0392b", fill = "#f5b7b1") +
labs(title = "Linear Fit: mpg ~ horsepower",
x = "Horsepower", y = "MPG") +
theme_minimal()
# Extract fitted values and residuals
Auto$fitted_lm <- fitted(lm_fit)
Auto$resid_lm <- residuals(lm_fit)
p_resid_lm <- ggplot(Auto, aes(x = fitted_lm, y = resid_lm)) +
geom_point(alpha = 0.4, colour = "#3266ad") +
geom_hline(yintercept = 0, linetype = "dashed",
colour = "#c0392b", linewidth = 0.8) +
geom_smooth(method = "loess", se = FALSE,
colour = "#e67e22", linewidth = 1) +
labs(title = "Residuals vs. Fitted — Linear",
x = "Fitted Values", y = "Residuals") +
theme_minimal()
# >>> Expect a clear "U" shape → nonlinearity signal <<<
lm_quad <- lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
summary(lm_quad)
##
## Call:
## lm(formula = mpg ~ horsepower + I(horsepower^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## I(horsepower^2) 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
# R² improves from ~0.606 to ~0.688
Auto$fitted_q <- fitted(lm_quad)
Auto$resid_q <- residuals(lm_quad)
p_resid_quad <- ggplot(Auto, aes(x = fitted_q, y = resid_q)) +
geom_point(alpha = 0.4, colour = "#3266ad") +
geom_hline(yintercept = 0, linetype = "dashed",
colour = "#c0392b", linewidth = 0.8) +
geom_smooth(method = "loess", se = FALSE,
colour = "#27ae60", linewidth = 1) +
labs(title = "Residuals vs. Fitted — Quadratic",
x = "Fitted Values", y = "Residuals") +
theme_minimal()
# >>> Residuals now scatter randomly → nonlinearity resolved <<<
(p_scatter | p_resid_lm) /
(p_resid_quad + plot_spacer()) +
plot_annotation(
title = "Auto Dataset — Regression Diagnostics",
subtitle = "Left: Scatter + linear fit | Top-right: Residuals (linear) | Bottom-right: Residuals (quadratic)"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Plot all four standard diagnostics at once:
par(mfrow = c(2, 2))
plot(lm_fit) # Linear model diagnostics

par(mfrow = c(1, 1))
par(mfrow = c(2, 2))
plot(lm_quad) # Quadratic model diagnostics

par(mfrow = c(1, 1))
# Compare other predictors
lm_weight <- lm(mpg ~ weight, data = Auto)
lm_disp <- lm(mpg ~ displacement, data = Auto)
lm_accel <- lm(mpg ~ acceleration, data = Auto)
sapply(list(
linear_hp = lm_fit,
quadratic_hp = lm_quad,
weight = lm_weight,
displacement = lm_disp,
acceleration = lm_accel
), function(m) round(summary(m)$r.squared, 3))
## linear_hp quadratic_hp weight displacement acceleration
## 0.606 0.688 0.693 0.648 0.179