install.packages("ISLR2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR2)
library(ggplot2)
data(Auto)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5, colour = "steelblue") +
geom_smooth(method = "lm", colour = "red", se = FALSE, linetype = "dashed") +
geom_smooth(method = "loess", colour = "darkgreen", se = FALSE) +
labs(
title = "MPG vs. Horsepower",
subtitle = "Red = linear fit | Green = LOESS fit",
x = "Horsepower", y = "Miles Per Gallon"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
Auto$fitted_lm <- fitted(lm_fit)
Auto$resid_lm <- residuals(lm_fit)
ggplot(Auto, aes(x = fitted_lm, y = resid_lm)) +
geom_point(alpha = 0.4, colour = "steelblue") +
geom_hline(yintercept = 0, colour = "red", linewidth = 0.8) +
geom_smooth(method = "loess", colour = "orange", se = FALSE) +
labs(
title = "Residual vs. Fitted — Linear Model",
subtitle = "U-shape = nonlinearity not captured",
x = "Fitted Values", y = "Residuals"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

poly_fit <- lm(mpg ~ poly(horsepower, 2), data = Auto)
summary(poly_fit)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Auto$fitted_poly <- fitted(poly_fit)
Auto$resid_poly <- residuals(poly_fit)
ggplot(Auto, aes(x = fitted_poly, y = resid_poly)) +
geom_point(alpha = 0.4, colour = "steelblue") +
geom_hline(yintercept = 0, colour = "red", linewidth = 0.8) +
geom_smooth(method = "loess", colour = "orange", se = FALSE) +
labs(
title = "Residual vs. Fitted — Polynomial (degree 2) Model",
subtitle = "Random scatter = nonlinearity captured",
x = "Fitted Values", y = "Residuals"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

par(mfrow = c(1, 2))
plot(fitted(lm_fit), residuals(lm_fit),
main = "Linear Model",
xlab = "Fitted Values", ylab = "Residuals",
pch = 20, col = "steelblue")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(fitted(lm_fit), residuals(lm_fit)), col = "orange", lwd = 2)
plot(fitted(poly_fit), residuals(poly_fit),
main = "Polynomial (degree 2) Model",
xlab = "Fitted Values", ylab = "Residuals",
pch = 20, col = "steelblue")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(fitted(poly_fit), residuals(poly_fit)), col = "orange", lwd = 2)
