install.packages("ISLR2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR2)
library(ggplot2)
data(Auto)
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point(alpha = 0.5, colour = "steelblue") +
  geom_smooth(method = "lm",    colour = "red",       se = FALSE, linetype = "dashed") +
  geom_smooth(method = "loess", colour = "darkgreen", se = FALSE) +
  labs(
    title    = "MPG vs. Horsepower",
    subtitle = "Red = linear fit  |  Green = LOESS fit",
    x = "Horsepower", y = "Miles Per Gallon"
  ) +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
Auto$fitted_lm <- fitted(lm_fit)
Auto$resid_lm  <- residuals(lm_fit)

ggplot(Auto, aes(x = fitted_lm, y = resid_lm)) +
  geom_point(alpha = 0.4, colour = "steelblue") +
  geom_hline(yintercept = 0, colour = "red", linewidth = 0.8) +
  geom_smooth(method = "loess", colour = "orange", se = FALSE) +
  labs(
    title    = "Residual vs. Fitted — Linear Model",
    subtitle = "U-shape = nonlinearity not captured",
    x = "Fitted Values", y = "Residuals"
  ) +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

poly_fit <- lm(mpg ~ poly(horsepower, 2), data = Auto)
summary(poly_fit)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16
Auto$fitted_poly <- fitted(poly_fit)
Auto$resid_poly  <- residuals(poly_fit)

ggplot(Auto, aes(x = fitted_poly, y = resid_poly)) +
  geom_point(alpha = 0.4, colour = "steelblue") +
  geom_hline(yintercept = 0, colour = "red", linewidth = 0.8) +
  geom_smooth(method = "loess", colour = "orange", se = FALSE) +
  labs(
    title    = "Residual vs. Fitted — Polynomial (degree 2) Model",
    subtitle = "Random scatter = nonlinearity captured",
    x = "Fitted Values", y = "Residuals"
  ) +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

par(mfrow = c(1, 2))

plot(fitted(lm_fit), residuals(lm_fit),
     main = "Linear Model",
     xlab = "Fitted Values", ylab = "Residuals",
     pch = 20, col = "steelblue")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(fitted(lm_fit), residuals(lm_fit)), col = "orange", lwd = 2)

plot(fitted(poly_fit), residuals(poly_fit),
     main = "Polynomial (degree 2) Model",
     xlab = "Fitted Values", ylab = "Residuals",
     pch = 20, col = "steelblue")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(fitted(poly_fit), residuals(poly_fit)), col = "orange", lwd = 2)