library(ISLR)
library(ggplot2)
library(patchwork)   # side-by-side plots
library(dplyr)
library(knitr)
library(kableExtra)

1. Overview

This report examines the relationship between mpg (miles per gallon) and horsepower in the Auto dataset.
We fit two models — a simple linear and a polynomial (degree 2) — and use a suite of diagnostic plots to determine whether the relationship is linear or nonlinear.


2. Data Exploration

data(Auto)

2.1 First Rows

kable(head(Auto), caption = "First 6 rows of the Auto dataset") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
First 6 rows of the Auto dataset
mpg cylinders displacement horsepower weight acceleration year origin name
18 8 307 130 3504 12.0 70 1 chevrolet chevelle malibu
15 8 350 165 3693 11.5 70 1 buick skylark 320
18 8 318 150 3436 11.0 70 1 plymouth satellite
16 8 304 150 3433 12.0 70 1 amc rebel sst
17 8 302 140 3449 10.5 70 1 ford torino
15 8 429 198 4341 10.0 70 1 ford galaxie 500

2.2 Summary Statistics

Auto |>
  select(mpg, horsepower, weight, displacement, cylinders) |>
  summary()
##       mpg          horsepower        weight      displacement     cylinders    
##  Min.   : 9.00   Min.   : 46.0   Min.   :1613   Min.   : 68.0   Min.   :3.000  
##  1st Qu.:17.00   1st Qu.: 75.0   1st Qu.:2225   1st Qu.:105.0   1st Qu.:4.000  
##  Median :22.75   Median : 93.5   Median :2804   Median :151.0   Median :4.000  
##  Mean   :23.45   Mean   :104.5   Mean   :2978   Mean   :194.4   Mean   :5.472  
##  3rd Qu.:29.00   3rd Qu.:126.0   3rd Qu.:3615   3rd Qu.:275.8   3rd Qu.:8.000  
##  Max.   :46.60   Max.   :230.0   Max.   :5140   Max.   :455.0   Max.   :8.000

2.3 Scatter Plot: mpg vs. horsepower

ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point(alpha = 0.45, color = "steelblue") +
  geom_smooth(method = "lm",    se = TRUE, color = "red",
              linetype = "dashed", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "darkgreen",
              linewidth = 1) +
  labs(title    = "MPG vs. Horsepower",
       subtitle = "Red dashed = linear fit  |  Green = LOESS smoother",
       x = "Horsepower", y = "MPG") +
  theme_minimal(base_size = 13)

Note: The LOESS smoother curves noticeably relative to the straight linear fit — an early visual hint of nonlinearity.


3. Model Fitting

3.1 Linear Model

model_linear <- lm(mpg ~ horsepower, data = Auto)
summary(model_linear)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

3.2 Polynomial Model (Degree 2)

model_poly <- lm(mpg ~ poly(horsepower, 2), data = Auto)
summary(model_poly)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16

4. Individual Diagnostic Plots

# Build a combined diagnostic data frame for both models
diag_linear <- data.frame(
  fitted    = fitted(model_linear),
  residuals = residuals(model_linear),
  stdres    = rstandard(model_linear),
  leverage  = hatvalues(model_linear),
  model     = "Linear"
)

diag_poly <- data.frame(
  fitted    = fitted(model_poly),
  residuals = residuals(model_poly),
  stdres    = rstandard(model_poly),
  leverage  = hatvalues(model_poly),
  model     = "Polynomial (deg 2)"
)

4.1 Residual vs. Fitted — Linear Model

ggplot(diag_linear, aes(x = fitted, y = residuals)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
  labs(title    = "Residual vs. Fitted — Linear Model",
       subtitle = "A U-shaped pattern signals nonlinearity",
       x = "Fitted Values", y = "Residuals") +
  theme_minimal(base_size = 13)

4.2 Residual vs. Fitted — Polynomial Model

ggplot(diag_poly, aes(x = fitted, y = residuals)) +
  geom_point(alpha = 0.5, color = "darkorange") +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
  labs(title    = "Residual vs. Fitted — Polynomial Model",
       subtitle = "Residuals are more randomly scattered around zero",
       x = "Fitted Values", y = "Residuals") +
  theme_minimal(base_size = 13)


5. Side-by-Side Diagnostic Summary

5.1 Residual vs. Fitted (Side-by-Side)

all_diag <- bind_rows(diag_linear, diag_poly)

ggplot(all_diag, aes(x = fitted, y = residuals, color = model)) +
  geom_point(alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  geom_smooth(method = "loess", se = TRUE, linewidth = 1, color = "darkred") +
  facet_wrap(~model, scales = "free_x") +
  scale_color_manual(values = c("steelblue", "darkorange")) +
  labs(title = "Residual vs. Fitted — Side-by-Side Comparison",
       x = "Fitted Values", y = "Residuals") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

5.2 Scale-Location (Spread of Residuals)

all_diag$sqrt_abs_stdres <- sqrt(abs(all_diag$stdres))

ggplot(all_diag, aes(x = fitted, y = sqrt_abs_stdres, color = model)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", se = TRUE, color = "darkred", linewidth = 1) +
  facet_wrap(~model, scales = "free_x") +
  scale_color_manual(values = c("steelblue", "darkorange")) +
  labs(title    = "Scale-Location Plot — Side-by-Side",
       subtitle = "Checks homoscedasticity (equal spread of residuals)",
       x = "Fitted Values",
       y = expression(sqrt("|Standardized Residuals|"))) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

5.3 Fitted vs. Actual (Predicted vs. Observed)

all_diag$actual <- c(Auto$mpg, Auto$mpg)

ggplot(all_diag, aes(x = actual, y = fitted, color = model)) +
  geom_point(alpha = 0.4) +
  geom_abline(slope = 1, intercept = 0, color = "red",
              linetype = "dashed", linewidth = 1) +
  facet_wrap(~model) +
  scale_color_manual(values = c("steelblue", "darkorange")) +
  labs(title    = "Fitted vs. Actual Values — Side-by-Side",
       subtitle = "Points closer to the red diagonal = better fit",
       x = "Actual MPG", y = "Fitted MPG") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

5.7 Regression Curves Overlaid on Data

hp_seq <- data.frame(horsepower = seq(min(Auto$horsepower),
                                      max(Auto$horsepower), length.out = 300))

hp_seq$pred_linear <- predict(model_linear, newdata = hp_seq)
hp_seq$pred_poly   <- predict(model_poly,   newdata = hp_seq)

ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point(alpha = 0.35, color = "gray40") +
  geom_line(data = hp_seq, aes(y = pred_linear, color = "Linear"),
            linewidth = 1.2) +
  geom_line(data = hp_seq, aes(y = pred_poly, color = "Polynomial (deg 2)"),
            linewidth = 1.2) +
  scale_color_manual(name   = "Model",
                     values = c("Linear" = "steelblue",
                                "Polynomial (deg 2)" = "darkorange")) +
  labs(title    = "Regression Curves: Linear vs. Polynomial",
       subtitle = "Polynomial curve follows the data more closely",
       x = "Horsepower", y = "MPG") +
  theme_minimal(base_size = 13)


6. Model Comparison Table

r2_lin  <- summary(model_linear)$r.squared
r2_poly <- summary(model_poly)$r.squared

rmse <- function(m) sqrt(mean(residuals(m)^2))

comparison <- data.frame(
  Model            = c("Linear", "Polynomial (deg 2)"),
  R_squared        = round(c(r2_lin, r2_poly), 4),
  Adj_R2           = round(c(summary(model_linear)$adj.r.squared,
                              summary(model_poly)$adj.r.squared), 4),
  AIC              = round(c(AIC(model_linear), AIC(model_poly)), 2),
  RMSE             = round(c(rmse(model_linear), rmse(model_poly)), 4),
  Residual_Pattern = c("U-shaped (nonlinear)", "Random scatter (better)")
)

kable(comparison, caption = "Model Comparison Summary") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) |>
  column_spec(2:5, bold = TRUE) |>
  row_spec(2, background = "#d4edda")
Model Comparison Summary
Model R_squared Adj_R2 AIC RMSE Residual_Pattern
Linear 0.6059 0.6049 2363.32 4.8932 U-shaped (nonlinear)
Polynomial (deg 2) 0.6876 0.6860 2274.35 4.3572 Random scatter (better)