library(ISLR2)
library(ggplot2)
library(patchwork)

data(Auto)
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
dim(Auto)          # 392 obs, 9 variables
## [1] 392   9
summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365
# Quick correlation: mpg vs horsepower
cor(Auto$mpg, Auto$horsepower)
## [1] -0.7784268
lm_fit <- lm(mpg ~ horsepower, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
# Key output:
#   Coefficients, R-squared, RSE, F-statistic

# Predicted value at horsepower = 98
predict(lm_fit, data.frame(horsepower = 98))
##        1 
## 24.46708
# 95% confidence interval
predict(lm_fit, data.frame(horsepower = 98), interval = "confidence")
##        fit      lwr      upr
## 1 24.46708 23.97308 24.96108
# 95% prediction interval
predict(lm_fit, data.frame(horsepower = 98), interval = "prediction")
##        fit     lwr      upr
## 1 24.46708 14.8094 34.12476
p_scatter <- ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point(alpha = 0.4, colour = "#3266ad") +
  geom_smooth(method = "lm", se = TRUE,
              colour = "#c0392b", fill = "#f5b7b1") +
  labs(title = "Linear Fit: mpg ~ horsepower",
       x = "Horsepower", y = "MPG") +
  theme_minimal()
# Extract fitted values and residuals
Auto$fitted_lm  <- fitted(lm_fit)
Auto$resid_lm   <- residuals(lm_fit)

p_resid_lm <- ggplot(Auto, aes(x = fitted_lm, y = resid_lm)) +
  geom_point(alpha = 0.4, colour = "#3266ad") +
  geom_hline(yintercept = 0, linetype = "dashed",
             colour = "#c0392b", linewidth = 0.8) +
  geom_smooth(method = "loess", se = FALSE,
              colour = "#e67e22", linewidth = 1) +
  labs(title = "Residuals vs. Fitted — Linear",
       x = "Fitted Values", y = "Residuals") +
  theme_minimal()
# >>> Expect a clear "U" shape → nonlinearity signal <<<
lm_quad <- lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
summary(lm_quad)
## 
## Call:
## lm(formula = mpg ~ horsepower + I(horsepower^2), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     56.9000997  1.8004268   31.60   <2e-16 ***
## horsepower      -0.4661896  0.0311246  -14.98   <2e-16 ***
## I(horsepower^2)  0.0012305  0.0001221   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16
# R² improves from ~0.606 to ~0.688

Auto$fitted_q <- fitted(lm_quad)
Auto$resid_q  <- residuals(lm_quad)

p_resid_quad <- ggplot(Auto, aes(x = fitted_q, y = resid_q)) +
  geom_point(alpha = 0.4, colour = "#3266ad") +
  geom_hline(yintercept = 0, linetype = "dashed",
             colour = "#c0392b", linewidth = 0.8) +
  geom_smooth(method = "loess", se = FALSE,
              colour = "#27ae60", linewidth = 1) +
  labs(title = "Residuals vs. Fitted — Quadratic",
       x = "Fitted Values", y = "Residuals") +
  theme_minimal()
# >>> Residuals now scatter randomly → nonlinearity resolved <<<
(p_scatter | p_resid_lm) /
  (p_resid_quad + plot_spacer()) +
  plot_annotation(
    title = "Auto Dataset — Regression Diagnostics",
    subtitle = "Left: Scatter + linear fit  |  Top-right: Residuals (linear)  |  Bottom-right: Residuals (quadratic)"
  )
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Plot all four standard diagnostics at once:
par(mfrow = c(2, 2))
plot(lm_fit)        # Linear model diagnostics

par(mfrow = c(1, 1))

par(mfrow = c(2, 2))
plot(lm_quad)       # Quadratic model diagnostics

par(mfrow = c(1, 1))
# Compare other predictors
lm_weight <- lm(mpg ~ weight, data = Auto)
lm_disp   <- lm(mpg ~ displacement, data = Auto)
lm_accel  <- lm(mpg ~ acceleration, data = Auto)

sapply(list(
  linear_hp   = lm_fit,
  quadratic_hp = lm_quad,
  weight       = lm_weight,
  displacement = lm_disp,
  acceleration = lm_accel
), function(m) round(summary(m)$r.squared, 3))
##    linear_hp quadratic_hp       weight displacement acceleration 
##        0.606        0.688        0.693        0.648        0.179