Multiple Linear Regression Week 3

Set up

data("mtcars")
str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Let’s review: Simple Linear Regression

mpg ~ disp

m_disp <- lm(mpg ~ disp, data = mtcars)
summary(m_disp)

## 
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8922 -2.2022 -0.9631  1.6272  7.2305 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.599855   1.229720  24.070  < 2e-16 ***
## disp        -0.041215   0.004712  -8.747 9.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.251 on 30 degrees of freedom
## Multiple R-squared:  0.7183, Adjusted R-squared:  0.709 
## F-statistic: 76.51 on 1 and 30 DF,  p-value: 9.38e-10

coef(m_disp)

## (Intercept)        disp 
## 29.59985476 -0.04121512

Visualization

library(ggplot2)

# Fit the model
m_disp <- lm(mpg ~ disp, data = mtcars)

# Scatterplot with regression line
ggplot(mtcars, aes(x = disp, y = mpg)) +
  geom_point(color = "blue", size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  labs(
    title = "Simple Linear Regression: mpg ~ disp",
    x = "Displacement (cu.in.)",
    y = "Miles per Gallon (mpg)"
  )

2) Multiple Linear Regression (all predictors)

m_full <- lm(mpg ~ hp + wt + disp, data = mtcars)
summary(m_full)

## 
## Call:
## lm(formula = mpg ~ hp + wt + disp, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.891 -1.640 -0.172  1.061  5.861 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.105505   2.110815  17.579  < 2e-16 ***
## hp          -0.031157   0.011436  -2.724  0.01097 *  
## wt          -3.800891   1.066191  -3.565  0.00133 ** 
## disp        -0.000937   0.010350  -0.091  0.92851    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared:  0.8268, Adjusted R-squared:  0.8083 
## F-statistic: 44.57 on 3 and 28 DF,  p-value: 8.65e-11

coef(m_full)

##   (Intercept)            hp            wt          disp 
## 37.1055052690 -0.0311565508 -3.8008905826 -0.0009370091

Interpret β_j: Average effect on mpg of a one-unit increase in X_j, holding the other predictors fixed.
Compare coefficients/p-values here vs. simple regressions above.

3) Visual Relationship

To begin with,

ggplot(mtcars, aes(x = hp, y = wt, color = mpg, size = disp)) +
  geom_point(alpha = 0.9) +
  scale_size_continuous(range = c(2, 6)) +
  labs(
    title = "Multivariate view: hp vs wt ",
    x = "Horsepower (hp)",
    y = "Weight (1000 lbs)",
    color = "mpg",
    size  = "disp"
  ) +
  theme_minimal(base_size = 13)

Or in another way,

mtcars$mpg_fitted <- fitted(m_full)

ggplot(mtcars, aes(x = mpg_fitted, y = mpg)) +
  geom_point(color = "blue", size = 3) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(
    title = "Fitted vs Actual mpg",
    x = "Fitted mpg (from model)",
    y = "Observed mpg"
  )

- Note:When you fit a regression model in R with lm(), the function fitted() (or its synonym fitted.values()) extracts the predicted values of the response variable from that model.

-Note: You’ve created a new column called mpg_fitted inside the mtcars dataset. Each row of this column stores the fitted value (predicted mpg) for that car from the model. The function fitted(m_full) returns the predicted mpg for each row in mtcars, using the coefficients from your model and each car’s hp, wt, and disp.

4) Correlation Check

vars <- mtcars[, c("mpg", "hp", "wt", "disp")]
round(cor(vars), 3)

##         mpg     hp     wt   disp
## mpg   1.000 -0.776 -0.868 -0.848
## hp   -0.776  1.000  0.659  0.791
## wt   -0.868  0.659  1.000  0.888
## disp -0.848  0.791  0.888  1.000

Note: If hp and disp correlate with wt, a simple regression like mpg ~ disp may partly capture weight’s effect.
Note: Predictors are not independent, which imply multicollinearity
hp and disp: 0.79 (high).
wt and disp: 0.89 (very high).
hp and wt: 0.66 (moderate-high).

5) Global Relationship Test (Overall F-test)

Question: Is at least one predictor useful?
Hypotheses:
- H0: β_hp = β_wt = β_disp = 0
- Ha: at least one β_j ≠ 0

The overall F-statistic and p-value appear in summary(m_full)

summary(m_full)$fstatistic

##    value    numdf    dendf 
## 44.56552  3.00000 28.00000

summary(m_full)$coefficients

##                  Estimate Std. Error     t value     Pr(>|t|)
## (Intercept) 37.1055052690 2.11081525 17.57875558 1.161936e-16
## hp          -0.0311565508 0.01143579 -2.72447633 1.097103e-02
## wt          -3.8008905826 1.06619064 -3.56492586 1.330991e-03
## disp        -0.0009370091 0.01034974 -0.09053451 9.285070e-01

pf(summary(m_full)$fstatistic[1],
   summary(m_full)$fstatistic[2],
   summary(m_full)$fstatistic[3],
   lower.tail = FALSE)

##        value 
## 8.649588e-11

Interpretation: Very small p-value → reject H0 → at least one predictor relates to mpg.

6) Partial Tests & Subset F-tests

Individual t-tests in summary(m_full) evaluate a single predictor given others.
Subset F-test (drop one or more variables):

m_hp_wt <- lm(mpg ~ hp + wt, data = mtcars)   # drop disp
anova(m_hp_wt, m_full)

## Analysis of Variance Table
## 
## Model 1: mpg ~ hp + wt
## Model 2: mpg ~ hp + wt + disp
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     29 195.05                           
## 2     28 194.99  1   0.05708 0.0082 0.9285

Point: Tests whether the dropped set (here disp) has coefficients equal to zero jointly, conditional on variables kept.

7) Model Fit: RSE and R²

Individual t-tests in summary(m_full) evaluate a single predictor given others.
Subset F-test (drop one or more variables):

summ_full <- summary(m_full)
summ_hp_wt <- summary(m_hp_wt)

c(RSE_full = summ_full$sigma, R2_full = summ_full$r.squared,
  RSE_hp_wt = summ_hp_wt$sigma, R2_hp_wt = summ_hp_wt$r.squared)

##  RSE_full   R2_full RSE_hp_wt  R2_hp_wt 
## 2.6389302 0.8268361 2.5934118 0.8267855

8) Predictions: Point, Confidence Interval, Prediction Interval

new_car <- data.frame(hp = 150, wt = 3.0, disp = 200)

# Point prediction from the model without disp (often competitive here)
predict(m_hp_wt, newdata = new_car, interval = "none")

##        1 
## 20.82784

# Confidence interval for the **mean** mpg at these predictors
predict(m_hp_wt, newdata = new_car, interval = "confidence", level = 0.95)

##        fit      lwr      upr
## 1 20.82784 19.83556 21.82012

# Prediction interval for **an individual** car's mpg
predict(m_hp_wt, newdata = new_car, interval = "prediction", level = 0.95)

##        fit      lwr      upr
## 1 20.82784 15.43169 26.22398

9) Visual Diagnostics

par(mfrow = c(2, 2))
plot(m_full)  # residuals vs fitted, QQ-plot, scale-location, Cook's distance/leverage

par(mfrow = c(1, 1))