data("mtcars")
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mpg ~ disp
m_disp <- lm(mpg ~ disp, data = mtcars)
summary(m_disp)
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8922 -2.2022 -0.9631 1.6272 7.2305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.599855 1.229720 24.070 < 2e-16 ***
## disp -0.041215 0.004712 -8.747 9.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.251 on 30 degrees of freedom
## Multiple R-squared: 0.7183, Adjusted R-squared: 0.709
## F-statistic: 76.51 on 1 and 30 DF, p-value: 9.38e-10
coef(m_disp)
## (Intercept) disp
## 29.59985476 -0.04121512
Visualization
library(ggplot2)
# Fit the model
m_disp <- lm(mpg ~ disp, data = mtcars)
# Scatterplot with regression line
ggplot(mtcars, aes(x = disp, y = mpg)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
labs(
title = "Simple Linear Regression: mpg ~ disp",
x = "Displacement (cu.in.)",
y = "Miles per Gallon (mpg)"
)
m_full <- lm(mpg ~ hp + wt + disp, data = mtcars)
summary(m_full)
##
## Call:
## lm(formula = mpg ~ hp + wt + disp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.891 -1.640 -0.172 1.061 5.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.105505 2.110815 17.579 < 2e-16 ***
## hp -0.031157 0.011436 -2.724 0.01097 *
## wt -3.800891 1.066191 -3.565 0.00133 **
## disp -0.000937 0.010350 -0.091 0.92851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.639 on 28 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083
## F-statistic: 44.57 on 3 and 28 DF, p-value: 8.65e-11
coef(m_full)
## (Intercept) hp wt disp
## 37.1055052690 -0.0311565508 -3.8008905826 -0.0009370091
mpg
of a one-unit increase in X_j, holding the other
predictors fixed.To begin with,
ggplot(mtcars, aes(x = hp, y = wt, color = mpg, size = disp)) +
geom_point(alpha = 0.9) +
scale_size_continuous(range = c(2, 6)) +
labs(
title = "Multivariate view: hp vs wt ",
x = "Horsepower (hp)",
y = "Weight (1000 lbs)",
color = "mpg",
size = "disp"
) +
theme_minimal(base_size = 13)
Or in another way,
mtcars$mpg_fitted <- fitted(m_full)
ggplot(mtcars, aes(x = mpg_fitted, y = mpg)) +
geom_point(color = "blue", size = 3) +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(
title = "Fitted vs Actual mpg",
x = "Fitted mpg (from model)",
y = "Observed mpg"
)
- Note:When you fit a regression model in R with lm(),
the function fitted() (or its synonym fitted.values()) extracts the
predicted values of the response variable from that model.
-Note: You’ve created a new column called mpg_fitted inside the mtcars dataset. Each row of this column stores the fitted value (predicted mpg) for that car from the model. The function fitted(m_full) returns the predicted mpg for each row in mtcars, using the coefficients from your model and each car’s hp, wt, and disp.
vars <- mtcars[, c("mpg", "hp", "wt", "disp")]
round(cor(vars), 3)
## mpg hp wt disp
## mpg 1.000 -0.776 -0.868 -0.848
## hp -0.776 1.000 0.659 0.791
## wt -0.868 0.659 1.000 0.888
## disp -0.848 0.791 0.888 1.000
Note: If hp and disp
correlate with wt, a simple regression like
mpg ~ disp may partly capture weight’s effect.
Note: Predictors are not independent, which imply multicollinearity
hp and disp: 0.79 (high).
wt and disp: 0.89 (very high).
hp and wt: 0.66 (moderate-high).
Question: Is at least one predictor
useful?
Hypotheses:
- H0: β_hp = β_wt = β_disp = 0
- Ha: at least one β_j ≠0
The overall F-statistic and p-value appear in summary(m_full)
summary(m_full)$fstatistic
## value numdf dendf
## 44.56552 3.00000 28.00000
summary(m_full)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.1055052690 2.11081525 17.57875558 1.161936e-16
## hp -0.0311565508 0.01143579 -2.72447633 1.097103e-02
## wt -3.8008905826 1.06619064 -3.56492586 1.330991e-03
## disp -0.0009370091 0.01034974 -0.09053451 9.285070e-01
pf(summary(m_full)$fstatistic[1],
summary(m_full)$fstatistic[2],
summary(m_full)$fstatistic[3],
lower.tail = FALSE)
## value
## 8.649588e-11
Interpretation: Very small p-value → reject H0 → at least one predictor relates to
mpg.
summary(m_full)
evaluate a single predictor given others.m_hp_wt <- lm(mpg ~ hp + wt, data = mtcars) # drop disp
anova(m_hp_wt, m_full)
## Analysis of Variance Table
##
## Model 1: mpg ~ hp + wt
## Model 2: mpg ~ hp + wt + disp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 29 195.05
## 2 28 194.99 1 0.05708 0.0082 0.9285
Point: Tests whether the dropped set (here
disp) has coefficients equal to zero jointly, conditional on variables kept.
summary(m_full)
evaluate a single predictor given others.summ_full <- summary(m_full)
summ_hp_wt <- summary(m_hp_wt)
c(RSE_full = summ_full$sigma, R2_full = summ_full$r.squared,
RSE_hp_wt = summ_hp_wt$sigma, R2_hp_wt = summ_hp_wt$r.squared)
## RSE_full R2_full RSE_hp_wt R2_hp_wt
## 2.6389302 0.8268361 2.5934118 0.8267855
new_car <- data.frame(hp = 150, wt = 3.0, disp = 200)
# Point prediction from the model without disp (often competitive here)
predict(m_hp_wt, newdata = new_car, interval = "none")
## 1
## 20.82784
# Confidence interval for the **mean** mpg at these predictors
predict(m_hp_wt, newdata = new_car, interval = "confidence", level = 0.95)
## fit lwr upr
## 1 20.82784 19.83556 21.82012
# Prediction interval for **an individual** car's mpg
predict(m_hp_wt, newdata = new_car, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 20.82784 15.43169 26.22398
par(mfrow = c(2, 2))
plot(m_full) # residuals vs fitted, QQ-plot, scale-location, Cook's distance/leverage
par(mfrow = c(1, 1))