- Goal: Model the relationship between car speed (mph) and stopping distance (ft)
- Tool: Simple Linear Regression (SLR)
- Dataset: built-in
cars(n = 50) - We’ll fit the model, check assumptions, do inference, and make predictions
cars (n = 50)\[\text{dist}_i = \beta_0 + \beta_1\,\text{speed}_i + \varepsilon_i,\quad \varepsilon_i \sim \mathcal{N}(0,\sigma^2)\]
These will be assessed via residual plots and a QQ-plot.
head(cars) %>% kable(caption = "First 6 rows of the cars data")
| speed | dist |
|---|---|
| 4 | 2 |
| 4 | 10 |
| 7 | 4 |
| 7 | 22 |
| 8 | 16 |
| 9 | 10 |
fit <- lm(dist ~ speed, data = cars) summary(fit)
## ## Call: ## lm(formula = dist ~ speed, data = cars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -29.069 -9.525 -2.272 9.215 43.201 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -17.5791 6.7584 -2.601 0.0123 * ## speed 3.9324 0.4155 9.464 1.49e-12 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 15.38 on 48 degrees of freedom ## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438 ## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
base_scatter <- ggplot(cars, aes(speed, dist)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
labs(title = "Stopping Distance vs Speed",
x = "Speed (mph)", y = "Stopping Distance (ft)") +
theme_minimal()
base_scatter
## `geom_smooth()` using formula = 'y ~ x'
aug <- augment(fit) rvf <- ggplot(aug, aes(.fitted, .resid)) + geom_hline(yintercept = 0, linetype = "dashed") + geom_point(alpha = 0.8) + labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals") + theme_minimal() rvf
qq <- ggplot(aug, aes(sample = .std.resid)) + stat_qq() + stat_qq_line() + labs(title = "Normal Q–Q Plot", x = "Theoretical Quantiles", y = "Standardized Residuals") + theme_minimal() qq
plt <- ggplot(cars, aes(speed, dist)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Interactive: Distance vs Speed",
x = "Speed (mph)", y = "Stopping Distance (ft)") +
theme_minimal()
ggplotly(plt)
## `geom_smooth()` using formula = 'y ~ x'
coef_tab <- tidy(fit, conf.int = TRUE) kable(coef_tab, digits = 3, caption = "SLR coefficients with 95% CIs")
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -17.579 | 6.758 | -2.601 | 0.012 | -31.168 | -3.990 |
| speed | 3.932 | 0.416 | 9.464 | 0.000 | 3.097 | 4.768 |
new_speeds <- data.frame(speed = c(10, 15, 20, 25)) preds <- predict(fit, newdata = new_speeds, interval = "prediction", level = 0.95) results <- cbind(new_speeds, round(preds, 1)) kable(results, caption = "Predicted stopping distance with 95% prediction intervals")
| speed | fit | lwr | upr |
|---|---|---|---|
| 10 | 21.7 | -9.8 | 53.3 |
| 15 | 41.4 | 10.2 | 72.6 |
| 20 | 61.1 | 29.6 | 92.5 |
| 25 | 80.7 | 48.5 | 113.0 |
# 1) Fit SLR
fit <- lm(dist ~ speed, data = cars)
# 2) Quick plot
plot(cars$speed, cars$dist,
xlab = "Speed (mph)", ylab = "Stopping Distance (ft)",
main = "cars: dist vs speed")
abline(fit, col = "red", lwd = 2)
# 3) Predict predict(fit, newdata = data.frame(speed = 20), interval = "prediction")
## fit lwr upr ## 1 61.06908 29.60309 92.53507