- What is the goal?
- The model in one line
- OLS estimators (math)
- Inference for slope (math)
- A real example (
mtcars) - Plot the relationship (ggplot2)
- Check the fit (residuals)
- Predictions with uncertainty
- Bonus: Plotly 3D
- Key takeaways + code
mtcars)\[ \text{mpg} \approx \beta_0 + \beta_1 \cdot \text{wt} + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0,\sigma^2) \]
\[ \hat{\beta}_1 = \frac{\sum_{i=1}^n (x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=1}^n (x_i-\bar{x})^2}, \qquad \hat{\beta}_0 = \bar{y} - \hat{\beta}_1\,\bar{x}. \]
\[ \hat{\beta}_1 \pm t_{n-2,\,1-\alpha/2}\;\mathrm{SE}(\hat{\beta}_1), \qquad \mathrm{SE}(\hat{\beta}_1) = \sqrt{\frac{\hat{\sigma}^2}{\sum_{i=1}^n (x_i-\bar{x})^2}}, \quad \hat{\sigma}^2 = \frac{1}{n-2}\sum_{i=1}^n (y_i-\hat{y}_i)^2. \]
mtcarslibrary(dplyr) library(ggplot2) library(plotly) df <- tibble::as_tibble(mtcars, rownames = "car") |> select(car, mpg, wt, hp, disp) dplyr::glimpse(df)
## Rows: 32 ## Columns: 5 ## $ car <chr> "Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Ho… ## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,… ## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.… ## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180… ## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
p1 <- ggplot(df, aes(x = wt, y = mpg)) +
geom_point(alpha = 0.85) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Heavier cars generally get lower mpg",
x = "Weight (1000 lbs)",
y = "Miles per Gallon") +
theme_minimal()
p1
fit_slr <- lm(mpg ~ wt, data = df) coef(summary(fit_slr))
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.285126 1.877627 19.857575 8.241799e-19 ## wt -5.344472 0.559101 -9.559044 1.293959e-10
aug <- tibble::tibble(
fitted = fitted(fit_slr),
resid = resid(fit_slr)
)
p2 <- ggplot(aug, aes(x = fitted, y = resid)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_point(alpha = 0.85) +
labs(title = "Residuals vs Fitted",
x = "Fitted mpg",
y = "Residual (actual - fitted)") +
theme_minimal()
p2
p3 <- ggplot(aug, aes(sample = resid)) +
stat_qq(alpha = 0.85) +
stat_qq_line() +
labs(title = "QQ Plot of Residuals",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_minimal()
p3
new_wt <- tibble::tibble(wt = c(2.0, 3.0, 4.0)) pred_ci <- predict(fit_slr, newdata = new_wt, interval = "confidence", level = 0.95) pred_pi <- predict(fit_slr, newdata = new_wt, interval = "prediction", level = 0.95) out <- cbind(new_wt, as.data.frame(pred_ci), PI_L = pred_pi[, "lwr"], PI_U = pred_pi[, "upr"]) out
## wt fit lwr upr PI_L PI_U ## 1 2 26.59618 24.82389 28.36848 20.128114 33.06425 ## 2 3 21.25171 20.12444 22.37899 14.929874 27.57355 ## 3 4 15.90724 14.49018 17.32429 9.527355 22.28712
simple_slr <- function(data, response, predictor) {
fo <- as.formula(paste(response, "~", predictor))
fit <- lm(fo, data = data)
list(summary = summary(fit), coef = coef(fit))
}
simple_slr(mtcars, "mpg", "wt")$summary
## ## Call: ## lm(formula = fo, data = data) ## ## Residuals: ## Min 1Q Median 3Q Max ## -4.5432 -2.3647 -0.1252 1.4096 6.8727 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.2851 1.8776 19.858 < 2e-16 *** ## wt -5.3445 0.5591 -9.559 1.29e-10 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 3.046 on 30 degrees of freedom ## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446 ## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10