- A means of interpreting the relationships between variables (how do changes in X affect Y?)
- Allows for the prediction of numeric outcomes based on observable features
- Foundation for many machine learning models
Given some predictor, \(x\), and its response, \(y\), their presumed linear relationship is modeled as follows: \[ y_i = \beta_0 + \beta_1 x_i + \varepsilon_i,\quad i=1,\dots,n, \] where the errors are \[\varepsilon_i \sim {N}(0,\sigma^2)\quad\].
library(ggplot2) library(dplyr) library(tibble) library(plotly) # Create dataset x <- runif(100, 0, 10) y <- 3 + 2*x + rnorm(100, 0, 2) data <- tibble(x, y) # Fit linear regression model <- lm(y ~ x, data = data)
ggplot(data, aes(x, y)) + geom_point() + geom_smooth(method = "lm", se = TRUE) + labs(title = "Data with Fitted Line", x = "X", y = "Y")
residuals <- resid(model) fitted_vals <- fitted(model) resid_data <- tibble(fitted = fitted_vals, resid = residuals) ggplot(resid_data, aes(fitted, resid)) + geom_point() + geom_hline(yintercept = 0, linetype = "dashed") + labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals")
summary(model)$coefficients
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.716664 0.46246530 5.874308 5.835836e-08 ## x 2.078672 0.07760014 26.786965 6.944611e-47
summary(model)$r.squared
## [1] 0.8798344
# Make predictions for a sequence of x values new_x <- tibble(x = seq(0, 10, length.out = 100)) # Get prediction intervals pred_mat <- predict(model, newdata = new_x, interval = "prediction") pred <- cbind(new_x, as.data.frame(pred_mat))
ggplot() + geom_point(data = data, aes(x, y)) + geom_line(data = pred, aes(x, fit)) + geom_ribbon(data = pred, aes(x, ymin = lwr, ymax = upr), alpha = 0.2) + labs(title = "Predictions with 95% Bands", x = "X", y = "Y")
Linear models can also be produced using multiple predictors. For example using \(x_1\) and \(x_2\).
set.seed(2) x1 <- runif(80, 0, 10) x2 <- runif(80, -5, 5) y2 <- 1 + 1.2*x1 - 0.7*x2 + rnorm(80, 0, 2) data2 <- tibble(x1, x2, y2) model2 <- lm(y2 ~ x1 + x2, data = data2) # Build grid g1 <- seq(min(x1), max(x1), length.out = 25) g2 <- seq(min(x2), max(x2), length.out = 25) grid <- expand.grid(x1 = g1, x2 = g2) grid$yhat <- coef(model2)[1] + coef(model2)[2]*grid$x1 + coef(model2)[3]*grid$x2
plot_ly() |> add_markers(data = data2, x = ~x1, y = ~x2, z = ~y2, opacity = 0.6) |> add_surface(x=~g1, y=~g2, z=~matrix(grid$yhat, nrow=length(g1), ncol=length(g2)), showscale=FALSE, opacity=0.5) |> layout(title = "3D Regression Plane")