- Linear regression is when you fit a line to data
- We use it to predict one variable from another
- Example: predicting exam scores from study hours for ASU students
- This is a simple example to understand the basics
The equation for simple linear regression is:
\[ y = \beta_0 + \beta_1 x + \varepsilon \]
Where: - \(y\) is the dependent variable (what we’re predicting) - \(x\) is the independent variable
- \(\beta_0\) is the intercept - \(\beta_1\) is the slope - \(\varepsilon\) is the error term
In our example: \(y\) = Exam Score, \(x\) = Study Hours
We use Ordinary Least Squares (OLS) to find the best line:
\[ \hat{\beta}_1 = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sum (x_i - \bar{x})^2} \]
\[ \hat{\beta}_0 = \bar{y} - \hat{\beta}_1 \bar{x} \]
The R-squared tells us how good the fit is:
\[ R^2 = 1 - \frac{SS_{res}}{SS_{tot}} \]
# load packages library(ggplot2) library(plotly) # study hours and exam scores study_hours <- c(2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 12, 14, 16, 5, 7, 9) exam_scores <- c(52, 61, 68, 71, 74, 77, 79, 81, 84, 87, 89, 91, 93, 58, 66, 73, 78, 83, 88, 92, 63, 70, 76, 80, 85, 90, 95, 67, 75, 82) # putting all of the dats on the df data <- data.frame(study_hours = study_hours, exam_scores = exam_scores) # display and simply check data head(data)
## study_hours exam_scores ## 1 2 52 ## 2 4 61 ## 3 5 68 ## 4 6 71 ## 5 7 74 ## 6 8 77
# fitting linear regression model model <- lm(exam_scores ~ study_hours, data = data) # look at results summary(model)
## ## Call: ## lm(formula = exam_scores ~ study_hours, data = data) ## ## Residuals: ## Min 1Q Median 3Q Max ## -6.0910 -0.9077 0.3260 1.1500 4.4000 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 52.5170 0.9556 54.95 <2e-16 *** ## study_hours 2.7870 0.0976 28.55 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.061 on 28 degrees of freedom ## Multiple R-squared: 0.9668, Adjusted R-squared: 0.9656 ## F-statistic: 815.4 on 1 and 28 DF, p-value: < 2.2e-16
# get coefficients coef(model)
## (Intercept) study_hours ## 52.517040 2.786996
# R squared summary(model)$r.squared
## [1] 0.9667993
# making the scatterplot
ggplot(data, aes(x = study_hours, y = exam_scores)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Exam Score vs Study Hours",
x = "Study Hours",
y = "Exam Score") +
theme_minimal()
# calculating residuals
fitted_values <- predict(model)
residuals_calc <- data$exam_scores - fitted_values
# making the dataframe for plotting
plot_data <- data.frame(
fitted = fitted_values,
residuals = residuals_calc
)
# plot residuals
ggplot(plot_data, aes(x = fitted, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residual Plot",
x = "Fitted Values",
y = "Residuals") +
theme_minimal()
# add residuals to data for 3d plot
data$residuals = residuals_calc
data$fitted_vals = fitted_values
# making the 3d plot vi
p <- plot_ly(data = data,
x = ~study_hours,
y = ~exam_scores,
z = ~residuals,
type = "scatter3d",
mode = "markers",
marker = list(size = 4, color = ~residuals))
# adding lables
p <- p %>% layout(title = "3D Plot: Study Hours, Exam Scores, and Residuals",
scene = list(
xaxis = list(title = "Study Hours"),
yaxis = list(title = "Exam Score"),
zaxis = list(title = "Residuals")
))
p
library(ggplot2)
library(plotly)
study_hours <- c(2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 12, 14, 16, 5, 7, 9)
exam_scores <- c(52, 61, 68, 71, 74, 77, 79, 81, 84, 87, 89, 91, 93, 58, 66, 73, 78, 83, 88, 92, 63, 70, 76, 80, 85, 90, 95, 67, 75, 82)
data <- data.frame(study_hours = study_hours, exam_scores = exam_scores)
# fit model
model <- lm(exam_scores ~ study_hours, data = data)
# first plot
ggplot(data, aes(study_hours, exam_scores)) +
geom_point() +
geom_smooth(method = "lm")
# residuals plot
residuals_calc <- data$exam_scores - predict(model)
plot_data <- data.frame(fitted = predict(model),
residuals = residuals_calc)
ggplot(plot_data, aes(fitted, residuals)) + geom_point()
# 3d plot
data$residuals <- residuals_calc
plot_ly(data, x = ~study_hours, y = ~exam_scores, z = ~residuals,
type = "scatter3d", mode = "markers")