We start with study hours and exam scores.
# Sample data
hours <- c(1,2,3,4,5,6,7,8,9)
scores <- c(35,40,50,55,60,65,70,78,85)
df <- data.frame(Hours = hours, Score = scores)
# Show first rows
head(df)
## Hours Score
## 1 1 35
## 2 2 40
## 3 3 50
## 4 4 55
## 5 5 60
## 6 6 65
Plot hours vs. score to see if there is a linear trend.
plot(df$Hours, df$Score,
xlab = "Study Hours",
ylab = "Score",
main = "Hours vs Score",
pch = 19, col = "blue")
Use lm()
to estimate the relationship.
model <- lm(Score ~ Hours, data = df) # fit model
summary(model) # see coefficients & R²
##
## Call:
## lm(formula = Score ~ Hours, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91111 -0.84444 0.02222 0.95556 2.35556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4444 1.0737 27.43 2.20e-08 ***
## Hours 6.0667 0.1908 31.80 7.87e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.478 on 7 degrees of freedom
## Multiple R-squared: 0.9931, Adjusted R-squared: 0.9921
## F-statistic: 1011 on 1 and 7 DF, p-value: 7.866e-09
plot(df$Hours, df$Score,
xlab = "Study Hours",
ylab = "Score",
pch = 19, col = "blue")
abline(model, col = "red", lwd = 2) # draw line of best fit
Compute predicted scores, Mean Squared Error, and R².
df$Predicted <- predict(model, newdata = df)
# Mean Squared Error
mse <- mean((df$Score - df$Predicted)^2)
cat("Mean Squared Error:", mse, "\n")
## Mean Squared Error: 1.698765
# R-squared (goodness of fit)
summary(model)$r.squared
## [1] 0.9931241
Example: what score for 7.5 study hours?
new_hours <- data.frame(Hours = 7.5)
predict(model, new_hours)
## 1
## 74.94444
library(ggplot2)
ggplot(df, aes(x = Hours, y = Score)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Study Hours vs Score",
x = "Hours", y = "Score")
## `geom_smooth()` using formula = 'y ~ x'