library(alr4); library(ggplot2)
## Warning: package 'car' was built under R version 4.3.3
## Warning: package 'carData' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
dd = alr4::snake
lm_fit = lm(Y ~ X, data = dd)
beta0 = coef(lm_fit)[1] # Intercept
beta1 = coef(lm_fit)[2] #slope
beta0
## (Intercept)
## 0.7253804
beta1
## X
## 0.4980812
ggplot(dd, aes(x = X, y = Y)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatter Plot with Regression Line",
x = "Water Content of Snow",
y = "Water Yield") +
theme_minimal()
residuals_sum = sum(residuals(lm_fit))
residuals_sum
## [1] -6.661338e-16
This is near zero so this does confirm the expected.
anova_result = anova(lm_fit)
p_value_f = anova_result[1, "Pr(>F)"]
p_value_f
## [1] 4.63157e-08
The p_value that we derived from the is <0.05, meaning we reject the NULL hypothesis.
summary_lm_fit = summary(lm_fit)
p_value_t = summary_lm_fit$coefficients[2, "Pr(>|t|)"]
p_value_t
## [1] 4.63157e-08
This p_value is also <0.05 so we can reject the null hypothesis.
f_statistic = anova_result[1, "F value"]
t_statistic = summary_lm_fit$coefficients[2, "t value"]
f_statistic
## [1] 101.16
t_statistic
## [1] 10.05783
t_statistic^2
## [1] 101.16
all.equal(f_statistic, t_statistic^2)
## [1] TRUE
X0 = data.frame(X = 42)
confidence_interval = predict(lm_fit, X0, interval = "confidence")
prediction_interval = predict(lm_fit, X0, interval = "prediction")
confidence_interval
## fit lwr upr
## 1 21.64479 20.098 23.19158
prediction_interval
## fit lwr upr
## 1 21.64479 17.62093 25.66865
The confidence interval is narrower than the prediction interval because the prediction interval accounts for error. The intervals here are relatively small meaning there is a fair level of certainty with this correlation. The furthing X0 is from the mean the wider the intervals would be because it adds more uncertainty to the data.