enrollment = read.csv("enrollmentForecast.csv")
str(enrollment)
## 'data.frame': 29 obs. of 5 variables:
## $ YEAR : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ROLL : int 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
## $ UNEM : num 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
## $ HGRAD: int 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
## $ INC : int 1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
ggplot(enrollment, aes(x = ROLL, y = YEAR)) + geom_point() ## Vs YEAR
ggplot(enrollment, aes(x = ROLL, y = UNEM)) + geom_point() ## Vs UNEM
ggplot(enrollment, aes(x = ROLL, y = HGRAD)) + geom_point() ## Vs HGRAD
ggplot(enrollment, aes(x = ROLL, y = INC)) + geom_point() ## Vs INC
enrollment$UNEM.cen = enrollment$UNEM - mean(enrollment$UNEM)
enrollment$HGRAD.cen = enrollment$HGRAD - mean(enrollment$HGRAD)
fit1 = lm(ROLL ~ UNEM.cen + HGRAD.cen, data = enrollment)
summary(fit1)
##
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD.cen, data = enrollment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2102.2 -861.6 -349.4 374.5 3603.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.271e+04 2.438e+02 52.127 < 2e-16 ***
## UNEM.cen 6.983e+02 2.244e+02 3.111 0.00449 **
## HGRAD.cen 9.423e-01 8.613e-02 10.941 3.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
## F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
anova(fit1)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 26.349 2.366e-05 ***
## HGRAD.cen 1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26 44805568 1723291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
hist(residuals(fit1))
plot(fit1, which = 1)
### Based on the anova and summary, high school graduates is more related to enrollment than unemployment. HGRAD has a lower p value meaning the null of them not being related must go and shows they are more related. Based on the histogram of risiduals there is some bias as it is not perfectly symmetical with more residulals on the negative side.
newROLL = data.frame(UNEM.cen = 0.09 - mean(enrollment$UNEM), HGRAD.cen = 25000 - mean(enrollment$HGRAD))
predict(fit1, newdata = newROLL, interval = "pred")
## fit lwr upr
## 1 15364.01 10461.38 20266.65
enrollment$INC.cen = enrollment$INC - mean(enrollment$INC)
fit2 = lm(ROLL ~ UNEM.cen + HGRAD.cen + INC.cen, data = enrollment)
anova(fit1)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 26.349 2.366e-05 ***
## HGRAD.cen 1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26 44805568 1723291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit2)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 101.02 2.894e-10 ***
## HGRAD.cen 1 206279143 206279143 458.92 < 2.2e-16 ***
## INC.cen 1 33568255 33568255 74.68 5.594e-09 ***
## Residuals 25 11237313 449493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1