enroll <- read.csv("enrollmentForecast.csv")
str(enroll)
## 'data.frame': 29 obs. of 5 variables:
## $ YEAR : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ROLL : int 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
## $ UNEM : num 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
## $ HGRAD: int 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
## $ INC : int 1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
summary(enroll)
## YEAR ROLL UNEM HGRAD INC
## Min. : 1 Min. : 5501 Min. : 5.700 Min. : 9552 Min. :1923
## 1st Qu.: 8 1st Qu.:10167 1st Qu.: 7.000 1st Qu.:15723 1st Qu.:2351
## Median :15 Median :14395 Median : 7.500 Median :17203 Median :2863
## Mean :15 Mean :12707 Mean : 7.717 Mean :16528 Mean :2729
## 3rd Qu.:22 3rd Qu.:14969 3rd Qu.: 8.200 3rd Qu.:18266 3rd Qu.:3127
## Max. :29 Max. :16081 Max. :10.100 Max. :19800 Max. :3345
library(ggplot2)
roll_year <- ggplot(enroll, aes(x = ROLL, y = YEAR)) + geom_point() +
ggtitle("Enrollment and Year")
roll_year
roll_unem <- ggplot(enroll, aes(x = ROLL, y = UNEM)) + geom_point() +
ggtitle("Enrollment and Unemployment Rate")
roll_unem
roll_hgrad <- ggplot(enroll, aes(x = ROLL, y = HGRAD)) + geom_point() +
ggtitle("Enrollment and Number of Graduates")
roll_hgrad
roll_inc <- ggplot(enroll, aes(x= ROLL, y = INC)) + geom_point() +
ggtitle("Enrollment and Income")
roll_inc
enroll$UNEM.cen = enroll$UNEM - mean(enroll$UNEM)
enroll_fit = lm(ROLL ~ UNEM.cen + HGRAD, data = enroll)
summary(enroll_fit)
##
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD, data = enroll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2102.2 -861.6 -349.4 374.5 3603.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.867e+03 1.444e+03 -1.985 0.05777 .
## UNEM.cen 6.983e+02 2.244e+02 3.111 0.00449 **
## HGRAD 9.423e-01 8.613e-02 10.941 3.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
## F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
anova(enroll_fit)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 26.349 2.366e-05 ***
## HGRAD 1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26 44805568 1723291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(enroll_fit, which = 1)
predict_enroll = data.frame(UNEM.cen = 0.09, HGRAD = 25000)
predict(enroll_fit, predict_enroll)
## 1
## 20752.72
enroll_inc = lm(ROLL~ UNEM.cen + HGRAD + INC, data = enroll)
anova(enroll_fit)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 26.349 2.366e-05 ***
## HGRAD 1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26 44805568 1723291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(enroll_inc)
## Analysis of Variance Table
##
## Response: ROLL
## Df Sum Sq Mean Sq F value Pr(>F)
## UNEM.cen 1 45407767 45407767 101.02 2.894e-10 ***
## HGRAD 1 206279143 206279143 458.92 < 2.2e-16 ***
## INC 1 33568255 33568255 74.68 5.594e-09 ***
## Residuals 25 11237313 449493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(enroll_fit)
##
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD, data = enroll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2102.2 -861.6 -349.4 374.5 3603.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.867e+03 1.444e+03 -1.985 0.05777 .
## UNEM.cen 6.983e+02 2.244e+02 3.111 0.00449 **
## HGRAD 9.423e-01 8.613e-02 10.941 3.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
## F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
summary(enroll_inc)
##
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD + INC, data = enroll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1148.84 -489.71 -1.88 387.40 1425.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.680e+03 8.062e+02 -7.045 2.20e-07 ***
## UNEM.cen 4.501e+02 1.182e+02 3.809 0.000807 ***
## HGRAD 4.065e-01 7.602e-02 5.347 1.52e-05 ***
## INC 4.275e+00 4.947e-01 8.642 5.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 670.4 on 25 degrees of freedom
## Multiple R-squared: 0.9621, Adjusted R-squared: 0.9576
## F-statistic: 211.5 on 3 and 25 DF, p-value: < 2.2e-16
Including the income (INC) variable does seem to improve the model. The R2 value increased, indicating that INC contributes to the model’s ability to explain the relationship to enrollment.