Read in the data and look at the data structure.
enroll <- read.csv("enrollmentForecast.csv")
str(enroll)
## 'data.frame':    29 obs. of  5 variables:
##  $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##  $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##  $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##  $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
summary(enroll)
##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345

Make scatterplots of ROLL against the other variables.
library(ggplot2)

roll_year <- ggplot(enroll, aes(x = ROLL, y = YEAR)) + geom_point() + 
  ggtitle("Enrollment and Year")
roll_year

roll_unem <- ggplot(enroll, aes(x = ROLL, y = UNEM)) + geom_point() +
  ggtitle("Enrollment and Unemployment Rate")
roll_unem

roll_hgrad <- ggplot(enroll, aes(x = ROLL, y = HGRAD)) + geom_point() +
  ggtitle("Enrollment and Number of Graduates")
roll_hgrad

roll_inc <- ggplot(enroll, aes(x= ROLL, y = INC)) + geom_point() + 
  ggtitle("Enrollment and Income")
roll_inc


Build a linear model using the unemployment rate (UNEM) and number of spring high school graduates (HGRAD) to predict the fall enrollment.
enroll$UNEM.cen = enroll$UNEM - mean(enroll$UNEM)
enroll_fit = lm(ROLL ~ UNEM.cen + HGRAD, data = enroll)

Use the summary() and anova() functions to investigate the model.
summary(enroll_fit)
## 
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.867e+03  1.444e+03  -1.985  0.05777 .  
## UNEM.cen     6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
anova(enroll_fit)
## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM.cen   1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Make a residual plot and check for any bias in the model.
plot(enroll_fit, which = 1)


Use the predict() function to estimate the expected fall enrollment, if the current year’s unemployment rate is 9% and the size of the spring high school graduating class is 25,000 students.
predict_enroll = data.frame(UNEM.cen = 0.09, HGRAD = 25000)
predict(enroll_fit, predict_enroll)
##        1 
## 20752.72

Build a second model which includes per capita income (INC).
enroll_inc = lm(ROLL~ UNEM.cen + HGRAD + INC, data = enroll)

Compare the two models with anova(). Does including this variable improve the model?
anova(enroll_fit)
## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM.cen   1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(enroll_inc)
## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM.cen   1  45407767  45407767  101.02 2.894e-10 ***
## HGRAD      1 206279143 206279143  458.92 < 2.2e-16 ***
## INC        1  33568255  33568255   74.68 5.594e-09 ***
## Residuals 25  11237313    449493                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(enroll_fit)
## 
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.867e+03  1.444e+03  -1.985  0.05777 .  
## UNEM.cen     6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
summary(enroll_inc)
## 
## Call:
## lm(formula = ROLL ~ UNEM.cen + HGRAD + INC, data = enroll)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1148.84  -489.71    -1.88   387.40  1425.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.680e+03  8.062e+02  -7.045 2.20e-07 ***
## UNEM.cen     4.501e+02  1.182e+02   3.809 0.000807 ***
## HGRAD        4.065e-01  7.602e-02   5.347 1.52e-05 ***
## INC          4.275e+00  4.947e-01   8.642 5.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 670.4 on 25 degrees of freedom
## Multiple R-squared:  0.9621, Adjusted R-squared:  0.9576 
## F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16

Including the income (INC) variable does seem to improve the model. The R2 value increased, indicating that INC contributes to the model’s ability to explain the relationship to enrollment.