read in data

library(ggplot2)
unm <- read.csv("enrollmentForecast.csv")
str(unm)
## 'data.frame':    29 obs. of  5 variables:
##  $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##  $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##  $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##  $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
summary(unm)
##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345

#make scatterplots of ROLL against the other variables

ggplot(unm, aes(x = ROLL, y = UNEM)) + geom_point()

ggplot(unm, aes(x = ROLL, y = HGRAD)) + geom_point()

ggplot(unm, aes(x = ROLL, y = INC)) + geom_point()

#build linear model using UNEM and number of spring HGRAD to predict fall ROLL (ROLL ~ UNEM + HGRAD)

lm(ROLL ~ UNEM + HGRAD, data = unm)
## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD  
##  -8255.7511     698.2681       0.9423
fall <- lm(ROLL ~ UNEM + HGRAD, data = unm)
class(fall)
## [1] "lm"
str(fall)
## List of 12
##  $ coefficients : Named num [1:3] -8255.751 698.268 0.942
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "UNEM" "HGRAD"
##  $ residuals    : Named num [1:29] -900 192 618 -418 -1744 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:29] -68429 6739 14362 -414 -1634 ...
##   ..- attr(*, "names")= chr [1:29] "(Intercept)" "UNEM" "HGRAD" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:29] 6401 5753 6011 7974 10460 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 2
##  $ qr           :List of 5
##   ..$ qr   : num [1:29, 1:3] -5.385 0.186 0.186 0.186 0.186 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:29] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "UNEM" "HGRAD"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 2
##   ..$ qraux: num [1:3] 1.19 1.13 1.33
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 26
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
##  $ terms        :Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD
##   .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
##   .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
##   .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
##  $ model        :'data.frame':   29 obs. of  3 variables:
##   ..$ ROLL : int [1:29] 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##   ..$ UNEM : num [1:29] 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##   ..$ HGRAD: int [1:29] 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD
##   .. .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
##   .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
##   .. .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
##   .. .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
##   .. .. ..- attr(*, "order")= int [1:2] 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
##  - attr(*, "class")= chr "lm"

#use summary() and anova() to investigate the model. which variable is most closely related to enrollment? make residual plot and check for any bias in model

summary(fall)
## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
hist(residuals(fall))

plot(fall, which = 1)

plot(fall, which = 4)

#use predict() to estimate the expected fall enrollment, current year’s UNEM is 9% and HGRAD is 25,000

fallpredict <- data.frame(UNEM = 0.09, HGRAD = 25000)
predict(fall, fallpredict)
##        1 
## 15364.01

#build second model that includes INC

allfall <- data.frame(UNEM = 0.09, HGRAD = 25000, INC = "prediction")
predict(fall, allfall)
##        1 
## 15364.01
predictINC <- predict(fall, allfall)
predictnoINC <- predict(fall, fallpredict)
aov(predictINC ~ predictnoINC)
## Call:
##    aov(formula = predictINC ~ predictnoINC)
## 
## Terms:
##                 <empty>
## Sum of Squares        0
## Deg. of Freedom       0