Module 10: Linear model of student enrollment

UNM enrollment data

Start by:

Read in the data
Look at the data structure

enroll = read.csv("enrollmentForecast.csv")
str(enroll)

## 'data.frame':    29 obs. of  5 variables:
##  $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##  $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##  $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##  $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...

summary(enroll)

##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345

Make scatterplots of ROLL against the other variables

library(ggplot2)
library(ggthemes)
myplot = ggplot(enroll, aes(x=UNEM, y=ROLL)) + ggtitle("Unemployment Rate vs Enrollment") + xlab("Unemployment Rate") + ylab("Fall Enrollment")
myplot + geom_point(color='lightblue') + geom_line(color='lightblue') + theme_bw()

library(ggplot2)
library(ggthemes)
myplot = ggplot(enroll, aes(x=YEAR, y=ROLL)) + ggtitle("Year vs Enrollment") + xlab("Year") + ylab("Fall Enrollment")
myplot + geom_point(color='lightpink') + geom_line(color='lightpink') + theme_bw()

library(ggplot2)
library(ggthemes)
myplot = ggplot(enroll, aes(x=HGRAD, y=ROLL)) + ggtitle("High School Graduates vs Enrollment") + xlab("High School Graduates") + ylab("Fall Enrollment")
myplot + geom_point(color='goldenrod4') + geom_line(color='goldenrod4') + theme_bw()

library(ggplot2)
library(ggthemes)
myplot = ggplot(enroll, aes(x=INC, y=ROLL)) + ggtitle("Monthly Per Capita Income vs Enrollment") + xlab("Monthly per Capita Income in 1961 US$") + ylab("Fall Enrollment")
myplot + geom_point(color='forestgreen') + geom_line(color='forestgreen') + theme_bw()

Build a linear model using the unemployment rate (UNEM) and number of spring high school graduates (HGRAD) to predict the fall enrollment (ROLL)

ROLL ~ UNEM + HGRAD

lm(ROLL ~ UNEM + HGRAD, data = enroll)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD  
##  -8255.7511     698.2681       0.9423

fall_enroll = lm(ROLL ~ UNEM + HGRAD, data = enroll)
class(fall_enroll)

## [1] "lm"

str(fall_enroll)

## List of 12
##  $ coefficients : Named num [1:3] -8255.751 698.268 0.942
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "UNEM" "HGRAD"
##  $ residuals    : Named num [1:29] -900 192 618 -418 -1744 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:29] -68429 6739 14362 -414 -1634 ...
##   ..- attr(*, "names")= chr [1:29] "(Intercept)" "UNEM" "HGRAD" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:29] 6401 5753 6011 7974 10460 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 2
##  $ qr           :List of 5
##   ..$ qr   : num [1:29, 1:3] -5.385 0.186 0.186 0.186 0.186 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:29] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "UNEM" "HGRAD"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 2
##   ..$ qraux: num [1:3] 1.19 1.13 1.33
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 26
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
##  $ terms        :Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD
##   .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
##   .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
##   .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
##  $ model        :'data.frame':   29 obs. of  3 variables:
##   ..$ ROLL : int [1:29] 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##   ..$ UNEM : num [1:29] 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##   ..$ HGRAD: int [1:29] 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD
##   .. .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
##   .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
##   .. .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
##   .. .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
##   .. .. ..- attr(*, "order")= int [1:2] 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
##  - attr(*, "class")= chr "lm"

Notes (unsure where this could go): new_enroll = 16081 - mean(enroll$HGRAD) -8255.7511 + (698.2681 * new_enroll)

predict_fall = data.frame(UNEM + HGRAD = new_enroll) predict(fall_enroll, predict_fall)

predict(fall_enroll, predict_fall, interval = “prediction”)

Use the summary() and anova() functions to investigate the model:

Which variable is the most closely related to enrollment? It seems that high school graduates is more related to enrollment since it has the 3 star signif. code on both the annova and summary diagnostics
Make a residual plot and check for any bias in the model Due to the U-shaped pattern in the graph, we can conclude that a linear model may not be appropriate for this data

summary(fall_enroll)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11

anova(fall_enroll)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

hist(residuals(fall_enroll))

plot(fall_enroll, which = 1)

Use the predict() function to estimate the expected fall enrollment, if the current year’s unemployment rate is 9% and the size of the spring high school graduating class is 25,000 students

predict_fall = data.frame(UNEM = 0.09, HGRAD = 25000)
predict(fall_enroll, predict_fall)

##        1 
## 15364.01

Build a second model which includes per capita income (INC).

Compare the two models with anova(). Does including this variable improve the model? Including this variable does improve the model as seen in the significance code where all three variables are now coded as three stars

lm(ROLL ~ UNEM + HGRAD + INC, data = enroll)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enroll)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD          INC  
##  -9153.2545     450.1245       0.4065       4.2749

fall_enroll2 = lm(ROLL ~ UNEM + HGRAD + INC, data = enroll)
summary(fall_enroll2)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enroll)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1148.84  -489.71    -1.88   387.40  1425.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.153e+03  1.053e+03  -8.691 5.02e-09 ***
## UNEM         4.501e+02  1.182e+02   3.809 0.000807 ***
## HGRAD        4.065e-01  7.602e-02   5.347 1.52e-05 ***
## INC          4.275e+00  4.947e-01   8.642 5.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 670.4 on 25 degrees of freedom
## Multiple R-squared:  0.9621, Adjusted R-squared:  0.9576 
## F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16

anova(fall_enroll)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(fall_enroll2)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  101.02 2.894e-10 ***
## HGRAD      1 206279143 206279143  458.92 < 2.2e-16 ***
## INC        1  33568255  33568255   74.68 5.594e-09 ***
## Residuals 25  11237313    449493                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1