Module 10: Statistical Modeling in R

For this lab we will be utilzing the enrollmentForcast.csv file which contains information used to estimate undergraduate enrollment at the University of New Mexico.
First we load in the data.

enroll <- read.csv("enrollmentForecast.csv")
library(ggplot2)

Then we look at the data structure.

summary(enroll)

##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345

ls(enroll)

## [1] "HGRAD" "INC"   "ROLL"  "UNEM"  "YEAR"

head(enroll)

##   YEAR ROLL UNEM HGRAD  INC
## 1    1 5501  8.1  9552 1923
## 2    2 5945  7.0  9680 1961
## 3    3 6629  7.3  9731 1979
## 4    4 7556  7.5 11666 2030
## 5    5 8716  7.0 14675 2112
## 6    6 9369  6.4 15265 2192

Then we make scatterplots of ROLL against aother variables.

ggplot(enroll, aes(x = ROLL, y = UNEM)) + geom_point()

ggplot(enroll, aes(x = ROLL, y = HGRAD)) + geom_point()

ggplot(enroll, aes(x = ROLL, y = INC)) + geom_point()

- The we build a linear model using the unemploment rate aand number of spring high school students to predict fall enrollment.

roll1 = lm(ROLL ~ UNEM + HGRAD, data = enroll)
roll1

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD  
##  -8255.7511     698.2681       0.9423

summary(roll1)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11

anova(roll1)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(roll1, which = 1)

plot(roll1, which = 4)

hist(residuals(roll1))

fall_enroll <- lm(ROLL ~ UNEM + HGRAD, data = enroll)
newvar = 16081 - mean(enroll$HGRAD)
-8255.7511 + (698.2681 * newvar)

## [1] -320477.9

plot(fall_enroll, which = 1)

plot(fall_enroll, which = 4)

hist(residuals(fall_enroll))

- Use the predict functions of fall enrollment if the unmployment rate is 9% and the size of the graduating class is 25,000.

predict_fall = data.frame(UNEM = .09, HGRAD = 25000)
predict(fall_enroll, predict_fall)

##        1 
## 15364.01

Build a second model which inclusdes per capita income.

lm(ROLL ~ UNEM + HGRAD + INC, data = enroll)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enroll)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD          INC  
##  -9153.2545     450.1245       0.4065       4.2749

fall_enroll2 = lm(ROLL ~ UNEM + HGRAD + INC, data = enroll)
class(fall_enroll2)

## [1] "lm"

str(fall_enroll2)

## List of 12
##  $ coefficients : Named num [1:4] -9153.254 450.125 0.406 4.275
##   ..- attr(*, "names")= chr [1:4] "(Intercept)" "UNEM" "HGRAD" "INC"
##  $ residuals    : Named num [1:29] -1095 -370.4 80.9 -86.7 -275.3 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:29] -68429.5 6738.5 14362.4 5793.8 -89.8 ...
##   ..- attr(*, "names")= chr [1:29] "(Intercept)" "UNEM" "HGRAD" "INC" ...
##  $ rank         : int 4
##  $ fitted.values: Named num [1:29] 6596 6315 6548 7643 8991 ...
##   ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
##  $ assign       : int [1:4] 0 1 2 3
##  $ qr           :List of 5
##   ..$ qr   : num [1:29, 1:4] -5.385 0.186 0.186 0.186 0.186 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:29] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:4] "(Intercept)" "UNEM" "HGRAD" "INC"
##   .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
##   ..$ qraux: num [1:4] 1.19 1.13 1.33 1.08
##   ..$ pivot: int [1:4] 1 2 3 4
##   ..$ tol  : num 1e-07
##   ..$ rank : int 4
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 25
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enroll)
##  $ terms        :Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD + INC
##   .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD, INC)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "ROLL" "UNEM" "HGRAD" "INC"
##   .. .. .. ..$ : chr [1:3] "UNEM" "HGRAD" "INC"
##   .. ..- attr(*, "term.labels")= chr [1:3] "UNEM" "HGRAD" "INC"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD, INC)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:4] "ROLL" "UNEM" "HGRAD" "INC"
##  $ model        :'data.frame':   29 obs. of  4 variables:
##   ..$ ROLL : int [1:29] 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##   ..$ UNEM : num [1:29] 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##   ..$ HGRAD: int [1:29] 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##   ..$ INC  : int [1:29] 1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language ROLL ~ UNEM + HGRAD + INC
##   .. .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD, INC)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "ROLL" "UNEM" "HGRAD" "INC"
##   .. .. .. .. ..$ : chr [1:3] "UNEM" "HGRAD" "INC"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "UNEM" "HGRAD" "INC"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD, INC)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "ROLL" "UNEM" "HGRAD" "INC"
##  - attr(*, "class")= chr "lm"

summary(enroll)

##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345

predict_fall2 = data.frame(UNEM = newvar, HGRAD = newvar, INC = newvar)
predict(fall_enroll2, predict_fall2)

##         1 
## -212514.2

I do think including INC varaible improves the model.

Module 10 Exercise RMD

Lucas Brizolara

2022-06-21

Module 10: Statistical Modeling in R