library(ggplot2)
unm <- read.csv("enrollmentForecast.csv")
str(unm)
## 'data.frame': 29 obs. of 5 variables:
## $ YEAR : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ROLL : int 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
## $ UNEM : num 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
## $ HGRAD: int 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
## $ INC : int 1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
summary(unm)
## YEAR ROLL UNEM HGRAD INC
## Min. : 1 Min. : 5501 Min. : 5.700 Min. : 9552 Min. :1923
## 1st Qu.: 8 1st Qu.:10167 1st Qu.: 7.000 1st Qu.:15723 1st Qu.:2351
## Median :15 Median :14395 Median : 7.500 Median :17203 Median :2863
## Mean :15 Mean :12707 Mean : 7.717 Mean :16528 Mean :2729
## 3rd Qu.:22 3rd Qu.:14969 3rd Qu.: 8.200 3rd Qu.:18266 3rd Qu.:3127
## Max. :29 Max. :16081 Max. :10.100 Max. :19800 Max. :3345
#make scatterplots of ROLL against the other variables
ggplot(unm, aes(x = ROLL, y = UNEM)) + geom_point()
ggplot(unm, aes(x = ROLL, y = HGRAD)) + geom_point()
ggplot(unm, aes(x = ROLL, y = INC)) + geom_point()
#build linear model using UNEM and number of spring HGRAD to predict fall ROLL (ROLL ~ UNEM + HGRAD)
lm(ROLL ~ UNEM + HGRAD, data = unm)
##
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
##
## Coefficients:
## (Intercept) UNEM HGRAD
## -8255.7511 698.2681 0.9423
fall <- lm(ROLL ~ UNEM + HGRAD, data = unm)
class(fall)
## [1] "lm"
str(fall)
## List of 12
## $ coefficients : Named num [1:3] -8255.751 698.268 0.942
## ..- attr(*, "names")= chr [1:3] "(Intercept)" "UNEM" "HGRAD"
## $ residuals : Named num [1:29] -900 192 618 -418 -1744 ...
## ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
## $ effects : Named num [1:29] -68429 6739 14362 -414 -1634 ...
## ..- attr(*, "names")= chr [1:29] "(Intercept)" "UNEM" "HGRAD" "" ...
## $ rank : int 3
## $ fitted.values: Named num [1:29] 6401 5753 6011 7974 10460 ...
## ..- attr(*, "names")= chr [1:29] "1" "2" "3" "4" ...
## $ assign : int [1:3] 0 1 2
## $ qr :List of 5
## ..$ qr : num [1:29, 1:3] -5.385 0.186 0.186 0.186 0.186 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:29] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:3] "(Intercept)" "UNEM" "HGRAD"
## .. ..- attr(*, "assign")= int [1:3] 0 1 2
## ..$ qraux: num [1:3] 1.19 1.13 1.33
## ..$ pivot: int [1:3] 1 2 3
## ..$ tol : num 1e-07
## ..$ rank : int 3
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 26
## $ xlevels : Named list()
## $ call : language lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
## $ terms :Classes 'terms', 'formula' language ROLL ~ UNEM + HGRAD
## .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
## .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
## .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
## .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
## .. ..- attr(*, "order")= int [1:2] 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
## .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
## $ model :'data.frame': 29 obs. of 3 variables:
## ..$ ROLL : int [1:29] 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
## ..$ UNEM : num [1:29] 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
## ..$ HGRAD: int [1:29] 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language ROLL ~ UNEM + HGRAD
## .. .. ..- attr(*, "variables")= language list(ROLL, UNEM, HGRAD)
## .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:3] "ROLL" "UNEM" "HGRAD"
## .. .. .. .. ..$ : chr [1:2] "UNEM" "HGRAD"
## .. .. ..- attr(*, "term.labels")= chr [1:2] "UNEM" "HGRAD"
## .. .. ..- attr(*, "order")= int [1:2] 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(ROLL, UNEM, HGRAD)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:3] "ROLL" "UNEM" "HGRAD"
## - attr(*, "class")= chr "lm"
#use summary() and anova() to investigate the model. which variable is most closely related to enrollment? make residual plot and check for any bias in model
summary(fall)
##
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = unm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2102.2 -861.6 -349.4 374.5 3603.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.256e+03 2.052e+03 -4.023 0.00044 ***
## UNEM 6.983e+02 2.244e+02 3.111 0.00449 **
## HGRAD 9.423e-01 8.613e-02 10.941 3.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
## F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
hist(residuals(fall))
plot(fall, which = 1)
plot(fall, which = 4)
#use predict() to estimate the expected fall enrollment, current year’s UNEM is 9% and HGRAD is 25,000
fallpredict <- data.frame(UNEM = 0.09, HGRAD = 25000)
predict(fall, fallpredict)
## 1
## 15364.01
#build second model that includes INC
allfall <- data.frame(UNEM = 0.09, HGRAD = 25000, INC = "prediction")
predict(fall, allfall)
## 1
## 15364.01
predictINC <- predict(fall, allfall)
predictnoINC <- predict(fall, fallpredict)
aov(predictINC ~ predictnoINC)
## Call:
## aov(formula = predictINC ~ predictnoINC)
##
## Terms:
## <empty>
## Sum of Squares 0
## Deg. of Freedom 0