Project 10

q1 and q2

enroll = read.csv("enrollmentForecast.csv")


str(enroll)

## 'data.frame':    29 obs. of  5 variables:
##  $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##  $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##  $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##  $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...

names(enroll)

## [1] "YEAR"  "ROLL"  "UNEM"  "HGRAD" "INC"

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

roll = ggplot(enroll, aes(x= ROLL))

rollyear = roll + geom_point(aes(y = YEAR))
rollunem = roll + geom_point(aes(y = UNEM))
rollhgrad = roll + geom_point(aes(y = HGRAD))
rollinc = roll + geom_point(aes(y = INC))

rollyear

rollunem

rollhgrad

rollinc

q4 and q5

library(ggplot2)
library(dplyr)


econroll = lm(ROLL ~ UNEM + HGRAD, data = enroll)

econroll

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Coefficients:
## (Intercept)         UNEM        HGRAD  
##  -8255.7511     698.2681       0.9423

#To really understand the data, a visual helps me:

ggplot(econroll, aes(x=ROLL, y = UNEM)) + geom_point() + geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

ggplot(econroll, aes(x=ROLL, y = HGRAD)) + geom_point() + geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

summary(econroll)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11

anova(econroll)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Hgrad has a much higher F and T value, and a much lower p. Hgrad's best fit line is much steeper. However, the slop for UNEM is much, much higher. This implies a much greater relationship.

hist(residuals(econroll))

plot(econroll, which = 1)

#However, examing residuals tells us that there may be something missing in our model. It may not be the right time to come to a conclusion, given the unevenness of both the histogram and the line graph.

library(ggplot2)
library(dplyr)


predict(econroll, HGRAD = 25000 , UNEM = 0.09, interval = "prediction")

## Warning in predict.lm(econroll, HGRAD = 25000, UNEM = 0.09, interval = "prediction"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   6400.849  3373.290  9428.409
## 2   5753.366  2758.488  8748.243
## 3   6010.902  3021.725  9000.080
## 4   7973.862  5101.115 10846.609
## 5  10460.039  7683.181 13236.896
## 6  10597.021  7785.717 13408.325
## 7  10873.207  8072.308 13674.105
## 8  11028.584  8219.452 13837.716
## 9  11691.849  8870.724 14512.973
## 10 13035.970 10290.669 15781.271
## 11 13680.036 10925.628 16434.445
## 12 13666.156 10910.553 16421.759
## 13 13974.182 11208.952 16739.412
## 14 14681.677 11915.401 17447.952
## 15 16990.239 14025.275 19955.202
## 16 15340.369 12508.841 18171.897
## 17 15021.347 12242.020 17800.673
## 18 14103.487 11128.567 17078.407
## 19 14940.074 12059.347 17820.800
## 20 15399.003 12597.789 18200.218
## 21 14855.113 12060.344 17649.881
## 22 15858.621 13019.279 18697.964
## 23 15581.535 12631.320 18531.750
## 24 13286.419 10535.760 16037.077
## 25 13677.799 10889.192 16466.407
## 26 13880.684 11064.310 16697.058
## 27 13837.044 11048.794 16625.295
## 28 13427.113 10679.826 16174.400
## 29 12477.453  9711.518 15243.389

newtemp = data.frame(HGRAD = 25000, UNEM = 0.09)

predict(econroll, newdata = newtemp, interval = "prediction")

##        fit      lwr      upr
## 1 15364.01 10461.38 20266.65

#Prediction is 15364.01 students enrolled which is

mean(enroll$ROLL)

## [1] 12707.03

#higher than the mean.

library(ggplot2)
library(dplyr)

econroll2 = lm(ROLL ~ UNEM + HGRAD + INC, data = enroll)

summary((econroll))

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enroll)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11

anova(econroll)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Multiple R-squared:  0.8489,   Adjusted R-squared:  0.8373 

summary(econroll2)

## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enroll)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1148.84  -489.71    -1.88   387.40  1425.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.153e+03  1.053e+03  -8.691 5.02e-09 ***
## UNEM         4.501e+02  1.182e+02   3.809 0.000807 ***
## HGRAD        4.065e-01  7.602e-02   5.347 1.52e-05 ***
## INC          4.275e+00  4.947e-01   8.642 5.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 670.4 on 25 degrees of freedom
## Multiple R-squared:  0.9621, Adjusted R-squared:  0.9576 
## F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16

anova(econroll2)

## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  101.02 2.894e-10 ***
## HGRAD      1 206279143 206279143  458.92 < 2.2e-16 ***
## INC        1  33568255  33568255   74.68 5.594e-09 ***
## Residuals 25  11237313    449493                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Multiple R-squared:  0.9621,   Adjusted R-squared:  0.9576 

#Adding the per capita income data makes this a pretty impressive model, account for almost all change in ROLL.

#P values for UNEM and HGRAD lowered when adding INC, which is a good sign as wel. UNEM's F value quadruplued. 

hist(residuals(econroll2))

plot(econroll2, which = 1)

#The residuals histogram has calmed down and the line graph is no longers as chaotic.

Project 10

Scott Udall

6/15/2021