#Read in the data/ load libraries
getwd()
## [1] "C:/GEOG 5680/module10/module10"
list.files()
## [1] "enrollmentForecast.csv" "module10.Rmd"           "module10.Rproj"
enf = read.csv("enrollmentForecast.csv")
library(ggplot2)
#look at the data structure
str(enf)
## 'data.frame':    29 obs. of  5 variables:
##  $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
##  $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
##  $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
##  $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
summary(enf)
##       YEAR         ROLL            UNEM            HGRAD            INC      
##  Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
##  1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
##  Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
##  Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
##  3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
##  Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345
#Make scatterplots of ROLL against the other variables
ggplot(enf, aes(x = UNEM, y = ROLL)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Scatterplot of Enrollment (ROLL) vs Unemployment Rate (UNEM)", 
       x = "Unemployment Rate (UNEM)", y = "Enrollment (ROLL)")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(enf, aes(x = HGRAD, y = ROLL)) +
  geom_point() +
  geom_smooth(method = "lm", col = "pink") +
  labs(title = "Scatterplot of Enrollment (ROLL) vs 
       High School Graduates (HGRAD)", x = "High School Graduates (HGRAD)", 
       y = "Enrollment (ROLL)")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(enf, aes(x = INC, y = ROLL)) +
  geom_point() +
  geom_smooth(method = "lm", col = "lightgreen") +
  labs(title = "Scatterplot of Enrollment (ROLL) vs Per Capita Income (INC)",
       x = "Per Capita Income (INC)", y = "Enrollment (ROLL)")
## `geom_smooth()` using formula = 'y ~ x'

#build the linear model using the UNEM and HGRAD to predcit the fall enrollment
model1 <- lm(ROLL ~ UNEM + HGRAD, data = enf)
#Invesitgate the model using summary() and anova()
summary(model1)
## 
## Call:
## lm(formula = ROLL ~ UNEM + HGRAD, data = enf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2102.2  -861.6  -349.4   374.5  3603.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
## UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
## HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1313 on 26 degrees of freedom
## Multiple R-squared:  0.8489, Adjusted R-squared:  0.8373 
## F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
anova(model1)
## Analysis of Variance Table
## 
## Response: ROLL
##           Df    Sum Sq   Mean Sq F value    Pr(>F)    
## UNEM       1  45407767  45407767  26.349 2.366e-05 ***
## HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
## Residuals 26  44805568   1723291                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#which variable is most closely related to enrollment
#UNEM becasue the t value is closer to zero
#make a residual plot
residual = resid(model1)
fitted = fitted(model1)
ggplot(enf, aes(x = fitted, y = residual)) +
  geom_point() +
  geom_hline(yintercept = 0, col = "green")+
  ggtitle("Residual Plot") + xlab("Fitted Values") + ylab("Residuals")

#Use the predict() function to estimate expected fall enrollment
new_data = data.frame(UNEM = 9, HGRAD = 25000)
predicted_enrollment = predict(model1, newdata = new_data)
predicted_enrollment
##        1 
## 21585.58
#build a second model which includes per capita income (INC)
model2 = lm(ROLL ~ UNEM + HGRAD + INC, data = enf)
#compare the two models with ANOVA()
anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: ROLL ~ UNEM + HGRAD
## Model 2: ROLL ~ UNEM + HGRAD + INC
##   Res.Df      RSS Df Sum of Sq     F    Pr(>F)    
## 1     26 44805568                                 
## 2     25 11237313  1  33568255 74.68 5.594e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1