library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(lubridate)
library(dplyr)
library(ggplot2)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
zipcode_df <- read_excel("final_zipcode_data.xlsx")
clean_zipcode_df <- zipcode_df %>% select(enrolled, cwidd, mhi, pw, ph, pb, bfpl) %>% na.omit(.)
zipcode_model <- lm(enrolled~cwidd+mhi+pw+ph+pb+bfpl, data=clean_zipcode_df)
summary(zipcode_model)
## 
## Call:
## lm(formula = enrolled ~ cwidd + mhi + pw + ph + pb + bfpl, data = clean_zipcode_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6880 -1.4806  0.2274  1.7152 16.5683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.744e-01  2.101e+00  -0.416  0.67908    
## cwidd        6.989e-03  2.690e-03   2.598  0.01223 *  
## mhi         -1.472e-06  2.490e-05  -0.059  0.95311    
## pw           3.118e-04  1.012e-04   3.082  0.00332 ** 
## ph           1.093e-03  2.355e-04   4.640 2.47e-05 ***
## pb           5.075e-04  2.705e-04   1.876  0.06636 .  
## bfpl        -4.078e-03  8.960e-04  -4.551 3.34e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.89 on 51 degrees of freedom
## Multiple R-squared:  0.7381, Adjusted R-squared:  0.7073 
## F-statistic: 23.96 on 6 and 51 DF,  p-value: 2.954e-13
plot(zipcode_model)

raintest(zipcode_model)
## 
##  Rainbow test
## 
## data:  zipcode_model
## Rain = 2.5745, df1 = 29, df2 = 22, p-value = 0.01265
durbinWatsonTest(zipcode_model)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.08515633      2.169274   0.576
##  Alternative hypothesis: rho != 0
vif(zipcode_model)
##    cwidd      mhi       pw       ph       pb     bfpl 
## 3.853469 1.983106 1.967901 6.672831 1.686193 8.045801
bptest(zipcode_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  zipcode_model
## BP = 28.793, df = 6, p-value = 6.657e-05
log_zipcode_model <-lm(enrolled~log(cwidd)+mhi+pw+ph+pb, data=clean_zipcode_df)
plot(log_zipcode_model)

glm_zipcode_model <-glm(enrolled~cwidd+mhi+pw+ph+pb, data=clean_zipcode_df)
summary(glm_zipcode_model)
## 
## Call:
## glm(formula = enrolled ~ cwidd + mhi + pw + ph + pb, data = clean_zipcode_df)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -6.340e+00  2.025e+00  -3.131  0.00286 **
## cwidd        2.916e-03  2.979e-03   0.979  0.33225   
## mhi          5.228e-05  2.575e-05   2.031  0.04741 * 
## pw           3.943e-04  1.169e-04   3.373  0.00141 **
## ph           2.976e-04  1.854e-04   1.605  0.11452   
## pb           4.496e-04  3.173e-04   1.417  0.16249   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 20.86625)
## 
##     Null deviance: 2946.9  on 57  degrees of freedom
## Residual deviance: 1085.0  on 52  degrees of freedom
## AIC: 348.48
## 
## Number of Fisher Scoring iterations: 2
clean_zipcode_df$predicted_enroll <- predict(zipcode_model, type = "response")
ggplot(clean_zipcode_df, aes(x = predicted_enroll, y = enrolled)) +
  geom_point(color = "#0072B2", size = 3) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(
    title = "GLM Predicted vs. Actual Enrollment",
    x = "Predicted Enrollment",
    y = "Actual Enrollment"
  ) +
  theme_minimal()