library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(dplyr)
library(stringr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(ggplot2)
library(ggrepel)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(readxl)
districts <- read_excel("district.xls")

Cleaning the Data

# After loading the respective packages and loading the data, I filtered for districts in Bexar County:
filtered_districts <- districts |> filter(DZCNTYNM == "015 BEXAR")
# Cleaned the data further by filtering for districts rated A and B and dropped the NAs:
cleaned_districts <- filtered_districts |> 
  filter(DZRATING == "A" | DZRATING == "B") |>
  drop_na()
# Then, I filtered out districts that are considered non-taxing entities from the variable indicating property wealth:
property_districts <- cleaned_districts |>
  filter(PROPWLTH != "Non-taxing entities")
#Finally, I created a new variable by mutating the property wealth variable (PROPWLTH) into PROPRATING:
new_property_districts <- property_districts |> mutate(PROPRATING = case_when(PROPWLTH %in% c("$234,712 to < $298,152", "Under $164,606", "$164,606 to < $234,712", "$298,152 to < $340,843", "$340,843 to < $359,962", "$359,962 to < $411,857", "$411,857 to < $427,868") ~ "LOW", PROPWLTH %in% c("$456,750 to < $479,670", "$479,670 to < $526,224", "$526,224 to < $539,089") ~ "MEDIUM", PROPWLTH == "Non-taxing entities" ~ "NO_TAX", TRUE ~ "HIGH"))
view(new_property_districts)

Comparing the Data

#This model is not linear:
lr_districts <- lm(DAGC4X21R~DPFPACOMP+DA0912DR21R+DPETECOP, data=new_property_districts)
plot(lr_districts,which=1)

# lr_districts model likely contains a correlation of residuals:
durbinWatsonTest(lr_districts)
##  lag Autocorrelation D-W Statistic p-value
##    1      -0.5136738      2.844957   0.022
##  Alternative hypothesis: rho != 0
# The model lr_districts is not homoscedastic:
plot(lr_districts, which=3)

# However according to the Breusch-Pagan test indicates the model is homoscedastic:
bptest(lr_districts)
## 
##  studentized Breusch-Pagan test
## 
## data:  lr_districts
## BP = 2.733, df = 3, p-value = 0.4346
# The linear model lr_districts has errors that are somewhat less normal
plot(lr_districts,which=2)

# However, according to the Shapiro-Wilk Test, the variables within lr_districts are probably normal.
shapiro.test(lr_districts$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lr_districts$residuals
## W = 0.94505, p-value = 0.6846
# In the model, lr_districts, % of economically disadvantaged students is strongly correlated with the expenditure % of state compensatory education: 
vif(lr_districts)
##   DPFPACOMP DA0912DR21R    DPETECOP 
##    28.86613    10.97407    44.39489
districts_variables <- new_property_districts |> dplyr::select(DAGC4X21R, DPFPACOMP, DA0912DR21R, DPETECOP)
cor(districts_variables)
##              DAGC4X21R  DPFPACOMP DA0912DR21R   DPETECOP
## DAGC4X21R    1.0000000 -0.1955098  -0.9139144 -0.5375277
## DPFPACOMP   -0.1955098  1.0000000   0.1686857  0.8716889
## DA0912DR21R -0.9139144  0.1686857   1.0000000  0.6068681
## DPETECOP    -0.5375277  0.8716889   0.6068681  1.0000000

Mitigation of Assumptions

# In order to address no linearity, I performed a log transform on the model:
districts_log <- lm(log(DAGC4X21R + 0.001) ~ 
                           log(DPFPACOMP + 0.001) + 
                           log(DA0912DR21R + 0.001) + 
                           log(DPETECOP + 0.001), 
                           data = districts_variables)
plot(districts_log,which=1)

# In order to address the the model homoscedascity violation, I performed a log transform that resulted in the following plot:
plot(districts_log,which=3)

#The Homoscedascity decreased after the log transformation, but it is still considered homoscedastic according to the bp-test: 
bptest(districts_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  districts_log
## BP = 4.2582, df = 3, p-value = 0.2349
#To mitigate the normality of Residuals Violation; the transformation led to fairly normalized model:
plot(districts_log,which=2)

# Which is confirmed by the new Shapiro-Wilk test:
shapiro.test(districts_log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  districts_log$residuals
## W = 0.9391, p-value = 0.6307
# Removing DPETECOP addressed the Multicolinearity Violation, where VIF of the independent variables is below 10:
new_districts_variables <- lm(DAGC4X21R~DPFPACOMP+DA0912DR21R, data=new_property_districts)
vif(new_districts_variables)
##   DPFPACOMP DA0912DR21R 
##    1.029288    1.029288

Resolution

# Switched to GLM to show model significance:
glm_districts<-glm(DAGC4X21R~DPFPACOMP+DA0912DR21R+DPETECOP, data=new_property_districts,family = gaussian(link=log))

summary(glm_districts)
## 
## Call:
## glm(formula = DAGC4X21R ~ DPFPACOMP + DA0912DR21R + DPETECOP, 
##     family = gaussian(link = log), data = new_property_districts)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.603708   0.018557 248.083 1.44e-07 ***
## DPFPACOMP   -0.021556   0.008321  -2.590   0.0810 .  
## DA0912DR21R -0.064857   0.013995  -4.634   0.0189 *  
## DPETECOP     0.003903   0.001499   2.603   0.0802 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.560691)
## 
##     Null deviance: 93.6886  on 6  degrees of freedom
## Residual deviance:  4.6821  on 3  degrees of freedom
## AIC: 27.05
## 
## Number of Fisher Scoring iterations: 3
# Model is significant as the difference between the Null Deviance and the Residual Deviance is greater than the difference between the Null deviance DF and the Residual deviance DF (89>3):
stargazer(glm_districts,
          type = "text",
          title = "GLM Results",
          dep.var.labels = "Graduation Rate",
          covariate.labels = c("% of State Comp Ed Exp","% of Econ DA Stud", "Dropout Rate"),
          report = "vc*p",                  
          single.row = TRUE,               
          no.space = TRUE)                 
## 
## GLM Results
## ==================================================
##                            Dependent variable:    
##                        ---------------------------
##                              Graduation Rate      
## --------------------------------------------------
## % of State Comp Ed Exp           -0.022*          
##                                 p = 0.082         
## % of Econ DA Stud               -0.065**          
##                                 p = 0.019         
## Dropout Rate                     0.004*           
##                                 p = 0.081         
## Constant                        4.604***          
##                                p = 0.00000        
## --------------------------------------------------
## Observations                        7             
## Log Likelihood                   -9.525           
## Akaike Inf. Crit.                27.050           
## ==================================================
## Note:                  *p<0.1; **p<0.05; ***p<0.01
# Despite the primary independent variable (% of compensatory education expenditure) demonstrating a paradoxical negative relationship, the negative relationship between graduation rates and dropout rates follow the literature; The affect of the % of economically disadvantaged students has little significance:
coef(glm_districts)
##  (Intercept)    DPFPACOMP  DA0912DR21R     DPETECOP 
##  4.603708430 -0.021555554 -0.064857252  0.003903396
# The same can be said when you exponent the coefficients:
exp(coef(glm_districts))
## (Intercept)   DPFPACOMP DA0912DR21R    DPETECOP 
##  99.8539312   0.9786751   0.9372012   1.0039110

Final Determination

# Ultimately, the percentage of compensatory education expenditure has no positive effect on graduation rates:
ggplot(new_property_districts,aes(x=DAGC4X21R,y=DPFPACOMP,color=as.factor(PROPRATING), shape=as.factor(DZRATING))) + 
  geom_point() + 
  geom_smooth(method="lm", se=TRUE, aes(group=1), color="black") + 
  labs(title="Compensatory Education Spending on Graduation Rates:", 
  subtitle = "A Paradoxically Negative Relationship") +
  labs(
    x = "4-Year Longitudinal Graduation Rate",
    y = "% Compensatory Education Expenditure",
    color = "Property Rating",
    shape = "District Rating"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: shape.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?