library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(stringr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ggplot2)
library(ggrepel)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(readxl)
districts <- read_excel("district.xls")
Cleaning the Data
# After loading the respective packages and loading the data, I filtered for districts in Bexar County:
filtered_districts <- districts |> filter(DZCNTYNM == "015 BEXAR")
# Cleaned the data further by filtering for districts rated A and B and dropped the NAs:
cleaned_districts <- filtered_districts |>
filter(DZRATING == "A" | DZRATING == "B") |>
drop_na()
# Then, I filtered out districts that are considered non-taxing entities from the variable indicating property wealth:
property_districts <- cleaned_districts |>
filter(PROPWLTH != "Non-taxing entities")
#Finally, I created a new variable by mutating the property wealth variable (PROPWLTH) into PROPRATING:
new_property_districts <- property_districts |> mutate(PROPRATING = case_when(PROPWLTH %in% c("$234,712 to < $298,152", "Under $164,606", "$164,606 to < $234,712", "$298,152 to < $340,843", "$340,843 to < $359,962", "$359,962 to < $411,857", "$411,857 to < $427,868") ~ "LOW", PROPWLTH %in% c("$456,750 to < $479,670", "$479,670 to < $526,224", "$526,224 to < $539,089") ~ "MEDIUM", PROPWLTH == "Non-taxing entities" ~ "NO_TAX", TRUE ~ "HIGH"))
view(new_property_districts)
Comparing the Data
#This model is not linear:
lr_districts <- lm(DAGC4X21R~DPFPACOMP+DA0912DR21R+DPETECOP, data=new_property_districts)
plot(lr_districts,which=1)

# lr_districts model likely contains a correlation of residuals:
durbinWatsonTest(lr_districts)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.5136738 2.844957 0.022
## Alternative hypothesis: rho != 0
# The model lr_districts is not homoscedastic:
plot(lr_districts, which=3)

# However according to the Breusch-Pagan test indicates the model is homoscedastic:
bptest(lr_districts)
##
## studentized Breusch-Pagan test
##
## data: lr_districts
## BP = 2.733, df = 3, p-value = 0.4346
# The linear model lr_districts has errors that are somewhat less normal
plot(lr_districts,which=2)

# However, according to the Shapiro-Wilk Test, the variables within lr_districts are probably normal.
shapiro.test(lr_districts$residuals)
##
## Shapiro-Wilk normality test
##
## data: lr_districts$residuals
## W = 0.94505, p-value = 0.6846
# In the model, lr_districts, % of economically disadvantaged students is strongly correlated with the expenditure % of state compensatory education:
vif(lr_districts)
## DPFPACOMP DA0912DR21R DPETECOP
## 28.86613 10.97407 44.39489
districts_variables <- new_property_districts |> dplyr::select(DAGC4X21R, DPFPACOMP, DA0912DR21R, DPETECOP)
cor(districts_variables)
## DAGC4X21R DPFPACOMP DA0912DR21R DPETECOP
## DAGC4X21R 1.0000000 -0.1955098 -0.9139144 -0.5375277
## DPFPACOMP -0.1955098 1.0000000 0.1686857 0.8716889
## DA0912DR21R -0.9139144 0.1686857 1.0000000 0.6068681
## DPETECOP -0.5375277 0.8716889 0.6068681 1.0000000
Mitigation of Assumptions
# In order to address no linearity, I performed a log transform on the model:
districts_log <- lm(log(DAGC4X21R + 0.001) ~
log(DPFPACOMP + 0.001) +
log(DA0912DR21R + 0.001) +
log(DPETECOP + 0.001),
data = districts_variables)
plot(districts_log,which=1)

# In order to address the the model homoscedascity violation, I performed a log transform that resulted in the following plot:
plot(districts_log,which=3)

#The Homoscedascity decreased after the log transformation, but it is still considered homoscedastic according to the bp-test:
bptest(districts_log)
##
## studentized Breusch-Pagan test
##
## data: districts_log
## BP = 4.2582, df = 3, p-value = 0.2349
#To mitigate the normality of Residuals Violation; the transformation led to fairly normalized model:
plot(districts_log,which=2)

# Which is confirmed by the new Shapiro-Wilk test:
shapiro.test(districts_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: districts_log$residuals
## W = 0.9391, p-value = 0.6307
# Removing DPETECOP addressed the Multicolinearity Violation, where VIF of the independent variables is below 10:
new_districts_variables <- lm(DAGC4X21R~DPFPACOMP+DA0912DR21R, data=new_property_districts)
vif(new_districts_variables)
## DPFPACOMP DA0912DR21R
## 1.029288 1.029288
Resolution
# Switched to GLM to show model significance:
glm_districts<-glm(DAGC4X21R~DPFPACOMP+DA0912DR21R+DPETECOP, data=new_property_districts,family = gaussian(link=log))
summary(glm_districts)
##
## Call:
## glm(formula = DAGC4X21R ~ DPFPACOMP + DA0912DR21R + DPETECOP,
## family = gaussian(link = log), data = new_property_districts)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.603708 0.018557 248.083 1.44e-07 ***
## DPFPACOMP -0.021556 0.008321 -2.590 0.0810 .
## DA0912DR21R -0.064857 0.013995 -4.634 0.0189 *
## DPETECOP 0.003903 0.001499 2.603 0.0802 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.560691)
##
## Null deviance: 93.6886 on 6 degrees of freedom
## Residual deviance: 4.6821 on 3 degrees of freedom
## AIC: 27.05
##
## Number of Fisher Scoring iterations: 3
# Model is significant as the difference between the Null Deviance and the Residual Deviance is greater than the difference between the Null deviance DF and the Residual deviance DF (89>3):
stargazer(glm_districts,
type = "text",
title = "GLM Results",
dep.var.labels = "Graduation Rate",
covariate.labels = c("% of State Comp Ed Exp","% of Econ DA Stud", "Dropout Rate"),
report = "vc*p",
single.row = TRUE,
no.space = TRUE)
##
## GLM Results
## ==================================================
## Dependent variable:
## ---------------------------
## Graduation Rate
## --------------------------------------------------
## % of State Comp Ed Exp -0.022*
## p = 0.082
## % of Econ DA Stud -0.065**
## p = 0.019
## Dropout Rate 0.004*
## p = 0.081
## Constant 4.604***
## p = 0.00000
## --------------------------------------------------
## Observations 7
## Log Likelihood -9.525
## Akaike Inf. Crit. 27.050
## ==================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Despite the primary independent variable (% of compensatory education expenditure) demonstrating a paradoxical negative relationship, the negative relationship between graduation rates and dropout rates follow the literature; The affect of the % of economically disadvantaged students has little significance:
coef(glm_districts)
## (Intercept) DPFPACOMP DA0912DR21R DPETECOP
## 4.603708430 -0.021555554 -0.064857252 0.003903396
# The same can be said when you exponent the coefficients:
exp(coef(glm_districts))
## (Intercept) DPFPACOMP DA0912DR21R DPETECOP
## 99.8539312 0.9786751 0.9372012 1.0039110
Final Determination
# Ultimately, the percentage of compensatory education expenditure has no positive effect on graduation rates:
ggplot(new_property_districts,aes(x=DAGC4X21R,y=DPFPACOMP,color=as.factor(PROPRATING), shape=as.factor(DZRATING))) +
geom_point() +
geom_smooth(method="lm", se=TRUE, aes(group=1), color="black") +
labs(title="Compensatory Education Spending on Graduation Rates:",
subtitle = "A Paradoxically Negative Relationship") +
labs(
x = "4-Year Longitudinal Graduation Rate",
y = "% Compensatory Education Expenditure",
color = "Property Rating",
shape = "District Rating"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: shape.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
