library(spatialreg) #Spatial Auto regressive function
library(spdep) # Constructing Neighbors
library(sf) # Manipulating spatial objects
library(tidycensus) #Bringing in Data from Census
library(dplyr) # Manipulating dataframes
library(stringr) #Manipulating strings
library(ggplot2) # For plotting
df.r <- read.csv("Data/County_Student_Teacher.csv",
header = TRUE,
skip = 6)
Creating a subset of variables of interest
df <- df.r %>%
select(Full.Time.Equivalent..FTE..Teachers..Public.School..2019.20,
County.Number..Public.School..2019.20,
Total.Number.of.Public.Schools..Public.School..2019.20,
Total.Students.All.Grades..Excludes.AE...Public.School..2019.20,
Pupil.Teacher.Ratio..Public.School..2019.20) %>%
rename(Teachers_19 = Full.Time.Equivalent..FTE..Teachers..Public.School..2019.20,
FIPS = County.Number..Public.School..2019.20,
PublicSchools_19 = Total.Number.of.Public.Schools..Public.School..2019.20,
Total_Students_19 = Total.Students.All.Grades..Excludes.AE...Public.School..2019.20,
Teacher_Pupil_Ratio_19 = Pupil.Teacher.Ratio..Public.School..2019.20)
Removing NAs
df <- df %>%
mutate(across(everything(), ~str_trim(str_replace(.,"â€", "NA")))) %>% # Replace special characters with "NA"
na_if("NA") %>% #making "NA" into real NA
na.omit() #Removing NA
df.census <-get_acs(geography = "county",
state="TX",
year = 2019,
variables=c("DP03_0086" # Median Family Income
),
geometry = T,
output = "wide")
df.merged <- df.census %>%
left_join(df, by = c("GEOID" = "FIPS")) %>%
na.omit()
Now that our data is merged we will be looking at Teacher and Pupil Ratio to see if it is related with median family income with spatial regression.
Queen Neighborhood
q.w <- poly2nb(df.merged, queen = TRUE) %>%
nb2listw(style = "W")
k 2 , 3, 4 Nearest Neighborhoods
k.2.w <- knearneigh(st_centroid(df.merged), k = 2) %>%
knn2nb() %>%
nb2listw(style = "W")
k.3.w <- knearneigh(st_centroid(df.merged), k = 3) %>%
knn2nb() %>%
nb2listw(style = "W")
k.4.w <- knearneigh(st_centroid(df.merged), k = 4) %>%
knn2nb() %>%
nb2listw(style = "W")
fit.ols <- lm(Teacher_Pupil_Ratio_19 ~ DP03_0086E, data = df.merged)
w.vector <- list(q.w, k.2.w, k.3.w, k.4.w)
total <- mapply(lm.morantest, listw = w.vector, MoreArgs = list(model = fit.ols)) #Apply moran test to every weight with the same model
estimates <- total %>%
apply(2,function(x) c(x$estimate[1], "p.value" = x$p.value)) #In each model grab the estimate (each column is a moran test)
plot(estimates[1,])
It seems that queen adjacency is has the highest estimate so that will be used in further analysis.
df.ne%>%
ggplot()+
geom_sf(aes(fill = quad_sig))+
ggtitle("Moran LISA Cluster Map -\nTeacher Pupil Ratio",
sub="Texas")
Fitting lag model
fit.lag <- lagsarlm(as.numeric(Teacher_Pupil_Ratio_19) ~ DP03_0086E,
data = df.merged,
listw = q.w,
type = "lag")
summary(fit.lag)
##
## Call:lagsarlm(formula = as.numeric(Teacher_Pupil_Ratio_19) ~ DP03_0086E,
## data = df.merged, listw = q.w, type = "lag")
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.33621 -1.07543 0.16108 1.18267 8.09314
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.3656e+00 8.9825e-01 4.8601 1.173e-06
## DP03_0086E 2.8210e-05 8.7931e-06 3.2081 0.001336
##
## Rho: 0.51997, LR test value: 50.098, p-value: 1.4627e-12
## Approximate (numerical Hessian) standard error: 0.065115
## z-value: 7.9853, p-value: 1.3323e-15
## Wald statistic: 63.766, p-value: 1.4433e-15
##
## Log likelihood: -529.6317 for lag model
## ML residual variance (sigma squared): 3.6359, (sigma: 1.9068)
## Number of observations: 253
## Number of parameters estimated: 4
## AIC: 1067.3, (AIC for lm: 1115.4)
moran.test(fit.lag$residuals, q.w)
##
## Moran I test under randomisation
##
## data: fit.lag$residuals
## weights: q.w
##
## Moran I statistic standard deviate = -0.98313, p-value = 0.8372
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.040922921 -0.003968254 0.001412920
Using the lag model we can see accounted for the autocorrelation in the residuals by using Moran I on the residuals.
Fitting Error Model
fit.err <- errorsarlm(as.numeric(Teacher_Pupil_Ratio_19) ~ DP03_0086E,
data = df.merged,
listw = q.w)
summary(fit.err)
##
## Call:errorsarlm(formula = as.numeric(Teacher_Pupil_Ratio_19) ~ DP03_0086E,
## data = df.merged, listw = q.w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.63896 -1.10592 0.17696 1.18362 8.05609
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0894e+01 7.2056e-01 15.1193 < 2.2e-16
## DP03_0086E 3.0683e-05 1.0502e-05 2.9215 0.003483
##
## Lambda: 0.5236, LR test value: 47.899, p-value: 4.4871e-12
## Approximate (numerical Hessian) standard error: 0.066769
## z-value: 7.8419, p-value: 4.4409e-15
## Wald statistic: 61.496, p-value: 4.4409e-15
##
## Log likelihood: -530.731 for error model
## ML residual variance (sigma squared): 3.6642, (sigma: 1.9142)
## Number of observations: 253
## Number of parameters estimated: 4
## AIC: 1069.5, (AIC for lm: 1115.4)
moran.test(fit.err$residuals, q.w)
##
## Moran I test under randomisation
##
## data: fit.err$residuals
## weights: q.w
##
## Moran I statistic standard deviate = -0.93947, p-value = 0.8263
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.039287504 -0.003968254 0.001413382
The error model also gets rid of the spatial autocorrelation in the residuals.
The spatial lag model has the lowest AIC score compared to the error model and ols model. Though the error model is very similar in AIC with the lag model. The estimates do not very that much between the 3 three models. They all stay significant. The error and the lag model got rid of the autocorrelation in the residuals of the model. I would say the best model would be the lag model because of the lowest AIC model.