Use the data of your choice for this homework
Make a series of spatial weights for the data
Define your outcome variable and all predictors
Fit the OLS model for your outcome :
Test for autocorrelation in the residuals using each neighbor specification, which specification showed the largest degree of dependence. Use that specification for the rest of the assignment.
Produce a map of the studentized residuals from the model fit and a map of the local Moran statistic for the residuals.
Examine which alternative SAR model specification would best fit this data using the minimum AIC criteria and specification tests.
Fit that model (the one you identify as being “best”), and describe the results, compared to the OLS results.
Publish your results to Rpubs and email me the link
For this project I am attempting to examine the relationship between housing tenure, race of householder, and real estate tax base using ACS 2019 5 Year data at the School District (unified) level for the state of Texas. School districts with fewer than 5,000 population have been removed.
My outcome variable is the Aggregate Real Estate Taxes Paid Per School Aged Child under 18. It’s a confusing measure because it is not just the taxes paid to the school distruct, but the real estate taxes paid overall by the residents/property owners in that district as estimated by the ACS (I know, messy, consider this just a thought exercise for the purpose of completing this assignment). To improve this measure, I could download the actual real estate taxes paid by district through the Census of Governments (latest 2017) and then factor in the “robin hood” law Texas takes into effect and perhaps school district quality through other data sources. For the purpose of this assignment, I’m just using ACS data.
My predictor variables are (1) the percent of White homeowners, (2) the percent of White renters, (3) the percent of Black homeowners, and (3) the Percent of Black Renters (with total renters and total homeowners as the denominator, does not take into account vacant or seasonal units).
#Download ACS data for the year 2015 for Los Angeles County, California census tracts in R using the tidycensus library
#In this extract, request both the proportion of the population that is Non-Hispanic Black and the proportion of the population that is Hispanic
#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE)
#ec2019 <- load_variables(2019 , "census of governments", cache = TRUE)
#demographic profile tables
#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) #demographic profile tables
tx_sd_acs<-get_acs(geography = "school district (unified)",
state="TX",
year = 2019,
variables=c("DP05_0001E",
"DP03_0119PE",
"DP03_0062E",
"B25003_003E",
"B25003H_003E",
"B25003I_003E",
"B25003B_003E",
"B25003_002E",
"B25003H_002E",
"B25003I_002E",
"B25003B_002E",
"B25090_001E",
"B05003_003E",
"B05003_014E") ,
geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
#rename variables and filter missing cases
tx_sd_acs2<-tx_sd_acs%>%
mutate(totpop=DP05_0001E,
ppov=DP03_0119PE,
MHI=DP03_0062E,
totRent = B25003_003E,
whiteRent = B25003H_003E,
hispRent = B25003I_003E,
blackRent = B25003B_003E,
totOwn = B25003_002E,
whiteOwn = B25003H_002E,
hispOwn = B25003I_002E,
blackOwn = B25003B_002E,
totRentOwn = totRent + totOwn,
pRent = totRent/totRentOwn,
pOwn = totOwn/totRentOwn,
pwhiteOwn = whiteOwn/totRentOwn,
phispOwn = hispOwn/totRentOwn,
pblackOwn = blackOwn/totRentOwn,
pwhiteRent = whiteRent/totRentOwn,
phispRent = hispRent/totRentOwn,
pblackRent = blackRent/totRentOwn,
aggREtax = B25090_001E,
totUnder18 = B05003_014E + B05003_003E,
REtaxPCapSchAge = aggREtax / totUnder18)
tx_sd_acs2<-na.omit(tx_sd_acs2)
#Select districts with pop > 5,000
tx_sd_acs2 <- subset(tx_sd_acs2, totpop >= 5000)
#pull texas shapefile for context
state<-get_acs(geography = "state", variables=c("DP05_0001E"), state="TX", year=2019, geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
MSAs<-core_based_statistical_areas(cb=T,year=2019)
SANB<-MSAs[MSAs$GEOID=="41700",]
DFW<-MSAs[MSAs$GEOID=="19100",]
HOU<-MSAs[MSAs$GEOID=="26420",]
AUS<-MSAs[MSAs$GEOID=="12420",]
#MHI by School District
library(tmap)
tm_shape(tx_sd_acs2)+
tm_polygons("MHI",
title="MHI",
palette="YlGnBu",
border.alpha = 0,
style="fixed",
breaks= c(0,25000,50000,75000,100000,250000),
labels=c("$0 to 25K","$25 to 50K","$50 to 75K","$75 to 100K","$100K +"),
legend.hist=T )+
tm_format("World", title="Texas School Districts - MHI", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
#Aggregate Real Estate Tax
library(tmap)
tm_shape(tx_sd_acs2)+
tm_polygons("aggREtax",
title="Aggregate Real Estate Tax (paid to all jurisdictions, not just School District)",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_format("World", title="Agg. Real Estate Tax Paid", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
#% Black Renters by SD
tm_shape(tx_sd_acs2)+
tm_polygons("pblackRent",
title="% Black Renters (includes Hispanic)",
palette="YlGnBu",
border.alpha = 0,
style="pretty",
legend.hist=T )+
tm_format("World", title="Texas School Districts - Percent Black Renters", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% Black Owners by SD
tm_shape(tx_sd_acs2)+
tm_polygons("pblackOwn",
title="% Black Homeowners (includes Hispanic)",
palette="YlGnBu",
border.alpha = 0,
style="pretty",
legend.hist=T )+
tm_format("World", title="Texas School Districts - Percent Black Homeowners", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% Hispanic Renters by SD
tm_shape(tx_sd_acs2)+
tm_polygons("phispRent",
title="% Hispanic Renters (any race)",
palette="YlGnBu",
border.alpha = 0,
style="pretty",
legend.hist=T )+
tm_format("World", title="Texas School Districts - Percent Hispanic Renters", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% Hispanic Owners by SD
tm_shape(tx_sd_acs2)+
tm_polygons("phispOwn",
title="% Hispanic Homeowners (any race)",
palette="YlGnBu",
border.alpha = 0,
style="pretty",
legend.hist=T )+
tm_format("World", title="Texas School Districts - Percent Hispanic Homeowners", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#Total Population by School District
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("totpop",
title="Total Pop (Quantile)",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts with Pop > 5,000", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
#% White Homeowners by SD
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("pwhiteOwn",
title="% White NH Homeowners",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts - Percent White, NH Homeowners", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% White Renters by SD
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("pwhiteRent",
title="% White NH renters",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts - Percent White, NH Renters", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% Black Renters by SD
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("pblackRent",
title="% Black Renters (includes Hispanic)",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts - Percent Black Renters", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#% Black Owners by SD
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("pblackOwn",
title="% Black Homeowners (includes Hispanic)",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts - Percent Black Homeowners", legend.outside=T)+
tm_scale_bar()+
tm_compass()
#Aggregate Real Estate Tax Per Child Under 18 (includes homeschool)
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs2)+
tm_polygons("REtaxPCapSchAge",
title="Averaged Real Estate Tax Per Child under 18y",
palette="YlGnBu",
border.alpha = 0,
style="quantile",
legend.hist=T )+
tm_shape(SANB)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(DFW)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(HOU)+
tm_polygons(alpha=0, border.col = "black")+
tm_shape(AUS)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Per cap. Real Estate Tax Paid", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
#Scatterplot of PRent by POwn (of course linear because it's defined that way)
library(ggplot2)
qplot(y = tx_sd_acs2$pRent,
x= tx_sd_acs2$pOwn )
#Scatterplot of PWhite Homeowners by PRenters (all races)
qplot(y = tx_sd_acs2$pwhiteOwn,
x= tx_sd_acs2$pRent )
#Scatterplot of PWhite Homeowners by PBlack Renters
qplot(y = tx_sd_acs2$pwhiteOwn,
x= tx_sd_acs2$pblackRent )
#Scatterplot of PWhite Homeowners by PWhite Renters
qplot(y = tx_sd_acs2$pwhiteOwn,
x= tx_sd_acs2$pwhiteRent )
#Scatterplot of PWhite Homeowners by Real Estate Tax Paid Per School Age Child (under 18)
qplot(y = tx_sd_acs2$pwhiteOwn,
x= tx_sd_acs2$REtaxPCapSchAge )
us.nb6<-knearneigh(st_centroid(tx_sd_acs2), k=6)
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid assumes attributes are
## constant over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
us.nb6<-knn2nb(us.nb6)
us.wt6<-nb2listw(us.nb6, style="W")
us.nb5<-knearneigh(st_centroid(tx_sd_acs2), k=5)
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid assumes attributes are
## constant over geometries of x
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid does not give correct
## centroids for longitude/latitude data
us.nb5<-knn2nb(us.nb5)
us.wt5<-nb2listw(us.nb5, style="W")
us.nb4<-knearneigh(st_centroid(tx_sd_acs2), k=4)
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid assumes attributes are
## constant over geometries of x
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid does not give correct
## centroids for longitude/latitude data
us.nb4<-knn2nb(us.nb4)
us.wt4<-nb2listw(us.nb4, style="W")
us.nb3<-knearneigh(st_centroid(tx_sd_acs2), k=3)
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid assumes attributes are
## constant over geometries of x
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid does not give correct
## centroids for longitude/latitude data
us.nb3<-knn2nb(us.nb3)
us.wt3<-nb2listw(us.nb3,style="W")
us.nb2<-knearneigh(st_centroid(tx_sd_acs2) , k=2)
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid assumes attributes are
## constant over geometries of x
## Warning in st_centroid.sf(tx_sd_acs2): st_centroid does not give correct
## centroids for longitude/latitude data
us.nb2<-knn2nb(us.nb2)
us.wt2<-nb2listw(us.nb2,style="W")
us.nbr<-poly2nb(tx_sd_acs2, queen=F)
us.wtr<-nb2listw(us.nbr, zero.policy=T)
us.nbq<-poly2nb(tx_sd_acs2, queen=T)
us.wtq<-nb2listw(us.nbr, style="W", zero.policy=T)
tx_sd_acs2$PerCapTaxZ<-as.numeric(scale(tx_sd_acs2$REtaxPCapSchAge, center=T, scale=T))
tx_sd_acs2$pwhiteOwnZ<-as.numeric(scale(tx_sd_acs2$pwhiteOwn, center=T, scale=T))
tx_sd_acs2$pblackOwnZ<-as.numeric(scale(tx_sd_acs2$pblackOwn, center=T, scale=T))
tx_sd_acs2$pwhiteRentZ<-as.numeric(scale(tx_sd_acs2$pwhiteRent, center=T, scale=T))
tx_sd_acs2$pblackRentZ<-as.numeric(scale(tx_sd_acs2$pblackRent, center=T, scale=T))
qplot(tx_sd_acs2$PerCapTaxZ,
geom="histogram" )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Based on the below, the K=2 nearest neighbor weighting has the highest residual moran’s I value.
fit.ols<-lm(PerCapTaxZ~pwhiteOwnZ+pwhiteRentZ+pblackOwnZ+pblackRentZ,
data=tx_sd_acs2)
summary(fit.ols)
##
## Call:
## lm(formula = PerCapTaxZ ~ pwhiteOwnZ + pwhiteRentZ + pblackOwnZ +
## pblackRentZ, data = tx_sd_acs2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4160 -0.5254 -0.2381 0.2143 6.8791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.715e-17 3.992e-02 0.000 1.000
## pwhiteOwnZ 4.797e-01 4.806e-02 9.981 <2e-16 ***
## pwhiteRentZ -7.544e-02 4.651e-02 -1.622 0.105
## pblackOwnZ -5.865e-02 5.988e-02 -0.980 0.328
## pblackRentZ 3.123e-02 6.450e-02 0.484 0.629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8936 on 496 degrees of freedom
## Multiple R-squared: 0.2079, Adjusted R-squared: 0.2015
## F-statistic: 32.55 on 4 and 496 DF, p-value: < 2.2e-16
resi<-c(lm.morantest(fit.ols, listw=us.wt2)$estimate[1],
lm.morantest(fit.ols, listw=us.wt3)$estimate[1],
lm.morantest(fit.ols, listw=us.wt4)$estimate[1],
lm.morantest(fit.ols, listw=us.wt5)$estimate[1],
lm.morantest(fit.ols, listw=us.wt6)$estimate[1],
lm.morantest(fit.ols, listw=us.wtq,zero.policy=T)$estimate[1],
lm.morantest(fit.ols, listw=us.wtr,zero.policy=T)$estimate[1])
plot(resi, type="l")
tx_sd_acs2$olsresid<-rstudent(fit.ols)
tx_sd_acs2<-st_as_sf(tx_sd_acs2)
tx_sd_acs2%>%
mutate(rquant=cut(olsresid,
breaks = quantile(tx_sd_acs2$olsresid,
p=seq(0,1,length.out = 8)),
include.lowest = T))%>%
ggplot(aes(color=rquant, fill=rquant))+
geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+
geom_sf(data=state, fill=NA, color="black")+
ggtitle("OLS Model Residuals")
#Moran's I on residuals from model
lm.morantest(fit.ols, listw = us.wt2)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = PerCapTaxZ ~ pwhiteOwnZ + pwhiteRentZ + pblackOwnZ
## + pblackRentZ, data = tx_sd_acs2)
## weights: us.wt2
##
## Moran I statistic standard deviate = 15.084, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.599164861 -0.005037236 0.001604385
locali<-localmoran(tx_sd_acs2$PerCapTaxZ, listw = us.wt2, alternative = "two.sided" )
tx_sd_acs2$locali<-locali[,1]
tx_sd_acs2$localp<-locali[,5]
tx_sd_acs2$sinc<-scale(tx_sd_acs2$PerCapTaxZ)
tx_sd_acs2$lag_inc<-lag.listw(var=tx_sd_acs2$sinc, x = us.wt2)
tx_sd_acs2$quad_sig <- NA
tx_sd_acs2$quad_sig[(tx_sd_acs2$sinc >= 0 & tx_sd_acs2$lag_inc >= 0) & (tx_sd_acs2$localp <= 0.05)] <- "H-H" #high high
tx_sd_acs2$quad_sig[(tx_sd_acs2$sinc <= 0 & tx_sd_acs2$lag_inc <= 0) & (tx_sd_acs2$localp <= 0.05)] <- "L-L" #low low
tx_sd_acs2$quad_sig[(tx_sd_acs2$sinc >= 0 & tx_sd_acs2$lag_inc <= 0) & (tx_sd_acs2$localp <= 0.05)] <- "H-L" #high low
tx_sd_acs2$quad_sig[(tx_sd_acs2$sinc <= 0 & tx_sd_acs2$lag_inc >= 0) & (tx_sd_acs2$localp <= 0.05)] <- "L-H" #low high
#WE ASSIGN A # Set the breaks for the thematic map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Clustered")
# see ?findInterval - This is necessary for making a map
np <- findInterval(tx_sd_acs2$quad_sig, breaks)
## Warning in findInterval(tx_sd_acs2$quad_sig, breaks): NAs introduced by coercion
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
tx_sd_acs2%>%
ggplot()+
geom_sf(aes(fill = quad_sig))+
ggtitle("Moran LISA Cluster Map -\nAveraged Real Estate Taxes Per Child Z Score",
sub="For the population under 18")