Description:

In this assignment you will:

A. Construct a regression model for spatial data B. Examine the model for non-stationarity using a spatial regimes specification C. Describe the results of your analysis

If you use your own data, follow these steps

Fit the OLS model for your outcome :

Be sure the document: Presents the results of your analysis, including figures and output tables to adequately describe the analysis. Contains a short (2 paragraph) description of the analysis results.

Reporting: Please construct either a HTML document, using Rstudio’s “knit” button and submit the Rpubs link on Blackboard

Spatial Regime

For this HW3 I am using the same data as in HW2 (Texas unified school districts) but first I tried classifying them by the 4 “Texas Triangle” metropolitan areas (Austin-RR, DFW, San Antonio-NB, and Houston-Woodlands) as my spatial regimes. However, this spatial regime didn’t turn out to be very interesting, so I decided to split them as having MHI’s above or below the state MHI.

Download ACS Data

#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) 
#ec2019 <- load_variables(2019 , "census of governments", cache = TRUE) 

#demographic profile tables

#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) #demographic profile tables

tx_sd_acs<-get_acs(geography = "school district (unified)",
                state="TX",
                year = 2019,
                variables=c("DP05_0001E", 
                            "DP03_0119PE",
                            "DP03_0062E",
                            "B25003_003E",
                            "B25003H_003E",
                            "B25003I_003E",
                            "B25003B_003E",
                            "B25003_002E",
                            "B25003H_002E",
                            "B25003I_002E",
                            "B25003B_002E",
                            "B25090_001E",
                            "B05003_003E",
                            "B05003_014E") ,
                geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
#rename variables and filter missing cases
tx_sd_acs2<-tx_sd_acs%>%
  mutate(totpop=DP05_0001E, 
         ppov=DP03_0119PE, 
         MHI=DP03_0062E, 
         totRent = B25003_003E,
         whiteRent = B25003H_003E,
         hispRent = B25003I_003E,
         blackRent = B25003B_003E,
         totOwn = B25003_002E,
         whiteOwn = B25003H_002E,
         hispOwn = B25003I_002E,
         blackOwn = B25003B_002E,
         totRentOwn = totRent + totOwn,
         pRent = totRent/totRentOwn,
         pOwn = totOwn/totRentOwn,
         pwhiteOwn = whiteOwn/totRentOwn,
         phispOwn = hispOwn/totRentOwn,
         pblackOwn = blackOwn/totRentOwn,
         pwhiteRent = whiteRent/totRentOwn,
         phispRent = hispRent/totRentOwn,
         pblackRent = blackRent/totRentOwn,
         aggREtax = B25090_001E,
         totUnder18 = B05003_014E + B05003_003E,
         REtaxPCapSchAge = aggREtax / totUnder18)
tx_sd_acs2<-na.omit(tx_sd_acs2)


#Select districts with pop > 5,000
tx_sd_acs2 <- subset(tx_sd_acs2, totpop >= 5000)

#pull texas shapefile for context
state<-get_acs(geography = "state", variables=c("DP05_0001E"), state="TX", year=2019, geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
MSAs<-core_based_statistical_areas(cb=T,year=2019)
SANB<-MSAs[MSAs$GEOID=="41700",]
DFW<-MSAs[MSAs$GEOID=="19100",]
HOU<-MSAs[MSAs$GEOID=="26420",]
AUS<-MSAs[MSAs$GEOID=="12420",]

#Merge MSAs into new shapefile (for some reason couldn't query them all together)
TX_MSAs <- rbind(SANB, DFW, HOU, AUS)

#Spatial Join to show School Districts by MSA
# not completely happy with how the districts were assigned to the MSAs (i tried all the options from intersects to covered_by etc) but that can be dealt with later. 
tx_sd_acs_joined <- st_join(tx_sd_acs2, left = TRUE, join=st_intersects, largest=TRUE, sparse=TRUE, TX_MSAs["NAME"])
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
#Examining MSA data
library(tmap)
tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_sd_acs_joined)+
  tm_polygons("NAME.y", title="School Districts by MSA",  border.alpha = 1)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas School Districts with Pop > 5,000 by assigned MSA", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:huxtable':
## 
##     number_format
## The following object is masked from 'package:arm':
## 
##     rescale
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
tx_sd_acs_joined$MHI_code <- case_when(tx_sd_acs_joined$MHI >= 64034 ~ 'Higher than TX MHI', TRUE ~ 'Lower than TX MHI')

#Examining MHI data
tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_sd_acs_joined)+
  tm_polygons("MHI_code", title="District MHI Compared to TX MHI",  border.alpha = 1)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas School Districts with Pop > 5,000 by assigned MSA", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()

Convert Variables to Z Scores

tx_sd_acs_joined$PerCapTaxZ<-as.numeric(scale(tx_sd_acs_joined$REtaxPCapSchAge, center=T, scale=T))
qplot(tx_sd_acs_joined$PerCapTaxZ,
      geom="histogram" )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Spatial Regime - MSAs

When splitting by the MSAs, the data is not that interesting, and is really just telling us that the San Antonio, Houston, and Dallas regions all have lower per capita tax rates than the Austin region. Based on this, I’m going to switch to a different Spatial Regime focused on MHI per tract.

fit.0<-lm(PerCapTaxZ~NAME.y+MHI_code+scale(pwhiteOwn)+scale(pwhiteRent)+scale(pblackOwn)+scale(pblackRent),data=tx_sd_acs_joined)
summary(fit.0)
## 
## Call:
## lm(formula = PerCapTaxZ ~ NAME.y + MHI_code + scale(pwhiteOwn) + 
##     scale(pwhiteRent) + scale(pblackOwn) + scale(pblackRent), 
##     data = tx_sd_acs_joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5093 -0.4959 -0.0995  0.3098  6.0923 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                 1.48211    0.17111   8.662 9.15e-16
## NAME.yDallas-Fort Worth-Arlington, TX      -0.68920    0.18651  -3.695 0.000276
## NAME.yHouston-The Woodlands-Sugar Land, TX -0.76853    0.19843  -3.873 0.000141
## NAME.ySan Antonio-New Braunfels, TX        -0.39610    0.24653  -1.607 0.109518
## MHI_codeLower than TX MHI                  -1.11799    0.12941  -8.639 1.06e-15
## scale(pwhiteOwn)                            0.53706    0.09738   5.515 9.52e-08
## scale(pwhiteRent)                          -0.04833    0.08315  -0.581 0.561608
## scale(pblackOwn)                           -0.12381    0.08206  -1.509 0.132776
## scale(pblackRent)                           0.23913    0.09176   2.606 0.009773
##                                               
## (Intercept)                                ***
## NAME.yDallas-Fort Worth-Arlington, TX      ***
## NAME.yHouston-The Woodlands-Sugar Land, TX ***
## NAME.ySan Antonio-New Braunfels, TX           
## MHI_codeLower than TX MHI                  ***
## scale(pwhiteOwn)                           ***
## scale(pwhiteRent)                             
## scale(pblackOwn)                              
## scale(pblackRent)                          ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9021 on 225 degrees of freedom
##   (267 observations deleted due to missingness)
## Multiple R-squared:  0.4566, Adjusted R-squared:  0.4373 
## F-statistic: 23.63 on 8 and 225 DF,  p-value: < 2.2e-16

Spatial Regime - MHI

Instead of doing the MSA’s I decided to split the data by the MHI for the school district and whether it was higher or lower than the State MHI of $64,034.

fit.1<-lm(PerCapTaxZ~MHI_code+NAME.y+scale(pwhiteOwn)+scale(pwhiteRent)+scale(pblackOwn)+scale(pblackRent),data=tx_sd_acs_joined)
summary(fit.1)
## 
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code + NAME.y + scale(pwhiteOwn) + 
##     scale(pwhiteRent) + scale(pblackOwn) + scale(pblackRent), 
##     data = tx_sd_acs_joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5093 -0.4959 -0.0995  0.3098  6.0923 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                 1.48211    0.17111   8.662 9.15e-16
## MHI_codeLower than TX MHI                  -1.11799    0.12941  -8.639 1.06e-15
## NAME.yDallas-Fort Worth-Arlington, TX      -0.68920    0.18651  -3.695 0.000276
## NAME.yHouston-The Woodlands-Sugar Land, TX -0.76853    0.19843  -3.873 0.000141
## NAME.ySan Antonio-New Braunfels, TX        -0.39610    0.24653  -1.607 0.109518
## scale(pwhiteOwn)                            0.53706    0.09738   5.515 9.52e-08
## scale(pwhiteRent)                          -0.04833    0.08315  -0.581 0.561608
## scale(pblackOwn)                           -0.12381    0.08206  -1.509 0.132776
## scale(pblackRent)                           0.23913    0.09176   2.606 0.009773
##                                               
## (Intercept)                                ***
## MHI_codeLower than TX MHI                  ***
## NAME.yDallas-Fort Worth-Arlington, TX      ***
## NAME.yHouston-The Woodlands-Sugar Land, TX ***
## NAME.ySan Antonio-New Braunfels, TX           
## scale(pwhiteOwn)                           ***
## scale(pwhiteRent)                             
## scale(pblackOwn)                              
## scale(pblackRent)                          ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9021 on 225 degrees of freedom
##   (267 observations deleted due to missingness)
## Multiple R-squared:  0.4566, Adjusted R-squared:  0.4373 
## F-statistic: 23.63 on 8 and 225 DF,  p-value: < 2.2e-16

Brazil Test for Stationarity

Based on the results of this test, which are significant, we can conclude that the models are stationary.

fit.a<-lm(PerCapTaxZ~MHI_code+scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined)
summary(fit.a)
## 
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code + scale(ppov) + scale(totRent) + 
##     scale(whiteOwn) + scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5935 -0.3627 -0.1076  0.1932  6.4611 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.46193    0.06633   6.964 1.06e-11 ***
## MHI_codeLower than TX MHI -0.69706    0.08729  -7.985 9.92e-15 ***
## scale(ppov)               -0.33705    0.04236  -7.958 1.21e-14 ***
## scale(totRent)            -0.04524    0.09250  -0.489    0.625    
## scale(whiteOwn)            0.28609    0.06539   4.375 1.48e-05 ***
## scale(blackOwn)           -0.07942    0.06053  -1.312    0.190    
## scale(hispOwn)            -0.04963    0.05792  -0.857    0.392    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7264 on 494 degrees of freedom
## Multiple R-squared:  0.4786, Adjusted R-squared:  0.4723 
## F-statistic: 75.58 on 6 and 494 DF,  p-value: < 2.2e-16
fit.b<-lm(PerCapTaxZ~MHI_code/scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined)
summary(fit.b)
## 
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code/scale(ppov) + scale(totRent) + 
##     scale(whiteOwn) + scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7867 -0.3229 -0.0967  0.1885  5.6996 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            -0.41620    0.10857  -3.834 0.000143 ***
## MHI_codeLower than TX MHI               0.12874    0.11643   1.106 0.269364    
## scale(totRent)                          0.02928    0.08510   0.344 0.730955    
## scale(whiteOwn)                         0.23787    0.06012   3.956 8.73e-05 ***
## scale(blackOwn)                        -0.09195    0.05548  -1.657 0.098110 .  
## scale(hispOwn)                         -0.07319    0.05313  -1.377 0.168996    
## MHI_codeHigher than TX MHI:scale(ppov) -1.46342    0.12174 -12.021  < 2e-16 ***
## MHI_codeLower than TX MHI:scale(ppov)  -0.22772    0.04040  -5.637 2.91e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6657 on 493 degrees of freedom
## Multiple R-squared:  0.5631, Adjusted R-squared:  0.5569 
## F-statistic: 90.76 on 7 and 493 DF,  p-value: < 2.2e-16
anova(fit.a, fit.b, test="F")
Res.DfRSSDfSum of SqFPr(>F)
494261           
493218142.295.31.07e-20

State-specific models

Nothing is very specific and I’m not very impressed with these results. I think I should have used a different outcome variable… Will keep working on this.

#higher than TX MHI
fit.H<-lm(PerCapTaxZ~scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined, subset=MHI_code=="Higher than TX MHI")
summary(fit.H)
## 
## Call:
## lm(formula = PerCapTaxZ ~ scale(ppov) + scale(totRent) + scale(whiteOwn) + 
##     scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined, 
##     subset = MHI_code == "Higher than TX MHI")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7758 -0.6027 -0.2176  0.2268  5.6832 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.405244   0.165221  -2.453   0.0152 *  
## scale(ppov)     -1.422326   0.193223  -7.361 8.52e-12 ***
## scale(totRent)   0.002863   0.243352   0.012   0.9906    
## scale(whiteOwn)  0.280968   0.153212   1.834   0.0685 .  
## scale(blackOwn) -0.073039   0.126261  -0.578   0.5637    
## scale(hispOwn)  -0.210388   0.171200  -1.229   0.2209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.012 on 163 degrees of freedom
## Multiple R-squared:  0.3231, Adjusted R-squared:  0.3023 
## F-statistic: 15.56 on 5 and 163 DF,  p-value: 1.673e-12
#Lower than TX MHI
fit.L<-lm(PerCapTaxZ~scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined, subset=MHI_code=="Lower than TX MHI")
summary(fit.L)
## 
## Call:
## lm(formula = PerCapTaxZ ~ scale(ppov) + scale(totRent) + scale(whiteOwn) + 
##     scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined, 
##     subset = MHI_code == "Lower than TX MHI")
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72304 -0.24957 -0.06658  0.14411  2.26340 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.26262    0.02579 -10.184  < 2e-16 ***
## scale(ppov)     -0.22472    0.02461  -9.132  < 2e-16 ***
## scale(totRent)  -0.10523    0.10359  -1.016    0.310    
## scale(whiteOwn)  0.38890    0.07572   5.136 4.83e-07 ***
## scale(blackOwn) -0.08582    0.06472  -1.326    0.186    
## scale(hispOwn)  -0.01718    0.03968  -0.433    0.665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3898 on 326 degrees of freedom
## Multiple R-squared:  0.3594, Adjusted R-squared:  0.3496 
## F-statistic: 36.58 on 5 and 326 DF,  p-value: < 2.2e-16