In this assignment you will:
A. Construct a regression model for spatial data B. Examine the model for non-stationarity using a spatial regimes specification C. Describe the results of your analysis
If you use your own data, follow these steps
A. Use R for all analysis Define your outcome variable and all predictors.
B. Generate spatial regimes in your own data, how do you do this?
C. Estimate a linear regression model your outcome using the predictors you describe from part A
Fit the OLS model for your outcome :
D. Test for non-stationarity in the model by interacting the base model from step C with the indicator variable for the spatial regime using the process described by Brazil from the lecture.
E. From part D, does it appear that the model is stationary with respect to the regimes?
F. Consider state-specific models and describe the results for each regime separately.
Be sure the document: Presents the results of your analysis, including figures and output tables to adequately describe the analysis. Contains a short (2 paragraph) description of the analysis results.
Reporting: Please construct either a HTML document, using Rstudio’s “knit” button and submit the Rpubs link on Blackboard
For this HW3 I am using the same data as in HW2 (Texas unified school districts) but first I tried classifying them by the 4 “Texas Triangle” metropolitan areas (Austin-RR, DFW, San Antonio-NB, and Houston-Woodlands) as my spatial regimes. However, this spatial regime didn’t turn out to be very interesting, so I decided to split them as having MHI’s above or below the state MHI.
#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE)
#ec2019 <- load_variables(2019 , "census of governments", cache = TRUE)
#demographic profile tables
#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) #demographic profile tables
tx_sd_acs<-get_acs(geography = "school district (unified)",
state="TX",
year = 2019,
variables=c("DP05_0001E",
"DP03_0119PE",
"DP03_0062E",
"B25003_003E",
"B25003H_003E",
"B25003I_003E",
"B25003B_003E",
"B25003_002E",
"B25003H_002E",
"B25003I_002E",
"B25003B_002E",
"B25090_001E",
"B05003_003E",
"B05003_014E") ,
geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
#rename variables and filter missing cases
tx_sd_acs2<-tx_sd_acs%>%
mutate(totpop=DP05_0001E,
ppov=DP03_0119PE,
MHI=DP03_0062E,
totRent = B25003_003E,
whiteRent = B25003H_003E,
hispRent = B25003I_003E,
blackRent = B25003B_003E,
totOwn = B25003_002E,
whiteOwn = B25003H_002E,
hispOwn = B25003I_002E,
blackOwn = B25003B_002E,
totRentOwn = totRent + totOwn,
pRent = totRent/totRentOwn,
pOwn = totOwn/totRentOwn,
pwhiteOwn = whiteOwn/totRentOwn,
phispOwn = hispOwn/totRentOwn,
pblackOwn = blackOwn/totRentOwn,
pwhiteRent = whiteRent/totRentOwn,
phispRent = hispRent/totRentOwn,
pblackRent = blackRent/totRentOwn,
aggREtax = B25090_001E,
totUnder18 = B05003_014E + B05003_003E,
REtaxPCapSchAge = aggREtax / totUnder18)
tx_sd_acs2<-na.omit(tx_sd_acs2)
#Select districts with pop > 5,000
tx_sd_acs2 <- subset(tx_sd_acs2, totpop >= 5000)
#pull texas shapefile for context
state<-get_acs(geography = "state", variables=c("DP05_0001E"), state="TX", year=2019, geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
MSAs<-core_based_statistical_areas(cb=T,year=2019)
SANB<-MSAs[MSAs$GEOID=="41700",]
DFW<-MSAs[MSAs$GEOID=="19100",]
HOU<-MSAs[MSAs$GEOID=="26420",]
AUS<-MSAs[MSAs$GEOID=="12420",]
#Merge MSAs into new shapefile (for some reason couldn't query them all together)
TX_MSAs <- rbind(SANB, DFW, HOU, AUS)
#Spatial Join to show School Districts by MSA
# not completely happy with how the districts were assigned to the MSAs (i tried all the options from intersects to covered_by etc) but that can be dealt with later.
tx_sd_acs_joined <- st_join(tx_sd_acs2, left = TRUE, join=st_intersects, largest=TRUE, sparse=TRUE, TX_MSAs["NAME"])
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
#Examining MSA data
library(tmap)
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs_joined)+
tm_polygons("NAME.y", title="School Districts by MSA", border.alpha = 1)+
tm_shape(TX_MSAs)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts with Pop > 5,000 by assigned MSA", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:huxtable':
##
## number_format
## The following object is masked from 'package:arm':
##
## rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
tx_sd_acs_joined$MHI_code <- case_when(tx_sd_acs_joined$MHI >= 64034 ~ 'Higher than TX MHI', TRUE ~ 'Lower than TX MHI')
#Examining MHI data
tm_shape(state)+
tm_polygons(alpha=1)+
tm_shape(tx_sd_acs_joined)+
tm_polygons("MHI_code", title="District MHI Compared to TX MHI", border.alpha = 1)+
tm_shape(TX_MSAs)+
tm_polygons(alpha=0, border.col = "black")+
tm_format("World", title="Texas School Districts with Pop > 5,000 by assigned MSA", legend.outside=T)+
tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
tm_compass()
tx_sd_acs_joined$PerCapTaxZ<-as.numeric(scale(tx_sd_acs_joined$REtaxPCapSchAge, center=T, scale=T))
qplot(tx_sd_acs_joined$PerCapTaxZ,
geom="histogram" )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
When splitting by the MSAs, the data is not that interesting, and is really just telling us that the San Antonio, Houston, and Dallas regions all have lower per capita tax rates than the Austin region. Based on this, I’m going to switch to a different Spatial Regime focused on MHI per tract.
fit.0<-lm(PerCapTaxZ~NAME.y+MHI_code+scale(pwhiteOwn)+scale(pwhiteRent)+scale(pblackOwn)+scale(pblackRent),data=tx_sd_acs_joined)
summary(fit.0)
##
## Call:
## lm(formula = PerCapTaxZ ~ NAME.y + MHI_code + scale(pwhiteOwn) +
## scale(pwhiteRent) + scale(pblackOwn) + scale(pblackRent),
## data = tx_sd_acs_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5093 -0.4959 -0.0995 0.3098 6.0923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.48211 0.17111 8.662 9.15e-16
## NAME.yDallas-Fort Worth-Arlington, TX -0.68920 0.18651 -3.695 0.000276
## NAME.yHouston-The Woodlands-Sugar Land, TX -0.76853 0.19843 -3.873 0.000141
## NAME.ySan Antonio-New Braunfels, TX -0.39610 0.24653 -1.607 0.109518
## MHI_codeLower than TX MHI -1.11799 0.12941 -8.639 1.06e-15
## scale(pwhiteOwn) 0.53706 0.09738 5.515 9.52e-08
## scale(pwhiteRent) -0.04833 0.08315 -0.581 0.561608
## scale(pblackOwn) -0.12381 0.08206 -1.509 0.132776
## scale(pblackRent) 0.23913 0.09176 2.606 0.009773
##
## (Intercept) ***
## NAME.yDallas-Fort Worth-Arlington, TX ***
## NAME.yHouston-The Woodlands-Sugar Land, TX ***
## NAME.ySan Antonio-New Braunfels, TX
## MHI_codeLower than TX MHI ***
## scale(pwhiteOwn) ***
## scale(pwhiteRent)
## scale(pblackOwn)
## scale(pblackRent) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9021 on 225 degrees of freedom
## (267 observations deleted due to missingness)
## Multiple R-squared: 0.4566, Adjusted R-squared: 0.4373
## F-statistic: 23.63 on 8 and 225 DF, p-value: < 2.2e-16
Instead of doing the MSA’s I decided to split the data by the MHI for the school district and whether it was higher or lower than the State MHI of $64,034.
fit.1<-lm(PerCapTaxZ~MHI_code+NAME.y+scale(pwhiteOwn)+scale(pwhiteRent)+scale(pblackOwn)+scale(pblackRent),data=tx_sd_acs_joined)
summary(fit.1)
##
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code + NAME.y + scale(pwhiteOwn) +
## scale(pwhiteRent) + scale(pblackOwn) + scale(pblackRent),
## data = tx_sd_acs_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5093 -0.4959 -0.0995 0.3098 6.0923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.48211 0.17111 8.662 9.15e-16
## MHI_codeLower than TX MHI -1.11799 0.12941 -8.639 1.06e-15
## NAME.yDallas-Fort Worth-Arlington, TX -0.68920 0.18651 -3.695 0.000276
## NAME.yHouston-The Woodlands-Sugar Land, TX -0.76853 0.19843 -3.873 0.000141
## NAME.ySan Antonio-New Braunfels, TX -0.39610 0.24653 -1.607 0.109518
## scale(pwhiteOwn) 0.53706 0.09738 5.515 9.52e-08
## scale(pwhiteRent) -0.04833 0.08315 -0.581 0.561608
## scale(pblackOwn) -0.12381 0.08206 -1.509 0.132776
## scale(pblackRent) 0.23913 0.09176 2.606 0.009773
##
## (Intercept) ***
## MHI_codeLower than TX MHI ***
## NAME.yDallas-Fort Worth-Arlington, TX ***
## NAME.yHouston-The Woodlands-Sugar Land, TX ***
## NAME.ySan Antonio-New Braunfels, TX
## scale(pwhiteOwn) ***
## scale(pwhiteRent)
## scale(pblackOwn)
## scale(pblackRent) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9021 on 225 degrees of freedom
## (267 observations deleted due to missingness)
## Multiple R-squared: 0.4566, Adjusted R-squared: 0.4373
## F-statistic: 23.63 on 8 and 225 DF, p-value: < 2.2e-16
Based on the results of this test, which are significant, we can conclude that the models are stationary.
fit.a<-lm(PerCapTaxZ~MHI_code+scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined)
summary(fit.a)
##
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code + scale(ppov) + scale(totRent) +
## scale(whiteOwn) + scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5935 -0.3627 -0.1076 0.1932 6.4611
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.46193 0.06633 6.964 1.06e-11 ***
## MHI_codeLower than TX MHI -0.69706 0.08729 -7.985 9.92e-15 ***
## scale(ppov) -0.33705 0.04236 -7.958 1.21e-14 ***
## scale(totRent) -0.04524 0.09250 -0.489 0.625
## scale(whiteOwn) 0.28609 0.06539 4.375 1.48e-05 ***
## scale(blackOwn) -0.07942 0.06053 -1.312 0.190
## scale(hispOwn) -0.04963 0.05792 -0.857 0.392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7264 on 494 degrees of freedom
## Multiple R-squared: 0.4786, Adjusted R-squared: 0.4723
## F-statistic: 75.58 on 6 and 494 DF, p-value: < 2.2e-16
fit.b<-lm(PerCapTaxZ~MHI_code/scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined)
summary(fit.b)
##
## Call:
## lm(formula = PerCapTaxZ ~ MHI_code/scale(ppov) + scale(totRent) +
## scale(whiteOwn) + scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7867 -0.3229 -0.0967 0.1885 5.6996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.41620 0.10857 -3.834 0.000143 ***
## MHI_codeLower than TX MHI 0.12874 0.11643 1.106 0.269364
## scale(totRent) 0.02928 0.08510 0.344 0.730955
## scale(whiteOwn) 0.23787 0.06012 3.956 8.73e-05 ***
## scale(blackOwn) -0.09195 0.05548 -1.657 0.098110 .
## scale(hispOwn) -0.07319 0.05313 -1.377 0.168996
## MHI_codeHigher than TX MHI:scale(ppov) -1.46342 0.12174 -12.021 < 2e-16 ***
## MHI_codeLower than TX MHI:scale(ppov) -0.22772 0.04040 -5.637 2.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6657 on 493 degrees of freedom
## Multiple R-squared: 0.5631, Adjusted R-squared: 0.5569
## F-statistic: 90.76 on 7 and 493 DF, p-value: < 2.2e-16
anova(fit.a, fit.b, test="F")
| Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
|---|---|---|---|---|---|
| 494 | 261 | ||||
| 493 | 218 | 1 | 42.2 | 95.3 | 1.07e-20 |
Nothing is very specific and I’m not very impressed with these results. I think I should have used a different outcome variable… Will keep working on this.
#higher than TX MHI
fit.H<-lm(PerCapTaxZ~scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined, subset=MHI_code=="Higher than TX MHI")
summary(fit.H)
##
## Call:
## lm(formula = PerCapTaxZ ~ scale(ppov) + scale(totRent) + scale(whiteOwn) +
## scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined,
## subset = MHI_code == "Higher than TX MHI")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7758 -0.6027 -0.2176 0.2268 5.6832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.405244 0.165221 -2.453 0.0152 *
## scale(ppov) -1.422326 0.193223 -7.361 8.52e-12 ***
## scale(totRent) 0.002863 0.243352 0.012 0.9906
## scale(whiteOwn) 0.280968 0.153212 1.834 0.0685 .
## scale(blackOwn) -0.073039 0.126261 -0.578 0.5637
## scale(hispOwn) -0.210388 0.171200 -1.229 0.2209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.012 on 163 degrees of freedom
## Multiple R-squared: 0.3231, Adjusted R-squared: 0.3023
## F-statistic: 15.56 on 5 and 163 DF, p-value: 1.673e-12
#Lower than TX MHI
fit.L<-lm(PerCapTaxZ~scale(ppov)+scale(totRent)+scale(whiteOwn)+scale(blackOwn)+scale(hispOwn),data=tx_sd_acs_joined, subset=MHI_code=="Lower than TX MHI")
summary(fit.L)
##
## Call:
## lm(formula = PerCapTaxZ ~ scale(ppov) + scale(totRent) + scale(whiteOwn) +
## scale(blackOwn) + scale(hispOwn), data = tx_sd_acs_joined,
## subset = MHI_code == "Lower than TX MHI")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72304 -0.24957 -0.06658 0.14411 2.26340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.26262 0.02579 -10.184 < 2e-16 ***
## scale(ppov) -0.22472 0.02461 -9.132 < 2e-16 ***
## scale(totRent) -0.10523 0.10359 -1.016 0.310
## scale(whiteOwn) 0.38890 0.07572 5.136 4.83e-07 ***
## scale(blackOwn) -0.08582 0.06472 -1.326 0.186
## scale(hispOwn) -0.01718 0.03968 -0.433 0.665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3898 on 326 degrees of freedom
## Multiple R-squared: 0.3594, Adjusted R-squared: 0.3496
## F-statistic: 36.58 on 5 and 326 DF, p-value: < 2.2e-16