Description:

Use a data set of your choice.

Backstory

TL;DR - I tried to use change over time and didn’t realize that won’t work with binomial or poisson regressions… Whoops.

Longer version: I went down the wrong path hoping to look at change between two years as this assignment, only to get to the code and have it tell me “negative values not allowed.” Before that I had some lofty ideas to look at a binary outcome of whether school districts increased or decreased in white households between 2010 and 2020, but there is (1) no School District data through tidycensus for years before 2011 - a specific error message I received and (2) some sort of bug in 2020 data where it isn’t letting me download certain variables? I tried switching to counties to see the change in the time period, and that didn’t work either (I even tried going back to just look at change between 2000 and 2010). And no, I was not trying to download any ACS variables, I did make sure they were all sf1 variables.

Actual Data Used

Time Period: 2015-2019 5 Year ACS by County for State of Texas

#Use this to search for variables
#acs2011 <- load_variables(2011 , "acs5", cache = TRUE) 
#ec2019 <- load_variables(2019 , "census of governments", cache = TRUE) 

#demographic profile tables

#Use this to search for variables
#vars19 <- load_variables(2019 , "acs5", cache=TRUE)
#View(vars19)
#demographic profile tables
options(repos="https://cran.rstudio.com" )
install.packages("tidycensus")
library(tidycensus)
install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
install.packages("magrittr")
library(magrittr)
install.packages("sf")
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
tx_19_raw<-get_acs(geography = "county",
                state="TX",
                year = 2019,
                variables=c("DP05_0001E", 
                            "B25107_001E", 
                            "DP03_0062E",
                            "B11001_001E",
                            "B11001H_001E",
                            "B11001B_001E",
                            "B11001I_001E",
                            "B25003_003E",
                            "B25003H_002E",
                            "B25002_003E",
                            "B05003_014E",
                            "B05003_003E"),
                geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
tx_11_raw<-get_acs(geography = "county",
                state="TX",
                year = 2011,
                variables=c("DP05_0001E", 
                            "B25107_001E",
                            "DP03_0062E",
                            "B11001_001E",
                            "B11001H_001E",
                            "B11001B_001E",
                            "B11001I_001E",
                            "B25003_003E",
                            "B25003H_002E",
                            "B25002_003E",
                            "B05003_014E",
                            "B05003_003E"),
                geometry = T, output = "wide")
## Getting data from the 2007-2011 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
#rename variables and filter missing cases
tx_11<-tx_11_raw %>% 
  mutate(totpop_11=DP05_0001E, 
         medHVal_11=(B25107_001E *1.14), 
         MHI_11=DP03_0062E*1.14,
         totHH_11 = B11001_001E,
         whiteHH_11 = B11001H_001E,
         pwhiteHH_11 = whiteHH_11/(totHH_11+0.000001),
         blackHH_11 =B11001B_001E,
         pblackHH_11 = blackHH_11/(totHH_11+0.000001),
         hispHH_11=B11001I_001E,
         phispHH_11=hispHH_11/(totHH_11+0.000001),
         totRent_11 = B25003_003E,
         whiteOwn_11 = B25003H_002E,
         vacantHU_11 =  B25002_003E,
         totUnder18_11 = B05003_014E + B05003_003E)
tx_11<-subset(tx_11, select=c(GEOID,NAME,totpop_11,medHVal_11,MHI_11,totHH_11,whiteHH_11,blackHH_11,hispHH_11, pwhiteHH_11,pblackHH_11,phispHH_11,totRent_11,whiteOwn_11,vacantHU_11,totUnder18_11))
tx_11<-na.omit(tx_11)

tx_19<-tx_19_raw%>%
  mutate(totpop_19=DP05_0001E, 
         medHVal_19=B25107_001E, 
         MHI_19=DP03_0062E,
         totHH_19 = B11001_001E,
         whiteHH_19 = B11001H_001E,
         pwhiteHH_19 = whiteHH_19/(totHH_19+0.000001),
         blackHH_19 =B11001B_001E,
         pblackHH_19 = blackHH_19/(totHH_19+0.000001),
         hispHH_19=B11001I_001E,
         phispHH_19=hispHH_19/(totHH_19+0.000001),
         totRent_19 = B25003_003E,
         whiteOwn_19 = B25003H_002E,
         vacantHU_19 =  B25002_003E,
         totUnder18_19 = B05003_014E + B05003_003E)
tx_19<-subset(tx_19, select=c(GEOID,NAME,totpop_19,medHVal_19,MHI_19,totHH_19,whiteHH_19,blackHH_19,hispHH_19,pwhiteHH_19,pblackHH_19,phispHH_19,totRent_19,whiteOwn_19,vacantHU_19,totUnder18_19))
tx_19<-na.omit(tx_19)


#Join 2011 to 2019 data

tx11_nosp<-tx_11
st_geometry(tx11_nosp)<-NULL
tx_1119 <- inner_join(tx_19, tx11_nosp, by="GEOID")

tx_1119$whiteOwn_11 <- tx_1119$whiteOwn_11+0.000001
tx_1119$chWhiteOwn1119 <- tx_1119$whiteOwn_19 - tx_1119$whiteOwn_11
tx_1119$PchWhiteOwn1119 <- tx_1119$chWhiteOwn1119/tx_1119$whiteOwn_11
tx_1119$medHVal_11 <- tx_1119$medHVal_11 +0.000001
tx_1119$chMHval <- tx_1119$medHVal_19 - tx_1119$medHVal_11
tx_1119$pchMHval <- tx_1119$chMHval/tx_1119$medHVal_11
tx_1119$chtotHH <- tx_1119$totHH_19 - tx_1119$totHH_11
tx_1119$pchtotHH <- tx_1119$chtotHH/(tx_1119$totHH_11 +0.000001)
tx_1119<-na.omit(tx_1119)


#Select districts with pop > 5,000
#tx_sd_acs2 <- subset(tx_sd_acs2, totpop >= 5000)

#pull texas shapefile for context
state<-get_acs(geography = "state", variables=c("DP05_0001E"), state="TX", year=2019, geometry = T, output = "wide")
## Getting data from the 2015-2019 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:tidycensus':
## 
##     fips_codes
MSAs<-core_based_statistical_areas(cb=T,year=2019)
SANB<-MSAs[MSAs$GEOID=="41700",]
DFW<-MSAs[MSAs$GEOID=="19100",]
HOU<-MSAs[MSAs$GEOID=="26420",]
AUS<-MSAs[MSAs$GEOID=="12420",]

#Merge MSAs into new shapefile (for some reason couldn't query them all together)
TX_MSAs <- rbind(SANB, DFW, HOU, AUS)

#Spatial Join to show School Districts by MSA
# not completely happy with how the districts were assigned to the MSAs (i tried all the options from intersects to covered_by etc) but that can be dealt with later. 
tx_1119 <- st_join(tx_1119, left = TRUE, join=st_intersects, largest=TRUE, TX_MSAs["NAME"])
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
library(tmap)
## Registered S3 methods overwritten by 'stars':
##   method             from
##   st_bbox.SpatRaster sf  
##   st_crs.SpatRaster  sf
tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_1119) +
  tm_polygons("chWhiteOwn1119", title="Change in White Homeowners, 2011-2019",  border.alpha = 0.5)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas Counties", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()
## Variable(s) "chWhiteOwn1119" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_1119) +
  tm_polygons("PchWhiteOwn1119", title="Percent Change in White Homeowners, 2011-2019",  border.alpha = 0.5)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas Counties", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()
## Variable(s) "PchWhiteOwn1119" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_1119) +
  tm_polygons("chMHval", title="Change in Median Home Value, 2011-2019",  border.alpha = 0.5)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas Counties", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()
## Variable(s) "chMHval" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tm_shape(state)+
      tm_polygons(alpha=1)+
tm_shape(tx_1119) +
  tm_polygons("pchMHval", title="Percent Change in Median Home Value, 2011-2019",  border.alpha = 0.5)+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Texas Counties", legend.outside=T)+
  tm_scale_bar(position="LEFT", breaks=c(0,2.5,5))+
  tm_compass()
## Variable(s) "pchMHval" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Poisson Regression

RISK RATIOS: I don’t think I set this regression up correctly and am having a hard time understanding what the Risk Ratios are telling me. Should I have used categorical data as you did in your example instead of the percentages? If, for example, this value for Percent Hispanic was 0.90 does that mean a county that was a higher percentage hispanic in 2011 was 10% less likely to have a higher number of white homeowners in 2019…?

OVERDISPERSION: My overdispersion test result is 12.73, which means my variance is 12 times as large as my mean, and from the notes, if the model is fitting the data well, this number would be 1. So this model is junk but it’s a learning experience I guess.

tx_1119$pwhiteHH_11Z<-as.numeric(scale(tx_1119$pwhiteHH_11, center=T, scale=T))
tx_1119$pblackHH_11Z<-as.numeric(scale(tx_1119$pblackHH_11, center=T, scale=T))
tx_1119$phispHH_11Z<-as.numeric(scale(tx_1119$phispHH_11, center=T, scale=T))
tx_1119$medHVal_11Z<-as.numeric(scale(tx_1119$medHVal_11, center=T, scale=T))
tx_1119$vacantHU_11Z<-as.numeric(scale(tx_1119$vacantHU_11, center=T, scale=T))


fit_pois<- glm(whiteOwn_19 ~ offset(log(totHH_19)) + pwhiteHH_11Z + pblackHH_11Z + phispHH_11Z + medHVal_11Z + vacantHU_11Z, 
               family=poisson, 
               data=tx_1119)
summary(fit_pois)
## 
## Call:
## glm(formula = whiteOwn_19 ~ offset(log(totHH_19)) + pwhiteHH_11Z + 
##     pblackHH_11Z + phispHH_11Z + medHVal_11Z + vacantHU_11Z, 
##     family = poisson, data = tx_1119)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -57.069   -3.161    1.978    7.408   51.541  
## 
## Coefficients:
##                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)  -0.8715038  0.0009465 -920.748  < 2e-16 ***
## pwhiteHH_11Z  0.4362135  0.0058613   74.422  < 2e-16 ***
## pblackHH_11Z  0.0110620  0.0022886    4.834 1.34e-06 ***
## phispHH_11Z  -0.0991869  0.0064234  -15.441  < 2e-16 ***
## medHVal_11Z  -0.0006016  0.0007816   -0.770    0.441    
## vacantHU_11Z -0.0001856  0.0001959   -0.947    0.343    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 668765  on 252  degrees of freedom
## Residual deviance:  40062  on 247  degrees of freedom
## AIC: 42565
## 
## Number of Fisher Scoring iterations: 4
#Exponentiate Coefficients to get Risk Ratios
exp(coef(fit_pois))
##  (Intercept) pwhiteHH_11Z pblackHH_11Z  phispHH_11Z  medHVal_11Z vacantHU_11Z 
##    0.4183220    1.5468391    1.0111234    0.9055734    0.9993985    0.9998144
#Test for Overdispersion
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 12.7355

Negative Binomial

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit_nb<- glm.nb(whiteOwn_19 ~ offset(log(totHH_19)) + pwhiteHH_11Z + pblackHH_11Z + phispHH_11Z + medHVal_11Z + vacantHU_11Z,
               data=tx_1119)
summary(fit_nb)
## 
## Call:
## glm.nb(formula = whiteOwn_19 ~ offset(log(totHH_19)) + pwhiteHH_11Z + 
##     pblackHH_11Z + phispHH_11Z + medHVal_11Z + vacantHU_11Z, 
##     data = tx_1119, init.theta = 17.75159187, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.4892  -0.4026  -0.0065   0.4165   2.3529  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.82842    0.01506 -55.024   <2e-16 ***
## pwhiteHH_11Z  0.27161    0.20553   1.322    0.186    
## pblackHH_11Z -0.03572    0.07063  -0.506    0.613    
## phispHH_11Z  -0.28307    0.22185  -1.276    0.202    
## medHVal_11Z  -0.02082    0.01799  -1.157    0.247    
## vacantHU_11Z -0.01571    0.01700  -0.924    0.355    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(17.7516) family taken to be 1)
## 
##     Null deviance: 1162.44  on 252  degrees of freedom
## Residual deviance:  271.73  on 247  degrees of freedom
## AIC: 4084.2
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  17.75 
##           Std. Err.:  1.67 
## 
##  2 x log-likelihood:  -4070.199
exp(coef(fit_nb))
##  (Intercept) pwhiteHH_11Z pblackHH_11Z  phispHH_11Z  medHVal_11Z vacantHU_11Z 
##    0.4367371    1.3120770    0.9649143    0.7534665    0.9794001    0.9844169

Test for Autocorrelation

If I’m interpreting the Moran’s I correctly, it is not significantly autocorrelated? Observed Moran I Expectation Variance 3.059215e-01 -2.024894e-06 1.819007e-03

install.packages("spdep")
## 
## The downloaded binary packages are in
##  /var/folders/x9/scwlftds0vn0c3vfyyrprz7r0000gn/T//RtmphJpY83/downloaded_packages
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
nbs<-knearneigh(coordinates(as(tx_1119, "Spatial")), k = 4, longlat = T)
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
lm.morantest(fit_pois, listw = us.wt4)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm(formula = whiteOwn_19 ~ offset(log(totHH_19)) + pwhiteHH_11Z
## + pblackHH_11Z + phispHH_11Z + medHVal_11Z + vacantHU_11Z, family =
## poisson, data = tx_1119)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 7.1729, p-value = 3.671e-13
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     3.059215e-01    -2.024894e-06     1.819007e-03

Spatial Filtering

Okay, I am lost in this section. Should I have used categorical data? is that why this is not making sense?