library(tidycensus)
library(tidyverse)
library(sf)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(nortest)
library(car)
library(lmtest)
library(ggplot2)
library(dplyr)

v15_Profile <- load_variables(2017, "acs5/profile", cache = TRUE)

search for variables by keywords in the label

v15_Profile[grep(x = v15_Profile$label, "POVERTY"), c("name", "label")]
## # A tibble: 38 x 2
##    name       label                                                             
##    <chr>      <chr>                                                             
##  1 DP03_0119  Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
##  2 DP03_0119P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
##  3 DP03_0120  Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
##  4 DP03_0120P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
##  5 DP03_0121  Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
##  6 DP03_0121P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
##  7 DP03_0122  Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
##  8 DP03_0122P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
##  9 DP03_0123  Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 10 DP03_0123P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## # ... with 28 more rows
v15_Profile[grep(x = v15_Profile$label, "Built 2000 to 2009"), c("name", "label")]
## # A tibble: 2 x 2
##   name       label                                                              
##   <chr>      <chr>                                                              
## 1 DP04_0019  Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2000 to~
## 2 DP04_0019P Percent Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built~

Extract from ACS summary file data profile variables from 2019 for Bexar County, TX Census Tracts

sa_acs<-get_acs(geography = "tract",
                state="TX",
                county = c("Bexar"),
                year = 2017,
                variables=c( "DP05_0001E","DP03_0119PE", "DP05_0077PE", "DP05_0078PE", "DP05_0071PE") ,
                geometry = T, output = "wide")
## Getting data from the 2013-2017 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
#create a county FIPS code - 5 digit
sa_acs$county<-substr(sa_acs$GEOID, 1, 5)

#rename variables and filter missing cases
dat<-sa_acs%>%
  mutate(totpop= DP05_0001E, 
         ppov=DP03_0119PE,
         pwhite=DP05_0077PE,
         pblack=DP05_0078PE,
         phisp=DP05_0071PE) %>%
  na.omit()

metro<- tigris::core_based_statistical_areas(cb=T,
                                             year = 2017)
metro<-metro%>%
  st_as_sf()%>%
  #st_boundary()%>%
  filter(grepl(NAME,
               pattern="San Antonio"))

dat%>%
  mutate(povquant=cut(ppov,
                      breaks = quantile(dat$ppov,
                                        p=seq(0,1,length.out = 8)),
                      include.lowest = T))%>%
  ggplot(aes(color=povquant,
             fill=povquant))+
  geom_sf()+
  scale_fill_viridis_d()+
  scale_color_viridis_d()+
  ggtitle(label = "Poverty Rate in Census Tracts - Bexar County, 2017")+
  geom_sf(data=metro,
          fill=NA,
          color="black")

1. Make a series of Spatial weights for the data

Queen-based contiquity weight matrix

## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364 
## Number of nonzero links: 2308 
## Percentage nonzero weights: 1.741939 
## Average number of links: 6.340659 
## 
## Weights style: W 
## Weights constants summary:
##     n     nn  S0       S1       S2
## W 364 132496 364 119.2547 1489.505

Rook style weight matrix

tabsr<-poly2nb(dat, queen=F)
salw<-nb2listw(tabsr, style="W")
salw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364 
## Number of nonzero links: 1916 
## Percentage nonzero weights: 1.446081 
## Average number of links: 5.263736 
## 
## Weights style: W 
## Weights constants summary:
##     n     nn  S0       S1       S2
## W 364 132496 364 142.9955 1491.795

k-nearest neighbor adjacency weight matrix, with k=4

knn<-knearneigh(x = coordinates(as(dat, "Spatial")), k = 4)
knn4<-knn2nb(knn = knn)
salwk<-nb2listw(knn4, style="W")
salwk
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364 
## Number of nonzero links: 1456 
## Percentage nonzero weights: 1.098901 
## Average number of links: 4 
## Non-symmetric neighbours list
## 
## Weights style: W 
## Weights constants summary:
##     n     nn  S0      S1       S2
## W 364 132496 364 160.625 1498.125

2. Define your outcome variable and all predictors

My outcome variable for this assignment is poverty rate and my predictors is race.

3. Fit the OLS model for your outcome

Linear Regression Model

fit<- lm(ppov ~ pwhite + phisp + pblack, data=dat)
summary(fit)
## 
## Call:
## lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.104  -4.926  -0.417   3.310  45.396 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.08185    8.71130  -0.928 0.354163    
## pwhite      -0.00991    0.09897  -0.100 0.920299    
## phisp        0.33186    0.08765   3.786 0.000179 ***
## pblack       0.38189    0.11078   3.447 0.000634 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.228 on 360 degrees of freedom
## Multiple R-squared:  0.4643, Adjusted R-squared:  0.4598 
## F-statistic:   104 on 3 and 360 DF,  p-value: < 2.2e-16

3a) Test for autocorrelation in the residuals using each neighbor specification, which specification showed the largest degree of dependence. Use that specification for the rest of the assignment.

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
## 
## Moran I statistic standard deviate = 8.2174, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.2342658679    -0.0073219155     0.0008643267
tabsr<-poly2nb(dat, queen=F)      # Rook adjacency matrix
salw<-nb2listw(tabsr, style="W")
lm.morantest(fit, listw = salw)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: salw
## 
## Moran I statistic standard deviate = 7.9659, p-value = 8.201e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.249981970     -0.007446667      0.001044344
knn<-knearneigh(x = coordinates(as(dat, "Spatial")), k = 4)  #  k-nearest neighbor adjacency matrix
knn4<-knn2nb(knn = knn)
salwk<-nb2listw(knn4, style="W")
lm.morantest(fit, listw = salwk)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: salwk
## 
## Moran I statistic standard deviate = 7.553, p-value = 2.126e-14
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.251721985     -0.007538145      0.001178227

The k- nearest neighborhood adjacency matrix provided the highest observed Moran I value of 0.2517, thus, providing a slightly higher degree of dependence compared to the other two neighbor specifications. With the Moran I observed value of 0.2517 i conclude that there is a weak positive autocorrelation in the model residuals.

3b). Produce a map of the studentized residuals from the model fit and a map of the local Moran statistic for the residuals.

nbs<-poly2nb(dat, queen = T)
wts<-nb2listw(nbs, style = "W")
dat$olsresid<-rstudent(fit)
dat<-st_as_sf(dat)
dat%>%
  mutate(rquant=cut(olsresid,
                    breaks = quantile(dat$olsresid,
                                      p=seq(0,1,length.out = 8)),
                    include.lowest = T))%>%
  ggplot(aes(color=rquant, fill=rquant))+
  geom_sf()+
  scale_fill_viridis_d()+
  scale_color_viridis_d()+
  geom_sf(data=metro, fill=NA, color="black")+
  ggtitle("OLS Model Residuals")

## Local Moran’s I ( K-Nearest Neighbor) # Local Moran I Statistics for the Residuals

sot<-nb2listw(neighbours = knn4, style = "W")

locali<-localmoran(dat$ppov, listw = sot, alternative = "two.sided" )
dat$locali<-locali[,1]
dat$localp<-locali[,5]

dat$ppovc<-scale(dat$ppov)
dat$lag_ppovc<-lag.listw(var=dat$ppovc, x = sot)
dat$quad_sig <- NA
dat$quad_sig[(dat$ppovc >= 0 & dat$lag_ppovc >= 0) & (dat$localp <= 0.05)] <- "H-H" #high high
dat$quad_sig[(dat$ppovc <= 0 & dat$lag_ppovc <= 0) & (dat$localp <= 0.05)] <- "L-L" #low low
dat$quad_sig[(dat$ppovc >= 0 & dat$lag_ppovc <= 0) & (dat$localp <= 0.05)] <- "H-L" #high low
dat$quad_sig[(dat$ppovc <= 0 & dat$lag_ppovc >= 0) & (dat$localp <= 0.05)] <- "L-H" #low high

#WE ASSIGN A # Set the breaks for the thematic map classes
breaks <- seq(1, 5, 1)

# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Clustered")

# find Interval - This is necessary for making a map
np <- findInterval(dat$quad_sig, breaks)
## Warning in findInterval(dat$quad_sig, breaks): NAs introduced by coercion
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")

dat%>%
  ggplot()+
  geom_sf(aes(fill = quad_sig))+
  ggtitle("Local Moran's I (K-Nearest Neighbor",
          sub=" Bexar County, TX")+labs(fill="Proportion of Poverty", color="Proportion of Poverty")

4). Examine which alternative SAR model specification would best fit this data using the minimum AIC criteria and specification tests.

library(spatialreg)
lm.LMtests(model = fit,
           listw=wts,
           test = c("LMerr", "LMlag", "RLMerr", "RLMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
## 
## LMerr = 60.974, df = 1, p-value = 5.773e-15
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
## 
## LMlag = 70.659, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
## 
## RLMerr = 1.7825, df = 1, p-value = 0.1818
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
## 
## RLMlag = 11.468, df = 1, p-value = 0.0007081

Examining the results of this model, the lag model has more support, with both the regular and the robust LMT showing larger values for the model, compared to the error model specification.

5). Fit that model (the one you identify as being “best”), and describe the results, compared to the OLS results.

#Spatial Lag Model
fit.lag<-lagsarlm(ppov ~ pwhite + phisp + pblack,
                  data=dat,
                  listw=wts,
                  type="lag")
summary(fit.lag)
## 
## Call:lagsarlm(formula = ppov ~ pwhite + phisp + pblack, data = dat, 
##     listw = wts, type = "lag")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -24.85450  -4.71342  -0.43258   3.32010  45.98692 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.173897   7.834431 -1.1710 0.241610
## pwhite       0.025884   0.089040  0.2907 0.771281
## phisp        0.226805   0.080140  2.8301 0.004653
## pblack       0.308001   0.100847  3.0541 0.002257
## 
## Rho: 0.47949, LR test value: 57.745, p-value: 2.9865e-14
## Asymptotic standard error: 0.058963
##     z-value: 8.132, p-value: 4.4409e-16
## Wald statistic: 66.13, p-value: 4.4409e-16
## 
## Log likelihood: -1252.735 for lag model
## ML residual variance (sigma squared): 54.746, (sigma: 7.3991)
## Number of observations: 364 
## Number of parameters estimated: 6 
## AIC: 2517.5, (AIC for lm: 2573.2)
## LM test for residual autocorrelation
## test value: 0.64498, p-value: 0.42191

This model shows that the regression effects reveal the coefficient changed compared to that of the OLS model, showing changes in the values of the coefficients of the three specifications. Specifically, the lag model has the slightest effect on the % Hispanic and the % black variables.

The auto-regressive coefficient, rho for the lag model, is also significant and positive, indicating that poverty rates are significantly associated with poverty rates in neighboring areas. The model fit is better than the OLS model, as measured by the AIC value of 2517.5 versus 2573.2 for the OLS model. Finally, in comparing the AIC values for the models, the lag model based on their values is a better measure of model fit.