library(tidycensus)
library(tidyverse)
library(sf)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(nortest)
library(car)
library(lmtest)
library(ggplot2)
library(dplyr)
v15_Profile <- load_variables(2017, "acs5/profile", cache = TRUE)
v15_Profile[grep(x = v15_Profile$label, "POVERTY"), c("name", "label")]
## # A tibble: 38 x 2
## name label
## <chr> <chr>
## 1 DP03_0119 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 2 DP03_0119P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## 3 DP03_0120 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 4 DP03_0120P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## 5 DP03_0121 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 6 DP03_0121P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## 7 DP03_0122 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 8 DP03_0122P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## 9 DP03_0123 Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE P~
## 10 DP03_0123P Percent Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME ~
## # ... with 28 more rows
v15_Profile[grep(x = v15_Profile$label, "Built 2000 to 2009"), c("name", "label")]
## # A tibble: 2 x 2
## name label
## <chr> <chr>
## 1 DP04_0019 Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 2000 to~
## 2 DP04_0019P Percent Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built~
sa_acs<-get_acs(geography = "tract",
state="TX",
county = c("Bexar"),
year = 2017,
variables=c( "DP05_0001E","DP03_0119PE", "DP05_0077PE", "DP05_0078PE", "DP05_0071PE") ,
geometry = T, output = "wide")
## Getting data from the 2013-2017 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
#create a county FIPS code - 5 digit
sa_acs$county<-substr(sa_acs$GEOID, 1, 5)
#rename variables and filter missing cases
dat<-sa_acs%>%
mutate(totpop= DP05_0001E,
ppov=DP03_0119PE,
pwhite=DP05_0077PE,
pblack=DP05_0078PE,
phisp=DP05_0071PE) %>%
na.omit()
metro<- tigris::core_based_statistical_areas(cb=T,
year = 2017)
metro<-metro%>%
st_as_sf()%>%
#st_boundary()%>%
filter(grepl(NAME,
pattern="San Antonio"))
dat%>%
mutate(povquant=cut(ppov,
breaks = quantile(dat$ppov,
p=seq(0,1,length.out = 8)),
include.lowest = T))%>%
ggplot(aes(color=povquant,
fill=povquant))+
geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+
ggtitle(label = "Poverty Rate in Census Tracts - Bexar County, 2017")+
geom_sf(data=metro,
fill=NA,
color="black")
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364
## Number of nonzero links: 2308
## Percentage nonzero weights: 1.741939
## Average number of links: 6.340659
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 364 132496 364 119.2547 1489.505
tabsr<-poly2nb(dat, queen=F)
salw<-nb2listw(tabsr, style="W")
salw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364
## Number of nonzero links: 1916
## Percentage nonzero weights: 1.446081
## Average number of links: 5.263736
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 364 132496 364 142.9955 1491.795
knn<-knearneigh(x = coordinates(as(dat, "Spatial")), k = 4)
knn4<-knn2nb(knn = knn)
salwk<-nb2listw(knn4, style="W")
salwk
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 364
## Number of nonzero links: 1456
## Percentage nonzero weights: 1.098901
## Average number of links: 4
## Non-symmetric neighbours list
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 364 132496 364 160.625 1498.125
My outcome variable for this assignment is poverty rate and my predictors is race.
fit<- lm(ppov ~ pwhite + phisp + pblack, data=dat)
summary(fit)
##
## Call:
## lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.104 -4.926 -0.417 3.310 45.396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.08185 8.71130 -0.928 0.354163
## pwhite -0.00991 0.09897 -0.100 0.920299
## phisp 0.33186 0.08765 3.786 0.000179 ***
## pblack 0.38189 0.11078 3.447 0.000634 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.228 on 360 degrees of freedom
## Multiple R-squared: 0.4643, Adjusted R-squared: 0.4598
## F-statistic: 104 on 3 and 360 DF, p-value: < 2.2e-16
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
##
## Moran I statistic standard deviate = 8.2174, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.2342658679 -0.0073219155 0.0008643267
tabsr<-poly2nb(dat, queen=F) # Rook adjacency matrix
salw<-nb2listw(tabsr, style="W")
lm.morantest(fit, listw = salw)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: salw
##
## Moran I statistic standard deviate = 7.9659, p-value = 8.201e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.249981970 -0.007446667 0.001044344
knn<-knearneigh(x = coordinates(as(dat, "Spatial")), k = 4) # k-nearest neighbor adjacency matrix
knn4<-knn2nb(knn = knn)
salwk<-nb2listw(knn4, style="W")
lm.morantest(fit, listw = salwk)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: salwk
##
## Moran I statistic standard deviate = 7.553, p-value = 2.126e-14
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.251721985 -0.007538145 0.001178227
The k- nearest neighborhood adjacency matrix provided the highest observed Moran I value of 0.2517, thus, providing a slightly higher degree of dependence compared to the other two neighbor specifications. With the Moran I observed value of 0.2517 i conclude that there is a weak positive autocorrelation in the model residuals.
nbs<-poly2nb(dat, queen = T)
wts<-nb2listw(nbs, style = "W")
dat$olsresid<-rstudent(fit)
dat<-st_as_sf(dat)
dat%>%
mutate(rquant=cut(olsresid,
breaks = quantile(dat$olsresid,
p=seq(0,1,length.out = 8)),
include.lowest = T))%>%
ggplot(aes(color=rquant, fill=rquant))+
geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+
geom_sf(data=metro, fill=NA, color="black")+
ggtitle("OLS Model Residuals")
## Local Moran’s I ( K-Nearest Neighbor) # Local Moran I Statistics for the Residuals
sot<-nb2listw(neighbours = knn4, style = "W")
locali<-localmoran(dat$ppov, listw = sot, alternative = "two.sided" )
dat$locali<-locali[,1]
dat$localp<-locali[,5]
dat$ppovc<-scale(dat$ppov)
dat$lag_ppovc<-lag.listw(var=dat$ppovc, x = sot)
dat$quad_sig <- NA
dat$quad_sig[(dat$ppovc >= 0 & dat$lag_ppovc >= 0) & (dat$localp <= 0.05)] <- "H-H" #high high
dat$quad_sig[(dat$ppovc <= 0 & dat$lag_ppovc <= 0) & (dat$localp <= 0.05)] <- "L-L" #low low
dat$quad_sig[(dat$ppovc >= 0 & dat$lag_ppovc <= 0) & (dat$localp <= 0.05)] <- "H-L" #high low
dat$quad_sig[(dat$ppovc <= 0 & dat$lag_ppovc >= 0) & (dat$localp <= 0.05)] <- "L-H" #low high
#WE ASSIGN A # Set the breaks for the thematic map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Clustered")
# find Interval - This is necessary for making a map
np <- findInterval(dat$quad_sig, breaks)
## Warning in findInterval(dat$quad_sig, breaks): NAs introduced by coercion
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
dat%>%
ggplot()+
geom_sf(aes(fill = quad_sig))+
ggtitle("Local Moran's I (K-Nearest Neighbor",
sub=" Bexar County, TX")+labs(fill="Proportion of Poverty", color="Proportion of Poverty")
library(spatialreg)
lm.LMtests(model = fit,
listw=wts,
test = c("LMerr", "LMlag", "RLMerr", "RLMlag"))
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
##
## LMerr = 60.974, df = 1, p-value = 5.773e-15
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
##
## LMlag = 70.659, df = 1, p-value < 2.2e-16
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
##
## RLMerr = 1.7825, df = 1, p-value = 0.1818
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = ppov ~ pwhite + phisp + pblack, data = dat)
## weights: wts
##
## RLMlag = 11.468, df = 1, p-value = 0.0007081
Examining the results of this model, the lag model has more support, with both the regular and the robust LMT showing larger values for the model, compared to the error model specification.
#Spatial Lag Model
fit.lag<-lagsarlm(ppov ~ pwhite + phisp + pblack,
data=dat,
listw=wts,
type="lag")
summary(fit.lag)
##
## Call:lagsarlm(formula = ppov ~ pwhite + phisp + pblack, data = dat,
## listw = wts, type = "lag")
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.85450 -4.71342 -0.43258 3.32010 45.98692
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.173897 7.834431 -1.1710 0.241610
## pwhite 0.025884 0.089040 0.2907 0.771281
## phisp 0.226805 0.080140 2.8301 0.004653
## pblack 0.308001 0.100847 3.0541 0.002257
##
## Rho: 0.47949, LR test value: 57.745, p-value: 2.9865e-14
## Asymptotic standard error: 0.058963
## z-value: 8.132, p-value: 4.4409e-16
## Wald statistic: 66.13, p-value: 4.4409e-16
##
## Log likelihood: -1252.735 for lag model
## ML residual variance (sigma squared): 54.746, (sigma: 7.3991)
## Number of observations: 364
## Number of parameters estimated: 6
## AIC: 2517.5, (AIC for lm: 2573.2)
## LM test for residual autocorrelation
## test value: 0.64498, p-value: 0.42191
This model shows that the regression effects reveal the coefficient changed compared to that of the OLS model, showing changes in the values of the coefficients of the three specifications. Specifically, the lag model has the slightest effect on the % Hispanic and the % black variables.
The auto-regressive coefficient, rho for the lag model, is also significant and positive, indicating that poverty rates are significantly associated with poverty rates in neighboring areas. The model fit is better than the OLS model, as measured by the AIC value of 2517.5 versus 2573.2 for the OLS model. Finally, in comparing the AIC values for the models, the lag model based on their values is a better measure of model fit.