title: “Dem 7263-Spatial regression-HW” name: Sarah Sharmin instructor: Dr. Corey Sparks output:
library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(tidycensus)
library(tidyverse)
library(dplyr)
library(haven)
library(ggplot2)
library(spatialreg)
library(tidycensus)
library(sf)
options(tigris_class = "sf")
sa_acs<-get_acs(geography = "tract",
state="TX",
county = c("029", "013","255","091","187","493","163","311","325","019","265","171","259"),
year = 2015,
variables=c("DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
"DP05_0001E", "DP02_0030E", "DP02_0031PE", "DP02_0004PE",
"DP02_0037PE","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
"DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
"DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
"DP03_0119PE","DP04_0046PE","DP04_0078PE","DP05_0072PE","DP05_0073PE",
"DP05_0066PE", "DP05_0072PE", "DP02_0113PE") ,
geometry = T,
output = "wide")
sa_acs$county<-substr(sa_acs$GEOID, 1, 5)
sa_acs2<-sa_acs%>%
mutate(totpop= DP05_0001E,
pcohab = DP02_0004PE,
pmdiv = DP02_0030E,
pfdiv = DP02_0031PE,
pforn=DP02_0092PE) %>%
na.omit()
metro<- tigris::core_based_statistical_areas(cb=T,
year = 2015)
metro<-metro%>%
st_as_sf()%>%
#st_boundary()%>%
filter(grepl(NAME,
pattern="San Antonio"))
sa_acs2%>%
mutate(cohab=cut(pcohab,
breaks = quantile(pcohab,
p=seq(0,1,length.out = 5)),
include.lowest = T))%>%
ggplot(aes(color=cohab,
fill=cohab))+
geom_sf()+
scale_fill_viridis_d()+
scale_color_viridis_d()+
ggtitle(label = "Percentage of cohabiting couple in Census Tracts, San Antonio - 2019")+
geom_sf(data=metro,
fill=NA,
color="black")
#Creating neighborhood list
nbs<-poly2nb(sa_acs2, queen = T)
wtsq<-nb2listw(nbs, style = "W")
nb4<-knearneigh(st_centroid(sa_acs2), k=4)
nb4<-knn2nb(nb4)
wt4<-nb2listw(nb4, style="W")
nb5<-knearneigh(st_centroid(sa_acs2), k=5)
nb5<-knn2nb(nb5)
wt5<-nb2listw(nb5, style="W")
#Determining the correlation of potential independent variables
fit <- lm( pcohab ~ pforn+pmdiv+pfdiv,
data=sa_acs2)
summary(fit)
##
## Call:
## lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.188 -7.814 -1.107 7.293 58.874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.5424304 2.2567343 33.917 < 2e-16 ***
## pforn -0.4709358 0.0904736 -5.205 2.95e-07 ***
## pmdiv 0.0017168 0.0006859 2.503 0.0127 *
## pfdiv -0.8762643 0.0581407 -15.071 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.54 on 452 degrees of freedom
## Multiple R-squared: 0.4437, Adjusted R-squared: 0.4401
## F-statistic: 120.2 on 3 and 452 DF, p-value: < 2.2e-16
We see that foreign born status, divorced status of male and female is significantly associated with cohabitation status.
#Auto correlation of the neighborhood by queen, k4, and k5
#Moran's I on residuals from model
lm.morantest(fit, listw = wtsq)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wtsq
##
## Moran I statistic standard deviate = 13.262, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3599809352 -0.0039710613 0.0007531413
lm.morantest(fit, listw = wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wt4
##
## Moran I statistic standard deviate = 12.582, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3800461246 -0.0040766727 0.0009319825
lm.morantest(fit, listw = wt5)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wt5
##
## Moran I statistic standard deviate = 13.513, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3663705604 -0.0039819570 0.0007511235
Based on the Moran’s I value, we see that k4 neighborhood has the largest and queen neighborhood has the lowest autocorrelation among three neighborhoods.
#Creating a model fit with rstudent model for San antonio
#nbs<-poly2nb(sa_acs2, queen = T)
#wtsq<-nb2listw(nbs, style = "W")
sa_acs2$olsresid<-rstudent(fit)
Now we will see residuls in a linear regression model for San Antonio
Visually we see that residuals are clustered. The moran’s I value is 0.36, z value is 13.3, and a very small p value (< 2.2e-16) indicates that there is autcorrelation exists in the dataset.
#Alternative SAR model specification would best fit this data using the minimum AIC criteria and specification tests
fit.lag <- lagsarlm(pcohab ~ pforn+pmdiv+pfdiv,
data=sa_acs2,
listw = wtsq,
type = "lag")
summary(fit.lag)
##
## Call:lagsarlm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2,
## listw = wtsq, type = "lag")
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.48749 -5.83430 -0.48325 5.54986 49.86385
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.0149e+01 3.3096e+00 12.1310 < 2e-16
## pforn -1.9151e-01 7.7399e-02 -2.4743 0.01335
## pmdiv 4.9154e-05 5.6042e-04 0.0877 0.93011
## pfdiv -5.9465e-01 5.1294e-02 -11.5930 < 2e-16
##
## Rho: 0.58009, LR test value: 154.07, p-value: < 2.22e-16
## Asymptotic standard error: 0.041276
## z-value: 14.054, p-value: < 2.22e-16
## Wald statistic: 197.51, p-value: < 2.22e-16
##
## Log likelihood: -1683.272 for lag model
## ML residual variance (sigma squared): 87.786, (sigma: 9.3694)
## Number of observations: 456
## Number of parameters estimated: 6
## AIC: 3378.5, (AIC for lm: 3530.6)
## LM test for residual autocorrelation
## test value: 0.00054407, p-value: 0.98139
fit.err <- errorsarlm (pcohab ~ pforn+pmdiv+pfdiv,
data=sa_acs2,
listw = wtsq)
summary(fit.err)
##
## Call:errorsarlm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2,
## listw = wtsq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.52696 -5.89107 -0.48133 5.63622 48.25578
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 72.23484084 2.42056331 29.8422 < 2.2e-16
## pforn -0.36127589 0.08743047 -4.1322 3.594e-05
## pmdiv -0.00016047 0.00055577 -0.2887 0.7728
## pfdiv -0.64501177 0.05352076 -12.0516 < 2.2e-16
##
## Lambda: 0.6849, LR test value: 144.9, p-value: < 2.22e-16
## Asymptotic standard error: 0.043946
## z-value: 15.585, p-value: < 2.22e-16
## Wald statistic: 242.89, p-value: < 2.22e-16
##
## Log likelihood: -1687.853 for error model
## ML residual variance (sigma squared): 86.515, (sigma: 9.3013)
## Number of observations: 456
## Number of parameters estimated: 6
## AIC: 3387.7, (AIC for lm: 3530.6)
moran.test(fit.lag$residuals, wtsq)
##
## Moran I test under randomisation
##
## data: fit.lag$residuals
## weights: wtsq
##
## Moran I statistic standard deviate = 0.095163, p-value = 0.4621
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.0004195129 -0.0021978022 0.0007564407
moran.test(fit.err$residuals, wtsq)
##
## Moran I test under randomisation
##
## data: fit.err$residuals
## weights: wtsq
##
## Moran I statistic standard deviate = -1.45, p-value = 0.9265
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.0420927199 -0.0021978022 0.0007570574
#Conclusion: We see that both the lag and error model have lower autocorrelation compared to linear model. But the AIC value of the lag model is lower than the error model. Hence, the lag model would be the best model for this data.