title: “Dem 7263-Spatial regression-HW” name: Sarah Sharmin instructor: Dr. Corey Sparks output:

library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(tidycensus)
library(tidyverse)
library(dplyr)
library(haven)
library(ggplot2)
library(spatialreg)
library(tidycensus)
library(sf)
options(tigris_class = "sf")

sa_acs<-get_acs(geography = "tract",
                state="TX",
                county = c("029", "013","255","091","187","493","163","311","325","019","265","171","259"),
                year = 2015,
                variables=c("DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
                            "DP05_0001E", "DP02_0030E", "DP02_0031PE", "DP02_0004PE",
                            "DP02_0037PE","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
                            "DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
                            "DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
                            "DP03_0119PE","DP04_0046PE","DP04_0078PE","DP05_0072PE","DP05_0073PE",
                            "DP05_0066PE", "DP05_0072PE", "DP02_0113PE") ,
                geometry = T,
                output = "wide")


sa_acs$county<-substr(sa_acs$GEOID, 1, 5)

sa_acs2<-sa_acs%>%
  mutate(totpop= DP05_0001E,
         pcohab = DP02_0004PE,
         pmdiv = DP02_0030E,
         pfdiv = DP02_0031PE,
         pforn=DP02_0092PE) %>%
  na.omit()

metro<- tigris::core_based_statistical_areas(cb=T,
                                             year = 2015)
metro<-metro%>%
  st_as_sf()%>%
  #st_boundary()%>%
  filter(grepl(NAME,
               pattern="San Antonio"))
sa_acs2%>%
  mutate(cohab=cut(pcohab,
                      breaks = quantile(pcohab,
                                       p=seq(0,1,length.out = 5)),
                      include.lowest = T))%>%
  ggplot(aes(color=cohab,
             fill=cohab))+
  geom_sf()+
  scale_fill_viridis_d()+
  scale_color_viridis_d()+
  ggtitle(label = "Percentage of cohabiting couple in Census Tracts, San Antonio - 2019")+
  geom_sf(data=metro,
          fill=NA,
          color="black")

#Creating neighborhood list

nbs<-poly2nb(sa_acs2, queen = T)
wtsq<-nb2listw(nbs, style = "W")

nb4<-knearneigh(st_centroid(sa_acs2), k=4)
nb4<-knn2nb(nb4)
wt4<-nb2listw(nb4, style="W")

nb5<-knearneigh(st_centroid(sa_acs2), k=5)
nb5<-knn2nb(nb5)
wt5<-nb2listw(nb5, style="W")

#Determining the correlation of potential independent variables

fit <- lm( pcohab ~ pforn+pmdiv+pfdiv,
           data=sa_acs2)
summary(fit)
## 
## Call:
## lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.188  -7.814  -1.107   7.293  58.874 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 76.5424304  2.2567343  33.917  < 2e-16 ***
## pforn       -0.4709358  0.0904736  -5.205 2.95e-07 ***
## pmdiv        0.0017168  0.0006859   2.503   0.0127 *  
## pfdiv       -0.8762643  0.0581407 -15.071  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.54 on 452 degrees of freedom
## Multiple R-squared:  0.4437, Adjusted R-squared:  0.4401 
## F-statistic: 120.2 on 3 and 452 DF,  p-value: < 2.2e-16

We see that foreign born status, divorced status of male and female is significantly associated with cohabitation status.

#Auto correlation of the neighborhood by queen, k4, and k5

#Moran's I on residuals from model
lm.morantest(fit, listw = wtsq)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wtsq
## 
## Moran I statistic standard deviate = 13.262, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3599809352    -0.0039710613     0.0007531413
lm.morantest(fit, listw = wt4)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wt4
## 
## Moran I statistic standard deviate = 12.582, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3800461246    -0.0040766727     0.0009319825
lm.morantest(fit, listw = wt5)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2)
## weights: wt5
## 
## Moran I statistic standard deviate = 13.513, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3663705604    -0.0039819570     0.0007511235

Based on the Moran’s I value, we see that k4 neighborhood has the largest and queen neighborhood has the lowest autocorrelation among three neighborhoods.

#Creating a model fit with rstudent model for San antonio

#nbs<-poly2nb(sa_acs2, queen = T)
#wtsq<-nb2listw(nbs, style = "W")
sa_acs2$olsresid<-rstudent(fit)

Now we will see residuls in a linear regression model for San Antonio

Visually we see that residuals are clustered. The moran’s I value is 0.36, z value is 13.3, and a very small p value (< 2.2e-16) indicates that there is autcorrelation exists in the dataset.

#Alternative SAR model specification would best fit this data using the minimum AIC criteria and specification tests

fit.lag <- lagsarlm(pcohab ~ pforn+pmdiv+pfdiv,
           data=sa_acs2,
                    listw = wtsq,
                    type = "lag")
summary(fit.lag)
## 
## Call:lagsarlm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2, 
##     listw = wtsq, type = "lag")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -29.48749  -5.83430  -0.48325   5.54986  49.86385 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error  z value Pr(>|z|)
## (Intercept)  4.0149e+01  3.3096e+00  12.1310  < 2e-16
## pforn       -1.9151e-01  7.7399e-02  -2.4743  0.01335
## pmdiv        4.9154e-05  5.6042e-04   0.0877  0.93011
## pfdiv       -5.9465e-01  5.1294e-02 -11.5930  < 2e-16
## 
## Rho: 0.58009, LR test value: 154.07, p-value: < 2.22e-16
## Asymptotic standard error: 0.041276
##     z-value: 14.054, p-value: < 2.22e-16
## Wald statistic: 197.51, p-value: < 2.22e-16
## 
## Log likelihood: -1683.272 for lag model
## ML residual variance (sigma squared): 87.786, (sigma: 9.3694)
## Number of observations: 456 
## Number of parameters estimated: 6 
## AIC: 3378.5, (AIC for lm: 3530.6)
## LM test for residual autocorrelation
## test value: 0.00054407, p-value: 0.98139
fit.err <- errorsarlm (pcohab ~ pforn+pmdiv+pfdiv,
           data=sa_acs2,
           listw = wtsq)
summary(fit.err)
## 
## Call:errorsarlm(formula = pcohab ~ pforn + pmdiv + pfdiv, data = sa_acs2, 
##     listw = wtsq)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -31.52696  -5.89107  -0.48133   5.63622  48.25578 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept) 72.23484084  2.42056331  29.8422 < 2.2e-16
## pforn       -0.36127589  0.08743047  -4.1322 3.594e-05
## pmdiv       -0.00016047  0.00055577  -0.2887    0.7728
## pfdiv       -0.64501177  0.05352076 -12.0516 < 2.2e-16
## 
## Lambda: 0.6849, LR test value: 144.9, p-value: < 2.22e-16
## Asymptotic standard error: 0.043946
##     z-value: 15.585, p-value: < 2.22e-16
## Wald statistic: 242.89, p-value: < 2.22e-16
## 
## Log likelihood: -1687.853 for error model
## ML residual variance (sigma squared): 86.515, (sigma: 9.3013)
## Number of observations: 456 
## Number of parameters estimated: 6 
## AIC: 3387.7, (AIC for lm: 3530.6)
moran.test(fit.lag$residuals, wtsq)
## 
##  Moran I test under randomisation
## 
## data:  fit.lag$residuals  
## weights: wtsq    
## 
## Moran I statistic standard deviate = 0.095163, p-value = 0.4621
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.0004195129     -0.0021978022      0.0007564407
moran.test(fit.err$residuals, wtsq)
## 
##  Moran I test under randomisation
## 
## data:  fit.err$residuals  
## weights: wtsq    
## 
## Moran I statistic standard deviate = -1.45, p-value = 0.9265
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0420927199     -0.0021978022      0.0007570574

#Conclusion: We see that both the lag and error model have lower autocorrelation compared to linear model. But the AIC value of the lag model is lower than the error model. Hence, the lag model would be the best model for this data.