library(haven)
library(foreign) 
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS) 
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(weights)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)

A. Data extraction, define outcome and predictors (AHRF 2018-2019)

arf<-read_sas("C://Users//Jaire//OneDrive//Desktop//Spatial Demography//Data//ahrf2019.sas7bdat")
# Outcome - deaths  (2015-2017) arf$deaths1517

# Grouping.V -  foreign-born arf$fb

# Predictors:

# number of Hispanic families arf$hfam
# number of males ages 18-64  arf$m_18.64
# Median household income     arf$minc
set.seed(123)
drop_na(arf)
arf$cofips=arf$f00002
arf$coname=arf$f00010
arf$state = arf$f00008

arf$fb=arf$f1433613
arf$tpop=arf$f1332510
arf$deaths1517=arf$f1194115
arf$hfam=arf$f1488710
arf$m_18.64=arf$f1476410
arf$minc=arf$f1322610
scale(arf$deaths1517)
scale(arf$hfam)
scale(arf$m_18.64)
scale(arf$minc)

B. Generate spatial regimes

# I construct a grouping variable based on the estimate of immigrants within counties. I divide the estimate of foreign-born by the total population in 2010 to calculate the percent estimate of immigrants within counties. 

# I then create regimes based on the percent of foreign-born residents (AHRF); counties whose foreign-born population makes up 14% or more of the county's total population are "High foreign-born" and counties with less than 14% are "Low foreign-born".

 
arf$p.fb<-c(arf$fb/arf$tpop)

arf$fb_cut<-cut(as.numeric(arf$p.fb),
                    breaks = c(0,.139,.59),
                    include.lowest = T,
                    labels = c("Low", "High"))
summary(arf$fb_cut)
##  Low High NA's 
## 2996  224   10
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts)

arf.g<-geo_join(usco, arf, by_sp="cofips", by_df="cofips",how="left" )
## Warning: We recommend using the dplyr::*_join() family of functions instead.
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help

C. Basic OLS of mortality

fit.0<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g)

summary(fit.0)
## 
## Call:
## lm(formula = arf$deaths1517 ~ arf$fb_cut + arf$hfam + arf$m_18.64 + 
##     arf$minc, data = arf.g)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4885.9  -115.3   -57.9    35.5  6906.2 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.102e+02  3.474e+01  11.809  < 2e-16 ***
## arf$fb_cutHigh -1.397e+02  3.507e+01  -3.984 6.92e-05 ***
## arf$hfam       -2.078e-02  7.318e-04 -28.395  < 2e-16 ***
## arf$m_18.64     2.816e-02  1.886e-04 149.320  < 2e-16 ***
## arf$minc       -6.609e-03  8.136e-04  -8.123 6.49e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 449.5 on 3098 degrees of freedom
##   (127 observations deleted due to missingness)
## Multiple R-squared:  0.963,  Adjusted R-squared:  0.9629 
## F-statistic: 2.014e+04 on 4 and 3098 DF,  p-value: < 2.2e-16

Areas where less than 14% of the population is foreign-born experience more deaths than areas with 14% or more, based on the first coefficient in the model (p<0.05).

The number of Hispanic families in a county is associated with a decrease in mortality (p<0.05). The number of males age 18 to 64 in a county is associated with an increase in mortality (p<0.05). Median household income is associated with an decrease in mortality (p<0.05).

D. Testing spatial regimes

fit.a<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g)

fit.h<-lm(arf$deaths1517~arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g, subset=fb_cut=="High")

fit.l<-lm(arf$deaths1517~arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g, subset=fb_cut=="Low")
#Chow Test

RSS2<-sum(resid(fit.l)^2) + sum(resid(fit.h)^2)
RSS0<-sum(resid(fit.a)^2)
k<-length(coef(fit.a))
n1<-as.numeric(table(arf$fb_cut)[1])
n2<-as.numeric(table(arf$fb_cut)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 39.61947
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 2.899985e-39

The test is significant, it seem plausible that non-stationarity is present.

summary(fit.h)
## 
## Call:
## lm(formula = arf$deaths1517 ~ arf$hfam + arf$m_18.64 + arf$minc, 
##     data = arf.g, subset = fb_cut == "High")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2792.6  -267.4  -182.6     8.7  4079.4 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.784e+02  1.837e+02   0.971    0.332    
## arf$hfam    -2.408e-03  2.529e-03  -0.952    0.342    
## arf$m_18.64  2.175e-02  7.956e-04  27.336   <2e-16 ***
## arf$minc     3.144e-03  3.998e-03   0.786    0.433    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 705 on 215 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9824 
## F-statistic:  4063 on 3 and 215 DF,  p-value: < 2.2e-16
summary(fit.l)
## 
## Call:
## lm(formula = arf$deaths1517 ~ arf$hfam + arf$m_18.64 + arf$minc, 
##     data = arf.g, subset = fb_cut == "Low")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4957.4   -99.5   -38.4    44.5  5314.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.546e+02  3.306e+01   13.75   <2e-16 ***
## arf$hfam    -2.283e-02  1.100e-03  -20.75   <2e-16 ***
## arf$m_18.64  2.892e-02  2.011e-04  143.84   <2e-16 ***
## arf$minc    -8.360e-03  7.733e-04  -10.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 409.6 on 2878 degrees of freedom
##   (124 observations deleted due to missingness)
## Multiple R-squared:  0.9541, Adjusted R-squared:  0.9541 
## F-statistic: 1.994e+04 on 3 and 2878 DF,  p-value: < 2.2e-16

The number of males ages 18 to 64 is the only significant predictor of mortality in areas where more than 14% of the population is foreign-born (p<0.05). In the “low” model all terms are significant predictors of area mortality (p<0.05).

a. Spatial regime model building through interactions

fit.i1<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g)
fit.i2<-lm(arf$deaths1517~arf$fb_cut/arf$hfam+arf$m_18.64+arf$minc,
          data=arf.g)


anova(fit.i1, fit.i2, test="F")
## Analysis of Variance Table
## 
## Model 1: arf$deaths1517 ~ arf$fb_cut + arf$hfam + arf$m_18.64 + arf$minc
## Model 2: arf$deaths1517 ~ arf$fb_cut/arf$hfam + arf$m_18.64 + arf$minc
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   3098 626024545                                  
## 2   3097 565452672  1  60571873 331.75 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA from the interaction models does confirm non-stationarity (F=331.75, p<0.05). This method provided stronger evidence than the Chow Test.

Spatially auto-regressive model for spatial regimes analysis

arf.use<-filter(arf.g, is.na(arf.g$fb_cut)==F)
#k= 4 nearest neighbors for the whole dataset
nbs<-knearneigh(coordinates(as_Spatial(arf.use)) , k=4)
nbs<-knn2nb(nbs,sym=T)
wts<-nb2listw(nbs)

# k = 4  nearest neighbors for the High foreign-born counties

nbsmg_h<-knearneigh(coordinates(as_Spatial(arf.use[arf.use$fb_cut=="High",])), k=4)
nbs_h<-knn2nb(nbsmg_h, sym=T)
wts_h<-nb2listw(nbs_h)

# k = 4  nearest neighbors for the High foreign-born counties
nbsmg_l<-knearneigh(coordinates(as_Spatial(arf.use[arf.use$fb_cut=="Low",])), k=4)
nbs_l<-knn2nb(nbsmg_l, sym=T)
wts_l<-nb2listw(nbs_l)

Global model

efit<-errorsarlm(deaths1517~fb_cut+hfam+m_18.64+minc,
          data=arf.use,
          listw=wts,
          method= "SE_classic")

summary(efit, Nagelkerke=T)
## 
## Call:errorsarlm(formula = deaths1517 ~ fb_cut + hfam + m_18.64 + minc, 
##     data = arf.use, listw = wts, method = "SE_classic")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -4264.400  -106.541   -49.384    37.378  5580.661 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)  4.7086e+02  4.1365e+01  11.3829 < 2.2e-16
## fb_cutHigh  -1.5480e+02  3.6144e+01  -4.2827 1.846e-05
## hfam        -1.9225e-02  6.9126e-04 -27.8111 < 2.2e-16
## m_18.64      2.7529e-02  1.8256e-04 150.7913 < 2.2e-16
## minc        -7.5864e-03  9.1698e-04  -8.2733 2.220e-16
## 
## Lambda: 0.475, LR test value: 451.8, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 4.494e-06
##     z-value: 105700, p-value: < 2.22e-16
## Wald statistic: 1.1172e+10, p-value: < 2.22e-16
## 
## Log likelihood: -23114.06 for error model
## ML residual variance (sigma squared): 164900, (sigma: 406.08)
## Nagelkerke pseudo-R-squared: 0.96799 
## Number of observations: 3101 
## Number of parameters estimated: 7 
## AIC: 46242, (AIC for lm: 46692)

Saturated model

efit2<-errorsarlm(deaths1517~fb_cut/hfam+m_18.64+minc,
          data=arf.use,
          listw=wts,
          method= "SE_classic")

summary(efit2, Nagelkerke=T)
## 
## Call:errorsarlm(formula = deaths1517 ~ fb_cut/hfam + m_18.64 + minc, 
##     data = arf.use, listw = wts, method = "SE_classic")
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3880.759   -91.576   -36.256    37.361  5294.340 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                    Estimate  Std. Error  z value Pr(>|z|)
## (Intercept)      4.5108e+02  3.9341e+01  11.4660   <2e-16
## fb_cutHigh      -1.0609e+01  3.5536e+01  -0.2985   0.7653
## m_18.64          2.6783e-02  1.8023e-04 148.6002   <2e-16
## minc            -7.8772e-03  8.7401e-04  -9.0126   <2e-16
## fb_cutLow:hfam   2.4991e-02  2.6740e-03   9.3458   <2e-16
## fb_cutHigh:hfam -1.7304e-02  6.7150e-04 -25.7698   <2e-16
## 
## Lambda: 0.464, LR test value: 414.61, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 4.4623e-06
##     z-value: 103980, p-value: < 2.22e-16
## Wald statistic: 1.0812e+10, p-value: < 2.22e-16
## 
## Log likelihood: -22974.89 for error model
## ML residual variance (sigma squared): 151180, (sigma: 388.81)
## Nagelkerke pseudo-R-squared: 0.97074 
## Number of observations: 3101 
## Number of parameters estimated: 8 
## AIC: 45966, (AIC for lm: 46378)
anova(efit, efit2)
##       Model df   AIC logLik Test L.Ratio p-value
## efit      1  7 46242 -23114    1                
## efit2     2  8 45966 -22975    2  278.34       0

E.

The test indicates that the two regimes have different regression effects.

F. Spatial regime models - results

efit1a<-errorsarlm(deaths1517~hfam+m_18.64+minc,
            data=arf.use[arf.use$fb_cut=="High",], 
            listw=wts_h)
## Warning in errorsarlm(deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut == : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 3.75352e-18 - using numerical Hessian.
efit2a<-errorsarlm(deaths1517~hfam+m_18.64+minc,
            data=arf.use[arf.use$fb_cut=="Low",], 
            listw=wts_l)
summary(efit1a, Nagelkerke=T)
## 
## Call:
## errorsarlm(formula = deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut == 
##     "High", ], listw = wts_h)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4028.11  -331.03  -151.31   185.06  5265.42 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  1.0147e+03  3.0833e+02  3.2908  0.000999
## hfam        -8.3487e-03  2.1396e-03 -3.9020  9.54e-05
## m_18.64      2.4016e-02  5.9988e-04 40.0348 < 2.2e-16
## minc        -1.5147e-02  5.4460e-03 -2.7813  0.005413
## 
## Lambda: 0.43451, LR test value: 25.355, p-value: 4.7699e-07
## Approximate (numerical Hessian) standard error: 0.077237
##     z-value: 5.6257, p-value: 1.8475e-08
## Wald statistic: 31.649, p-value: 1.8475e-08
## 
## Log likelihood: -1826.617 for error model
## ML residual variance (sigma squared): 1059700, (sigma: 1029.4)
## Nagelkerke pseudo-R-squared: 0.97669 
## Number of observations: 218 
## Number of parameters estimated: 6 
## AIC: 3665.2, (AIC for lm: 3688.6)

In the “High” regime model all predictors are significant (p<0.05). The number of Hispanic families is negatively associated with all-cause morality and is more significant in the global model. The number of males 18 to 64 is positively associated with all-cause morality and holds the same level of statistical significance as in the global model. Median household income is negatively associated with all-cause morality and is more significant in the global model.There is less spatial autocorrelation (0.43451) in the “High” regime model and its AIC (3665.2) is substantially lower than the global model. Overall, this model performs much better than the global models.

summary(efit2a, Nagelkerke=T)
## 
## Call:
## errorsarlm(formula = deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut == 
##     "Low", ], listw = wts_l)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2893.9091   -54.1219    -8.9297    45.8253  2554.8203 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)  3.6675e+02  2.4662e+01  14.8709 < 2.2e-16
## hfam        -1.5396e-02  1.9418e-03  -7.9287  2.22e-15
## m_18.64      3.2368e-02  1.7858e-04 181.2458 < 2.2e-16
## minc        -7.5780e-03  5.5794e-04 -13.5821 < 2.2e-16
## 
## Lambda: 0.45618, LR test value: 396.55, p-value: < 2.22e-16
## Asymptotic standard error: 0.021763
##     z-value: 20.962, p-value: < 2.22e-16
## Wald statistic: 439.39, p-value: < 2.22e-16
## 
## Log likelihood: -19802.56 for error model
## ML residual variance (sigma squared): 51418, (sigma: 226.76)
## Nagelkerke pseudo-R-squared: 0.96192 
## Number of observations: 2883 
## Number of parameters estimated: 6 
## AIC: 39617, (AIC for lm: 40012)

In the “Low” regime model all predictors are significant (p<0.05). The number of Hispanic families is negatively associated with all-cause morality and has statistical significance similar to the saturated global model. The number of males 18 to 64 is positively associated with all-cause morality and holds the same level of statistical significance as in the saturated global model. Median household income is negatively associated with all-cause morality.There is less spatial autocorrelation (0.45618) in the “Low” regime model and its AIC (39617) is substantially lower than the global models. Overall, this model performs somewhat better than the global models.

Overall, the “High” regime model performed well.