library(haven)
library(foreign)
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS)
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(weights)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)
arf<-read_sas("C://Users//Jaire//OneDrive//Desktop//Spatial Demography//Data//ahrf2019.sas7bdat")
# Outcome - deaths (2015-2017) arf$deaths1517
# Grouping.V - foreign-born arf$fb
# Predictors:
# number of Hispanic families arf$hfam
# number of males ages 18-64 arf$m_18.64
# Median household income arf$minc
set.seed(123)
drop_na(arf)
arf$cofips=arf$f00002
arf$coname=arf$f00010
arf$state = arf$f00008
arf$fb=arf$f1433613
arf$tpop=arf$f1332510
arf$deaths1517=arf$f1194115
arf$hfam=arf$f1488710
arf$m_18.64=arf$f1476410
arf$minc=arf$f1322610
scale(arf$deaths1517)
scale(arf$hfam)
scale(arf$m_18.64)
scale(arf$minc)
# I construct a grouping variable based on the estimate of immigrants within counties. I divide the estimate of foreign-born by the total population in 2010 to calculate the percent estimate of immigrants within counties.
# I then create regimes based on the percent of foreign-born residents (AHRF); counties whose foreign-born population makes up 14% or more of the county's total population are "High foreign-born" and counties with less than 14% are "Low foreign-born".
arf$p.fb<-c(arf$fb/arf$tpop)
arf$fb_cut<-cut(as.numeric(arf$p.fb),
breaks = c(0,.139,.59),
include.lowest = T,
labels = c("Low", "High"))
summary(arf$fb_cut)
## Low High NA's
## 2996 224 10
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts)
arf.g<-geo_join(usco, arf, by_sp="cofips", by_df="cofips",how="left" )
## Warning: We recommend using the dplyr::*_join() family of functions instead.
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
fit.0<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
data=arf.g)
summary(fit.0)
##
## Call:
## lm(formula = arf$deaths1517 ~ arf$fb_cut + arf$hfam + arf$m_18.64 +
## arf$minc, data = arf.g)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4885.9 -115.3 -57.9 35.5 6906.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.102e+02 3.474e+01 11.809 < 2e-16 ***
## arf$fb_cutHigh -1.397e+02 3.507e+01 -3.984 6.92e-05 ***
## arf$hfam -2.078e-02 7.318e-04 -28.395 < 2e-16 ***
## arf$m_18.64 2.816e-02 1.886e-04 149.320 < 2e-16 ***
## arf$minc -6.609e-03 8.136e-04 -8.123 6.49e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 449.5 on 3098 degrees of freedom
## (127 observations deleted due to missingness)
## Multiple R-squared: 0.963, Adjusted R-squared: 0.9629
## F-statistic: 2.014e+04 on 4 and 3098 DF, p-value: < 2.2e-16
Areas where less than 14% of the population is foreign-born experience more deaths than areas with 14% or more, based on the first coefficient in the model (p<0.05).
The number of Hispanic families in a county is associated with a decrease in mortality (p<0.05). The number of males age 18 to 64 in a county is associated with an increase in mortality (p<0.05). Median household income is associated with an decrease in mortality (p<0.05).
fit.a<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
data=arf.g)
fit.h<-lm(arf$deaths1517~arf$hfam+arf$m_18.64+arf$minc,
data=arf.g, subset=fb_cut=="High")
fit.l<-lm(arf$deaths1517~arf$hfam+arf$m_18.64+arf$minc,
data=arf.g, subset=fb_cut=="Low")
#Chow Test
RSS2<-sum(resid(fit.l)^2) + sum(resid(fit.h)^2)
RSS0<-sum(resid(fit.a)^2)
k<-length(coef(fit.a))
n1<-as.numeric(table(arf$fb_cut)[1])
n2<-as.numeric(table(arf$fb_cut)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 39.61947
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 2.899985e-39
The test is significant, it seem plausible that non-stationarity is present.
summary(fit.h)
##
## Call:
## lm(formula = arf$deaths1517 ~ arf$hfam + arf$m_18.64 + arf$minc,
## data = arf.g, subset = fb_cut == "High")
##
## Residuals:
## Min 1Q Median 3Q Max
## -2792.6 -267.4 -182.6 8.7 4079.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.784e+02 1.837e+02 0.971 0.332
## arf$hfam -2.408e-03 2.529e-03 -0.952 0.342
## arf$m_18.64 2.175e-02 7.956e-04 27.336 <2e-16 ***
## arf$minc 3.144e-03 3.998e-03 0.786 0.433
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 705 on 215 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9824
## F-statistic: 4063 on 3 and 215 DF, p-value: < 2.2e-16
summary(fit.l)
##
## Call:
## lm(formula = arf$deaths1517 ~ arf$hfam + arf$m_18.64 + arf$minc,
## data = arf.g, subset = fb_cut == "Low")
##
## Residuals:
## Min 1Q Median 3Q Max
## -4957.4 -99.5 -38.4 44.5 5314.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.546e+02 3.306e+01 13.75 <2e-16 ***
## arf$hfam -2.283e-02 1.100e-03 -20.75 <2e-16 ***
## arf$m_18.64 2.892e-02 2.011e-04 143.84 <2e-16 ***
## arf$minc -8.360e-03 7.733e-04 -10.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 409.6 on 2878 degrees of freedom
## (124 observations deleted due to missingness)
## Multiple R-squared: 0.9541, Adjusted R-squared: 0.9541
## F-statistic: 1.994e+04 on 3 and 2878 DF, p-value: < 2.2e-16
The number of males ages 18 to 64 is the only significant predictor of mortality in areas where more than 14% of the population is foreign-born (p<0.05). In the “low” model all terms are significant predictors of area mortality (p<0.05).
fit.i1<-lm(arf$deaths1517~arf$fb_cut+arf$hfam+arf$m_18.64+arf$minc,
data=arf.g)
fit.i2<-lm(arf$deaths1517~arf$fb_cut/arf$hfam+arf$m_18.64+arf$minc,
data=arf.g)
anova(fit.i1, fit.i2, test="F")
## Analysis of Variance Table
##
## Model 1: arf$deaths1517 ~ arf$fb_cut + arf$hfam + arf$m_18.64 + arf$minc
## Model 2: arf$deaths1517 ~ arf$fb_cut/arf$hfam + arf$m_18.64 + arf$minc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 3098 626024545
## 2 3097 565452672 1 60571873 331.75 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA from the interaction models does confirm non-stationarity (F=331.75, p<0.05). This method provided stronger evidence than the Chow Test.
arf.use<-filter(arf.g, is.na(arf.g$fb_cut)==F)
#k= 4 nearest neighbors for the whole dataset
nbs<-knearneigh(coordinates(as_Spatial(arf.use)) , k=4)
nbs<-knn2nb(nbs,sym=T)
wts<-nb2listw(nbs)
# k = 4 nearest neighbors for the High foreign-born counties
nbsmg_h<-knearneigh(coordinates(as_Spatial(arf.use[arf.use$fb_cut=="High",])), k=4)
nbs_h<-knn2nb(nbsmg_h, sym=T)
wts_h<-nb2listw(nbs_h)
# k = 4 nearest neighbors for the High foreign-born counties
nbsmg_l<-knearneigh(coordinates(as_Spatial(arf.use[arf.use$fb_cut=="Low",])), k=4)
nbs_l<-knn2nb(nbsmg_l, sym=T)
wts_l<-nb2listw(nbs_l)
efit<-errorsarlm(deaths1517~fb_cut+hfam+m_18.64+minc,
data=arf.use,
listw=wts,
method= "SE_classic")
summary(efit, Nagelkerke=T)
##
## Call:errorsarlm(formula = deaths1517 ~ fb_cut + hfam + m_18.64 + minc,
## data = arf.use, listw = wts, method = "SE_classic")
##
## Residuals:
## Min 1Q Median 3Q Max
## -4264.400 -106.541 -49.384 37.378 5580.661
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7086e+02 4.1365e+01 11.3829 < 2.2e-16
## fb_cutHigh -1.5480e+02 3.6144e+01 -4.2827 1.846e-05
## hfam -1.9225e-02 6.9126e-04 -27.8111 < 2.2e-16
## m_18.64 2.7529e-02 1.8256e-04 150.7913 < 2.2e-16
## minc -7.5864e-03 9.1698e-04 -8.2733 2.220e-16
##
## Lambda: 0.475, LR test value: 451.8, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 4.494e-06
## z-value: 105700, p-value: < 2.22e-16
## Wald statistic: 1.1172e+10, p-value: < 2.22e-16
##
## Log likelihood: -23114.06 for error model
## ML residual variance (sigma squared): 164900, (sigma: 406.08)
## Nagelkerke pseudo-R-squared: 0.96799
## Number of observations: 3101
## Number of parameters estimated: 7
## AIC: 46242, (AIC for lm: 46692)
efit2<-errorsarlm(deaths1517~fb_cut/hfam+m_18.64+minc,
data=arf.use,
listw=wts,
method= "SE_classic")
summary(efit2, Nagelkerke=T)
##
## Call:errorsarlm(formula = deaths1517 ~ fb_cut/hfam + m_18.64 + minc,
## data = arf.use, listw = wts, method = "SE_classic")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3880.759 -91.576 -36.256 37.361 5294.340
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5108e+02 3.9341e+01 11.4660 <2e-16
## fb_cutHigh -1.0609e+01 3.5536e+01 -0.2985 0.7653
## m_18.64 2.6783e-02 1.8023e-04 148.6002 <2e-16
## minc -7.8772e-03 8.7401e-04 -9.0126 <2e-16
## fb_cutLow:hfam 2.4991e-02 2.6740e-03 9.3458 <2e-16
## fb_cutHigh:hfam -1.7304e-02 6.7150e-04 -25.7698 <2e-16
##
## Lambda: 0.464, LR test value: 414.61, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 4.4623e-06
## z-value: 103980, p-value: < 2.22e-16
## Wald statistic: 1.0812e+10, p-value: < 2.22e-16
##
## Log likelihood: -22974.89 for error model
## ML residual variance (sigma squared): 151180, (sigma: 388.81)
## Nagelkerke pseudo-R-squared: 0.97074
## Number of observations: 3101
## Number of parameters estimated: 8
## AIC: 45966, (AIC for lm: 46378)
anova(efit, efit2)
## Model df AIC logLik Test L.Ratio p-value
## efit 1 7 46242 -23114 1
## efit2 2 8 45966 -22975 2 278.34 0
The test indicates that the two regimes have different regression effects.
efit1a<-errorsarlm(deaths1517~hfam+m_18.64+minc,
data=arf.use[arf.use$fb_cut=="High",],
listw=wts_h)
## Warning in errorsarlm(deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut == : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 3.75352e-18 - using numerical Hessian.
efit2a<-errorsarlm(deaths1517~hfam+m_18.64+minc,
data=arf.use[arf.use$fb_cut=="Low",],
listw=wts_l)
summary(efit1a, Nagelkerke=T)
##
## Call:
## errorsarlm(formula = deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut ==
## "High", ], listw = wts_h)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4028.11 -331.03 -151.31 185.06 5265.42
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0147e+03 3.0833e+02 3.2908 0.000999
## hfam -8.3487e-03 2.1396e-03 -3.9020 9.54e-05
## m_18.64 2.4016e-02 5.9988e-04 40.0348 < 2.2e-16
## minc -1.5147e-02 5.4460e-03 -2.7813 0.005413
##
## Lambda: 0.43451, LR test value: 25.355, p-value: 4.7699e-07
## Approximate (numerical Hessian) standard error: 0.077237
## z-value: 5.6257, p-value: 1.8475e-08
## Wald statistic: 31.649, p-value: 1.8475e-08
##
## Log likelihood: -1826.617 for error model
## ML residual variance (sigma squared): 1059700, (sigma: 1029.4)
## Nagelkerke pseudo-R-squared: 0.97669
## Number of observations: 218
## Number of parameters estimated: 6
## AIC: 3665.2, (AIC for lm: 3688.6)
In the “High” regime model all predictors are significant (p<0.05). The number of Hispanic families is negatively associated with all-cause morality and is more significant in the global model. The number of males 18 to 64 is positively associated with all-cause morality and holds the same level of statistical significance as in the global model. Median household income is negatively associated with all-cause morality and is more significant in the global model.There is less spatial autocorrelation (0.43451) in the “High” regime model and its AIC (3665.2) is substantially lower than the global model. Overall, this model performs much better than the global models.
summary(efit2a, Nagelkerke=T)
##
## Call:
## errorsarlm(formula = deaths1517 ~ hfam + m_18.64 + minc, data = arf.use[arf.use$fb_cut ==
## "Low", ], listw = wts_l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2893.9091 -54.1219 -8.9297 45.8253 2554.8203
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.6675e+02 2.4662e+01 14.8709 < 2.2e-16
## hfam -1.5396e-02 1.9418e-03 -7.9287 2.22e-15
## m_18.64 3.2368e-02 1.7858e-04 181.2458 < 2.2e-16
## minc -7.5780e-03 5.5794e-04 -13.5821 < 2.2e-16
##
## Lambda: 0.45618, LR test value: 396.55, p-value: < 2.22e-16
## Asymptotic standard error: 0.021763
## z-value: 20.962, p-value: < 2.22e-16
## Wald statistic: 439.39, p-value: < 2.22e-16
##
## Log likelihood: -19802.56 for error model
## ML residual variance (sigma squared): 51418, (sigma: 226.76)
## Nagelkerke pseudo-R-squared: 0.96192
## Number of observations: 2883
## Number of parameters estimated: 6
## AIC: 39617, (AIC for lm: 40012)
In the “Low” regime model all predictors are significant (p<0.05). The number of Hispanic families is negatively associated with all-cause morality and has statistical significance similar to the saturated global model. The number of males 18 to 64 is positively associated with all-cause morality and holds the same level of statistical significance as in the saturated global model. Median household income is negatively associated with all-cause morality.There is less spatial autocorrelation (0.45618) in the “Low” regime model and its AIC (39617) is substantially lower than the global models. Overall, this model performs somewhat better than the global models.
Overall, the “High” regime model performed well.