library(rgdal)
library(spdep)
library(INLA)
library(tigris)
library(tidycensus)
library(tidyverse)
library(spdep)
library(MASS)
library(spatialreg)
library(ggplot2)
library(rio)
library(sas7bdat)
library(haven)
library(dplyr)
library(sf)
library(stringr)
For this analysis,I am using Area Health Resource Files for year 2020 from the Health Resource and Service Administration (HRSA). I am going to use Crude Death Rate as the numerator and total population as the denominator/offset variable.
There are four predictors in this analysis: poverty rate in 2018, child poverty in 2018, and medium house value in 2014,and the median household income for 2014.
ahrf2018 <- read_sas("C:/Users/chris/OneDrive - University of Texas at San Antonio/FALL 2021/FALL 2021/SPATIAL DEMOGRAPHY DEM 7263/ACS DATA/ahrf2020.sas7bdat")
ahrf2018<-ahrf2018%>%
mutate(cofips=as.factor(f00004),
coname=f00010,
state = f00011,
pop = f1198418,
death = f1255818,
pov = f1332118,
childpov= f1332218,
medhouvl=f1461314,
rucc= as.numeric(f0002013),
medhinc= f1434514,
obgyn10_pc= 1000*(f1168410/ f0453010) )%>%
dplyr::select(pop, death, state, cofips, coname, pov, childpov,rucc, medhinc, medhouvl)%>%
filter(complete.cases(.))
#as.data.frame()
head(ahrf2018)
## # A tibble: 6 x 10
## pop death state cofips coname pov childpov rucc medhinc medhouvl
## <dbl> <dbl> <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 55601 541 01 01001 Autauga 13.8 19.3 2 58786 147900
## 2 218022 2326 01 01003 Baldwin 9.8 13.9 3 55962 189800
## 3 24881 312 01 01005 Barbour 30.9 43.9 6 34186 92900
## 4 22400 252 01 01007 Bibb 21.8 27.8 1 45340 96500
## 5 57840 657 01 01009 Blount 13.2 18 1 48695 124700
## 6 10138 109 01 01011 Bullock 42.5 68.3 6 32152 77500
summary(ahrf2018)
## pop death state cofips
## Min. : 277 Min. : 0 Length:3139 02001 : 29
## 1st Qu.: 10958 1st Qu.: 120 Class :character 51015 : 3
## Median : 25776 Median : 282 Mode :character 51059 : 3
## Mean : 104214 Mean : 903 51153 : 3
## 3rd Qu.: 67980 3rd Qu.: 700 51161 : 3
## Max. :10105518 Max. :68164 51163 : 3
## (Other):3095
## coname pov childpov rucc
## Length:3139 Min. : 2.60 Min. : 2.50 Min. :1.000
## Class :character 1st Qu.:10.80 1st Qu.:14.50 1st Qu.:2.000
## Mode :character Median :14.10 Median :20.10 Median :6.000
## Mean :15.16 Mean :21.11 Mean :5.008
## 3rd Qu.:18.30 3rd Qu.:26.30 3rd Qu.:7.000
## Max. :54.00 Max. :68.30 Max. :9.000
##
## medhinc medhouvl
## Min. : 20188 Min. : 20700
## 1st Qu.: 42479 1st Qu.: 93750
## Median : 49887 Median : 122600
## Mean : 51570 Mean : 147156
## 3rd Qu.: 57585 3rd Qu.: 168300
## Max. :136268 Max. :1056500
##
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts)%>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))
ahrf2018_m<-geo_join(usco, ahrf2018, by_sp="cofips", by_df="cofips",how="left" )
## Warning: We recommend using the dplyr::*_join() family of functions instead.
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
ahrf2018_m%>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
mutate(cdr=(death/pop)*1000)%>% #Calculating Crude death Rate
mutate(cdr_range = cut(cdr, breaks=quantile(cdr, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>% #Making quantile map
ggplot()+
geom_sf(aes(fill=cdr_range, color=NA))+
scale_color_brewer(palette = "Blues")+
scale_fill_brewer(palette = "Blues",na.value = "grey50")+
geom_sf(data=sts, color="black")+
coord_sf(crs = 2163)+
ggtitle(label = "Crude Death Rates, United States, 2018")
## Application of Count Data Models ## Negative Binomial Model
ahrf2018wadt<-filter(ahrf2018_m, is.na(death)==F)
fit_nb<- glm.nb(death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc),
data=ahrf2018wadt)
summary(fit_nb)
##
## Call:
## glm.nb(formula = death ~ offset(log(pop)) + scale(pov) + scale(childpov) +
## scale(medhouvl) + scale(medhinc), data = ahrf2018wadt, init.theta = 32.72776372,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4867 -0.6130 0.0104 0.5821 3.7505
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.569172 0.003439 -1328.541 < 2e-16 ***
## scale(pov) -0.202713 0.010070 -20.130 < 2e-16 ***
## scale(childpov) 0.194451 0.010384 18.726 < 2e-16 ***
## scale(medhouvl) -0.022154 0.005003 -4.429 9.49e-06 ***
## scale(medhinc) -0.128960 0.007318 -17.624 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(32.7278) family taken to be 1)
##
## Null deviance: 5500.9 on 3071 degrees of freedom
## Residual deviance: 3297.3 on 3067 degrees of freedom
## AIC: 33880
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 32.73
## Std. Err.: 1.00
##
## 2 x log-likelihood: -33867.64
The result show that all the predictors are significant. However, a unit increase in childhood poverty led to 19.4% increase in mortality while a unit increase in the other predictors led to varying decrease in mortality.
scale<-sqrt(fit_nb$deviance/fit_nb$df.residual)
scale
## [1] 1.036872
This value of 1.036872 is greater than 1 which means the mean and the variance are not equal, which suggests that the variance is greater than the mean. Therefore, there’s an over-dispersion.
Key part here is to make numeric identifier for each geography!
#In INLA, we don't need FIPS codes, we need a simple numeric index for our counties
ahrf2018wadt$struct<-1:dim(ahrf2018wadt)[1]
nbs<-knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T) #k=5 nearest neighbors
## Warning in st_centroid.sf(ahrf2018wadt): st_centroid assumes attributes are
## constant over geometries of x
## Warning in knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T):
## dnearneigh: longlat argument overrides object
nbs<-knn2nb(nbs, row.names = ahrf2018wadt$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat)
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])
nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")
f4 <- death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc) +
f(struct, model = "bym",scale.model = T, constr = T, graph = H)
m_bym<-inla(formula = f4,data = ahrf2018wadt,
family = "poisson",
control.predictor = list(compute = TRUE),
control.compute = list(dic = TRUE, waic = TRUE),
num.threads = 3,
verbose = F)
summary(m_bym)
##
## Call:
## c("inla(formula = f4, family = \"poisson\", data = ahrf2018wadt,
## verbose = F, ", " control.compute = list(dic = TRUE, waic = TRUE),
## control.predictor = list(compute = TRUE), ", " num.threads = 3)")
## Time used:
## Pre = 0.845, Running = 29, Post = 0.967, Total = 30.8
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) -4.588 0.002 -4.592 -4.588 -4.583 -4.588 0
## scale(pov) -0.218 0.009 -0.236 -0.218 -0.200 -0.218 0
## scale(childpov) 0.216 0.010 0.197 0.216 0.235 0.216 0
## scale(medhouvl) -0.035 0.006 -0.048 -0.035 -0.022 -0.035 0
## scale(medhinc) -0.113 0.008 -0.129 -0.113 -0.098 -0.113 0
##
## Random effects:
## Name Model
## struct BYM model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for struct (iid component) 121.62 15.30 94.96 120.39
## Precision for struct (spatial component) 28.49 2.84 23.20 28.39
## 0.975quant mode
## Precision for struct (iid component) 154.93 117.70
## Precision for struct (spatial component) 34.35 28.27
##
## Expected number of effective parameters(stdev): 2416.23(13.52)
## Number of equivalent replicates : 1.27
##
## Deviance Information Criterion (DIC) ...............: 28857.19
## Deviance Information Criterion (DIC, saturated) ....: 5728.98
## Effective number of parameters .....................: 2418.77
##
## Watanabe-Akaike information criterion (WAIC) ...: 28415.74
## Effective number of parameters .................: 1454.26
##
## Marginal log-Likelihood: -15474.15
## Posterior marginals for the linear predictor and
## the fitted values are computed
fit_nb$aic
## [1] 33879.64
m_bym$waic$waic
## [1] 28415.74
The bym model which is the spatial correlation looks the better fits for the data.