library(rgdal)
library(spdep)
library(INLA)
library(tigris)
library(tidycensus)
library(tidyverse)
library(spdep)
library(MASS)
library(spatialreg)
library(ggplot2)
library(rio)
library(sas7bdat)
library(haven)
library(dplyr)
library(sf)
library(stringr)

For this analysis,I am using Area Health Resource Files for year 2020 from the Health Resource and Service Administration (HRSA). I am going to use Crude Death Rate as the numerator and total population as the denominator/offset variable.

There are four predictors in this analysis: poverty rate in 2018, child poverty in 2018, and medium house value in 2014,and the median household income for 2014.

Getting the Data

ahrf2018 <- read_sas("C:/Users/chris/OneDrive - University of Texas at San Antonio/FALL 2021/FALL 2021/SPATIAL DEMOGRAPHY DEM 7263/ACS DATA/ahrf2020.sas7bdat")

REcoding Variables

ahrf2018<-ahrf2018%>%
  mutate(cofips=as.factor(f00004),  
         coname=f00010,
         state = f00011,
         pop = f1198418,
         death = f1255818,
         pov = f1332118,
         childpov= f1332218,
         medhouvl=f1461314,
         rucc= as.numeric(f0002013),
         medhinc= f1434514,
         obgyn10_pc= 1000*(f1168410/ f0453010) )%>%
        dplyr::select(pop, death, state, cofips, coname, pov, childpov,rucc, medhinc, medhouvl)%>%
  filter(complete.cases(.))
  #as.data.frame()


head(ahrf2018)
## # A tibble: 6 x 10
##      pop death state cofips coname    pov childpov  rucc medhinc medhouvl
##    <dbl> <dbl> <chr> <fct>  <chr>   <dbl>    <dbl> <dbl>   <dbl>    <dbl>
## 1  55601   541 01    01001  Autauga  13.8     19.3     2   58786   147900
## 2 218022  2326 01    01003  Baldwin   9.8     13.9     3   55962   189800
## 3  24881   312 01    01005  Barbour  30.9     43.9     6   34186    92900
## 4  22400   252 01    01007  Bibb     21.8     27.8     1   45340    96500
## 5  57840   657 01    01009  Blount   13.2     18       1   48695   124700
## 6  10138   109 01    01011  Bullock  42.5     68.3     6   32152    77500
summary(ahrf2018)
##       pop               death          state               cofips    
##  Min.   :     277   Min.   :    0   Length:3139        02001  :  29  
##  1st Qu.:   10958   1st Qu.:  120   Class :character   51015  :   3  
##  Median :   25776   Median :  282   Mode  :character   51059  :   3  
##  Mean   :  104214   Mean   :  903                      51153  :   3  
##  3rd Qu.:   67980   3rd Qu.:  700                      51161  :   3  
##  Max.   :10105518   Max.   :68164                      51163  :   3  
##                                                        (Other):3095  
##     coname               pov           childpov          rucc      
##  Length:3139        Min.   : 2.60   Min.   : 2.50   Min.   :1.000  
##  Class :character   1st Qu.:10.80   1st Qu.:14.50   1st Qu.:2.000  
##  Mode  :character   Median :14.10   Median :20.10   Median :6.000  
##                     Mean   :15.16   Mean   :21.11   Mean   :5.008  
##                     3rd Qu.:18.30   3rd Qu.:26.30   3rd Qu.:7.000  
##                     Max.   :54.00   Max.   :68.30   Max.   :9.000  
##                                                                    
##     medhinc          medhouvl      
##  Min.   : 20188   Min.   :  20700  
##  1st Qu.: 42479   1st Qu.:  93750  
##  Median : 49887   Median : 122600  
##  Mean   : 51570   Mean   : 147156  
##  3rd Qu.: 57585   3rd Qu.: 168300  
##  Max.   :136268   Max.   :1056500  
## 

Mapping Crude Death Rate

options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts)%>%
  filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))

ahrf2018_m<-geo_join(usco, ahrf2018, by_sp="cofips", by_df="cofips",how="left" )
## Warning: We recommend using the dplyr::*_join() family of functions instead.
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
ahrf2018_m%>%
  filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
  mutate(cdr=(death/pop)*1000)%>% #Calculating Crude death Rate
  mutate(cdr_range = cut(cdr, breaks=quantile(cdr, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>% #Making quantile map
  ggplot()+
  geom_sf(aes(fill=cdr_range, color=NA))+
  scale_color_brewer(palette = "Blues")+
  scale_fill_brewer(palette = "Blues",na.value = "grey50")+
  geom_sf(data=sts, color="black")+
  coord_sf(crs = 2163)+
  ggtitle(label = "Crude Death Rates, United States, 2018")

## Application of Count Data Models ## Negative Binomial Model

ahrf2018wadt<-filter(ahrf2018_m, is.na(death)==F)

fit_nb<- glm.nb(death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc), 
                  data=ahrf2018wadt)
summary(fit_nb)
## 
## Call:
## glm.nb(formula = death ~ offset(log(pop)) + scale(pov) + scale(childpov) + 
##     scale(medhouvl) + scale(medhinc), data = ahrf2018wadt, init.theta = 32.72776372, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4867  -0.6130   0.0104   0.5821   3.7505  
## 
## Coefficients:
##                  Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)     -4.569172   0.003439 -1328.541  < 2e-16 ***
## scale(pov)      -0.202713   0.010070   -20.130  < 2e-16 ***
## scale(childpov)  0.194451   0.010384    18.726  < 2e-16 ***
## scale(medhouvl) -0.022154   0.005003    -4.429 9.49e-06 ***
## scale(medhinc)  -0.128960   0.007318   -17.624  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(32.7278) family taken to be 1)
## 
##     Null deviance: 5500.9  on 3071  degrees of freedom
## Residual deviance: 3297.3  on 3067  degrees of freedom
## AIC: 33880
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  32.73 
##           Std. Err.:  1.00 
## 
##  2 x log-likelihood:  -33867.64

The result show that all the predictors are significant. However, a unit increase in childhood poverty led to 19.4% increase in mortality while a unit increase in the other predictors led to varying decrease in mortality.

Checking out over-dispersion

scale<-sqrt(fit_nb$deviance/fit_nb$df.residual)
scale
## [1] 1.036872

This value of 1.036872 is greater than 1 which means the mean and the variance are not equal, which suggests that the variance is greater than the mean. Therefore, there’s an over-dispersion.

Construction of spatial relationships:

Key part here is to make numeric identifier for each geography!

#In INLA, we don't need FIPS codes, we need a simple numeric index for our counties
ahrf2018wadt$struct<-1:dim(ahrf2018wadt)[1]  
nbs<-knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T) #k=5 nearest neighbors
## Warning in st_centroid.sf(ahrf2018wadt): st_centroid assumes attributes are
## constant over geometries of x
## Warning in knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T):
## dnearneigh: longlat argument overrides object
nbs<-knn2nb(nbs, row.names = ahrf2018wadt$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat) 
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])


nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")

Use INLA to estimate spatially correlated (besag or bym model) for your count outcome.

Bym Model

f4 <- death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc) +
  f(struct, model = "bym",scale.model = T, constr = T, graph = H)

m_bym<-inla(formula = f4,data = ahrf2018wadt,
           family = "poisson",
           control.predictor = list(compute = TRUE),
           control.compute = list(dic = TRUE, waic = TRUE),
           num.threads = 3, 
               verbose = F)
summary(m_bym)
## 
## Call:
##    c("inla(formula = f4, family = \"poisson\", data = ahrf2018wadt, 
##    verbose = F, ", " control.compute = list(dic = TRUE, waic = TRUE), 
##    control.predictor = list(compute = TRUE), ", " num.threads = 3)") 
## Time used:
##     Pre = 0.845, Running = 29, Post = 0.967, Total = 30.8 
## Fixed effects:
##                   mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)     -4.588 0.002     -4.592   -4.588     -4.583 -4.588   0
## scale(pov)      -0.218 0.009     -0.236   -0.218     -0.200 -0.218   0
## scale(childpov)  0.216 0.010      0.197    0.216      0.235  0.216   0
## scale(medhouvl) -0.035 0.006     -0.048   -0.035     -0.022 -0.035   0
## scale(medhinc)  -0.113 0.008     -0.129   -0.113     -0.098 -0.113   0
## 
## Random effects:
##   Name     Model
##     struct BYM model
## 
## Model hyperparameters:
##                                            mean    sd 0.025quant 0.5quant
## Precision for struct (iid component)     121.62 15.30      94.96   120.39
## Precision for struct (spatial component)  28.49  2.84      23.20    28.39
##                                          0.975quant   mode
## Precision for struct (iid component)         154.93 117.70
## Precision for struct (spatial component)      34.35  28.27
## 
## Expected number of effective parameters(stdev): 2416.23(13.52)
## Number of equivalent replicates : 1.27 
## 
## Deviance Information Criterion (DIC) ...............: 28857.19
## Deviance Information Criterion (DIC, saturated) ....: 5728.98
## Effective number of parameters .....................: 2418.77
## 
## Watanabe-Akaike information criterion (WAIC) ...: 28415.74
## Effective number of parameters .................: 1454.26
## 
## Marginal log-Likelihood:  -15474.15 
## Posterior marginals for the linear predictor and
##  the fitted values are computed

Model fit

fit_nb$aic
## [1] 33879.64
m_bym$waic$waic
## [1] 28415.74

The bym model which is the spatial correlation looks the better fits for the data.