DEM 7263 - Spatial GLMM(s) using the INLA Approximation

library(rgdal)
library(spdep)
library(INLA)
library(tigris)
library(tidycensus)
library(tidyverse)
library(spdep)
library(MASS)
library(spatialreg)
library(ggplot2)
library(rio)
library(sas7bdat)
library(haven)
library(dplyr)
library(sf)
library(stringr)

For this analysis,I am using Area Health Resource Files for year 2020 from the Health Resource and Service Administration (HRSA). I am going to use Crude Death Rate as the numerator and total population as the denominator/offset variable.

There are four predictors in this analysis: poverty rate in 2018, child poverty in 2018, and medium house value in 2014,and the median household income for 2014.

Getting the Data

ahrf2018 <- read_sas("C:/Users/chris/OneDrive - University of Texas at San Antonio/FALL 2021/FALL 2021/SPATIAL DEMOGRAPHY DEM 7263/ACS DATA/ahrf2020.sas7bdat")

REcoding Variables

ahrf2018<-ahrf2018%>%
  mutate(cofips=as.factor(f00004),  
         coname=f00010,
         state = f00011,
         pop = f1198418,
         death = f1255818,
         pov = f1332118,
         childpov= f1332218,
         medhouvl=f1461314,
         rucc= as.numeric(f0002013),
         medhinc= f1434514,
         obgyn10_pc= 1000*(f1168410/ f0453010) )%>%
        dplyr::select(pop, death, state, cofips, coname, pov, childpov,rucc, medhinc, medhouvl)%>%
  filter(complete.cases(.))
  #as.data.frame()


head(ahrf2018)

## # A tibble: 6 x 10
##      pop death state cofips coname    pov childpov  rucc medhinc medhouvl
##    <dbl> <dbl> <chr> <fct>  <chr>   <dbl>    <dbl> <dbl>   <dbl>    <dbl>
## 1  55601   541 01    01001  Autauga  13.8     19.3     2   58786   147900
## 2 218022  2326 01    01003  Baldwin   9.8     13.9     3   55962   189800
## 3  24881   312 01    01005  Barbour  30.9     43.9     6   34186    92900
## 4  22400   252 01    01007  Bibb     21.8     27.8     1   45340    96500
## 5  57840   657 01    01009  Blount   13.2     18       1   48695   124700
## 6  10138   109 01    01011  Bullock  42.5     68.3     6   32152    77500

summary(ahrf2018)

##       pop               death          state               cofips    
##  Min.   :     277   Min.   :    0   Length:3139        02001  :  29  
##  1st Qu.:   10958   1st Qu.:  120   Class :character   51015  :   3  
##  Median :   25776   Median :  282   Mode  :character   51059  :   3  
##  Mean   :  104214   Mean   :  903                      51153  :   3  
##  3rd Qu.:   67980   3rd Qu.:  700                      51161  :   3  
##  Max.   :10105518   Max.   :68164                      51163  :   3  
##                                                        (Other):3095  
##     coname               pov           childpov          rucc      
##  Length:3139        Min.   : 2.60   Min.   : 2.50   Min.   :1.000  
##  Class :character   1st Qu.:10.80   1st Qu.:14.50   1st Qu.:2.000  
##  Mode  :character   Median :14.10   Median :20.10   Median :6.000  
##                     Mean   :15.16   Mean   :21.11   Mean   :5.008  
##                     3rd Qu.:18.30   3rd Qu.:26.30   3rd Qu.:7.000  
##                     Max.   :54.00   Max.   :68.30   Max.   :9.000  
##                                                                    
##     medhinc          medhouvl      
##  Min.   : 20188   Min.   :  20700  
##  1st Qu.: 42479   1st Qu.:  93750  
##  Median : 49887   Median : 122600  
##  Mean   : 51570   Mean   : 147156  
##  3rd Qu.: 57585   3rd Qu.: 168300  
##  Max.   :136268   Max.   :1056500  
##

Mapping Crude Death Rate

options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts)%>%
  filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))

ahrf2018_m<-geo_join(usco, ahrf2018, by_sp="cofips", by_df="cofips",how="left" )

## Warning: We recommend using the dplyr::*_join() family of functions instead.

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

ahrf2018_m%>%
  filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
  mutate(cdr=(death/pop)*1000)%>% #Calculating Crude death Rate
  mutate(cdr_range = cut(cdr, breaks=quantile(cdr, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>% #Making quantile map
  ggplot()+
  geom_sf(aes(fill=cdr_range, color=NA))+
  scale_color_brewer(palette = "Blues")+
  scale_fill_brewer(palette = "Blues",na.value = "grey50")+
  geom_sf(data=sts, color="black")+
  coord_sf(crs = 2163)+
  ggtitle(label = "Crude Death Rates, United States, 2018")

## Application of Count Data Models ## Negative Binomial Model

ahrf2018wadt<-filter(ahrf2018_m, is.na(death)==F)

fit_nb<- glm.nb(death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc), 
                  data=ahrf2018wadt)
summary(fit_nb)

## 
## Call:
## glm.nb(formula = death ~ offset(log(pop)) + scale(pov) + scale(childpov) + 
##     scale(medhouvl) + scale(medhinc), data = ahrf2018wadt, init.theta = 32.72776372, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4867  -0.6130   0.0104   0.5821   3.7505  
## 
## Coefficients:
##                  Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)     -4.569172   0.003439 -1328.541  < 2e-16 ***
## scale(pov)      -0.202713   0.010070   -20.130  < 2e-16 ***
## scale(childpov)  0.194451   0.010384    18.726  < 2e-16 ***
## scale(medhouvl) -0.022154   0.005003    -4.429 9.49e-06 ***
## scale(medhinc)  -0.128960   0.007318   -17.624  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(32.7278) family taken to be 1)
## 
##     Null deviance: 5500.9  on 3071  degrees of freedom
## Residual deviance: 3297.3  on 3067  degrees of freedom
## AIC: 33880
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  32.73 
##           Std. Err.:  1.00 
## 
##  2 x log-likelihood:  -33867.64

The result show that all the predictors are significant. However, a unit increase in childhood poverty led to 19.4% increase in mortality while a unit increase in the other predictors led to varying decrease in mortality.

Checking out over-dispersion

scale<-sqrt(fit_nb$deviance/fit_nb$df.residual)
scale

## [1] 1.036872

This value of 1.036872 is greater than 1 which means the mean and the variance are not equal, which suggests that the variance is greater than the mean. Therefore, there’s an over-dispersion.

Construction of spatial relationships:

Key part here is to make numeric identifier for each geography!

#In INLA, we don't need FIPS codes, we need a simple numeric index for our counties
ahrf2018wadt$struct<-1:dim(ahrf2018wadt)[1]  
nbs<-knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T) #k=5 nearest neighbors

## Warning in st_centroid.sf(ahrf2018wadt): st_centroid assumes attributes are
## constant over geometries of x

## Warning in knearneigh(st_centroid(ahrf2018wadt), k = 5, longlat = T):
## dnearneigh: longlat argument overrides object

nbs<-knn2nb(nbs, row.names = ahrf2018wadt$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat) 
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])


nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")

Use INLA to estimate spatially correlated (besag or bym model) for your count outcome.

Bym Model

f4 <- death ~ offset(log(pop)) + scale(pov) + scale(childpov) + scale(medhouvl) + scale(medhinc) +
  f(struct, model = "bym",scale.model = T, constr = T, graph = H)

m_bym<-inla(formula = f4,data = ahrf2018wadt,
           family = "poisson",
           control.predictor = list(compute = TRUE),
           control.compute = list(dic = TRUE, waic = TRUE),
           num.threads = 3, 
               verbose = F)
summary(m_bym)

## 
## Call:
##    c("inla(formula = f4, family = \"poisson\", data = ahrf2018wadt, 
##    verbose = F, ", " control.compute = list(dic = TRUE, waic = TRUE), 
##    control.predictor = list(compute = TRUE), ", " num.threads = 3)") 
## Time used:
##     Pre = 0.845, Running = 29, Post = 0.967, Total = 30.8 
## Fixed effects:
##                   mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)     -4.588 0.002     -4.592   -4.588     -4.583 -4.588   0
## scale(pov)      -0.218 0.009     -0.236   -0.218     -0.200 -0.218   0
## scale(childpov)  0.216 0.010      0.197    0.216      0.235  0.216   0
## scale(medhouvl) -0.035 0.006     -0.048   -0.035     -0.022 -0.035   0
## scale(medhinc)  -0.113 0.008     -0.129   -0.113     -0.098 -0.113   0
## 
## Random effects:
##   Name     Model
##     struct BYM model
## 
## Model hyperparameters:
##                                            mean    sd 0.025quant 0.5quant
## Precision for struct (iid component)     121.62 15.30      94.96   120.39
## Precision for struct (spatial component)  28.49  2.84      23.20    28.39
##                                          0.975quant   mode
## Precision for struct (iid component)         154.93 117.70
## Precision for struct (spatial component)      34.35  28.27
## 
## Expected number of effective parameters(stdev): 2416.23(13.52)
## Number of equivalent replicates : 1.27 
## 
## Deviance Information Criterion (DIC) ...............: 28857.19
## Deviance Information Criterion (DIC, saturated) ....: 5728.98
## Effective number of parameters .....................: 2418.77
## 
## Watanabe-Akaike information criterion (WAIC) ...: 28415.74
## Effective number of parameters .................: 1454.26
## 
## Marginal log-Likelihood:  -15474.15 
## Posterior marginals for the linear predictor and
##  the fitted values are computed

Model fit

fit_nb$aic

## [1] 33879.64

m_bym$waic$waic

## [1] 28415.74

The bym model which is the spatial correlation looks the better fits for the data.