library(tidyverse, quietly = T)
library(ggplot2, quietly = T)
theme_set(theme_classic())
library(rstanarm)
library(tidycensus)
options(mc.cores = parallel::detectCores()-1)
library(extraoperators)
library(spdep)
library(MASS)
library(spatialreg)
library(ggplot2)
library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(tidycensus)
library(dplyr)
library(haven)
library(spatialreg)
library(INLA)
library(haven)
AHRF2021 <- read_sas("G:/My Drive/PhD Demography/Fall 2021/Courses/Spatial Demography 7263/Term Paper/Data Analysis/AHRF_2020-2021_SAS/AHRF2021.sas7bdat", NULL)

Selecting data from AREA RESOURCE FILE

library(dplyr)
arf21<-AHRF2021%>%
  mutate(cofips=f00004, 
         coname=f00010,
         state = f00011,
         totfpop15=f1390715,
         totfpop17=f1390717,
         nhwm15= f1392615,
         nhwf15= f1392715,
         hispm=f1392415,
         hispf=f1392515,
         blkm= f1391015,
         blkf= f1391115,
         fnins=f1553015, 
         unemprate= f0679515, 
         obgyn15= f1168415,
         births1517= f1254615,
        childpov15= f1332215,
         ppoverty= f1332115,
         lowbw1517=f1255315,
         rucc= as.factor(f0002013),
         hpsa10= as.factor(f0978710))%>%
  dplyr:: select(totfpop15, totfpop17,births1517, lowbw1517,state, cofips,hpsa10, obgyn15, ppoverty, unemprate)%>%
  filter(complete.cases(.))%>%
  as.data.frame()

Create expected number of cases

# rates<-aggregate(dat$lbrate~1, dat, mean)

arf21$E<-arf21$births1517*(sum(arf21$lowbw1517)/sum(arf21$births1517))


arf21<-arf21[order(arf21$cofips),]
arf21$id<-1:dim(arf21)[1]

head(arf21)
##   totfpop15 totfpop17 births1517 lowbw1517 state cofips hpsa10 obgyn15 ppoverty
## 1     28490     28497        667        57    01  01001      1       1     12.7
## 2    104423    109403       2304       183    01  01003      1      22     12.9
## 3     12341     11935        278        32    01  01005      2       1     32.0
## 4     10413     10530        266        28    01  01007      1       0     22.2
## 5     29178     29406        684        60    01  01009      1       0     14.7
## 6      4839      4694        129        19    01  01011      1       0     39.6
##   unemprate         E id
## 1       5.2  54.58666  1
## 2       5.5 188.55721  2
## 3       8.9  22.75126  3
## 4       6.6  21.76919  4
## 5       5.4  55.97792  5
## 6       7.8  10.55724  6
options(scipen=999)

Load social capital data

soccap <- readxl::read_xlsx("G:/My Drive/MSc Demography/Summer 2021/Github_repository/Soc-Cap-Smoking/soccap2014.xlsx")
soccap$cofips <- stringr::str_pad(string=soccap$FIPS, width=5,side = "left", pad = 0 )
soccap2 <- soccap[, c(19, 2, 18)]

head(soccap2)
## # A tibble: 6 x 3
##   cofips County_Name        sk2014
##   <chr>  <chr>               <dbl>
## 1 01001  Autauga County, AL -0.631
## 2 01003  Baldwin County, AL -0.555
## 3 01005  Barbour County, AL -0.891
## 4 01007  Bibb County, AL    -0.907
## 5 01009  Blount County, AL  -1.01 
## 6 01011  Bullock County, AL -0.502

Merge ARF, and social capital data.

dat <- merge(soccap2, arf21, by="cofips")

#yahoo... it ran. lets see the heads

head(dat)
##   cofips        County_Name     sk2014 totfpop15 totfpop17 births1517 lowbw1517
## 1  01001 Autauga County, AL -0.6310033     28490     28497        667        57
## 2  01003 Baldwin County, AL -0.5553960    104423    109403       2304       183
## 3  01005 Barbour County, AL -0.8910361     12341     11935        278        32
## 4  01007    Bibb County, AL -0.9065815     10413     10530        266        28
## 5  01009  Blount County, AL -1.0132797     29178     29406        684        60
## 6  01011 Bullock County, AL -0.5024002      4839      4694        129        19
##   state hpsa10 obgyn15 ppoverty unemprate         E id
## 1    01      1       1     12.7       5.2  54.58666  1
## 2    01      1      22     12.9       5.5 188.55721  2
## 3    01      2       1     32.0       8.9  22.75126  3
## 4    01      1       0     22.2       6.6  21.76919  4
## 5    01      1       0     14.7       5.4  55.97792  5
## 6    01      1       0     39.6       7.8  10.55724  6
#creating a rate variable

dat$lbrate <- dat$lowbw1517/dat$births1517

Fit ols model to check county level characterisics are associated with low birth weight rate

Finding predictor variables

fit <- lm(lbrate ~ scale(ppoverty)+scale(sk2014)+ scale(unemprate), data=dat)

summary(fit)
## 
## Call:
## lm(formula = lbrate ~ scale(ppoverty) + scale(sk2014) + scale(unemprate), 
##     data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.064307 -0.010026 -0.000478  0.009338  0.080991 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      0.0848523  0.0003255 260.713 < 0.0000000000000002 ***
## scale(ppoverty)  0.0117155  0.0004392  26.674 < 0.0000000000000002 ***
## scale(sk2014)    0.0005711  0.0003396   1.682               0.0927 .  
## scale(unemprate) 0.0021064  0.0004344   4.849           0.00000132 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01574 on 2335 degrees of freedom
## Multiple R-squared:  0.4077, Adjusted R-squared:  0.4069 
## F-statistic: 535.8 on 3 and 2335 DF,  p-value: < 0.00000000000000022

Poission model

dat2<-filter(dat, is.na(lowbw1517)==F)


fit_pois<- glm(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate), 
               family=poisson, 
               data=dat2)
summary(fit_pois)
## 
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) + 
##     scale(sk2014) + scale(unemprate), family = poisson, data = dat2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -12.1308   -0.9065   -0.0937    0.7824    9.9402  
## 
## Coefficients:
##                   Estimate Std. Error   z value             Pr(>|z|)    
## (Intercept)      -2.473276   0.002035 -1215.131 < 0.0000000000000002 ***
## scale(ppoverty)   0.119535   0.002663    44.887 < 0.0000000000000002 ***
## scale(sk2014)     0.024235   0.002256    10.743 < 0.0000000000000002 ***
## scale(unemprate) -0.014754   0.002546    -5.796         0.0000000068 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 9693.7  on 2338  degrees of freedom
## Residual deviance: 7147.3  on 2335  degrees of freedom
## AIC: 20565
## 
## Number of Fisher Scoring iterations: 4

Total number of birth is the offset term here.

To check overdispersion

scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 1.74956

This is more than 1 which indicates this model over dispersion

1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)
## [1] 0

Quasipoisson model

fit_q_pois<- glm(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate), 
               family=quasipoisson, 
               data=dat2)
summary(fit_q_pois)
## 
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) + 
##     scale(sk2014) + scale(unemprate), family = quasipoisson, 
##     data = dat2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -12.1308   -0.9065   -0.0937    0.7824    9.9402  
## 
## Coefficients:
##                   Estimate Std. Error  t value             Pr(>|t|)    
## (Intercept)      -2.473276   0.003561 -694.525 < 0.0000000000000002 ***
## scale(ppoverty)   0.119535   0.004659   25.656 < 0.0000000000000002 ***
## scale(sk2014)     0.024235   0.003947    6.140       0.000000000965 ***
## scale(unemprate) -0.014754   0.004454   -3.313             0.000938 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 3.061048)
## 
##     Null deviance: 9693.7  on 2338  degrees of freedom
## Residual deviance: 7147.3  on 2335  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

The dispersion parameter is 3.06, in the regular Poisson model, it was assumed to be 1. This model is not a good fit for the data.

Alternative of poission model

Negative binomial

library(MASS)
fit_nb<- glm.nb(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate) , 
               data=dat2)
summary(fit_nb)
## 
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) + 
##     scale(sk2014) + scale(unemprate), data = dat2, init.theta = 68.2992919, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5877  -0.5865  -0.0461   0.5587   3.2064  
## 
## Coefficients:
##                   Estimate Std. Error  z value            Pr(>|z|)    
## (Intercept)      -2.479219   0.003927 -631.349 <0.0000000000000002 ***
## scale(ppoverty)   0.124490   0.005224   23.832 <0.0000000000000002 ***
## scale(sk2014)    -0.001734   0.004622   -0.375              0.7076    
## scale(unemprate)  0.011997   0.005045    2.378              0.0174 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(68.2993) family taken to be 1)
## 
##     Null deviance: 2979.8  on 2338  degrees of freedom
## Residual deviance: 1929.8  on 2335  degrees of freedom
## AIC: 16979
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  68.30 
##           Std. Err.:  3.65 
## 
##  2 x log-likelihood:  -16968.63

Creating INLA dataframe

library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts) %>% 
 filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))
usco$struct<-1:dim(usco)[1]  
nbs<-knearneigh(st_centroid(usco), k = 5, longlat = T) #k=5 nearest neighbors
nbs<-knn2nb(nbs, row.names = usco$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat) 
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])


nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")
library(sf)
usco<-st_as_sf(usco)
usco$cofips<-paste(usco$STATEFP, usco$COUNTYFP, sep="")
usco%>%
  ggplot()+
  geom_sf()+
  coord_sf(crs = 2163)

final.dat<-merge( usco, dat, by="cofips")
final.dat<-final.dat[order(final.dat$cofips),]

head(final.dat)
## Simple feature collection with 6 features and 24 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -88.02927 ymin: 30.22113 xmax: -85.05307 ymax: 34.26048
## Geodetic CRS:  NAD83
##   cofips         GEO_ID STATE COUNTY    NAME   LSAD CENSUSAREA COUNTYFP STATEFP
## 1  01001 0500000US01001    01    001 Autauga County    594.436      001      01
## 2  01003 0500000US01003    01    003 Baldwin County   1589.784      003      01
## 3  01005 0500000US01005    01    005 Barbour County    884.876      005      01
## 4  01007 0500000US01007    01    007    Bibb County    622.582      007      01
## 5  01009 0500000US01009    01    009  Blount County    644.776      009      01
## 6  01011 0500000US01011    01    011 Bullock County    622.805      011      01
##   struct        County_Name     sk2014 totfpop15 totfpop17 births1517 lowbw1517
## 1     15 Autauga County, AL -0.6310033     28490     28497        667        57
## 2     16 Baldwin County, AL -0.5553960    104423    109403       2304       183
## 3     17 Barbour County, AL -0.8910361     12341     11935        278        32
## 4     18    Bibb County, AL -0.9065815     10413     10530        266        28
## 5     19  Blount County, AL -1.0132797     29178     29406        684        60
## 6     20 Bullock County, AL -0.5024002      4839      4694        129        19
##   state hpsa10 obgyn15 ppoverty unemprate         E id     lbrate
## 1    01      1       1     12.7       5.2  54.58666  1 0.08545727
## 2    01      1      22     12.9       5.5 188.55721  2 0.07942708
## 3    01      2       1     32.0       8.9  22.75126  3 0.11510791
## 4    01      1       0     22.2       6.6  21.76919  4 0.10526316
## 5    01      1       0     14.7       5.4  55.97792  5 0.08771930
## 6    01      1       0     39.6       7.8  10.55724  6 0.14728682
##                         geometry
## 1 MULTIPOLYGON (((-86.52469 3...
## 2 MULTIPOLYGON (((-87.41247 3...
## 3 MULTIPOLYGON (((-85.13285 3...
## 4 MULTIPOLYGON (((-87.11632 3...
## 5 MULTIPOLYGON (((-86.73121 3...
## 6 MULTIPOLYGON (((-85.74209 3...
ggplot(data = final.dat)+geom_histogram(aes(x =log(lowbw1517/births1517) ,
                                            y=0.5*..density..))+
  #facet_wrap(~year)+
  ggtitle(label = "Distribution of low birthweight rate",
          subtitle = "US Counties, 2015-17")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Relative risk

ggplot(data = final.dat)+geom_histogram(aes(x =log(lowbw1517/E) ,
                                            y=0.5*..density..))+
  #facet_wrap(~year)+
  ggtitle(label = "Distribution of relative risk of low birthweight",
          subtitle = "US Counties, 2015-17")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sts<- states(cb=T)%>%
  st_transform(crs= 2163)%>%
  st_boundary()%>%
  subset(!STATEFP%in%c("02", "15", "60", "66", "69", "72", "78"))
final.dat%>%
  mutate(qrr=cut(I(lowbw1517/E), 
                 breaks = quantile(I(lowbw1517/E),
                                   p=seq(0,1,length.out = 5)),
                 include.lowest = T,))%>%
  ggplot()+
  geom_sf(aes(fill=qrr, color=NA))+
  geom_sf(data=sts, color="black")+
  scale_colour_brewer(palette = "RdBu" )+
  scale_fill_brewer(palette = "RdBu", na.value="grey")+
  guides(fill=guide_legend(title="Relative Risk Quartile"))+
  ggtitle(label="Relative Risk Quartile - Low birth weight 2015-17")+
  coord_sf(crs = 2163)

#INLA model run (Besag)

f3<-lowbw1517~scale(ppoverty)+scale(unemprate)+
  f(struct, model = "besag",scale.model = T, constr = T, graph = H)

mod3<-inla(formula = f3,data = final.dat,
           family = "poisson",
           E = E, 
           control.compute = list(waic=T), 
           control.predictor = list(link=1), 
           num.threads = 3, 
               verbose = F)

#total model summary
summary(mod3)
## 
## Call:
##    c("inla(formula = f3, family = \"poisson\", data = final.dat, E = E, ", 
##    " verbose = F, control.compute = list(waic = T), control.predictor = 
##    list(link = 1), ", " num.threads = 3)") 
## Time used:
##     Pre = 1.22, Running = 5.38, Post = 0.531, Total = 7.13 
## Fixed effects:
##                   mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## (Intercept)      0.007 0.004      0.000    0.007      0.015 0.007   0
## scale(ppoverty)  0.093 0.005      0.084    0.093      0.103 0.093   0
## scale(unemprate) 0.007 0.005     -0.002    0.007      0.016 0.007   0
## 
## Random effects:
##   Name     Model
##     struct Besags ICAR model
## 
## Model hyperparameters:
##                       mean   sd 0.025quant 0.5quant 0.975quant  mode
## Precision for struct 63.73 4.54      55.35    63.55      73.16 63.17
## 
## Expected number of effective parameters(stdev): 604.22(22.42)
## Number of equivalent replicates : 3.87 
## 
## Watanabe-Akaike information criterion (WAIC) ...: 15559.82
## Effective number of parameters .................: 400.16
## 
## Marginal log-Likelihood:  -9875.74 
## Posterior marginals for the linear predictor and
##  the fitted values are computed