library(spdep)
library(MASS)
library(spatialreg)
library(tidyverse)
library(ggplot2)
library(tigris)
library(dplyr)
library(sf)
library(ggplot2)
library(haven)
AHRF2021 <- read_sas("G:/My Drive/PhD Demography/Fall 2021/Courses/Spatial Demography 7263/Term Paper/Data Analysis/AHRF_2020-2021_SAS/AHRF2021.sas7bdat", 
    NULL)
#View(AHRF2021)
#head(AHRF2021)
library(dplyr)
arf21<-AHRF2021%>%
  mutate(cofips=f00004, 
         coname=f00010,
         state = f00011,
         totfpop15=f1390715,
         totfpop17=f1390717,
         obgyn15= f1168415,
         births1517= f1254615,
        childpov15= f1332215,
         ppoverty= f1332115,
         lowbw1517=f1255315,
         divf=f1443615,
         rucc= as.factor(f0002013),
         hpsa10= as.factor(f0978710))%>%
  dplyr:: select(totfpop15, totfpop17, births1517, lowbw1517,state, cofips, coname, rucc, hpsa10, obgyn15,ppoverty, childpov15, divf)%>%
  filter(complete.cases(.))%>%
  as.data.frame()


head(arf21)
  summary(arf21)
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts) %>% 
 filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))
arf211<-left_join(usco, arf21, by = c("cofips" = "cofips"))
arf211%>%
   filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
  mutate(lbrate=lowbw1517/births1517)%>%
  mutate(lb_group = cut(lbrate, breaks=quantile(lbrate, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>%
  ggplot()+
  geom_sf(aes(fill=lb_group, color=NA))+
  scale_color_brewer(palette = "Blues")+
  scale_fill_brewer(palette = "Blues",na.value = "grey50")+
  geom_sf(data=sts, color="black")+
  coord_sf(crs = 2163)+
  ggtitle(label = "Proportion of births that were low birthweight, 2015-2017")

arf211%>%
   filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
  mutate(ppov_group = cut(ppoverty, breaks=quantile(ppoverty, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>%
  ggplot()+
  geom_sf(aes(fill=ppov_group, color=NA))+
  scale_color_brewer(palette = "Blues")+
  scale_fill_brewer(palette = "Blues",na.value = "grey50")+
  geom_sf(data=sts, color="black")+
  coord_sf(crs = 2163)+
  ggtitle(label = "Proportion of % poverty, 2015-2017")

I am taking ppoverty(% of poverty) as an independent variable for low-birth weight outcome.

arf21sub<-filter(arf211, is.na(lowbw1517)==F)


fit_pois<- glm(lowbw1517 ~ offset(log(births1517+.00001)) + ppoverty, 
               family=poisson, 
               data=arf21sub)
summary(fit_pois)
## 
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517 + 1e-05)) + ppoverty, 
##     family = poisson, data = arf21sub)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13.9514   -0.8316   -0.0678    0.7927   10.5204  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.7502160  0.0053586 -513.24   <2e-16 ***
## ppoverty     0.0161420  0.0003269   49.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 9693.7  on 2338  degrees of freedom
## Residual deviance: 7307.4  on 2337  degrees of freedom
## AIC: 20721
## 
## Number of Fisher Scoring iterations: 4

We see that % of poverty is assoicated with low birth weight. But we also see that residual deviance is larger than degrees of freedom. That indicates model is overdisperesed.

A check of this is to examine the ratio of these to values:

scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 1.76828

This value is more than 1. this indicates model should be changed.

fit_q_pois<- glm(lowbw1517 ~ offset(log(births1517)) + ppoverty, 
               family=quasipoisson, 
               data=arf21sub)
summary(fit_q_pois)
## 
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + ppoverty, 
##     family = quasipoisson, data = arf21sub)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13.9514   -0.8316   -0.0678    0.7927   10.5204  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.7502160  0.0094890 -289.83   <2e-16 ***
## ppoverty     0.0161420  0.0005789   27.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 3.135773)
## 
##     Null deviance: 9693.7  on 2338  degrees of freedom
## Residual deviance: 7307.4  on 2337  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

Both has difference in dispersion parameter and other remains same. In this case it is good to use quasi-poission model.

To findout expected number of low birth weight outcome we need to compared between observed and expected low birth weight. To do so, first we calculate ration of low birth weight birth and total birth.

lbrate<-(sum(arf21sub$lowbw1517, na.rm=T)/sum(arf21sub$births1517, na.rm=T)) 
lbrate
## [1] 0.08171312

% of low birth weight in 2015-17 is 8.2

Now we want to see the difference of number of low birth weight and expected low birth weight.

arf21sub$E <- lbrate* arf21sub$births1517

head(arf21sub[, c("coname", "births1517", "lowbw1517", "E")])
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -86.70279 ymin: 30.99324 xmax: -85.30444 ymax: 33.9642
## Geodetic CRS:  NAD83
##      coname births1517 lowbw1517         E                       geometry
## 1  Cleburne        170        13 13.891231 MULTIPOLYGON (((-85.38872 3...
## 2    Coffee        650        55 53.113530 MULTIPOLYGON (((-86.03044 3...
## 3     Coosa         92        13  7.517607 MULTIPOLYGON (((-86.00928 3...
## 4 Covington        450        43 36.770905 MULTIPOLYGON (((-86.34851 3...
## 5  Crenshaw        156        18 12.747247 MULTIPOLYGON (((-86.14699 3...
## 6      Dale        653        59 53.358669 MULTIPOLYGON (((-85.79043 3...
arf21sub%>%
  ggplot( aes(lowbw1517))+geom_histogram()+ggtitle("Distribution of low birthweight births", "y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

arf21sub%>%
  ggplot( aes(lowbw1517/E))+geom_histogram()+ggtitle("Distribution of standardized low birthweight births", "y/E")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

In order to estimate the model with the expected counts we need to change offset term to “E”(expected outcome)

fit_pois_E<-glm(lowbw1517 ~ offset(log(E)) + ppoverty, 
               family=poisson, 
               data=arf21sub)
summary(fit_pois_E)
## 
## Call:
## glm(formula = lowbw1517 ~ offset(log(E)) + ppoverty, family = poisson, 
##     data = arf21sub)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13.9514   -0.8316   -0.0678    0.7927   10.5204  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.2456753  0.0053586  -45.85   <2e-16 ***
## ppoverty     0.0161420  0.0003269   49.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 9693.7  on 2338  degrees of freedom
## Residual deviance: 7307.4  on 2337  degrees of freedom
## AIC: 20721
## 
## Number of Fisher Scoring iterations: 4

Identifying model’s dinominator and numerator

In order to identify the numerator and the denominator, we have to specify the outcome in the model a little differently:

fit_bin<-glm(cbind(lowbw1517 , births1517)~  ppoverty, 
               family=binomial, 
               data=arf21sub)
summary(fit_bin)
## 
## Call:
## glm(formula = cbind(lowbw1517, births1517) ~ ppoverty, family = binomial, 
##     data = arf21sub)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13.4278   -0.8028   -0.0652    0.7613   10.0394  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.7504407  0.0055800 -492.91   <2e-16 ***
## ppoverty     0.0161567  0.0003413   47.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8937.2  on 2338  degrees of freedom
## Residual deviance: 6733.5  on 2337  degrees of freedom
## AIC: 19957
## 
## Number of Fisher Scoring iterations: 4

It has dispersion 1 and lower AIC value than the Poission model. This model fits well in the model. Also the numerator is low birth weight and demonimator is births.

Test of autcorrelation in the model

library(spdep)
nbs<-knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T)
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: identical points found
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: kd_tree not available for identical points
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
lm.morantest(fit_pois, listw = us.wt4)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm(formula = lowbw1517 ~ offset(log(births1517 + 1e-05)) +
## ppoverty, family = poisson, data = arf21sub)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 33.994, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     4.630151e-01    -5.935293e-06     1.855223e-04
lm.morantest(fit_bin, listw = us.wt4)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm(formula = cbind(lowbw1517, births1517) ~ ppoverty, family =
## binomial, data = arf21sub)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 31.255, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.4636689881    -0.0066329372     0.0002264163

To get theta value

library(MASS)
fit_nb<- glm.nb(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty), 
               data=arf21sub)
summary(fit_nb)
## 
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty), 
##     data = arf21sub, init.theta = 68.71888079, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8985  -0.5878  -0.0434   0.5591   3.2312  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.479078   0.003866 -641.32   <2e-16 ***
## scale(ppoverty)  0.132425   0.004045   32.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(68.7189) family taken to be 1)
## 
##     Null deviance: 2988.4  on 2338  degrees of freedom
## Residual deviance: 1941.2  on 2337  degrees of freedom
## AIC: 16980
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  68.72 
##           Std. Err.:  3.70 
## 
##  2 x log-likelihood:  -16974.01

It is 68.72

Now I include rucc, child poverty, and female divorce as my

library(spatialreg)

arf21sub<-arf21sub%>%
  filter(!STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))

nbs<-knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T)
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: identical points found
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: kd_tree not available for identical points
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")

fitnb<-glm.nb(lowbw1517 ~ offset(log(E)) + scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf),
           data=arf21sub)
fitnb$theta
## [1] 84.66024
me.fit<-ME(lowbw1517 ~ offset(log(E)) + scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+ scale(divf),
           data=arf21sub,
           family=negative.binomial(68.72),
           listw = us.wt4,
           verbose=T,alpha=.05 )
## eV[,12], I: 0.279181 ZI: NA, pr(ZI): 0.01
## eV[,33], I: 0.1853194 ZI: NA, pr(ZI): 0.01
## eV[,89], I: 0.1261752 ZI: NA, pr(ZI): 0.01
## eV[,49], I: 0.07920538 ZI: NA, pr(ZI): 0.02
## eV[,34], I: 0.05088808 ZI: NA, pr(ZI): 0.02
## eV[,57], I: 0.03916846 ZI: NA, pr(ZI): 0.02
## eV[,8], I: 0.02874423 ZI: NA, pr(ZI): 0.05
## eV[,2], I: 0.01404392 ZI: NA, pr(ZI): 0.06
me.fit
##   Eigenvector ZI pr(ZI)
## 0          NA NA   0.01
## 1          12 NA   0.01
## 2          33 NA   0.01
## 3          89 NA   0.01
## 4          49 NA   0.02
## 5          34 NA   0.02
## 6          57 NA   0.02
## 7           8 NA   0.05
## 8           2 NA   0.06
fits<-data.frame(fitted(me.fit))
arf21sub$me1<-fits$vec1
arf21sub$me10<-fits$vec10

We want to see any autocorrelation is there:

arf21sub%>%
  ggplot()+
  geom_sf(aes(fill=arf21sub$me1))+
  scale_fill_viridis_c()+
  coord_sf(crs = 2163)+
  ggtitle("First Moran Eigenvector")

Then I plug the scores of each county on these Moran eigenvectors into the model to “clean” it:

clean.nb<-glm.nb(lowbw1517 ~ offset(log(E)) + scale(ppoverty)+ I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf)+fitted(me.fit), arf21sub)
summary(clean.nb)
## 
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(E)) + scale(ppoverty) + 
##     I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf) + 
##     fitted(me.fit), data = arf21sub, init.theta = 118.0218917, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6598  -0.6154  -0.0674   0.5726   3.3635  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          -0.008400   0.005960  -1.410 0.158686    
## scale(ppoverty)                      -0.026763   0.010979  -2.438 0.014781 *  
## I(rucc %in% c("01", "02", "03"))TRUE  0.063107   0.007956   7.932 2.16e-15 ***
## scale(childpov15)                     0.167424   0.010863  15.413  < 2e-16 ***
## scale(divf)                           0.012947   0.002405   5.384 7.28e-08 ***
## fitted(me.fit)vec12                   0.991298   0.148828   6.661 2.73e-11 ***
## fitted(me.fit)vec33                  -0.984162   0.156579  -6.285 3.27e-10 ***
## fitted(me.fit)vec89                   0.463320   0.151177   3.065 0.002178 ** 
## fitted(me.fit)vec49                   0.602736   0.155114   3.886 0.000102 ***
## fitted(me.fit)vec34                  -0.869372   0.162742  -5.342 9.19e-08 ***
## fitted(me.fit)vec57                  -0.390693   0.150531  -2.595 0.009447 ** 
## fitted(me.fit)vec8                    1.083461   0.157454   6.881 5.94e-12 ***
## fitted(me.fit)vec2                   -1.416336   0.158755  -8.922  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(118.0219) family taken to be 1)
## 
##     Null deviance: 3776.0  on 2334  degrees of freedom
## Residual deviance: 1909.4  on 2322  degrees of freedom
## AIC: 16469
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  118.02 
##           Std. Err.:  7.69 
## 
##  2 x log-likelihood:  -16441.14
lm.morantest(clean.nb, listw=us.wt4)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = lowbw1517 ~ offset(log(E)) + scale(ppoverty) +
## I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf) +
## fitted(me.fit), data = arf21sub, init.theta = 118.0218917, link = log)
## weights: us.wt4
## 
## Moran I statistic standard deviate = 29.908, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.4093638675    -0.0001062245     0.0001874446
arf21sub$lag_rate<-lag.listw(x=us.wt4, var=(arf21sub$lowbw1517/arf21sub$births1517))

fit.nb.auto<-glm.nb(lowbw1517 ~ offset(log(E)) +lag_rate+ scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf), 
                     data=arf21sub)
summary(fit.nb.auto)
## 
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(E)) + lag_rate + scale(ppoverty) + 
##     I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf), 
##     data = arf21sub, init.theta = 169.8624647, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1972  -0.5498  -0.0425   0.4720   3.5540  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          -0.608091   0.020354 -29.876  < 2e-16 ***
## lag_rate                              7.170923   0.231989  30.911  < 2e-16 ***
## scale(ppoverty)                      -0.038668   0.010252  -3.772 0.000162 ***
## I(rucc %in% c("01", "02", "03"))TRUE  0.042175   0.007487   5.633 1.77e-08 ***
## scale(childpov15)                     0.122172   0.010226  11.947  < 2e-16 ***
## scale(divf)                           0.008319   0.001908   4.361 1.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(169.8625) family taken to be 1)
## 
##     Null deviance: 4332.6  on 2334  degrees of freedom
## Residual deviance: 1615.0  on 2329  degrees of freedom
## AIC: 15912
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  169.9 
##           Std. Err.:  11.6 
## 
##  2 x log-likelihood:  -15898.13
AIC(fitnb)
## [1] 16706.45
AIC(clean.nb)
## [1] 16469.14
AIC(fit.nb.auto)
## [1] 15912.13