library(spdep)
library(MASS)
library(spatialreg)
library(tidyverse)
library(ggplot2)
library(tigris)
library(dplyr)
library(sf)
library(ggplot2)
library(haven)
AHRF2021 <- read_sas("G:/My Drive/PhD Demography/Fall 2021/Courses/Spatial Demography 7263/Term Paper/Data Analysis/AHRF_2020-2021_SAS/AHRF2021.sas7bdat",
NULL)
#View(AHRF2021)
#head(AHRF2021)
library(dplyr)
arf21<-AHRF2021%>%
mutate(cofips=f00004,
coname=f00010,
state = f00011,
totfpop15=f1390715,
totfpop17=f1390717,
obgyn15= f1168415,
births1517= f1254615,
childpov15= f1332215,
ppoverty= f1332115,
lowbw1517=f1255315,
divf=f1443615,
rucc= as.factor(f0002013),
hpsa10= as.factor(f0978710))%>%
dplyr:: select(totfpop15, totfpop17, births1517, lowbw1517,state, cofips, coname, rucc, hpsa10, obgyn15,ppoverty, childpov15, divf)%>%
filter(complete.cases(.))%>%
as.data.frame()
head(arf21)
summary(arf21)
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts) %>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))
arf211<-left_join(usco, arf21, by = c("cofips" = "cofips"))
arf211%>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
mutate(lbrate=lowbw1517/births1517)%>%
mutate(lb_group = cut(lbrate, breaks=quantile(lbrate, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>%
ggplot()+
geom_sf(aes(fill=lb_group, color=NA))+
scale_color_brewer(palette = "Blues")+
scale_fill_brewer(palette = "Blues",na.value = "grey50")+
geom_sf(data=sts, color="black")+
coord_sf(crs = 2163)+
ggtitle(label = "Proportion of births that were low birthweight, 2015-2017")
arf211%>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
mutate(ppov_group = cut(ppoverty, breaks=quantile(ppoverty, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>%
ggplot()+
geom_sf(aes(fill=ppov_group, color=NA))+
scale_color_brewer(palette = "Blues")+
scale_fill_brewer(palette = "Blues",na.value = "grey50")+
geom_sf(data=sts, color="black")+
coord_sf(crs = 2163)+
ggtitle(label = "Proportion of % poverty, 2015-2017")
I am taking ppoverty(% of poverty) as an independent variable for low-birth weight outcome.
arf21sub<-filter(arf211, is.na(lowbw1517)==F)
fit_pois<- glm(lowbw1517 ~ offset(log(births1517+.00001)) + ppoverty,
family=poisson,
data=arf21sub)
summary(fit_pois)
##
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517 + 1e-05)) + ppoverty,
## family = poisson, data = arf21sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.9514 -0.8316 -0.0678 0.7927 10.5204
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7502160 0.0053586 -513.24 <2e-16 ***
## ppoverty 0.0161420 0.0003269 49.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 9693.7 on 2338 degrees of freedom
## Residual deviance: 7307.4 on 2337 degrees of freedom
## AIC: 20721
##
## Number of Fisher Scoring iterations: 4
We see that % of poverty is assoicated with low birth weight. But we also see that residual deviance is larger than degrees of freedom. That indicates model is overdisperesed.
A check of this is to examine the ratio of these to values:
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 1.76828
This value is more than 1. this indicates model should be changed.
fit_q_pois<- glm(lowbw1517 ~ offset(log(births1517)) + ppoverty,
family=quasipoisson,
data=arf21sub)
summary(fit_q_pois)
##
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + ppoverty,
## family = quasipoisson, data = arf21sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.9514 -0.8316 -0.0678 0.7927 10.5204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.7502160 0.0094890 -289.83 <2e-16 ***
## ppoverty 0.0161420 0.0005789 27.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 3.135773)
##
## Null deviance: 9693.7 on 2338 degrees of freedom
## Residual deviance: 7307.4 on 2337 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Both has difference in dispersion parameter and other remains same. In this case it is good to use quasi-poission model.
To findout expected number of low birth weight outcome we need to compared between observed and expected low birth weight. To do so, first we calculate ration of low birth weight birth and total birth.
lbrate<-(sum(arf21sub$lowbw1517, na.rm=T)/sum(arf21sub$births1517, na.rm=T))
lbrate
## [1] 0.08171312
% of low birth weight in 2015-17 is 8.2
Now we want to see the difference of number of low birth weight and expected low birth weight.
arf21sub$E <- lbrate* arf21sub$births1517
head(arf21sub[, c("coname", "births1517", "lowbw1517", "E")])
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -86.70279 ymin: 30.99324 xmax: -85.30444 ymax: 33.9642
## Geodetic CRS: NAD83
## coname births1517 lowbw1517 E geometry
## 1 Cleburne 170 13 13.891231 MULTIPOLYGON (((-85.38872 3...
## 2 Coffee 650 55 53.113530 MULTIPOLYGON (((-86.03044 3...
## 3 Coosa 92 13 7.517607 MULTIPOLYGON (((-86.00928 3...
## 4 Covington 450 43 36.770905 MULTIPOLYGON (((-86.34851 3...
## 5 Crenshaw 156 18 12.747247 MULTIPOLYGON (((-86.14699 3...
## 6 Dale 653 59 53.358669 MULTIPOLYGON (((-85.79043 3...
arf21sub%>%
ggplot( aes(lowbw1517))+geom_histogram()+ggtitle("Distribution of low birthweight births", "y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
arf21sub%>%
ggplot( aes(lowbw1517/E))+geom_histogram()+ggtitle("Distribution of standardized low birthweight births", "y/E")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In order to estimate the model with the expected counts we need to change offset term to “E”(expected outcome)
fit_pois_E<-glm(lowbw1517 ~ offset(log(E)) + ppoverty,
family=poisson,
data=arf21sub)
summary(fit_pois_E)
##
## Call:
## glm(formula = lowbw1517 ~ offset(log(E)) + ppoverty, family = poisson,
## data = arf21sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.9514 -0.8316 -0.0678 0.7927 10.5204
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2456753 0.0053586 -45.85 <2e-16 ***
## ppoverty 0.0161420 0.0003269 49.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 9693.7 on 2338 degrees of freedom
## Residual deviance: 7307.4 on 2337 degrees of freedom
## AIC: 20721
##
## Number of Fisher Scoring iterations: 4
Identifying model’s dinominator and numerator
In order to identify the numerator and the denominator, we have to specify the outcome in the model a little differently:
fit_bin<-glm(cbind(lowbw1517 , births1517)~ ppoverty,
family=binomial,
data=arf21sub)
summary(fit_bin)
##
## Call:
## glm(formula = cbind(lowbw1517, births1517) ~ ppoverty, family = binomial,
## data = arf21sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.4278 -0.8028 -0.0652 0.7613 10.0394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7504407 0.0055800 -492.91 <2e-16 ***
## ppoverty 0.0161567 0.0003413 47.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8937.2 on 2338 degrees of freedom
## Residual deviance: 6733.5 on 2337 degrees of freedom
## AIC: 19957
##
## Number of Fisher Scoring iterations: 4
It has dispersion 1 and lower AIC value than the Poission model. This model fits well in the model. Also the numerator is low birth weight and demonimator is births.
Test of autcorrelation in the model
library(spdep)
nbs<-knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T)
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: identical points found
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: kd_tree not available for identical points
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
lm.morantest(fit_pois, listw = us.wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: glm(formula = lowbw1517 ~ offset(log(births1517 + 1e-05)) +
## ppoverty, family = poisson, data = arf21sub)
## weights: us.wt4
##
## Moran I statistic standard deviate = 33.994, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 4.630151e-01 -5.935293e-06 1.855223e-04
lm.morantest(fit_bin, listw = us.wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: glm(formula = cbind(lowbw1517, births1517) ~ ppoverty, family =
## binomial, data = arf21sub)
## weights: us.wt4
##
## Moran I statistic standard deviate = 31.255, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.4636689881 -0.0066329372 0.0002264163
To get theta value
library(MASS)
fit_nb<- glm.nb(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty),
data=arf21sub)
summary(fit_nb)
##
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty),
## data = arf21sub, init.theta = 68.71888079, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8985 -0.5878 -0.0434 0.5591 3.2312
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.479078 0.003866 -641.32 <2e-16 ***
## scale(ppoverty) 0.132425 0.004045 32.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(68.7189) family taken to be 1)
##
## Null deviance: 2988.4 on 2338 degrees of freedom
## Residual deviance: 1941.2 on 2337 degrees of freedom
## AIC: 16980
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 68.72
## Std. Err.: 3.70
##
## 2 x log-likelihood: -16974.01
It is 68.72
Now I include rucc, child poverty, and female divorce as my
library(spatialreg)
arf21sub<-arf21sub%>%
filter(!STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
nbs<-knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T)
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: identical points found
## Warning in knearneigh(coordinates(as(arf21sub, "Spatial")), k = 4, longlat = T):
## knearneigh: kd_tree not available for identical points
nbs<-knn2nb(nbs, sym = T)
us.wt4<-nb2listw(nbs, style = "W")
fitnb<-glm.nb(lowbw1517 ~ offset(log(E)) + scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf),
data=arf21sub)
fitnb$theta
## [1] 84.66024
me.fit<-ME(lowbw1517 ~ offset(log(E)) + scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+ scale(divf),
data=arf21sub,
family=negative.binomial(68.72),
listw = us.wt4,
verbose=T,alpha=.05 )
## eV[,12], I: 0.279181 ZI: NA, pr(ZI): 0.01
## eV[,33], I: 0.1853194 ZI: NA, pr(ZI): 0.01
## eV[,89], I: 0.1261752 ZI: NA, pr(ZI): 0.01
## eV[,49], I: 0.07920538 ZI: NA, pr(ZI): 0.02
## eV[,34], I: 0.05088808 ZI: NA, pr(ZI): 0.02
## eV[,57], I: 0.03916846 ZI: NA, pr(ZI): 0.02
## eV[,8], I: 0.02874423 ZI: NA, pr(ZI): 0.05
## eV[,2], I: 0.01404392 ZI: NA, pr(ZI): 0.06
me.fit
## Eigenvector ZI pr(ZI)
## 0 NA NA 0.01
## 1 12 NA 0.01
## 2 33 NA 0.01
## 3 89 NA 0.01
## 4 49 NA 0.02
## 5 34 NA 0.02
## 6 57 NA 0.02
## 7 8 NA 0.05
## 8 2 NA 0.06
fits<-data.frame(fitted(me.fit))
arf21sub$me1<-fits$vec1
arf21sub$me10<-fits$vec10
We want to see any autocorrelation is there:
arf21sub%>%
ggplot()+
geom_sf(aes(fill=arf21sub$me1))+
scale_fill_viridis_c()+
coord_sf(crs = 2163)+
ggtitle("First Moran Eigenvector")
Then I plug the scores of each county on these Moran eigenvectors into the model to “clean” it:
clean.nb<-glm.nb(lowbw1517 ~ offset(log(E)) + scale(ppoverty)+ I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf)+fitted(me.fit), arf21sub)
summary(clean.nb)
##
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(E)) + scale(ppoverty) +
## I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf) +
## fitted(me.fit), data = arf21sub, init.theta = 118.0218917,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6598 -0.6154 -0.0674 0.5726 3.3635
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.008400 0.005960 -1.410 0.158686
## scale(ppoverty) -0.026763 0.010979 -2.438 0.014781 *
## I(rucc %in% c("01", "02", "03"))TRUE 0.063107 0.007956 7.932 2.16e-15 ***
## scale(childpov15) 0.167424 0.010863 15.413 < 2e-16 ***
## scale(divf) 0.012947 0.002405 5.384 7.28e-08 ***
## fitted(me.fit)vec12 0.991298 0.148828 6.661 2.73e-11 ***
## fitted(me.fit)vec33 -0.984162 0.156579 -6.285 3.27e-10 ***
## fitted(me.fit)vec89 0.463320 0.151177 3.065 0.002178 **
## fitted(me.fit)vec49 0.602736 0.155114 3.886 0.000102 ***
## fitted(me.fit)vec34 -0.869372 0.162742 -5.342 9.19e-08 ***
## fitted(me.fit)vec57 -0.390693 0.150531 -2.595 0.009447 **
## fitted(me.fit)vec8 1.083461 0.157454 6.881 5.94e-12 ***
## fitted(me.fit)vec2 -1.416336 0.158755 -8.922 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(118.0219) family taken to be 1)
##
## Null deviance: 3776.0 on 2334 degrees of freedom
## Residual deviance: 1909.4 on 2322 degrees of freedom
## AIC: 16469
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 118.02
## Std. Err.: 7.69
##
## 2 x log-likelihood: -16441.14
lm.morantest(clean.nb, listw=us.wt4)
##
## Global Moran I for regression residuals
##
## data:
## model: glm.nb(formula = lowbw1517 ~ offset(log(E)) + scale(ppoverty) +
## I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf) +
## fitted(me.fit), data = arf21sub, init.theta = 118.0218917, link = log)
## weights: us.wt4
##
## Moran I statistic standard deviate = 29.908, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.4093638675 -0.0001062245 0.0001874446
arf21sub$lag_rate<-lag.listw(x=us.wt4, var=(arf21sub$lowbw1517/arf21sub$births1517))
fit.nb.auto<-glm.nb(lowbw1517 ~ offset(log(E)) +lag_rate+ scale(ppoverty) + I(rucc%in%c("01", "02", "03"))+ scale(childpov15)+scale(divf),
data=arf21sub)
summary(fit.nb.auto)
##
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(E)) + lag_rate + scale(ppoverty) +
## I(rucc %in% c("01", "02", "03")) + scale(childpov15) + scale(divf),
## data = arf21sub, init.theta = 169.8624647, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1972 -0.5498 -0.0425 0.4720 3.5540
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.608091 0.020354 -29.876 < 2e-16 ***
## lag_rate 7.170923 0.231989 30.911 < 2e-16 ***
## scale(ppoverty) -0.038668 0.010252 -3.772 0.000162 ***
## I(rucc %in% c("01", "02", "03"))TRUE 0.042175 0.007487 5.633 1.77e-08 ***
## scale(childpov15) 0.122172 0.010226 11.947 < 2e-16 ***
## scale(divf) 0.008319 0.001908 4.361 1.30e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(169.8625) family taken to be 1)
##
## Null deviance: 4332.6 on 2334 degrees of freedom
## Residual deviance: 1615.0 on 2329 degrees of freedom
## AIC: 15912
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 169.9
## Std. Err.: 11.6
##
## 2 x log-likelihood: -15898.13
AIC(fitnb)
## [1] 16706.45
AIC(clean.nb)
## [1] 16469.14
AIC(fit.nb.auto)
## [1] 15912.13