temp <- tempfile()
#Download the SAS dataset as a ZIP compressed archive
download.file("https://data.hrsa.gov/DataDownload/AHRF/AHRF_2019-2020_SAS.zip", temp)
#Read SAS data into R
ahrf<-haven::read_sas(unz(temp,
filename = "ahrf2020.sas7bdat"))
rm(temp)
Count outcome: Preterm birth; a number of preterm deliveries before 37 weeks of pregnancy.
Is there an association between preterm birth and hospitals shortages (0=no shortage, 1=shortage 2=partially shortage) in counties of Texas?
The offset term is how unequal population size is incorporated. Poission model is an integer count model where we cannot use the rate/probability. In my case, offset term is necessary because number of births or populations vary according to the county and we need to find the percentage of preterm birth in each county, so, offset term is necessary.
Rename the variables
library(dplyr)
ahrf2<-ahrf%>%
mutate(cofips=f00004,
coname=f00010,
state = f00011,
births1416=f1254614,
births0608=f1254608,
lowbw1416=f1255314,
lowbw0608=f1255308,
preterm = f1360814,
childpov10= f1332210,
rucc= as.factor(f0002013),
hpsa10= as.factor(f0978710),
obgyn10_pc= 1000*(f1168410/ f0453010) )%>%
dplyr::select(births1416, lowbw1416,births0608, lowbw0608, preterm, state, cofips, coname, childpov10, rucc, hpsa10, obgyn10_pc)%>%
filter(complete.cases(.))%>%
as.data.frame()
head(ahrf2)
summary(ahrf2)
## births1416 lowbw1416 births0608 lowbw0608
## Min. : 67 Min. : 11.00 Min. : 84 Min. : 11.0
## 1st Qu.: 268 1st Qu.: 22.00 1st Qu.: 287 1st Qu.: 23.0
## Median : 499 Median : 39.00 Median : 521 Median : 43.0
## Mean : 1742 Mean : 141.01 Mean : 1810 Mean : 147.9
## 3rd Qu.: 1254 3rd Qu.: 99.75 3rd Qu.: 1300 3rd Qu.: 103.0
## Max. :126007 Max. :8969.00 Max. :140251 Max. :10211.0
##
## preterm state cofips coname
## Min. : 11.0 Length:2238 Length:2238 Length:2238
## 1st Qu.: 32.0 Class :character Class :character Class :character
## Median : 58.0 Mode :character Mode :character Mode :character
## Mean : 197.6
## 3rd Qu.: 142.8
## Max. :11494.0
##
## childpov10 rucc hpsa10 obgyn10_pc
## Min. : 2.70 06 :483 : 0 Min. :0.00000
## 1st Qu.:17.70 01 :405 0:433 1st Qu.:0.00000
## Median :24.10 02 :355 1:834 Median :0.05731
## Mean :24.42 03 :301 2:971 Mean :0.06926
## 3rd Qu.:30.10 07 :275 3rd Qu.:0.10382
## Max. :61.10 04 :214 Max. :0.82115
## (Other):205
Make a map
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
library(ggplot2)
options(tigris_class="sf")
usco<-counties(cb=T, year=2015, state = "TX")
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|=========== | 15%
|
|=========== | 16%
|
|================== | 25%
|
|================== | 26%
|
|==================== | 29%
|
|============================== | 43%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================== | 57%
|
|========================================== | 59%
|
|==================================================== | 74%
|
|============================================================== | 88%
|
|======================================================================| 100%
usco$cofips<-usco$GEOID
sts<-states(cb = T, year=2015)
##
|
| | 0%
|
|= | 1%
|
|==== | 5%
|
|==== | 6%
|
|======= | 9%
|
|============ | 17%
|
|=============== | 21%
|
|================== | 26%
|
|======================= | 33%
|
|================================== | 49%
|
|======================================== | 57%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|========================================================= | 81%
|
|============================================================ | 85%
|
|============================================================== | 89%
|
|======================================================================| 100%
sts<-st_boundary(sts)%>%
filter(STATEFP==48)
ahrf2_m<-geo_join(usco, ahrf2, by_sp="cofips", by_df="cofips",how="left" )
## Warning: We recommend using the dplyr::*_join() family of functions instead.
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
ahrf2_m%>%
filter(STATEFP==48)%>%
mutate(ppb=preterm/births1416)%>%
mutate(pb_group = cut(ppb, breaks=quantile(ppb, p=seq(0,1,length.out = 6), na.rm=T ), include.lowest=T ))%>%
ggplot()+
geom_sf(aes(fill=pb_group, color=NA))+
scale_color_brewer(palette = "Blues")+
scale_fill_brewer(palette = "Blues",na.value = "grey50")+
geom_sf(data=sts["STATEFP"=="48",], color="black")+
coord_sf(crs = 2163)+
ggtitle(label = "Proportion of Preterm Births, 2014-2016")
ahrf3<-filter(ahrf2_m, is.na(preterm)==F)
fit_pois<- glm(preterm ~ offset(log(births1416)) + hpsa10,
family=poisson,
data=ahrf3)
summary(fit_pois)
##
## Call:
## glm(formula = preterm ~ offset(log(births1416)) + hpsa10, family = poisson,
## data = ahrf3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.2377 -0.5404 0.1685 0.9559 10.4849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.15211 0.01514 -142.154 < 2e-16 ***
## hpsa101 0.05352 0.01616 3.312 0.000928 ***
## hpsa102 0.05903 0.01756 3.361 0.000775 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 607.13 on 160 degrees of freedom
## Residual deviance: 594.63 on 158 degrees of freedom
## AIC: 1609.3
##
## Number of Fisher Scoring iterations: 4
Exponentiate the coefficients to get the risk ratio
exp(coef(fit_pois))
## (Intercept) hpsa101 hpsa102
## 0.1162382 1.0549800 1.0608022
While counties consisting of hospital shortages have a mean preterm birth rate of 5.4%, counties consisting of partially shortage of hospitals have a mean preterm birth rate of 6% . Both of these values are higher than no shortage area.
ahrf3$est_pois<- fitted(fit_pois)
ahrf3$est_rate<- ahrf3$est_pois/ahrf3$births1416
aggregate(est_rate ~ hpsa10, data=ahrf3, FUN = mean)
Difference between the mean of preterm birth rate of no shortage area and whole shortage area is 5.4% and the difference between the mean of preterm birth rate of no shortage area and partially shortage area is 6%.
(0.1226290-0.1162382)/0.1162382
## [1] 0.0549802
Cross-check the result through arithmetic
(0.1233057-0.1162382)/0.1162382
## [1] 0.06080187
Over-dispersion test
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 1.939966
If there were no difference between mean and variance then it had to be 1, suggesting that there is over-dispersion in this model.
1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)
## [1] 0
The value, here is lower than the significant level, hence, poisson model is not a appropriate one for this fit.
Now, let us see if quasi-poisson model yields p-value more than 1. Here, I am using quasi-poisson model since it includes overdisperson parameter.
fit_q_pois<- glm(preterm ~ offset(log(births1416)) + hpsa10,
family=quasipoisson,
data=ahrf3)
summary(fit_q_pois)
##
## Call:
## glm(formula = preterm ~ offset(log(births1416)) + hpsa10, family = quasipoisson,
## data = ahrf3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.2377 -0.5404 0.1685 0.9559 10.4849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.15211 0.02950 -72.960 <2e-16 ***
## hpsa101 0.05352 0.03149 1.700 0.0912 .
## hpsa102 0.05903 0.03421 1.725 0.0864 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 3.796133)
##
## Null deviance: 607.13 on 160 degrees of freedom
## Residual deviance: 594.63 on 158 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Dispersion parameter for quasi poissan is 3.79 with similar substantive difference, hence, I would use this model.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
##
## select
## The following object is masked from 'package:srvyr':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
fit_nb<- glm.nb(preterm ~ offset(log(births1416)) + hpsa10,
data=ahrf3)
summary(fit_nb)
##
## Call:
## glm.nb(formula = preterm ~ offset(log(births1416)) + hpsa10,
## data = ahrf3, init.theta = 119.2885051, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1721 -0.4639 0.0305 0.6846 2.4871
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13528 0.02620 -81.491 <2e-16 ***
## hpsa101 0.06136 0.03155 1.945 0.0518 .
## hpsa102 0.03740 0.03165 1.181 0.2374
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(119.2885) family taken to be 1)
##
## Null deviance: 127.26 on 160 degrees of freedom
## Residual deviance: 123.44 on 158 degrees of freedom
## AIC: 1254.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 119.3
## Std. Err.: 23.5
##
## 2 x log-likelihood: -1246.861
exp(coef(fit_nb))
## (Intercept) hpsa101 hpsa102
## 0.1182118 1.0632823 1.0381035
Counties that have shortage of hospitals have 6.3% of increased risk of preterm birth whereas counties that have partial shortage of hospitals have 3.8% of increased risk of preterm birth.
Negetive binomial model has much lower AIC value compared to poisson model.
Relative risk model
pbrate<-(sum(ahrf3$preterm, na.rm=T)/sum(ahrf3$births1416, na.rm=T))
pbrate
## [1] 0.122197
12% (on average) of preterm births can be applied to each county’s births.
ahrf3$E <- pbrate*ahrf3$births1416
head(ahrf3[, c("coname", "births1416", "preterm", "E")])
ahrf3%>%
ggplot( aes(preterm))+geom_histogram()+ggtitle("Distribution of Preterm Births", "y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ahrf3%>%
ggplot( aes(preterm/E))+geom_histogram()+ggtitle("Distribution of Standardized Preterm Births", "y/E")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fit_pois_E<-glm(preterm ~ offset(log(E+.000001)) + hpsa10,
family=poisson,
data=ahrf3)
summary(fit_pois_E)
##
## Call:
## glm(formula = preterm ~ offset(log(E + 1e-06)) + hpsa10, family = poisson,
## data = ahrf3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.2377 -0.5404 0.1685 0.9559 10.4849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.04999 0.01514 -3.302 0.000959 ***
## hpsa101 0.05352 0.01616 3.312 0.000928 ***
## hpsa102 0.05903 0.01756 3.361 0.000775 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 607.13 on 160 degrees of freedom
## Residual deviance: 594.63 on 158 degrees of freedom
## AIC: 1609.3
##
## Number of Fisher Scoring iterations: 4
fit_bin<-glm(cbind(preterm, births1416)~ hpsa10,
family=binomial,
data=ahrf3)
summary(fit_bin)
##
## Call:
## glm(formula = cbind(preterm, births1416) ~ hpsa10, family = binomial,
## data = ahrf3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.8548 -0.5108 0.1594 0.9005 9.8501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.15211 0.01600 -134.549 < 2e-16 ***
## hpsa101 0.05352 0.01708 3.133 0.00173 **
## hpsa102 0.05903 0.01857 3.179 0.00148 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 540.39 on 160 degrees of freedom
## Residual deviance: 529.23 on 158 degrees of freedom
## AIC: 1525.1
##
## Number of Fisher Scoring iterations: 4
In this model, we see the same results as the Poisson model, except the difference in the AIC for the binomial model which is significantly lower. Hence, binomial count model is the best model for this study.