library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
temp<-tempfile()
download.file("https://data.hrsa.gov/DataDownload/AHRF/AHRF_2019-2020_SAS.zip",temp)
ahrf<-haven::read_sas(unz(temp,filename="ahrf2020.sas7bdat"))
rm(temp)
ahrf4<-ahrf%>%
mutate(cofips=f00004,
coname=f00010,
state=f00011,
popn=f1198416,
fampov=f1443214,
teenbirth=f1360613,
hpsa16=case_when(.$f0978716==0~'no shortage',
.$f0978716==1~'whole county shortage',
.$f0978716==2~'partial county shortage'))%>%
dplyr::select(cofips,coname,state,popn,fampov,teenbirth,hpsa16)%>%
filter(complete.cases(.))%>%
as.data.frame()
2) Consider a Poisson regression model for the outcome
library(gtsummary)
glmp<-glm(teenbirth~fampov+hpsa16,
data=ahrf4,
family=poisson)
glmp %>% tbl_regression(exp=TRUE)
| Characteristic | IRR1 | 95% CI1 | p-value |
|---|---|---|---|
| % Families Below Poverty Level 2014-18 | 1.02 | 1.02, 1.02 | <0.001 |
| hpsa16 | |||
| no shortage | — | — | |
| partial county shortage | 2.45 | 2.37, 2.54 | <0.001 |
| whole county shortage | 0.76 | 0.73, 0.80 | <0.001 |
| 1 IRR = Incidence Rate Ratio, CI = Confidence Interval | |||
a. Evaluate the level of dispersion in the outcome
The IRR for Whole County Shortage (of primary care physicians) was .01 higher than the original Poisson and was not significant in this case. According to the AIC, the dispersed Poisson model is a better fit than the Poisson.
b. Is the Poisson model a good choice? The model revealed somewhat what I expected in that having a higher portion of families living below the poverty line and a partial shortage of physicians leads to higher teen pregnancy. I was surprised to see that a total shortage of physicians shows a lower liklihood of teen pregnancy but that result was not significant in the dispersed model.
library(dispmod)
glmp_d<-glm.poisson.disp(glmp,verbose = F)
glmp_d%>%tbl_regression(exp=T)
| Characteristic | IRR1 | 95% CI1 | p-value |
|---|---|---|---|
| % Families Below Poverty Level 2014-18 | 1.02 | 1.00, 1.04 | 0.11 |
| hpsa16 | |||
| no shortage | — | — | |
| partial county shortage | 2.45 | 1.77, 3.49 | <0.001 |
| whole county shortage | 0.77 | 0.47, 1.26 | 0.3 |
| 1 IRR = Incidence Rate Ratio, CI = Confidence Interval | |||
AIC(glmp,glmp_d)
## df AIC
## glmp 4 103041.5477
## glmp_d 4 469.2604
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
3) Consider a Negative binomial model
glmnb<-glm.nb(teenbirth~fampov+hpsa16,
data=ahrf4)
glmnb%>%tbl_regression()
| Characteristic | log(IRR)1 | 95% CI1 | p-value |
|---|---|---|---|
| % Families Below Poverty Level 2014-18 | 0.02 | 0.01, 0.03 | 0.002 |
| hpsa16 | |||
| no shortage | — | — | |
| partial county shortage | 0.90 | 0.72, 1.1 | <0.001 |
| whole county shortage | -0.26 | -0.51, 0.00 | 0.046 |
| 1 IRR = Incidence Rate Ratio, CI = Confidence Interval | |||
4) Compare the model fits of the alternative models using AIC
AIC(glmp,glmnb)
## df AIC
## glmp 4 103041.55
## glmnb 5 11387.05
According to the AIC, the negative binomial model is a much better fit than the poisson given that the AIC of the negative binomial model is over 91,000 AIC points below the Poisson model.