library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
temp<-tempfile()
download.file("https://data.hrsa.gov/DataDownload/AHRF/AHRF_2019-2020_SAS.zip",temp)
ahrf<-haven::read_sas(unz(temp,filename="ahrf2020.sas7bdat"))
rm(temp)
ahrf4<-ahrf%>%
  mutate(cofips=f00004,
         coname=f00010,
         state=f00011,
         popn=f1198416,
         fampov=f1443214,
         teenbirth=f1360613,
         hpsa16=case_when(.$f0978716==0~'no shortage',
                          .$f0978716==1~'whole county shortage',
                          .$f0978716==2~'partial county shortage'))%>%
  dplyr::select(cofips,coname,state,popn,fampov,teenbirth,hpsa16)%>%
  filter(complete.cases(.))%>%
  as.data.frame()

2) Consider a Poisson regression model for the outcome

library(gtsummary)
glmp<-glm(teenbirth~fampov+hpsa16,
          data=ahrf4,
          family=poisson)
glmp %>% tbl_regression(exp=TRUE)
Characteristic IRR1 95% CI1 p-value
% Families Below Poverty Level 2014-18 1.02 1.02, 1.02 <0.001
hpsa16
no shortage
partial county shortage 2.45 2.37, 2.54 <0.001
whole county shortage 0.76 0.73, 0.80 <0.001
1 IRR = Incidence Rate Ratio, CI = Confidence Interval

a. Evaluate the level of dispersion in the outcome

The IRR for Whole County Shortage (of primary care physicians) was .01 higher than the original Poisson and was not significant in this case. According to the AIC, the dispersed Poisson model is a better fit than the Poisson.

b. Is the Poisson model a good choice? The model revealed somewhat what I expected in that having a higher portion of families living below the poverty line and a partial shortage of physicians leads to higher teen pregnancy. I was surprised to see that a total shortage of physicians shows a lower liklihood of teen pregnancy but that result was not significant in the dispersed model.

library(dispmod)
glmp_d<-glm.poisson.disp(glmp,verbose = F)
glmp_d%>%tbl_regression(exp=T)
Characteristic IRR1 95% CI1 p-value
% Families Below Poverty Level 2014-18 1.02 1.00, 1.04 0.11
hpsa16
no shortage
partial county shortage 2.45 1.77, 3.49 <0.001
whole county shortage 0.77 0.47, 1.26 0.3
1 IRR = Incidence Rate Ratio, CI = Confidence Interval
AIC(glmp,glmp_d)
##        df         AIC
## glmp    4 103041.5477
## glmp_d  4    469.2604
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select

3) Consider a Negative binomial model

glmnb<-glm.nb(teenbirth~fampov+hpsa16,
          data=ahrf4)
glmnb%>%tbl_regression()
Characteristic log(IRR)1 95% CI1 p-value
% Families Below Poverty Level 2014-18 0.02 0.01, 0.03 0.002
hpsa16
no shortage
partial county shortage 0.90 0.72, 1.1 <0.001
whole county shortage -0.26 -0.51, 0.00 0.046
1 IRR = Incidence Rate Ratio, CI = Confidence Interval

4) Compare the model fits of the alternative models using AIC

AIC(glmp,glmnb)
##       df       AIC
## glmp   4 103041.55
## glmnb  5  11387.05

According to the AIC, the negative binomial model is a much better fit than the poisson given that the AIC of the negative binomial model is over 91,000 AIC points below the Poisson model.