library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(tigris)
library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")


nams<-names(tb19)
head(nams, n=10)
##  [1] "year"     "ststr"    "state"    "fmonth"   "dispcode" "orgseqno"
##  [7] "c01q01"   "fairpoor" "c02q01"   "phyng"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(tb19)<-newnames

Recode

#recode

tb19$dia<-Recode(tb19$m02q04, recodes = "88=0; 77=NA; 99=NA")


#age
tb19$agegr4 = as.factor(tb19$agegr4)
tb19$age<-Recode(tb19$agegr4, recodes="1='18-19'; 2='30-44'; 3='45-64'; 4='65+'; else=NA", as.factor=T)

#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")

#employment
tb19$c08q14 = as.factor(tb19$c08q14)
tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
 tb19$employ<-relevel(tb19$employ, ref='Employed')
 

#education level
 tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'", as.factor=T)
tb19$educ<-relevel(tb19$educ, ref='hsgrad')

#race/ethnicity
tb19$race = as.factor(tb19$race)

tb19$black<-Recode(tb19$race, recodes="2=1; 9=NA; else=0")
tb19$white<-Recode(tb19$race, recodes="1=1; 9=NA; else=0")
tb19$other<-Recode(tb19$race, recodes="3:7=1; 9=NA; else=0")
tb19$hispanic<-Recode(tb19$race, recodes="8=1; else=0")

tb19$racegr5 = as.factor(tb19$racegr5)
tb19$ethn<-Recode(tb19$racegr5, recodes="1='nhwhite'; 2='nh black'; 3='Hispanic';4='othernh'; 5='multinh'; else=NA", as.factor = T)

#smoking currently
tb19$smoker2 = as.factor(tb19$smoker2)

tb19$smoke<-Recode(tb19$smoker2, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
tb19$smoke<-relevel(tb19$smoke, ref = "NeverSmoked")

#Poor or fair self rated health
tb19$fairpoor = as.factor(tb19$fairpoor)
tb19$genhlth<-Recode(tb19$fairpoor, recodes="1='poor'; 2='good'")


#sex
tb19$sex = as.factor(tb19$sex)
tb19$sex<-Recode(tb19$sex, recodes="1='Male'; 2='Female'")


#marital status
tb19$c08q05 = as.factor(tb19$c08q05)
tb19$marital<-Recode(tb19$c08q05, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
tb19$marital<-relevel(tb19$marital, ref='married')

#income
tb19$c08q16 = as.factor(tb19$c08q16)
tb19$income<-Recode(tb19$c08q16, recodes="1='less10k';2='10-15k'; 3='15-20k'; 4='20-25k'; 5='25-35';6='35-50k';7='50-75';8='75k'; else=NA")

Hist and Summary

hist(tb19$dia)

summary(tb19$dia)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   3.000   3.935   4.000  76.000   10504

Define count Outcome:

#1) Define a count outcome for the data set of your choosing: count count outcome is the times a health professional has been seen for diabetes in the past 12 months. 

  # a. State a research question about your outcome
        #Which populations visit health professional for diabetes the most?


   #b. Is an offset term necessary? why or why not?
    # No, because the counts over time are the same for my outcome variable.

Subset Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub19<-tb19%>%
  select(dia,smoke,genhlth,sex,age,ins,employ,educ,ethn,income,llcpwt, ststr) %>%
  filter( complete.cases(.))

#survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data =sub19 )

Consider a Poisson regression model for the outcome

#2) #a. Evaluate the level of dispersion in the outcome
   #b. Is the Poisson model a good choice? The scale number is 2.013713, the Poisson model is not a good choice.


#simple descriptives
svyhist(~dia, des)

svyby(~dia, ~ethn+ins, des, svymean, na.rm=T)
svyby(~dia, ~genhlth, des, svymean, na.rm=T)
#Poisson glm fit to survey data
fit1<-svyglm(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income), design=des, family=poisson)

summary(fit1)
## 
## Call:
## svyglm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) + 
##     factor(age) + factor(ins) + factor(employ) + factor(educ) + 
##     factor(ethn) + factor(income), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub19)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.09945    0.43877   0.227  0.82073    
## factor(smoke)Current  -0.04350    0.23157  -0.188  0.85103    
## factor(smoke)Former   -0.06422    0.13399  -0.479  0.63182    
## factor(genhlth)poor    0.47614    0.11949   3.985 7.15e-05 ***
## factor(sex)Male        0.08183    0.13830   0.592  0.55415    
## factor(age)30-44       0.43053    0.35624   1.209  0.22707    
## factor(age)45-64       0.25331    0.20071   1.262  0.20716    
## factor(age)65+        -0.02424    0.22935  -0.106  0.91584    
## factor(ins)1           0.61324    0.32719   1.874  0.06113 .  
## factor(employ)nilf     0.26355    0.23658   1.114  0.26550    
## factor(employ)retired  0.49501    0.17919   2.763  0.00582 ** 
## factor(employ)unable   0.50446    0.21102   2.391  0.01697 *  
## factor(educ)colgrad    0.29702    0.20665   1.437  0.15089    
## factor(educ)leshs      0.18601    0.18166   1.024  0.30605    
## factor(educ)sumcol     0.26037    0.17444   1.493  0.13579    
## factor(ethn)multinh   -0.51234    0.19779  -2.590  0.00970 ** 
## factor(ethn)nh black  -0.31056    0.17657  -1.759  0.07885 .  
## factor(ethn)nhwhite   -0.19995    0.13663  -1.463  0.14359    
## factor(ethn)othernh   -0.43052    0.25415  -1.694  0.09052 .  
## factor(income)15-20k  -0.06997    0.19351  -0.362  0.71773    
## factor(income)20-25k  -0.03118    0.26980  -0.116  0.90803    
## factor(income)25-35   -0.16057    0.23523  -0.683  0.49499    
## factor(income)35-50k  -0.19209    0.21457  -0.895  0.37084    
## factor(income)50-75   -0.24500    0.22381  -1.095  0.27387    
## factor(income)75k     -0.19404    0.27701  -0.700  0.48375    
## factor(income)less10k  0.17866    0.32567   0.549  0.58338    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 6.148811)
## 
## Number of Fisher Scoring iterations: 6
#poisson risk ratios
round(exp(summary(fit1)$coef[-1,1]), 3)
##  factor(smoke)Current   factor(smoke)Former   factor(genhlth)poor 
##                 0.957                 0.938                 1.610 
##       factor(sex)Male      factor(age)30-44      factor(age)45-64 
##                 1.085                 1.538                 1.288 
##        factor(age)65+          factor(ins)1    factor(employ)nilf 
##                 0.976                 1.846                 1.302 
## factor(employ)retired  factor(employ)unable   factor(educ)colgrad 
##                 1.641                 1.656                 1.346 
##     factor(educ)leshs    factor(educ)sumcol   factor(ethn)multinh 
##                 1.204                 1.297                 0.599 
##  factor(ethn)nh black   factor(ethn)nhwhite   factor(ethn)othernh 
##                 0.733                 0.819                 0.650 
##  factor(income)15-20k  factor(income)20-25k   factor(income)25-35 
##                 0.932                 0.969                 0.852 
##  factor(income)35-50k   factor(income)50-75     factor(income)75k 
##                 0.825                 0.783                 0.824 
## factor(income)less10k 
##                 1.196

Deviance

fit2<-glm(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income), data=tb19, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) + 
##     factor(age) + factor(ins) + factor(employ) + factor(educ) + 
##     factor(ethn) + factor(income), family = poisson, data = tb19)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3483  -1.4197  -0.4070   0.3758  16.5912  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            0.529031   0.181763   2.911  0.00361 ** 
## factor(smoke)Current  -0.240140   0.046484  -5.166 2.39e-07 ***
## factor(smoke)Former   -0.135687   0.032190  -4.215 2.50e-05 ***
## factor(genhlth)poor    0.233341   0.030692   7.603 2.90e-14 ***
## factor(sex)Male       -0.056083   0.029970  -1.871  0.06130 .  
## factor(age)30-44       0.485152   0.179542   2.702  0.00689 ** 
## factor(age)45-64       0.384740   0.172650   2.228  0.02585 *  
## factor(age)65+         0.369221   0.174538   2.115  0.03439 *  
## factor(ins)1           0.293865   0.056902   5.164 2.41e-07 ***
## factor(employ)nilf     0.181283   0.056551   3.206  0.00135 ** 
## factor(employ)retired  0.227503   0.044671   5.093 3.53e-07 ***
## factor(employ)unable   0.632710   0.047352  13.362  < 2e-16 ***
## factor(educ)colgrad   -0.097366   0.041992  -2.319  0.02041 *  
## factor(educ)leshs     -0.078257   0.044221  -1.770  0.07678 .  
## factor(educ)sumcol    -0.022308   0.037676  -0.592  0.55378    
## factor(ethn)multinh   -0.158384   0.131809  -1.202  0.22951    
## factor(ethn)nh black  -0.242841   0.052035  -4.667 3.06e-06 ***
## factor(ethn)nhwhite   -0.092144   0.034679  -2.657  0.00788 ** 
## factor(ethn)othernh   -0.006529   0.076922  -0.085  0.93236    
## factor(income)15-20k   0.030472   0.058567   0.520  0.60286    
## factor(income)20-25k   0.049655   0.059272   0.838  0.40217    
## factor(income)25-35    0.051704   0.063690   0.812  0.41690    
## factor(income)35-50k  -0.078388   0.063553  -1.233  0.21742    
## factor(income)50-75   -0.075047   0.065047  -1.154  0.24861    
## factor(income)75k      0.018270   0.063795   0.286  0.77459    
## factor(income)less10k  0.322583   0.059490   5.422 5.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6016.3  on 1346  degrees of freedom
## Residual deviance: 5356.7  on 1321  degrees of freedom
##   (10950 observations deleted due to missingness)
## AIC: 9016.5
## 
## Number of Fisher Scoring iterations: 6
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.013713

Deviance as test model with chi square and degrees of freedom.

1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
#the model does not fit the data because the p-value is zero. 

fit3<-glm(dia~factor(smoke) + factor(genhlth) + factor(sex) + 
    factor(age) + factor(ins) + factor(employ) + factor(educ) + 
    factor(ethn) + factor(income), data=tb19, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) + 
##     factor(age) + factor(ins) + factor(employ) + factor(educ) + 
##     factor(ethn) + factor(income), family = quasipoisson, data = tb19)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3483  -1.4197  -0.4070   0.3758  16.5912  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.529031   0.493619   1.072  0.28403    
## factor(smoke)Current  -0.240140   0.126237  -1.902  0.05735 .  
## factor(smoke)Former   -0.135687   0.087420  -1.552  0.12087    
## factor(genhlth)poor    0.233341   0.083351   2.800  0.00519 ** 
## factor(sex)Male       -0.056083   0.081389  -0.689  0.49090    
## factor(age)30-44       0.485152   0.487585   0.995  0.31991    
## factor(age)45-64       0.384740   0.468869   0.821  0.41204    
## factor(age)65+         0.369221   0.473998   0.779  0.43615    
## factor(ins)1           0.293865   0.154530   1.902  0.05743 .  
## factor(employ)nilf     0.181283   0.153578   1.180  0.23805    
## factor(employ)retired  0.227503   0.121313   1.875  0.06097 .  
## factor(employ)unable   0.632710   0.128594   4.920 9.73e-07 ***
## factor(educ)colgrad   -0.097366   0.114038  -0.854  0.39336    
## factor(educ)leshs     -0.078257   0.120093  -0.652  0.51475    
## factor(educ)sumcol    -0.022308   0.102318  -0.218  0.82744    
## factor(ethn)multinh   -0.158384   0.357957  -0.442  0.65823    
## factor(ethn)nh black  -0.242841   0.141312  -1.718  0.08595 .  
## factor(ethn)nhwhite   -0.092144   0.094177  -0.978  0.32805    
## factor(ethn)othernh   -0.006529   0.208898  -0.031  0.97507    
## factor(income)15-20k   0.030472   0.159052   0.192  0.84810    
## factor(income)20-25k   0.049655   0.160965   0.308  0.75776    
## factor(income)25-35    0.051704   0.172963   0.299  0.76504    
## factor(income)35-50k  -0.078388   0.172593  -0.454  0.64978    
## factor(income)50-75   -0.075047   0.176650  -0.425  0.67102    
## factor(income)75k      0.018270   0.173250   0.105  0.91603    
## factor(income)less10k  0.322583   0.161559   1.997  0.04606 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 7.375155)
## 
##     Null deviance: 6016.3  on 1346  degrees of freedom
## Residual deviance: 5356.7  on 1321  degrees of freedom
##   (10950 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

#3) Consider a Negative binomial model

#use sample weights to fit negative binomial model
library(lmtest)
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="orgseqno"))
## 
## z test of coefficients:
## 
##                        Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)            0.529031   0.288499  1.8337 0.0666928 .  
## factor(smoke)Current  -0.240140   0.126352 -1.9006 0.0573598 .  
## factor(smoke)Former   -0.135687   0.092659 -1.4644 0.1430947    
## factor(genhlth)poor    0.233341   0.105647  2.2087 0.0271963 *  
## factor(sex)Male       -0.056083   0.084657 -0.6625 0.5076702    
## factor(age)30-44       0.485152   0.305325  1.5890 0.1120671    
## factor(age)45-64       0.384740   0.222489  1.7293 0.0837630 .  
## factor(age)65+         0.369221   0.241198  1.5308 0.1258227    
## factor(ins)1           0.293865   0.254163  1.1562 0.2475957    
## factor(employ)nilf     0.181283   0.172496  1.0509 0.2932857    
## factor(employ)retired  0.227503   0.118418  1.9212 0.0547087 .  
## factor(employ)unable   0.632710   0.182881  3.4597 0.0005408 ***
## factor(educ)colgrad   -0.097366   0.146647 -0.6639 0.5067225    
## factor(educ)leshs     -0.078257   0.135179 -0.5789 0.5626449    
## factor(educ)sumcol    -0.022308   0.118454 -0.1883 0.8506207    
## factor(ethn)multinh   -0.158384   0.208579 -0.7593 0.4476456    
## factor(ethn)nh black  -0.242841   0.112079 -2.1667 0.0302581 *  
## factor(ethn)nhwhite   -0.092144   0.106277 -0.8670 0.3859344    
## factor(ethn)othernh   -0.006529   0.152175 -0.0429 0.9657776    
## factor(income)15-20k   0.030472   0.162798  0.1872 0.8515232    
## factor(income)20-25k   0.049655   0.179288  0.2770 0.7818131    
## factor(income)25-35    0.051704   0.165020  0.3133 0.7540382    
## factor(income)35-50k  -0.078388   0.191578 -0.4092 0.6824152    
## factor(income)50-75   -0.075047   0.162132 -0.4629 0.6434525    
## factor(income)75k      0.018270   0.173912  0.1051 0.9163351    
## factor(income)less10k  0.322583   0.172403  1.8711 0.0613318 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#fit negative binomail glm
#library(MASS)
#fit_nb<- glm.nb(lowbw1416 ~ offset(log(births1416)) + hpsa10, 
               #data=arf2018sub)
#summary(fit_nb)


library(stargazer)
library(MASS)
fit.nb1<-glm.nb(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income),
              data=sub19,
              weights=llcpwt/mean(llcpwt, na.rm=T))

fit.nb2<-glm.nb(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income),
              data=sub19,
              weights=llcpwt/mean(llcpwt, na.rm=T))
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="orgseqno"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="orgseqno"))


stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )
## 
## -------------------------------------------------------
##                                      dia               
##                           Model 1          Model 2     
## -------------------------------------------------------
## factor(smoke)Current       -0.155           -0.155     
##                           (0.195)          (0.195)     
## factor(smoke)Former        -0.048           -0.048     
##                           (0.117)          (0.117)     
## factor(genhlth)poor       0.472***         0.472***    
##                           (0.104)          (0.104)     
## factor(sex)Male            0.045            0.045      
##                           (0.120)          (0.120)     
## factor(age)30-44           0.283            0.283      
##                           (0.312)          (0.312)     
## factor(age)45-64           0.298            0.298      
##                           (0.199)          (0.199)     
## factor(age)65+             0.125            0.125      
##                           (0.217)          (0.217)     
## factor(ins)1               0.508            0.508      
##                           (0.262)          (0.262)     
## factor(employ)nilf         0.207            0.207      
##                           (0.207)          (0.207)     
## factor(employ)retired      0.360*           0.360*     
##                           (0.155)          (0.155)     
## factor(employ)unable       0.452*           0.452*     
##                           (0.204)          (0.204)     
## factor(educ)colgrad        0.199            0.199      
##                           (0.172)          (0.172)     
## factor(educ)leshs          0.134            0.134      
##                           (0.162)          (0.162)     
## factor(educ)sumcol         0.234            0.234      
##                           (0.162)          (0.162)     
## factor(ethn)multinh       -0.431*          -0.431*     
##                           (0.191)          (0.191)     
## factor(ethn)nh black       -0.225           -0.225     
##                           (0.167)          (0.167)     
## factor(ethn)nhwhite        -0.156           -0.156     
##                           (0.123)          (0.123)     
## factor(ethn)othernh        -0.294           -0.294     
##                           (0.221)          (0.221)     
## factor(income)15-20k       -0.125           -0.125     
##                           (0.199)          (0.199)     
## factor(income)20-25k       0.029            0.029      
##                           (0.266)          (0.266)     
## factor(income)25-35        -0.197           -0.197     
##                           (0.226)          (0.226)     
## factor(income)35-50k       -0.220           -0.220     
##                           (0.211)          (0.211)     
## factor(income)50-75        -0.247           -0.247     
##                           (0.222)          (0.222)     
## factor(income)75k          -0.219           -0.219     
##                           (0.255)          (0.255)     
## factor(income)less10k      0.092            0.092      
##                           (0.349)          (0.349)     
## Constant                   0.259            0.259      
##                           (0.392)          (0.392)     
## N                          1,347            1,347      
## Log Likelihood           -3,164.959       -3,164.959   
## theta                 1.803*** (0.101) 1.803*** (0.101)
## AIC                      6,381.918        6,381.918    
## -------------------------------------------------------
## *p < .05; **p < .01; ***p < .001

#4) Compare the model fits of the alternative models using AIC

AIC(fit.nb1) 
## [1] 6381.918
AIC(fit.nb2)
## [1] 6381.918
AIC(fit1)
##      eff.p        AIC   deltabar 
##  535.44355 5919.10504   21.41774
AIC(fit2)
## [1] 9016.534
#Based on the model numbers, the model that best fits is fit1 with AIC of 5919.10504 with the most parsimonious fit.