This example will cover the use of R functions for fitting count data models to complex survey data.

For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

#load brfss
library(car)
library(survey)
## Loading required package: grid
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(MASS)
library(pscl)
## Loading required package: lattice
## Classes and Methods for R developed in the
## 
## Political Science Computational Laboratory
## 
## Department of Political Science
## 
## Stanford University
## 
## Simon Jackman
## 
## hurdle and zeroinfl functions by Achim Zeileis
load("~/Google Drive/dem7283/data/brfss_11.Rdata")

#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_11)
head(nams, n=10)
##  [1] "_STATE"   "precall"  "fmonth"   "intvid"   "seqno"    "_PSU"    
##  [7] "ctelenum" "cellfon"  "pvtresid" "genhlth"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement =  "",x =  nams)
names(brfss_11)<-tolower(newnames)

#Outcome is: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
brfss_11$healthydays<-ifelse(brfss_11$physhlth %in%c(77,99), NA,ifelse(brfss_11$physhlth==88,0, brfss_11$physhlth))


#race/ethnicity
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0")
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0")
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0")
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0")
brfss_11$race_group<-recode(brfss_11$racegr2, recodes="1='NH white'; 2='NH black'; 3:4='NH other';5='hispanic'; else=NA", as.factor.result = T)
brfss_11$race_group<-relevel(brfss_11$race_group, ref = 'NH white')
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)

#income grouping
brfss_11$inc<-ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)

#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
#brfss_11$educ<-relevel(brfss_11$educ, ref='0Prim')

#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))

Analysis

First, we will subset our data to have complete cases for our variables in our model and make our survey design object

#Here I keep complete cases on my key variables, AND only cases for TX, just for speed (the suvey procedures can run for a long time)
brfss_11<-brfss_11[is.na(brfss_11$healthydays)==F&is.na(brfss_11$cntywt)==F&is.na(brfss_11$black)==F&is.na(brfss_11$educ)==F&brfss_11$state==48,]

#FOR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
options(survey.lonely.psu = "adjust")
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~ststr, weights=~cntywt, data = brfss_11[is.na(brfss_11$cntywt)==F,], nest=T )

Poisson regression example

To fit a Poisson GLM to survey data in R, we use the svyglm fucntion in the survey library.

#First I do some simple descriptives
svyhist(~healthydays, des)

plot of chunk unnamed-chunk-3

svyby(~healthydays, ~race_group+educ, des, svymean)
##                   race_group     educ healthydays     se
## NH white.0Prim      NH white    0Prim      2.2098 0.7945
## hispanic.0Prim      hispanic    0Prim      4.4616 0.7622
## NH black.0Prim      NH black    0Prim      7.9453 3.7086
## NH other.0Prim      NH other    0Prim      2.3599 1.8957
## NH white.1somehs    NH white  1somehs      4.3014 1.2629
## hispanic.1somehs    hispanic  1somehs      2.7066 0.5954
## NH black.1somehs    NH black  1somehs      5.5027 1.4486
## NH other.1somehs    NH other  1somehs      4.9275 2.7237
## NH white.2hsgrad    NH white  2hsgrad      3.8612 0.5174
## hispanic.2hsgrad    hispanic  2hsgrad      3.2261 0.5848
## NH black.2hsgrad    NH black  2hsgrad      4.3885 0.9617
## NH other.2hsgrad    NH other  2hsgrad      6.1952 2.0821
## NH white.3somecol   NH white 3somecol      3.6084 0.3534
## hispanic.3somecol   hispanic 3somecol      2.9251 0.5488
## NH black.3somecol   NH black 3somecol      3.3241 0.5994
## NH other.3somecol   NH other 3somecol      2.0023 0.5217
## NH white.4colgrad   NH white 4colgrad      2.0947 0.1612
## hispanic.4colgrad   hispanic 4colgrad      2.2108 0.4453
## NH black.4colgrad   NH black 4colgrad      2.1182 0.3634
## NH other.4colgrad   NH other 4colgrad      0.9925 0.2178
#Poisson glm fit to survey data
fit1<-svyglm(healthydays~race_group+educ+agec, design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = healthydays ~ race_group + educ + agec, design = des, 
##     family = poisson)
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~ststr, weights = ~cntywt, data = brfss_11[is.na(brfss_11$cntywt) == 
##     F, ], nest = T)
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.84729    0.25721    3.29  0.00099 ***
## race_grouphispanic -0.00797    0.12196   -0.07  0.94788    
## race_groupNH black  0.15012    0.12216    1.23  0.21917    
## race_groupNH other -0.10501    0.18773   -0.56  0.57594    
## educ1somehs        -0.16561    0.20703   -0.80  0.42376    
## educ2hsgrad        -0.15005    0.18185   -0.83  0.40933    
## educ3somecol       -0.32257    0.17949   -1.80  0.07235 .  
## educ4colgrad       -0.84530    0.18183   -4.65  3.4e-06 ***
## agec(24,39]         0.36457    0.23305    1.56  0.11777    
## agec(39,59]         0.73850    0.21312    3.47  0.00053 ***
## agec(59,79]         1.07773    0.21148    5.10  3.6e-07 ***
## agec(79,99]         1.19155    0.23540    5.06  4.3e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 15.18)
## 
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## race_grouphispanic race_groupNH black race_groupNH other 
##              0.992              1.162              0.900 
##        educ1somehs        educ2hsgrad       educ3somecol 
##              0.847              0.861              0.724 
##       educ4colgrad        agec(24,39]        agec(39,59] 
##              0.429              1.440              2.093 
##        agec(59,79]        agec(79,99] 
##              2.938              3.292

Now, R will not fit the other count data models described in the notes, so, we will fit them using sample weights only, then calculate the robust standard errors. We standardize the weights to equal the sample size, as opposed to the population size by dividing each person’s weight by the mean weight.

#First, I define a function to get the clustered, or robust standard errors. This function effectively controls for the within-strata homogeneity when calculateing the se's for the betas. 

#I stole this from: http://drewdimmery.com/robust-ses-in-r/
#and http://people.su.se/~ma/clustering.pdf
#I also added a correction to use this with the hurdle and zero-inflated models
#This is how stata gets robust se's

clx2 <- 
  function(fm, dfcw,  cluster){
    # R-codes (www.r-project.org) for computing
    # clustered-standard errors. Mahmood Arai, Jan 26, 2008.
    
    # The arguments of the function are:
    # fitted model, cluster1
    # You need to install libraries `sandwich' and `lmtest'
    
    # reweighting the var-cov matrix for the within model
    require(sandwich);require(lmtest)
    if(class(fm)=="zeroinfl"|class(fm)=="hurdle") {
    M <- length(unique(cluster))   
    N <- length(cluster)           
    K <- dim(fm$vcov)[1]        #here is the rank from the zero inflated fits             
    dfc <- (M/(M-1))*((N-1)/(N-K))  
    uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
    vcovCL <- dfc[1]*sandwich(fm, meat=crossprod(uj)/N)*dfcw #fix a length problem in dfc
    list(summary=coeftest(fm, vcovCL))}
    else if(class(fm)!="zeroinfl"){
    M <- length(unique(cluster))
 N <- length(cluster)
 K <- fm$rank
 dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
 uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum));
 rcse.cov <- dfc * sandwich(fm, meat = crossprod(uj)/N)
 rcse.se <- coeftest(fm, rcse.cov)
 return(list( rcse.se))}

    }


#Fit poisson, and compare it to the fit from the survey design
#Fit the Poisson GLM
fit.pois<-glm(healthydays~race_group+educ+agec, data=brfss_11, weights=cntywt/mean(cntywt), family=poisson)
summary(fit.pois)
## 
## Call:
## glm(formula = healthydays ~ race_group + educ + agec, family = poisson, 
##     data = brfss_11, weights = cntywt/mean(cntywt))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -9.484  -1.994  -0.958  -0.020  28.967  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.84729    0.03578   23.68  < 2e-16 ***
## race_grouphispanic -0.00797    0.01741   -0.46    0.647    
## race_groupNH black  0.15012    0.01978    7.59  3.2e-14 ***
## race_groupNH other -0.10501    0.03204   -3.28    0.001 ** 
## educ1somehs        -0.16561    0.03095   -5.35  8.8e-08 ***
## educ2hsgrad        -0.15005    0.02624   -5.72  1.1e-08 ***
## educ3somecol       -0.32257    0.02667  -12.09  < 2e-16 ***
## educ4colgrad       -0.84530    0.02880  -29.35  < 2e-16 ***
## agec(24,39]         0.36457    0.02664   13.69  < 2e-16 ***
## agec(39,59]         0.73850    0.02493   29.62  < 2e-16 ***
## agec(59,79]         1.07773    0.02661   40.50  < 2e-16 ***
## agec(79,99]         1.19155    0.03887   30.66  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 70053  on 7197  degrees of freedom
## Residual deviance: 65706  on 7186  degrees of freedom
## AIC: 74332
## 
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios"
round(exp(summary(fit.pois)$coef[-1,1]), 3)
## race_grouphispanic race_groupNH black race_groupNH other 
##              0.992              1.162              0.900 
##        educ1somehs        educ2hsgrad       educ3somecol 
##              0.847              0.861              0.724 
##       educ4colgrad        agec(24,39]        agec(39,59] 
##              0.429              1.440              2.093 
##        agec(59,79]        agec(79,99] 
##              2.938              3.292
#I use psu as the clustering variable, this is legal.
#Here is the Poisson model
clx2(fit.pois, 1, brfss_11$psu)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.84729    0.25730    3.29  0.00099 ***
## race_grouphispanic -0.00797    0.12191   -0.07  0.94786    
## race_groupNH black  0.15012    0.12225    1.23  0.21947    
## race_groupNH other -0.10501    0.18779   -0.56  0.57606    
## educ1somehs        -0.16561    0.20712   -0.80  0.42393    
## educ2hsgrad        -0.15005    0.18199   -0.82  0.40966    
## educ3somecol       -0.32257    0.17950   -1.80  0.07233 .  
## educ4colgrad       -0.84530    0.18202   -4.64  3.4e-06 ***
## agec(24,39]         0.36457    0.23343    1.56  0.11833    
## agec(39,59]         0.73850    0.21314    3.46  0.00053 ***
## agec(59,79]         1.07773    0.21157    5.09  3.5e-07 ***
## agec(79,99]         1.19155    0.23548    5.06  4.2e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#which looks nearly identical to the output using the survey design from above

#Fit the Negative Binomial GLM
fit.nb<-glm.nb(healthydays~race_group+educ+agec, data=brfss_11, weights=cntywt/mean(cntywt))
summary(fit.nb)
## 
## Call:
## glm.nb(formula = healthydays ~ race_group + educ + agec, data = brfss_11, 
##     weights = cntywt/mean(cntywt), init.theta = 0.1415724034, 
##     link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.442  -0.732  -0.340  -0.010   4.751  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.83253    0.17091    4.87  1.1e-06 ***
## race_grouphispanic  0.00791    0.08218    0.10   0.9233    
## race_groupNH black  0.04456    0.10010    0.45   0.6562    
## race_groupNH other -0.25798    0.13481   -1.91   0.0557 .  
## educ1somehs        -0.10227    0.16702   -0.61   0.5403    
## educ2hsgrad        -0.02241    0.14575   -0.15   0.8778    
## educ3somecol       -0.23319    0.14590   -1.60   0.1100    
## educ4colgrad       -0.74118    0.14829   -5.00  5.8e-07 ***
## agec(24,39]         0.28386    0.10316    2.75   0.0059 ** 
## agec(39,59]         0.66317    0.09964    6.66  2.8e-11 ***
## agec(59,79]         1.03391    0.11767    8.79  < 2e-16 ***
## agec(79,99]         1.14742    0.21767    5.27  1.4e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1416) family taken to be 1)
## 
##     Null deviance: 5177.4  on 7197  degrees of freedom
## Residual deviance: 4985.6  on 7186  degrees of freedom
## AIC: 24366
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.14157 
##           Std. Err.:  0.00362 
## 
##  2 x log-likelihood:  -24339.97700
clx2(fit.nb, 1, brfss_11$psu)
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.83253    0.25620    3.25  0.00116 ** 
## race_grouphispanic  0.00791    0.13839    0.06  0.95440    
## race_groupNH black  0.04456    0.13029    0.34  0.73232    
## race_groupNH other -0.25798    0.17526   -1.47  0.14103    
## educ1somehs        -0.10227    0.23849   -0.43  0.66804    
## educ2hsgrad        -0.02241    0.21376   -0.10  0.91651    
## educ3somecol       -0.23319    0.21359   -1.09  0.27494    
## educ4colgrad       -0.74118    0.21178   -3.50  0.00047 ***
## agec(24,39]         0.28386    0.21172    1.34  0.18001    
## agec(39,59]         0.66317    0.19239    3.45  0.00057 ***
## agec(59,79]         1.03391    0.19024    5.43  5.5e-08 ***
## agec(79,99]         1.14742    0.21532    5.33  9.9e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Zero Inflated Poisson Mixture Model
fit.zip<-zeroinfl(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="poisson")
## Warning: non-integer #successes in a binomial glm!
summary(fit.zip)
## 
## Call:
## zeroinfl(formula = healthydays ~ race_group + educ + agec | agec, 
##     data = brfss_11, weights = cntywt/mean(cntywt), dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4152 -0.5565 -0.2810 -0.0239 21.8913 
## 
## Count model coefficients (poisson with log link):
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.91423    0.03681   52.00  < 2e-16 ***
## race_grouphispanic -0.03890    0.01733   -2.25  0.02476 *  
## race_groupNH black -0.02766    0.01991   -1.39  0.16469    
## race_groupNH other  0.13441    0.03222    4.17  3.0e-05 ***
## educ1somehs        -0.00338    0.03119   -0.11  0.91363    
## educ2hsgrad         0.09429    0.02637    3.58  0.00035 ***
## educ3somecol       -0.11535    0.02671   -4.32  1.6e-05 ***
## educ4colgrad       -0.48553    0.02880  -16.86  < 2e-16 ***
## agec(24,39]         0.23496    0.02689    8.74  < 2e-16 ***
## agec(39,59]         0.51047    0.02534   20.14  < 2e-16 ***
## agec(59,79]         0.68021    0.02693   25.26  < 2e-16 ***
## agec(79,99]         0.70691    0.03916   18.05  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.8767     0.0651   13.46  < 2e-16 ***
## agec(24,39]  -0.1046     0.0805   -1.30   0.1939    
## agec(39,59]  -0.2357     0.0768   -3.07   0.0021 ** 
## agec(59,79]  -0.4908     0.0889   -5.52  3.4e-08 ***
## agec(79,99]  -0.6823     0.1625   -4.20  2.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 25 
## Log-likelihood: -1.9e+04 on 17 Df
clx2(fit.zip, 1, brfss_11$psu)
## $summary
## 
## t test of coefficients:
## 
##                          Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)         1.91423    0.21788    8.79  < 2e-16 ***
## count_race_grouphispanic -0.03890    0.09573   -0.41  0.68448    
## count_race_groupNH black -0.02766    0.09706   -0.29  0.77564    
## count_race_groupNH other  0.13441    0.12932    1.04  0.29865    
## count_educ1somehs        -0.00338    0.16469   -0.02  0.98361    
## count_educ2hsgrad         0.09429    0.14245    0.66  0.50806    
## count_educ3somecol       -0.11535    0.14127   -0.82  0.41421    
## count_educ4colgrad       -0.48553    0.14344   -3.38  0.00072 ***
## count_agec(24,39]         0.23496    0.19254    1.22  0.22238    
## count_agec(39,59]         0.51047    0.17548    2.91  0.00364 ** 
## count_agec(59,79]         0.68021    0.17517    3.88  0.00010 ***
## count_agec(79,99]         0.70691    0.19222    3.68  0.00024 ***
## zero_(Intercept)          0.87669    0.15917    5.51  3.8e-08 ***
## zero_agec(24,39]         -0.10463    0.18535   -0.56  0.57242    
## zero_agec(39,59]         -0.23572    0.17246   -1.37  0.17173    
## zero_agec(59,79]         -0.49080    0.17359   -2.83  0.00471 ** 
## zero_agec(79,99]         -0.68231    0.21904   -3.11  0.00185 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Zero Inflated Negative Binomial Mixture Model
fit.zinb<-zeroinfl(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="negbin")
## Warning: non-integer #successes in a binomial glm!
summary(fit.zinb)
## 
## Call:
## zeroinfl(formula = healthydays ~ race_group + educ + agec | agec, 
##     data = brfss_11, weights = cntywt/mean(cntywt), dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5311 -0.3493 -0.1741 -0.0121 14.5765 
## 
## Count model coefficients (negbin with log link):
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.6076     0.1455   11.05  < 2e-16 ***
## race_grouphispanic  -0.0180     0.0664   -0.27   0.7861    
## race_groupNH black  -0.0225     0.0791   -0.28   0.7759    
## race_groupNH other  -0.1060     0.1210   -0.88   0.3811    
## educ1somehs         -0.0323     0.1315   -0.25   0.8062    
## educ2hsgrad          0.0834     0.1149    0.73   0.4681    
## educ3somecol        -0.1781     0.1132   -1.57   0.1155    
## educ4colgrad        -0.6614     0.1162   -5.69  1.3e-08 ***
## agec(24,39]          0.2719     0.0912    2.98   0.0029 ** 
## agec(39,59]          0.6143     0.0885    6.94  3.9e-12 ***
## agec(59,79]          0.8682     0.1010    8.60  < 2e-16 ***
## agec(79,99]          0.9117     0.1715    5.32  1.1e-07 ***
## Log(theta)          -0.5979     0.0680   -8.79  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.3452     0.0880    3.92  8.7e-05 ***
## agec(24,39]  -0.1195     0.1034   -1.16   0.2476    
## agec(39,59]  -0.1691     0.0986   -1.72   0.0862 .  
## agec(59,79]  -0.3881     0.1207   -3.22   0.0013 ** 
## agec(79,99]  -0.5821     0.2192   -2.66   0.0079 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.55 
## Number of iterations in BFGS optimization: 25 
## Log-likelihood: -1.21e+04 on 18 Df
clx2(fit.zinb, 1, brfss_11$psu)
## $summary
## 
## t test of coefficients:
## 
##                          Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)          1.6076     0.2703    5.95  2.8e-09 ***
## count_race_grouphispanic  -0.0180     0.1223   -0.15  0.88286    
## count_race_groupNH black  -0.0225     0.1127   -0.20  0.84165    
## count_race_groupNH other  -0.1060     0.1546   -0.69  0.49297    
## count_educ1somehs         -0.0323     0.2101   -0.15  0.87796    
## count_educ2hsgrad          0.0834     0.1883    0.44  0.65801    
## count_educ3somecol        -0.1781     0.1864   -0.96  0.33943    
## count_educ4colgrad        -0.6614     0.1887   -3.50  0.00046 ***
## count_agec(24,39]          0.2719     0.2150    1.26  0.20604    
## count_agec(39,59]          0.6143     0.1943    3.16  0.00158 ** 
## count_agec(59,79]          0.8682     0.1924    4.51  6.5e-06 ***
## count_agec(79,99]          0.9117     0.2153    4.24  2.3e-05 ***
## zero_(Intercept)           0.3452     0.2409    1.43  0.15191    
## zero_agec(24,39]          -0.1195     0.2415   -0.50  0.62058    
## zero_agec(39,59]          -0.1691     0.2221   -0.76  0.44649    
## zero_agec(59,79]          -0.3881     0.2346   -1.65  0.09814 .  
## zero_agec(79,99]          -0.5821     0.3154   -1.85  0.06499 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Poisson Hurdle Model
fit.hp<-hurdle(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="poisson")
summary(fit.hp)
## 
## Call:
## hurdle(formula = healthydays ~ race_group + educ + agec | agec, 
##     data = brfss_11, weights = cntywt/mean(cntywt), dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4146 -0.5564 -0.2811 -0.0239 21.9415 
## 
## Count model coefficients (truncated poisson with log link):
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.9128     0.0368   51.93  < 2e-16 ***
## race_grouphispanic  -0.0390     0.0173   -2.25  0.02461 *  
## race_groupNH black  -0.0275     0.0199   -1.38  0.16735    
## race_groupNH other   0.1351     0.0322    4.20  2.7e-05 ***
## educ1somehs         -0.0033     0.0312   -0.11  0.91563    
## educ2hsgrad          0.0943     0.0264    3.58  0.00035 ***
## educ3somecol        -0.1154     0.0267   -4.32  1.6e-05 ***
## educ4colgrad        -0.4859     0.0288  -16.87  < 2e-16 ***
## agec(24,39]          0.2366     0.0269    8.78  < 2e-16 ***
## agec(39,59]          0.5119     0.0254   20.16  < 2e-16 ***
## agec(59,79]          0.6817     0.0270   25.27  < 2e-16 ***
## agec(79,99]          0.7084     0.0392   18.07  < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.8808     0.0650  -13.54  < 2e-16 ***
## agec(24,39]   0.1057     0.0804    1.31   0.1887    
## agec(39,59]   0.2393     0.0767    3.12   0.0018 ** 
## agec(59,79]   0.4947     0.0889    5.57  2.6e-08 ***
## agec(79,99]   0.6863     0.1625    4.22  2.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 22 
## Log-likelihood: -1.9e+04 on 17 Df
clx2(fit.hp, 1, brfss_11$psu)
## $summary
## 
## t test of coefficients:
## 
##                          Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)          1.9128     0.2183    8.76  < 2e-16 ***
## count_race_grouphispanic  -0.0390     0.0958   -0.41  0.68432    
## count_race_groupNH black  -0.0275     0.0970   -0.28  0.77696    
## count_race_groupNH other   0.1351     0.1291    1.05  0.29526    
## count_educ1somehs         -0.0033     0.1646   -0.02  0.98399    
## count_educ2hsgrad          0.0943     0.1424    0.66  0.50783    
## count_educ3somecol        -0.1154     0.1412   -0.82  0.41402    
## count_educ4colgrad        -0.4859     0.1434   -3.39  0.00071 ***
## count_agec(24,39]          0.2366     0.1933    1.22  0.22100    
## count_agec(39,59]          0.5119     0.1764    2.90  0.00371 ** 
## count_agec(59,79]          0.6817     0.1760    3.87  0.00011 ***
## count_agec(79,99]          0.7084     0.1930    3.67  0.00024 ***
## zero_(Intercept)          -0.8808     0.1589   -5.54  3.1e-08 ***
## zero_agec(24,39]           0.1057     0.1850    0.57  0.56762    
## zero_agec(39,59]           0.2392     0.1722    1.39  0.16467    
## zero_agec(59,79]           0.4947     0.1733    2.86  0.00431 ** 
## zero_agec(79,99]           0.6863     0.2188    3.14  0.00172 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Negative Binomial Hurdle Model
fit.hnb<-hurdle(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="negbin")
summary(fit.hnb)
## 
## Call:
## hurdle(formula = healthydays ~ race_group + educ + agec | agec, 
##     data = brfss_11, weights = cntywt/mean(cntywt), dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5007 -0.3473 -0.1740 -0.0164 14.4192 
## 
## Count model coefficients (truncated negbin with log link):
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.5306     0.1514   10.11  < 2e-16 ***
## race_grouphispanic  -0.0368     0.0700   -0.53    0.599    
## race_groupNH black  -0.0721     0.0825   -0.87    0.382    
## race_groupNH other   0.0391     0.1332    0.29    0.769    
## educ1somehs          0.0117     0.1356    0.09    0.932    
## educ2hsgrad          0.1712     0.1187    1.44    0.149    
## educ3somecol        -0.1393     0.1166   -1.19    0.232    
## educ4colgrad        -0.5894     0.1198   -4.92  8.6e-07 ***
## agec(24,39]          0.2764     0.0931    2.97    0.003 ** 
## agec(39,59]          0.6340     0.0903    7.02  2.2e-12 ***
## agec(59,79]          0.8821     0.1022    8.63  < 2e-16 ***
## agec(79,99]          0.8769     0.1707    5.14  2.8e-07 ***
## Log(theta)          -0.6479     0.0691   -9.38  < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.8808     0.0650  -13.54  < 2e-16 ***
## agec(24,39]   0.1057     0.0804    1.31   0.1887    
## agec(39,59]   0.2393     0.0767    3.12   0.0018 ** 
## agec(59,79]   0.4947     0.0889    5.57  2.6e-08 ***
## agec(79,99]   0.6863     0.1625    4.22  2.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta: count = 0.523
## Number of iterations in BFGS optimization: 27 
## Log-likelihood: -1.21e+04 on 18 Df
clx2(fit.hnb, 1, brfss_11$psu)
## $summary
## 
## t test of coefficients:
## 
##                          Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)          1.5306     0.2869    5.33  9.9e-08 ***
## count_race_grouphispanic  -0.0368     0.1274   -0.29   0.7725    
## count_race_groupNH black  -0.0721     0.1157   -0.62   0.5330    
## count_race_groupNH other   0.0391     0.1467    0.27   0.7896    
## count_educ1somehs          0.0117     0.2190    0.05   0.9575    
## count_educ2hsgrad          0.1712     0.1954    0.88   0.3811    
## count_educ3somecol        -0.1393     0.1955   -0.71   0.4760    
## count_educ4colgrad        -0.5894     0.1921   -3.07   0.0022 ** 
## count_agec(24,39]          0.2764     0.2201    1.26   0.2093    
## count_agec(39,59]          0.6340     0.2029    3.12   0.0018 ** 
## count_agec(59,79]          0.8821     0.2037    4.33  1.5e-05 ***
## count_agec(79,99]          0.8769     0.2231    3.93  8.6e-05 ***
## zero_(Intercept)          -0.8808     0.1589   -5.54  3.1e-08 ***
## zero_agec(24,39]           0.1057     0.1850    0.57   0.5676    
## zero_agec(39,59]           0.2393     0.1722    1.39   0.1647    
## zero_agec(59,79]           0.4947     0.1733    2.86   0.0043 ** 
## zero_agec(79,99]           0.6863     0.2188    3.14   0.0017 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Collect all the model AICs
AICS<-c(AIC(fit.pois),
AIC(fit.nb),
AIC(fit.zip),
AIC(fit.zinb),
AIC(fit.hp),
AIC(fit.hnb))

#Plot them here are are looking for the smallest value
plot(AICS, type="p",pch=24,bg="grey", cex=2, ylab="AIC",axes=T,xaxt="n", xlab="", ylim=c(min(AICS)-5000, max(AICS)+5000))
title(main="AIC Values for Alternative Models")
axis(1, at=1:6,labels=F) #6= number of models
labels<-c("Poisson model", "NB Model","ZI-Pois", "ZI-NebBin","HurdlePois", "HurdleNB" )
text(1:6, par("usr")[3]-.25, srt=45, adj=1, labels=labels, xpd=T)
mtext(side=1, text="Model Specification", line=3)
symbols(x= which.min(AICS), y=AICS[which.min(AICS)], circles=5, fg=2,lwd=3,add=T)

plot of chunk unnamed-chunk-4

AICS[which.min(AICS)]
## [1] 24224
#looks like the negative binomimal fits best  


#get a series of predicted counts of days for different "types" of people for each model
dat<-expand.grid(race_group=levels(brfss_11$race_group), educ=levels(brfss_11$educ), agec=levels(brfss_11$agec))

#generate the fitted values
fitted<-predict(fit.zinb, newdat=dat,type="response")
#add the values to the fake data
dat$fitted.days<-round(fitted, 3)

#Print the fitted number of days
head(dat, n=20)
##    race_group     educ   agec fitted.days
## 1    NH white    0Prim (0,24]       2.069
## 2    hispanic    0Prim (0,24]       2.032
## 3    NH black    0Prim (0,24]       2.023
## 4    NH other    0Prim (0,24]       1.861
## 5    NH white  1somehs (0,24]       2.003
## 6    hispanic  1somehs (0,24]       1.968
## 7    NH black  1somehs (0,24]       1.959
## 8    NH other  1somehs (0,24]       1.802
## 9    NH white  2hsgrad (0,24]       2.249
## 10   hispanic  2hsgrad (0,24]       2.209
## 11   NH black  2hsgrad (0,24]       2.199
## 12   NH other  2hsgrad (0,24]       2.023
## 13   NH white 3somecol (0,24]       1.731
## 14   hispanic 3somecol (0,24]       1.701
## 15   NH black 3somecol (0,24]       1.693
## 16   NH other 3somecol (0,24]       1.557
## 17   NH white 4colgrad (0,24]       1.068
## 18   hispanic 4colgrad (0,24]       1.049
## 19   NH black 4colgrad (0,24]       1.044
## 20   NH other 4colgrad (0,24]       0.961