This example will cover the use of R functions for fitting count data models to complex survey data.
For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link
#load brfss
library(car)
library(survey)
## Loading required package: grid
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(MASS)
library(pscl)
## Loading required package: lattice
## Classes and Methods for R developed in the
##
## Political Science Computational Laboratory
##
## Department of Political Science
##
## Stanford University
##
## Simon Jackman
##
## hurdle and zeroinfl functions by Achim Zeileis
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_11)
head(nams, n=10)
## [1] "_STATE" "precall" "fmonth" "intvid" "seqno" "_PSU"
## [7] "ctelenum" "cellfon" "pvtresid" "genhlth"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss_11)<-tolower(newnames)
#Outcome is: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
brfss_11$healthydays<-ifelse(brfss_11$physhlth %in%c(77,99), NA,ifelse(brfss_11$physhlth==88,0, brfss_11$physhlth))
#race/ethnicity
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0")
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0")
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0")
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0")
brfss_11$race_group<-recode(brfss_11$racegr2, recodes="1='NH white'; 2='NH black'; 3:4='NH other';5='hispanic'; else=NA", as.factor.result = T)
brfss_11$race_group<-relevel(brfss_11$race_group, ref = 'NH white')
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#income grouping
brfss_11$inc<-ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)
#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
#brfss_11$educ<-relevel(brfss_11$educ, ref='0Prim')
#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
First, we will subset our data to have complete cases for our variables in our model and make our survey design object
#Here I keep complete cases on my key variables, AND only cases for TX, just for speed (the suvey procedures can run for a long time)
brfss_11<-brfss_11[is.na(brfss_11$healthydays)==F&is.na(brfss_11$cntywt)==F&is.na(brfss_11$black)==F&is.na(brfss_11$educ)==F&brfss_11$state==48,]
#FOR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
options(survey.lonely.psu = "adjust")
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~ststr, weights=~cntywt, data = brfss_11[is.na(brfss_11$cntywt)==F,], nest=T )
To fit a Poisson GLM to survey data in R, we use the svyglm fucntion in the survey library.
#First I do some simple descriptives
svyhist(~healthydays, des)
svyby(~healthydays, ~race_group+educ, des, svymean)
## race_group educ healthydays se
## NH white.0Prim NH white 0Prim 2.2098 0.7945
## hispanic.0Prim hispanic 0Prim 4.4616 0.7622
## NH black.0Prim NH black 0Prim 7.9453 3.7086
## NH other.0Prim NH other 0Prim 2.3599 1.8957
## NH white.1somehs NH white 1somehs 4.3014 1.2629
## hispanic.1somehs hispanic 1somehs 2.7066 0.5954
## NH black.1somehs NH black 1somehs 5.5027 1.4486
## NH other.1somehs NH other 1somehs 4.9275 2.7237
## NH white.2hsgrad NH white 2hsgrad 3.8612 0.5174
## hispanic.2hsgrad hispanic 2hsgrad 3.2261 0.5848
## NH black.2hsgrad NH black 2hsgrad 4.3885 0.9617
## NH other.2hsgrad NH other 2hsgrad 6.1952 2.0821
## NH white.3somecol NH white 3somecol 3.6084 0.3534
## hispanic.3somecol hispanic 3somecol 2.9251 0.5488
## NH black.3somecol NH black 3somecol 3.3241 0.5994
## NH other.3somecol NH other 3somecol 2.0023 0.5217
## NH white.4colgrad NH white 4colgrad 2.0947 0.1612
## hispanic.4colgrad hispanic 4colgrad 2.2108 0.4453
## NH black.4colgrad NH black 4colgrad 2.1182 0.3634
## NH other.4colgrad NH other 4colgrad 0.9925 0.2178
#Poisson glm fit to survey data
fit1<-svyglm(healthydays~race_group+educ+agec, design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = healthydays ~ race_group + educ + agec, design = des,
## family = poisson)
##
## Survey design:
## svydesign(ids = ~psu, strata = ~ststr, weights = ~cntywt, data = brfss_11[is.na(brfss_11$cntywt) ==
## F, ], nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.84729 0.25721 3.29 0.00099 ***
## race_grouphispanic -0.00797 0.12196 -0.07 0.94788
## race_groupNH black 0.15012 0.12216 1.23 0.21917
## race_groupNH other -0.10501 0.18773 -0.56 0.57594
## educ1somehs -0.16561 0.20703 -0.80 0.42376
## educ2hsgrad -0.15005 0.18185 -0.83 0.40933
## educ3somecol -0.32257 0.17949 -1.80 0.07235 .
## educ4colgrad -0.84530 0.18183 -4.65 3.4e-06 ***
## agec(24,39] 0.36457 0.23305 1.56 0.11777
## agec(39,59] 0.73850 0.21312 3.47 0.00053 ***
## agec(59,79] 1.07773 0.21148 5.10 3.6e-07 ***
## agec(79,99] 1.19155 0.23540 5.06 4.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 15.18)
##
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## race_grouphispanic race_groupNH black race_groupNH other
## 0.992 1.162 0.900
## educ1somehs educ2hsgrad educ3somecol
## 0.847 0.861 0.724
## educ4colgrad agec(24,39] agec(39,59]
## 0.429 1.440 2.093
## agec(59,79] agec(79,99]
## 2.938 3.292
Now, R will not fit the other count data models described in the notes, so, we will fit them using sample weights only, then calculate the robust standard errors. We standardize the weights to equal the sample size, as opposed to the population size by dividing each person’s weight by the mean weight.
#First, I define a function to get the clustered, or robust standard errors. This function effectively controls for the within-strata homogeneity when calculateing the se's for the betas.
#I stole this from: http://drewdimmery.com/robust-ses-in-r/
#and http://people.su.se/~ma/clustering.pdf
#I also added a correction to use this with the hurdle and zero-inflated models
#This is how stata gets robust se's
clx2 <-
function(fm, dfcw, cluster){
# R-codes (www.r-project.org) for computing
# clustered-standard errors. Mahmood Arai, Jan 26, 2008.
# The arguments of the function are:
# fitted model, cluster1
# You need to install libraries `sandwich' and `lmtest'
# reweighting the var-cov matrix for the within model
require(sandwich);require(lmtest)
if(class(fm)=="zeroinfl"|class(fm)=="hurdle") {
M <- length(unique(cluster))
N <- length(cluster)
K <- dim(fm$vcov)[1] #here is the rank from the zero inflated fits
dfc <- (M/(M-1))*((N-1)/(N-K))
uj <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
vcovCL <- dfc[1]*sandwich(fm, meat=crossprod(uj)/N)*dfcw #fix a length problem in dfc
list(summary=coeftest(fm, vcovCL))}
else if(class(fm)!="zeroinfl"){
M <- length(unique(cluster))
N <- length(cluster)
K <- fm$rank
dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum));
rcse.cov <- dfc * sandwich(fm, meat = crossprod(uj)/N)
rcse.se <- coeftest(fm, rcse.cov)
return(list( rcse.se))}
}
#Fit poisson, and compare it to the fit from the survey design
#Fit the Poisson GLM
fit.pois<-glm(healthydays~race_group+educ+agec, data=brfss_11, weights=cntywt/mean(cntywt), family=poisson)
summary(fit.pois)
##
## Call:
## glm(formula = healthydays ~ race_group + educ + agec, family = poisson,
## data = brfss_11, weights = cntywt/mean(cntywt))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.484 -1.994 -0.958 -0.020 28.967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.84729 0.03578 23.68 < 2e-16 ***
## race_grouphispanic -0.00797 0.01741 -0.46 0.647
## race_groupNH black 0.15012 0.01978 7.59 3.2e-14 ***
## race_groupNH other -0.10501 0.03204 -3.28 0.001 **
## educ1somehs -0.16561 0.03095 -5.35 8.8e-08 ***
## educ2hsgrad -0.15005 0.02624 -5.72 1.1e-08 ***
## educ3somecol -0.32257 0.02667 -12.09 < 2e-16 ***
## educ4colgrad -0.84530 0.02880 -29.35 < 2e-16 ***
## agec(24,39] 0.36457 0.02664 13.69 < 2e-16 ***
## agec(39,59] 0.73850 0.02493 29.62 < 2e-16 ***
## agec(59,79] 1.07773 0.02661 40.50 < 2e-16 ***
## agec(79,99] 1.19155 0.03887 30.66 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 70053 on 7197 degrees of freedom
## Residual deviance: 65706 on 7186 degrees of freedom
## AIC: 74332
##
## Number of Fisher Scoring iterations: 7
#here are the poisson model "risk ratios"
round(exp(summary(fit.pois)$coef[-1,1]), 3)
## race_grouphispanic race_groupNH black race_groupNH other
## 0.992 1.162 0.900
## educ1somehs educ2hsgrad educ3somecol
## 0.847 0.861 0.724
## educ4colgrad agec(24,39] agec(39,59]
## 0.429 1.440 2.093
## agec(59,79] agec(79,99]
## 2.938 3.292
#I use psu as the clustering variable, this is legal.
#Here is the Poisson model
clx2(fit.pois, 1, brfss_11$psu)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
## [[1]]
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.84729 0.25730 3.29 0.00099 ***
## race_grouphispanic -0.00797 0.12191 -0.07 0.94786
## race_groupNH black 0.15012 0.12225 1.23 0.21947
## race_groupNH other -0.10501 0.18779 -0.56 0.57606
## educ1somehs -0.16561 0.20712 -0.80 0.42393
## educ2hsgrad -0.15005 0.18199 -0.82 0.40966
## educ3somecol -0.32257 0.17950 -1.80 0.07233 .
## educ4colgrad -0.84530 0.18202 -4.64 3.4e-06 ***
## agec(24,39] 0.36457 0.23343 1.56 0.11833
## agec(39,59] 0.73850 0.21314 3.46 0.00053 ***
## agec(59,79] 1.07773 0.21157 5.09 3.5e-07 ***
## agec(79,99] 1.19155 0.23548 5.06 4.2e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#which looks nearly identical to the output using the survey design from above
#Fit the Negative Binomial GLM
fit.nb<-glm.nb(healthydays~race_group+educ+agec, data=brfss_11, weights=cntywt/mean(cntywt))
summary(fit.nb)
##
## Call:
## glm.nb(formula = healthydays ~ race_group + educ + agec, data = brfss_11,
## weights = cntywt/mean(cntywt), init.theta = 0.1415724034,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.442 -0.732 -0.340 -0.010 4.751
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.83253 0.17091 4.87 1.1e-06 ***
## race_grouphispanic 0.00791 0.08218 0.10 0.9233
## race_groupNH black 0.04456 0.10010 0.45 0.6562
## race_groupNH other -0.25798 0.13481 -1.91 0.0557 .
## educ1somehs -0.10227 0.16702 -0.61 0.5403
## educ2hsgrad -0.02241 0.14575 -0.15 0.8778
## educ3somecol -0.23319 0.14590 -1.60 0.1100
## educ4colgrad -0.74118 0.14829 -5.00 5.8e-07 ***
## agec(24,39] 0.28386 0.10316 2.75 0.0059 **
## agec(39,59] 0.66317 0.09964 6.66 2.8e-11 ***
## agec(59,79] 1.03391 0.11767 8.79 < 2e-16 ***
## agec(79,99] 1.14742 0.21767 5.27 1.4e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.1416) family taken to be 1)
##
## Null deviance: 5177.4 on 7197 degrees of freedom
## Residual deviance: 4985.6 on 7186 degrees of freedom
## AIC: 24366
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.14157
## Std. Err.: 0.00362
##
## 2 x log-likelihood: -24339.97700
clx2(fit.nb, 1, brfss_11$psu)
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
## [[1]]
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.83253 0.25620 3.25 0.00116 **
## race_grouphispanic 0.00791 0.13839 0.06 0.95440
## race_groupNH black 0.04456 0.13029 0.34 0.73232
## race_groupNH other -0.25798 0.17526 -1.47 0.14103
## educ1somehs -0.10227 0.23849 -0.43 0.66804
## educ2hsgrad -0.02241 0.21376 -0.10 0.91651
## educ3somecol -0.23319 0.21359 -1.09 0.27494
## educ4colgrad -0.74118 0.21178 -3.50 0.00047 ***
## agec(24,39] 0.28386 0.21172 1.34 0.18001
## agec(39,59] 0.66317 0.19239 3.45 0.00057 ***
## agec(59,79] 1.03391 0.19024 5.43 5.5e-08 ***
## agec(79,99] 1.14742 0.21532 5.33 9.9e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Zero Inflated Poisson Mixture Model
fit.zip<-zeroinfl(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="poisson")
## Warning: non-integer #successes in a binomial glm!
summary(fit.zip)
##
## Call:
## zeroinfl(formula = healthydays ~ race_group + educ + agec | agec,
## data = brfss_11, weights = cntywt/mean(cntywt), dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.4152 -0.5565 -0.2810 -0.0239 21.8913
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.91423 0.03681 52.00 < 2e-16 ***
## race_grouphispanic -0.03890 0.01733 -2.25 0.02476 *
## race_groupNH black -0.02766 0.01991 -1.39 0.16469
## race_groupNH other 0.13441 0.03222 4.17 3.0e-05 ***
## educ1somehs -0.00338 0.03119 -0.11 0.91363
## educ2hsgrad 0.09429 0.02637 3.58 0.00035 ***
## educ3somecol -0.11535 0.02671 -4.32 1.6e-05 ***
## educ4colgrad -0.48553 0.02880 -16.86 < 2e-16 ***
## agec(24,39] 0.23496 0.02689 8.74 < 2e-16 ***
## agec(39,59] 0.51047 0.02534 20.14 < 2e-16 ***
## agec(59,79] 0.68021 0.02693 25.26 < 2e-16 ***
## agec(79,99] 0.70691 0.03916 18.05 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8767 0.0651 13.46 < 2e-16 ***
## agec(24,39] -0.1046 0.0805 -1.30 0.1939
## agec(39,59] -0.2357 0.0768 -3.07 0.0021 **
## agec(59,79] -0.4908 0.0889 -5.52 3.4e-08 ***
## agec(79,99] -0.6823 0.1625 -4.20 2.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 25
## Log-likelihood: -1.9e+04 on 17 Df
clx2(fit.zip, 1, brfss_11$psu)
## $summary
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## count_(Intercept) 1.91423 0.21788 8.79 < 2e-16 ***
## count_race_grouphispanic -0.03890 0.09573 -0.41 0.68448
## count_race_groupNH black -0.02766 0.09706 -0.29 0.77564
## count_race_groupNH other 0.13441 0.12932 1.04 0.29865
## count_educ1somehs -0.00338 0.16469 -0.02 0.98361
## count_educ2hsgrad 0.09429 0.14245 0.66 0.50806
## count_educ3somecol -0.11535 0.14127 -0.82 0.41421
## count_educ4colgrad -0.48553 0.14344 -3.38 0.00072 ***
## count_agec(24,39] 0.23496 0.19254 1.22 0.22238
## count_agec(39,59] 0.51047 0.17548 2.91 0.00364 **
## count_agec(59,79] 0.68021 0.17517 3.88 0.00010 ***
## count_agec(79,99] 0.70691 0.19222 3.68 0.00024 ***
## zero_(Intercept) 0.87669 0.15917 5.51 3.8e-08 ***
## zero_agec(24,39] -0.10463 0.18535 -0.56 0.57242
## zero_agec(39,59] -0.23572 0.17246 -1.37 0.17173
## zero_agec(59,79] -0.49080 0.17359 -2.83 0.00471 **
## zero_agec(79,99] -0.68231 0.21904 -3.11 0.00185 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Zero Inflated Negative Binomial Mixture Model
fit.zinb<-zeroinfl(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="negbin")
## Warning: non-integer #successes in a binomial glm!
summary(fit.zinb)
##
## Call:
## zeroinfl(formula = healthydays ~ race_group + educ + agec | agec,
## data = brfss_11, weights = cntywt/mean(cntywt), dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5311 -0.3493 -0.1741 -0.0121 14.5765
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.6076 0.1455 11.05 < 2e-16 ***
## race_grouphispanic -0.0180 0.0664 -0.27 0.7861
## race_groupNH black -0.0225 0.0791 -0.28 0.7759
## race_groupNH other -0.1060 0.1210 -0.88 0.3811
## educ1somehs -0.0323 0.1315 -0.25 0.8062
## educ2hsgrad 0.0834 0.1149 0.73 0.4681
## educ3somecol -0.1781 0.1132 -1.57 0.1155
## educ4colgrad -0.6614 0.1162 -5.69 1.3e-08 ***
## agec(24,39] 0.2719 0.0912 2.98 0.0029 **
## agec(39,59] 0.6143 0.0885 6.94 3.9e-12 ***
## agec(59,79] 0.8682 0.1010 8.60 < 2e-16 ***
## agec(79,99] 0.9117 0.1715 5.32 1.1e-07 ***
## Log(theta) -0.5979 0.0680 -8.79 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3452 0.0880 3.92 8.7e-05 ***
## agec(24,39] -0.1195 0.1034 -1.16 0.2476
## agec(39,59] -0.1691 0.0986 -1.72 0.0862 .
## agec(59,79] -0.3881 0.1207 -3.22 0.0013 **
## agec(79,99] -0.5821 0.2192 -2.66 0.0079 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.55
## Number of iterations in BFGS optimization: 25
## Log-likelihood: -1.21e+04 on 18 Df
clx2(fit.zinb, 1, brfss_11$psu)
## $summary
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## count_(Intercept) 1.6076 0.2703 5.95 2.8e-09 ***
## count_race_grouphispanic -0.0180 0.1223 -0.15 0.88286
## count_race_groupNH black -0.0225 0.1127 -0.20 0.84165
## count_race_groupNH other -0.1060 0.1546 -0.69 0.49297
## count_educ1somehs -0.0323 0.2101 -0.15 0.87796
## count_educ2hsgrad 0.0834 0.1883 0.44 0.65801
## count_educ3somecol -0.1781 0.1864 -0.96 0.33943
## count_educ4colgrad -0.6614 0.1887 -3.50 0.00046 ***
## count_agec(24,39] 0.2719 0.2150 1.26 0.20604
## count_agec(39,59] 0.6143 0.1943 3.16 0.00158 **
## count_agec(59,79] 0.8682 0.1924 4.51 6.5e-06 ***
## count_agec(79,99] 0.9117 0.2153 4.24 2.3e-05 ***
## zero_(Intercept) 0.3452 0.2409 1.43 0.15191
## zero_agec(24,39] -0.1195 0.2415 -0.50 0.62058
## zero_agec(39,59] -0.1691 0.2221 -0.76 0.44649
## zero_agec(59,79] -0.3881 0.2346 -1.65 0.09814 .
## zero_agec(79,99] -0.5821 0.3154 -1.85 0.06499 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Poisson Hurdle Model
fit.hp<-hurdle(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="poisson")
summary(fit.hp)
##
## Call:
## hurdle(formula = healthydays ~ race_group + educ + agec | agec,
## data = brfss_11, weights = cntywt/mean(cntywt), dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.4146 -0.5564 -0.2811 -0.0239 21.9415
##
## Count model coefficients (truncated poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9128 0.0368 51.93 < 2e-16 ***
## race_grouphispanic -0.0390 0.0173 -2.25 0.02461 *
## race_groupNH black -0.0275 0.0199 -1.38 0.16735
## race_groupNH other 0.1351 0.0322 4.20 2.7e-05 ***
## educ1somehs -0.0033 0.0312 -0.11 0.91563
## educ2hsgrad 0.0943 0.0264 3.58 0.00035 ***
## educ3somecol -0.1154 0.0267 -4.32 1.6e-05 ***
## educ4colgrad -0.4859 0.0288 -16.87 < 2e-16 ***
## agec(24,39] 0.2366 0.0269 8.78 < 2e-16 ***
## agec(39,59] 0.5119 0.0254 20.16 < 2e-16 ***
## agec(59,79] 0.6817 0.0270 25.27 < 2e-16 ***
## agec(79,99] 0.7084 0.0392 18.07 < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8808 0.0650 -13.54 < 2e-16 ***
## agec(24,39] 0.1057 0.0804 1.31 0.1887
## agec(39,59] 0.2393 0.0767 3.12 0.0018 **
## agec(59,79] 0.4947 0.0889 5.57 2.6e-08 ***
## agec(79,99] 0.6863 0.1625 4.22 2.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 22
## Log-likelihood: -1.9e+04 on 17 Df
clx2(fit.hp, 1, brfss_11$psu)
## $summary
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## count_(Intercept) 1.9128 0.2183 8.76 < 2e-16 ***
## count_race_grouphispanic -0.0390 0.0958 -0.41 0.68432
## count_race_groupNH black -0.0275 0.0970 -0.28 0.77696
## count_race_groupNH other 0.1351 0.1291 1.05 0.29526
## count_educ1somehs -0.0033 0.1646 -0.02 0.98399
## count_educ2hsgrad 0.0943 0.1424 0.66 0.50783
## count_educ3somecol -0.1154 0.1412 -0.82 0.41402
## count_educ4colgrad -0.4859 0.1434 -3.39 0.00071 ***
## count_agec(24,39] 0.2366 0.1933 1.22 0.22100
## count_agec(39,59] 0.5119 0.1764 2.90 0.00371 **
## count_agec(59,79] 0.6817 0.1760 3.87 0.00011 ***
## count_agec(79,99] 0.7084 0.1930 3.67 0.00024 ***
## zero_(Intercept) -0.8808 0.1589 -5.54 3.1e-08 ***
## zero_agec(24,39] 0.1057 0.1850 0.57 0.56762
## zero_agec(39,59] 0.2392 0.1722 1.39 0.16467
## zero_agec(59,79] 0.4947 0.1733 2.86 0.00431 **
## zero_agec(79,99] 0.6863 0.2188 3.14 0.00172 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Fit the Negative Binomial Hurdle Model
fit.hnb<-hurdle(healthydays~race_group+educ+agec|agec, data=brfss_11, weights=cntywt/mean(cntywt), dist="negbin")
summary(fit.hnb)
##
## Call:
## hurdle(formula = healthydays ~ race_group + educ + agec | agec,
## data = brfss_11, weights = cntywt/mean(cntywt), dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5007 -0.3473 -0.1740 -0.0164 14.4192
##
## Count model coefficients (truncated negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5306 0.1514 10.11 < 2e-16 ***
## race_grouphispanic -0.0368 0.0700 -0.53 0.599
## race_groupNH black -0.0721 0.0825 -0.87 0.382
## race_groupNH other 0.0391 0.1332 0.29 0.769
## educ1somehs 0.0117 0.1356 0.09 0.932
## educ2hsgrad 0.1712 0.1187 1.44 0.149
## educ3somecol -0.1393 0.1166 -1.19 0.232
## educ4colgrad -0.5894 0.1198 -4.92 8.6e-07 ***
## agec(24,39] 0.2764 0.0931 2.97 0.003 **
## agec(39,59] 0.6340 0.0903 7.02 2.2e-12 ***
## agec(59,79] 0.8821 0.1022 8.63 < 2e-16 ***
## agec(79,99] 0.8769 0.1707 5.14 2.8e-07 ***
## Log(theta) -0.6479 0.0691 -9.38 < 2e-16 ***
## Zero hurdle model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8808 0.0650 -13.54 < 2e-16 ***
## agec(24,39] 0.1057 0.0804 1.31 0.1887
## agec(39,59] 0.2393 0.0767 3.12 0.0018 **
## agec(59,79] 0.4947 0.0889 5.57 2.6e-08 ***
## agec(79,99] 0.6863 0.1625 4.22 2.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta: count = 0.523
## Number of iterations in BFGS optimization: 27
## Log-likelihood: -1.21e+04 on 18 Df
clx2(fit.hnb, 1, brfss_11$psu)
## $summary
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## count_(Intercept) 1.5306 0.2869 5.33 9.9e-08 ***
## count_race_grouphispanic -0.0368 0.1274 -0.29 0.7725
## count_race_groupNH black -0.0721 0.1157 -0.62 0.5330
## count_race_groupNH other 0.0391 0.1467 0.27 0.7896
## count_educ1somehs 0.0117 0.2190 0.05 0.9575
## count_educ2hsgrad 0.1712 0.1954 0.88 0.3811
## count_educ3somecol -0.1393 0.1955 -0.71 0.4760
## count_educ4colgrad -0.5894 0.1921 -3.07 0.0022 **
## count_agec(24,39] 0.2764 0.2201 1.26 0.2093
## count_agec(39,59] 0.6340 0.2029 3.12 0.0018 **
## count_agec(59,79] 0.8821 0.2037 4.33 1.5e-05 ***
## count_agec(79,99] 0.8769 0.2231 3.93 8.6e-05 ***
## zero_(Intercept) -0.8808 0.1589 -5.54 3.1e-08 ***
## zero_agec(24,39] 0.1057 0.1850 0.57 0.5676
## zero_agec(39,59] 0.2393 0.1722 1.39 0.1647
## zero_agec(59,79] 0.4947 0.1733 2.86 0.0043 **
## zero_agec(79,99] 0.6863 0.2188 3.14 0.0017 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Collect all the model AICs
AICS<-c(AIC(fit.pois),
AIC(fit.nb),
AIC(fit.zip),
AIC(fit.zinb),
AIC(fit.hp),
AIC(fit.hnb))
#Plot them here are are looking for the smallest value
plot(AICS, type="p",pch=24,bg="grey", cex=2, ylab="AIC",axes=T,xaxt="n", xlab="", ylim=c(min(AICS)-5000, max(AICS)+5000))
title(main="AIC Values for Alternative Models")
axis(1, at=1:6,labels=F) #6= number of models
labels<-c("Poisson model", "NB Model","ZI-Pois", "ZI-NebBin","HurdlePois", "HurdleNB" )
text(1:6, par("usr")[3]-.25, srt=45, adj=1, labels=labels, xpd=T)
mtext(side=1, text="Model Specification", line=3)
symbols(x= which.min(AICS), y=AICS[which.min(AICS)], circles=5, fg=2,lwd=3,add=T)
AICS[which.min(AICS)]
## [1] 24224
#looks like the negative binomimal fits best
#get a series of predicted counts of days for different "types" of people for each model
dat<-expand.grid(race_group=levels(brfss_11$race_group), educ=levels(brfss_11$educ), agec=levels(brfss_11$agec))
#generate the fitted values
fitted<-predict(fit.zinb, newdat=dat,type="response")
#add the values to the fake data
dat$fitted.days<-round(fitted, 3)
#Print the fitted number of days
head(dat, n=20)
## race_group educ agec fitted.days
## 1 NH white 0Prim (0,24] 2.069
## 2 hispanic 0Prim (0,24] 2.032
## 3 NH black 0Prim (0,24] 2.023
## 4 NH other 0Prim (0,24] 1.861
## 5 NH white 1somehs (0,24] 2.003
## 6 hispanic 1somehs (0,24] 1.968
## 7 NH black 1somehs (0,24] 1.959
## 8 NH other 1somehs (0,24] 1.802
## 9 NH white 2hsgrad (0,24] 2.249
## 10 hispanic 2hsgrad (0,24] 2.209
## 11 NH black 2hsgrad (0,24] 2.199
## 12 NH other 2hsgrad (0,24] 2.023
## 13 NH white 3somecol (0,24] 1.731
## 14 hispanic 3somecol (0,24] 1.701
## 15 NH black 3somecol (0,24] 1.693
## 16 NH other 3somecol (0,24] 1.557
## 17 NH white 4colgrad (0,24] 1.068
## 18 hispanic 4colgrad (0,24] 1.049
## 19 NH black 4colgrad (0,24] 1.044
## 20 NH other 4colgrad (0,24] 0.961