For the following task I am interested in looking at Number days smoked in past 30 days (some day smokers) as a count outcome with regards to Education and Race and Ethnicity.
1)b. An offset term is necessary in my model in order to incorporate unequal risk. That is, people who are exposed to various conditions depending on their status, such good/bad neighborhood, stressful/light work environment, strict/indifferent school rules and regulations, and socioeconomic backgrounds, may be more/less likely to smoke in the past 30 days than someone who is not exposed to such conditions that can induce smoking.
Recoding Variables:
Education, as one of the moderators, will be categorized into three groups: High School or Less, Some College, and College. Race/Ethnicity, as another moderator, will be categorized into four groups: NHWhite, NHBlack, Hispanic, and NHother. Age is recoded to break into the following age groups: 18-24, 24-35, 35-45, 45-65, 65+.
library(readr)
library(car)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00005.dta")
names(mydata)
## [1] "year" "serial" "quarter" "strata"
## [5] "psu" "nhishid" "hhweight" "region"
## [9] "livingqtr" "nhispid" "hhx" "fmx"
## [13] "px" "perweight" "sampweight" "fweight"
## [17] "astatflg" "cstatflg" "pernum" "supp2wt"
## [21] "age" "sex" "marstat" "marst"
## [25] "marstcohab" "cohabmarst" "cohabevmar" "birthmo"
## [29] "birthyr" "racea" "hispeth" "racesr"
## [33] "yrsinus" "hispyn" "usborn" "citizen"
## [37] "racenew" "racebr" "raceimpute" "educrec2"
## [41] "educrec1" "higrade2" "higrade1" "educ"
## [45] "empstat" "occ1995" "occ" "ind1995"
## [49] "ind" "hourswrk" "monthwrk" "numemps"
## [53] "secondjob" "paidhour" "paidsick" "usualft"
## [57] "empstatwkyr" "pooryn" "incfam97on2" "earnimp1"
## [61] "earnimp2" "earnimp3" "earnimp4" "earnimp5"
## [65] "health" "weight" "bedayr2" "bmicalc"
## [69] "bmi" "bedayr" "alc1yr" "alclife"
## [73] "alc5upyr" "alcamt" "alcstat1" "alcstat2"
## [77] "alcanyno" "alcanytp" "alcdaysmo" "alcdayswk"
## [81] "alcdaysyr" "alc5upno" "alc5uptp" "smokev"
## [85] "smokagereg" "cigdaymo" "cigsday" "cigsday1"
## [89] "cigsday2" "smokestatus1" "smokestatus2" "smokfreqnow"
## [93] "mortelig" "mortstat" "mortdodq" "mortdody"
## [97] "mortucod" "mortucodld" "mortwt" "mortdiab"
## [101] "morthypr" "mortcms" "mortndi" "mortssa"
## [105] "mortwtsa"
#Year of birth
mydata$yob<-mydata$year-mydata$age
#Restricting the year of birth
mydata<-subset (mydata, yob%in%c(1940:1985) & year%in%c(2001:2006))
#Dividing the cohort into early and late
mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1985='late'; else=NA", as.factor.result = )
#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)
samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples
#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
ifelse(sub$mortstat==2, 2006-(sub$year-sub$age), NA))
#Mortality Status
sub$d.event<-ifelse(sub$mortstat==1,1,0)
sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2006 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)
#Mortalist Status: Dead or Alive
sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)
#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 10:13='married'; 20:40='separated'; 50='nevermarried'; 99=NA", as.factor.result=T )
sub$sep<-ifelse(sub$married=='separated',1,0)
sub$nm<-ifelse(sub$married=='nevermarried',1,0)
#Smoking Status
sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 40='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor.result=T)
sub$smoke<-relevel(sub$smoke, ref = "NeverSmoked")
#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result=T)
sub$alcohol<-relevel(sub$alcohol, ref = "5")
sub$male<-ifelse(sub$sex==1,1,0)
sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)
#Age cut into intervals
sub$agere<-cut(sub$age, breaks=c(15,18,24,35,45,65,85))
#Education
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:42='High School or Less'; 50:53='Some College'; 54:60='College'; else=NA", as.factor.result=T)
sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)
#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='White'; 20 ='Black'; 30:61='Other'; 97:99=NA", as.factor.result=T)
sub$black<-ifelse(sub$race=='Black',1,0)
sub$oth<-ifelse(sub$race=='Other',1,0)
sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")
sub$race_eth[sub$hisp == 0 & sub$race=="White"]<-"NHWhite"
## Warning: Unknown or uninitialised column: 'race_eth'.
sub$race_eth[sub$hisp == 0 & sub$race=="Black"]<-"NHBlack"
sub$race_eth[sub$hisp == 0 & sub$race=="Other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA
#race_eth <-c("NHWhite", "NHBlack", "NHOther", "Hispanic", "NA")
#sub$race_eth<-recode(sub$race_eth, recodes= "NHWhite='1'; NHBlack='2'; NHOther='3'; Hispanic='4', NA=else", as.factor.result=T)
#requency drank alcohol in past year: Days per month (Count Model Outcome Variable)
#sub$alcdays <- recode(as.numeric(sub$alcdaysmo), recodes = "0=0; 95=0; 96:99=NA")
#hist(sub$alcdays)
#Number days smoked in past 30 days (some day smokers)
sub$cigdays <- recode(as.numeric(sub$cigdaymo), recodes = "00=0; 96:99=NA") #single inverted comma treats it as categorical.
hist(sub$cigdays)
#summary(sub$alcdays)
summary(sub$cigdays)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 5.00 15.00 13.31 20.00 30.00 19557
The mean shows that, people (some day smoker) smoked on an average for 15 days a month.
sub$alconsum<-recode(sub$alcohol, recodes= "10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result=T)
sub$alconsum<-relevel(sub$alconsum, ref = "5")
sub$alcstat<-car::recode(sub$alcohol,
recodes="10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result = F)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Keeping only clean cases
sub2<-sub%>%
select(cigdays, alconsum, alcstat, cohort, mortstat, agere, race_eth, married, moredu, race, black, hisp, oth, smoke, mortwt, strata, psu) %>%
filter( complete.cases(.))
#Survey Design
options(survey.lonely.psu="adjust")
des<-svydesign(ids=~psu, strata=~strata, weights = ~mortwt, data=sub2[sub2$mortwt>0,], nest=T)
svyhist(~cigdays, des)
svyby(~cigdays, ~race_eth+moredu, des, svymean, na.rm=T)
## race_eth moredu cigdays
## Hispanic.College Hispanic College 10.128410
## NHBlack.College NHBlack College 11.237746
## NHOther.College NHOther College 10.211226
## NHWhite.College NHWhite College 11.796459
## Hispanic.High School or Less Hispanic High School or Less 12.334416
## NHBlack.High School or Less NHBlack High School or Less 14.740256
## NHOther.High School or Less NHOther High School or Less 9.458504
## NHWhite.High School or Less NHWhite High School or Less 15.036463
## Hispanic.Some College Hispanic Some College 12.047193
## NHBlack.Some College NHBlack Some College 11.812800
## NHOther.Some College NHOther Some College 16.411776
## NHWhite.Some College NHWhite Some College 14.382866
## se
## Hispanic.College 2.6953998
## NHBlack.College 3.7671581
## NHOther.College 3.8135281
## NHWhite.College 0.9825825
## Hispanic.High School or Less 1.1976894
## NHBlack.High School or Less 1.5273259
## NHOther.High School or Less 3.9745981
## NHWhite.High School or Less 0.9395040
## Hispanic.Some College 1.5128831
## NHBlack.Some College 2.2460443
## NHOther.Some College 3.6385585
## NHWhite.Some College 0.9577137
svyby(~cigdays, ~agere, des, svymean, na.rm=T)
## agere cigdays se
## (15,18] (15,18] 11.18421 3.3372110
## (18,24] (18,24] 13.80263 1.1558116
## (24,35] (24,35] 13.71868 0.7193883
## (35,45] (35,45] 13.84462 0.8286642
## (45,65] (45,65] 12.73291 0.8957113
## (65,85] (65,85] 10.00000 0.0000000
Education seems to show variable effects on the number of days smoked. At every level of education, the NHBlacks show higher number of days smoked compared to others, and Hispanics show the lowest number of days smoked.
Interestingly, the youngest age group (15-18 show highest number of days smoked in a month, 17days on an average. However, the number lowers during the middle ages and increases again during the 45-65 age groups.
fit1<-svyglm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = cigdays ~ factor(race_eth) + factor(moredu) +
## factor(agere), design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~mortwt, data = sub2[sub2$mortwt >
## 0, ], nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.05553 0.24048 8.548 9.79e-12 ***
## factor(race_eth)NHBlack 0.11468 0.12568 0.912 0.3654
## factor(race_eth)NHOther 0.11111 0.20634 0.539 0.5924
## factor(race_eth)NHWhite 0.20251 0.09853 2.055 0.0445 *
## factor(moredu)High School or Less 0.24753 0.09290 2.664 0.0101 *
## factor(moredu)Some College 0.19114 0.09552 2.001 0.0503 .
## factor(agere)(18,24] 0.22640 0.23254 0.974 0.3344
## factor(agere)(24,35] 0.23819 0.22253 1.070 0.2890
## factor(agere)(35,45] 0.24200 0.22385 1.081 0.2843
## factor(agere)(45,65] 0.15283 0.22855 0.669 0.5064
## factor(agere)(65,85] -0.20298 0.22032 -0.921 0.3609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 5.237111)
##
## Number of Fisher Scoring iterations: 5
The output shows what we have seen above. The Hispanics show lower number of days smoked compared to NHBlacks and NHWhites. The association of education and number of days smoked is not significant.
round(exp(summary(fit1)$coef[-1,1]), 3) #which just show the change in the mean
## factor(race_eth)NHBlack factor(race_eth)NHOther
## 1.122 1.118
## factor(race_eth)NHWhite factor(moredu)High School or Less
## 1.224 1.281
## factor(moredu)Some College factor(agere)(18,24]
## 1.211 1.254
## factor(agere)(24,35] factor(agere)(35,45]
## 1.269 1.274
## factor(agere)(45,65] factor(agere)(65,85]
## 1.165 0.816
The risk ratios show that, the NHBlacks and NHWhites have 35% and 25% higher mean counts of days smoked than the Hispanics. All other age groups show lower percentage of mean count of days smoked than the 15-18 age groups, which is justified from the literature.
fit2<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), data=sub2, family=poisson)
summary(fit2)
##
## Call:
## glm(formula = cigdays ~ factor(race_eth) + factor(moredu) + factor(agere),
## family = poisson, data = sub2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.5553 -2.3953 0.0618 1.6647 4.5694
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.00170 0.22776 8.789 < 2e-16 ***
## factor(race_eth)NHBlack 0.16334 0.04505 3.626 0.000288 ***
## factor(race_eth)NHOther 0.11119 0.07029 1.582 0.113654
## factor(race_eth)NHWhite 0.24882 0.03472 7.166 7.71e-13 ***
## factor(moredu)High School or Less 0.18786 0.03757 5.000 5.72e-07 ***
## factor(moredu)Some College 0.14370 0.03881 3.703 0.000213 ***
## factor(agere)(18,24] 0.31488 0.22657 1.390 0.164591
## factor(agere)(24,35] 0.29782 0.22500 1.324 0.185630
## factor(agere)(35,45] 0.29797 0.22547 1.322 0.186318
## factor(agere)(45,65] 0.24339 0.22536 1.080 0.280146
## factor(agere)(65,85] -0.13580 0.38789 -0.350 0.726271
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2727.4 on 433 degrees of freedom
## Residual deviance: 2655.8 on 423 degrees of freedom
## AIC: 4440.1
##
## Number of Fisher Scoring iterations: 5
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.50567
We get 2.5 times as much deviance as degree of freedom residuals. Since the value is more than 1, the model does not fit the data.
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
The p-value of 0 also mean that the model does not fit the data.
fit3<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), data=sub2, family=quasipoisson)
summary(fit3)
##
## Call:
## glm(formula = cigdays ~ factor(race_eth) + factor(moredu) + factor(agere),
## family = quasipoisson, data = sub2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.5553 -2.3953 0.0618 1.6647 4.5694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.00170 0.53136 3.767 0.000189 ***
## factor(race_eth)NHBlack 0.16334 0.10511 1.554 0.120920
## factor(race_eth)NHOther 0.11119 0.16398 0.678 0.498080
## factor(race_eth)NHWhite 0.24882 0.08100 3.072 0.002266 **
## factor(moredu)High School or Less 0.18786 0.08765 2.143 0.032655 *
## factor(moredu)Some College 0.14370 0.09054 1.587 0.113200
## factor(agere)(18,24] 0.31488 0.52857 0.596 0.551681
## factor(agere)(24,35] 0.29782 0.52492 0.567 0.570775
## factor(agere)(35,45] 0.29797 0.52602 0.566 0.571376
## factor(agere)(45,65] 0.24339 0.52577 0.463 0.643655
## factor(agere)(65,85] -0.13580 0.90494 -0.150 0.880787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 5.442738)
##
## Null deviance: 2727.4 on 433 degrees of freedom
## Residual deviance: 2655.8 on 423 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
We can still see presence of overdispersion. Which means that the poisson model is not a good choice at this point.
#Getting the clustered, or robust standard errors.
clx2 <- function(fm, dfcw, cluster){
# reweighting the var-cov matrix for the within model
require(sandwich);require(lmtest)
if(class(fm)=="zeroinfl"|class(fm)=="hurdle") {
M <- length(unique(cluster))
N <- length(cluster)
K <- dim(fm$vcov)[1] #here is the rank from the zero inflated fits
dfc <- (M/(M-1))*((N-1)/(N-K))
uj <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum, na.rm=T));
vcovCL <- dfc[1]*sandwich(fm, meat=crossprod(uj)/N)*dfcw #fix a length problem in dfc
list(summary=coeftest(fm, vcovCL))}
else if(class(fm)!="zeroinfl"){
M <- length(unique(cluster))
N <- length(cluster)
K <- fm$rank
dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum, na.rm=T));
rcse.cov <- dfc * sandwich(fm, meat = crossprod(uj)/N)
rcse.se <- coeftest(fm, rcse.cov)
return(list( rcse.se))}
}
#scaling the weights
sub2$wts<-sub2$mortwt/mean(sub2$mortwt, na.rm=T)
fit.pois<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere),
data=sub2,
weights=wts, family=poisson)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
clx2(fit.pois,cluster =sub2$strata)
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## Warning in if (class(fm) != "zeroinfl") {: the condition has length > 1 and
## only the first element will be used
## [[1]]
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.055525 0.253238 8.1170 4.779e-16
## factor(race_eth)NHBlack 0.114680 0.121633 0.9428 0.345763
## factor(race_eth)NHOther 0.111112 0.201659 0.5510 0.581641
## factor(race_eth)NHWhite 0.202510 0.092108 2.1986 0.027906
## factor(moredu)High School or Less 0.247530 0.085691 2.8886 0.003869
## factor(moredu)Some College 0.191136 0.092596 2.0642 0.039000
## factor(agere)(18,24] 0.226400 0.234620 0.9650 0.334563
## factor(agere)(24,35] 0.238190 0.223293 1.0667 0.286099
## factor(agere)(35,45] 0.242002 0.226697 1.0675 0.285741
## factor(agere)(45,65] 0.152828 0.230154 0.6640 0.506674
## factor(agere)(65,85] -0.202980 0.219205 -0.9260 0.354456
##
## (Intercept) ***
## factor(race_eth)NHBlack
## factor(race_eth)NHOther
## factor(race_eth)NHWhite *
## factor(moredu)High School or Less **
## factor(moredu)Some College *
## factor(agere)(18,24]
## factor(agere)(24,35]
## factor(agere)(35,45]
## factor(agere)(45,65]
## factor(agere)(65,85]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The output only show significance for NHBlacks and NHWhites compared to the Hispanics in terms of the number of days smoked in the past 30 days.
coeftest(fit.pois, vcov=vcovHC(fit.pois, type="HC1",cluster="ststr")) #same as survey!
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.055525 0.245564 8.3706 < 2.2e-16
## factor(race_eth)NHBlack 0.114680 0.123707 0.9270 0.353910
## factor(race_eth)NHOther 0.111112 0.205613 0.5404 0.588925
## factor(race_eth)NHWhite 0.202510 0.086788 2.3334 0.019628
## factor(moredu)High School or Less 0.247530 0.088975 2.7820 0.005402
## factor(moredu)Some College 0.191136 0.093772 2.0383 0.041520
## factor(agere)(18,24] 0.226400 0.234694 0.9647 0.334715
## factor(agere)(24,35] 0.238190 0.226747 1.0505 0.293504
## factor(agere)(35,45] 0.242002 0.227042 1.0659 0.286474
## factor(agere)(45,65] 0.152828 0.230809 0.6621 0.507881
## factor(agere)(65,85] -0.202980 0.220929 -0.9188 0.358223
##
## (Intercept) ***
## factor(race_eth)NHBlack
## factor(race_eth)NHOther
## factor(race_eth)NHWhite *
## factor(moredu)High School or Less **
## factor(moredu)Some College *
## factor(agere)(18,24]
## factor(agere)(24,35]
## factor(agere)(35,45]
## factor(agere)(45,65]
## factor(agere)(65,85]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
##
## Call:
## svyglm(formula = cigdays ~ factor(race_eth) + factor(moredu) +
## factor(agere), design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~mortwt, data = sub2[sub2$mortwt >
## 0, ], nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.05553 0.24048 8.548 9.79e-12 ***
## factor(race_eth)NHBlack 0.11468 0.12568 0.912 0.3654
## factor(race_eth)NHOther 0.11111 0.20634 0.539 0.5924
## factor(race_eth)NHWhite 0.20251 0.09853 2.055 0.0445 *
## factor(moredu)High School or Less 0.24753 0.09290 2.664 0.0101 *
## factor(moredu)Some College 0.19114 0.09552 2.001 0.0503 .
## factor(agere)(18,24] 0.22640 0.23254 0.974 0.3344
## factor(agere)(24,35] 0.23819 0.22253 1.070 0.2890
## factor(agere)(35,45] 0.24200 0.22385 1.081 0.2843
## factor(agere)(45,65] 0.15283 0.22855 0.669 0.5064
## factor(agere)(65,85] -0.20298 0.22032 -0.921 0.3609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 5.237111)
##
## Number of Fisher Scoring iterations: 5
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit.nb2<-glm.nb(cigdays~factor(race_eth)+factor(moredu)+factor(agere),
data=sub2,
weights=wts)
clx2(fit.nb2,cluster =sub2$strata)
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## Warning in if (class(fm) != "zeroinfl") {: the condition has length > 1 and
## only the first element will be used
## [[1]]
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.015804 0.273695 7.3651 1.77e-13
## factor(race_eth)NHBlack 0.113642 0.121341 0.9366 0.348990
## factor(race_eth)NHOther 0.115581 0.202821 0.5699 0.568766
## factor(race_eth)NHWhite 0.201721 0.092016 2.1922 0.028362
## factor(moredu)High School or Less 0.253645 0.087185 2.9093 0.003623
## factor(moredu)Some College 0.197107 0.093391 2.1106 0.034811
## factor(agere)(18,24] 0.270800 0.254725 1.0631 0.287734
## factor(agere)(24,35] 0.271840 0.244950 1.1098 0.267095
## factor(agere)(35,45] 0.284109 0.248987 1.1411 0.253845
## factor(agere)(45,65] 0.180431 0.251801 0.7166 0.473645
## factor(agere)(65,85] -0.168585 0.242355 -0.6956 0.486674
##
## (Intercept) ***
## factor(race_eth)NHBlack
## factor(race_eth)NHOther
## factor(race_eth)NHWhite *
## factor(moredu)High School or Less **
## factor(moredu)Some College *
## factor(agere)(18,24]
## factor(agere)(24,35]
## factor(agere)(35,45]
## factor(agere)(45,65]
## factor(agere)(65,85]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="strata"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.015804 0.265270 7.5991 2.983e-14
## factor(race_eth)NHBlack 0.113642 0.124411 0.9134 0.361010
## factor(race_eth)NHOther 0.115581 0.206726 0.5591 0.576090
## factor(race_eth)NHWhite 0.201721 0.087256 2.3118 0.020787
## factor(moredu)High School or Less 0.253645 0.089808 2.8243 0.004738
## factor(moredu)Some College 0.197107 0.094385 2.0883 0.036769
## factor(agere)(18,24] 0.270800 0.255316 1.0606 0.288850
## factor(agere)(24,35] 0.271840 0.248894 1.0922 0.274750
## factor(agere)(35,45] 0.284109 0.249534 1.1386 0.254887
## factor(agere)(45,65] 0.180431 0.252739 0.7139 0.475287
## factor(agere)(65,85] -0.168585 0.244659 -0.6891 0.490786
##
## (Intercept) ***
## factor(race_eth)NHBlack
## factor(race_eth)NHOther
## factor(race_eth)NHWhite *
## factor(moredu)High School or Less **
## factor(moredu)Some College *
## factor(agere)(18,24]
## factor(agere)(24,35]
## factor(agere)(35,45]
## factor(agere)(45,65]
## factor(agere)(65,85]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We get similar output like the survey output above.
AIC(fit1)
## eff.p AIC deltabar
## 52.667854 2686.335608 5.266785
AIC(fit.nb2)
## [1] 3087.563
By comparing the AIC output of the Poisson and Negative Binomial model, it seems that the Poisson model is a slightly better fit than the negative binomial model. However, both the models are not a good fit to the data. As a rule of thumb, both AIC values are way higher than the coveted AIC value of 10. Hence the poisson is a preffered count model than the negative binomial to adddress the question of number of days smoked in the past 30days in association with Edcation and Race/Ethnicity.