For the following task I am interested in looking at Number days smoked in past 30 days (some day smokers) as a count outcome with regards to Education and Race and Ethnicity.

  1. Research Questions: I am interested in answering the following questoins:
  1. Does number of days smoking vary with increased educational attainment?
  2. Does number of days smoking vary across racial and ethinic population?

1)b. An offset term is necessary in my model in order to incorporate unequal risk. That is, people who are exposed to various conditions depending on their status, such good/bad neighborhood, stressful/light work environment, strict/indifferent school rules and regulations, and socioeconomic backgrounds, may be more/less likely to smoke in the past 30 days than someone who is not exposed to such conditions that can induce smoking.

Recoding Variables:

Education, as one of the moderators, will be categorized into three groups: High School or Less, Some College, and College. Race/Ethnicity, as another moderator, will be categorized into four groups: NHWhite, NHBlack, Hispanic, and NHother. Age is recoded to break into the following age groups: 18-24, 24-35, 35-45, 45-65, 65+.

Loading Libraries

library(readr)
library(car)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
## 
##     expand

Reading data

mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00005.dta")
names(mydata)
##   [1] "year"         "serial"       "quarter"      "strata"      
##   [5] "psu"          "nhishid"      "hhweight"     "region"      
##   [9] "livingqtr"    "nhispid"      "hhx"          "fmx"         
##  [13] "px"           "perweight"    "sampweight"   "fweight"     
##  [17] "astatflg"     "cstatflg"     "pernum"       "supp2wt"     
##  [21] "age"          "sex"          "marstat"      "marst"       
##  [25] "marstcohab"   "cohabmarst"   "cohabevmar"   "birthmo"     
##  [29] "birthyr"      "racea"        "hispeth"      "racesr"      
##  [33] "yrsinus"      "hispyn"       "usborn"       "citizen"     
##  [37] "racenew"      "racebr"       "raceimpute"   "educrec2"    
##  [41] "educrec1"     "higrade2"     "higrade1"     "educ"        
##  [45] "empstat"      "occ1995"      "occ"          "ind1995"     
##  [49] "ind"          "hourswrk"     "monthwrk"     "numemps"     
##  [53] "secondjob"    "paidhour"     "paidsick"     "usualft"     
##  [57] "empstatwkyr"  "pooryn"       "incfam97on2"  "earnimp1"    
##  [61] "earnimp2"     "earnimp3"     "earnimp4"     "earnimp5"    
##  [65] "health"       "weight"       "bedayr2"      "bmicalc"     
##  [69] "bmi"          "bedayr"       "alc1yr"       "alclife"     
##  [73] "alc5upyr"     "alcamt"       "alcstat1"     "alcstat2"    
##  [77] "alcanyno"     "alcanytp"     "alcdaysmo"    "alcdayswk"   
##  [81] "alcdaysyr"    "alc5upno"     "alc5uptp"     "smokev"      
##  [85] "smokagereg"   "cigdaymo"     "cigsday"      "cigsday1"    
##  [89] "cigsday2"     "smokestatus1" "smokestatus2" "smokfreqnow" 
##  [93] "mortelig"     "mortstat"     "mortdodq"     "mortdody"    
##  [97] "mortucod"     "mortucodld"   "mortwt"       "mortdiab"    
## [101] "morthypr"     "mortcms"      "mortndi"      "mortssa"     
## [105] "mortwtsa"

Taking a Subset of the Population and Recoding Variables

#Year of birth
mydata$yob<-mydata$year-mydata$age

#Restricting the year of birth
mydata<-subset (mydata, yob%in%c(1940:1985) & year%in%c(2001:2006))

#Dividing the cohort into early and late
mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1985='late'; else=NA", as.factor.result = )

#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)

samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples

#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
                  ifelse(sub$mortstat==2, 2006-(sub$year-sub$age), NA))
#Mortality Status
sub$d.event<-ifelse(sub$mortstat==1,1,0)
sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2006 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)

#Mortalist Status: Dead or Alive
sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)

#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 10:13='married'; 20:40='separated'; 50='nevermarried'; 99=NA", as.factor.result=T )
sub$sep<-ifelse(sub$married=='separated',1,0)
sub$nm<-ifelse(sub$married=='nevermarried',1,0)

#Smoking Status
sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 40='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor.result=T)
sub$smoke<-relevel(sub$smoke, ref = "NeverSmoked")

#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result=T)
sub$alcohol<-relevel(sub$alcohol, ref = "5")

sub$male<-ifelse(sub$sex==1,1,0)

sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)

#Age cut into intervals
sub$agere<-cut(sub$age, breaks=c(15,18,24,35,45,65,85))

#Education 
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:42='High School or Less'; 50:53='Some College'; 54:60='College'; else=NA", as.factor.result=T)

sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)

#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='White'; 20 ='Black'; 30:61='Other'; 97:99=NA", as.factor.result=T)

sub$black<-ifelse(sub$race=='Black',1,0)
sub$oth<-ifelse(sub$race=='Other',1,0)

sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")

sub$race_eth[sub$hisp == 0 & sub$race=="White"]<-"NHWhite"
## Warning: Unknown or uninitialised column: 'race_eth'.
sub$race_eth[sub$hisp == 0 & sub$race=="Black"]<-"NHBlack"
sub$race_eth[sub$hisp == 0 & sub$race=="Other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA

#race_eth <-c("NHWhite", "NHBlack", "NHOther", "Hispanic", "NA")
#sub$race_eth<-recode(sub$race_eth, recodes= "NHWhite='1'; NHBlack='2'; NHOther='3'; Hispanic='4', NA=else", as.factor.result=T)

#requency drank alcohol in past year: Days per month (Count Model Outcome Variable)
#sub$alcdays <- recode(as.numeric(sub$alcdaysmo), recodes = "0=0; 95=0; 96:99=NA")
#hist(sub$alcdays) 

#Number days smoked in past 30 days (some day smokers)
sub$cigdays <- recode(as.numeric(sub$cigdaymo), recodes = "00=0; 96:99=NA") #single inverted comma treats it as categorical.
hist(sub$cigdays)

#summary(sub$alcdays)
summary(sub$cigdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    5.00   15.00   13.31   20.00   30.00   19557

The mean shows that, people (some day smoker) smoked on an average for 15 days a month.

Analysis

sub$alconsum<-recode(sub$alcohol, recodes= "10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result=T)

sub$alconsum<-relevel(sub$alconsum, ref = "5")

sub$alcstat<-car::recode(sub$alcohol,
                                recodes="10=1; 20:23=2; 31:32=3; 35=3; 33=4; 34=5; 97:99=NA", as.factor.result = F)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Keeping only clean cases
sub2<-sub%>%
  select(cigdays, alconsum, alcstat, cohort, mortstat, agere, race_eth, married, moredu, race, black, hisp, oth, smoke, mortwt, strata, psu) %>%
  filter( complete.cases(.))

#Survey Design
options(survey.lonely.psu="adjust")
des<-svydesign(ids=~psu, strata=~strata, weights = ~mortwt, data=sub2[sub2$mortwt>0,], nest=T)

2) Poisson Regression

Simple descriptives

svyhist(~cigdays, des)

svyby(~cigdays, ~race_eth+moredu, des, svymean, na.rm=T)
##                              race_eth              moredu   cigdays
## Hispanic.College             Hispanic             College 10.128410
## NHBlack.College               NHBlack             College 11.237746
## NHOther.College               NHOther             College 10.211226
## NHWhite.College               NHWhite             College 11.796459
## Hispanic.High School or Less Hispanic High School or Less 12.334416
## NHBlack.High School or Less   NHBlack High School or Less 14.740256
## NHOther.High School or Less   NHOther High School or Less  9.458504
## NHWhite.High School or Less   NHWhite High School or Less 15.036463
## Hispanic.Some College        Hispanic        Some College 12.047193
## NHBlack.Some College          NHBlack        Some College 11.812800
## NHOther.Some College          NHOther        Some College 16.411776
## NHWhite.Some College          NHWhite        Some College 14.382866
##                                     se
## Hispanic.College             2.6953998
## NHBlack.College              3.7671581
## NHOther.College              3.8135281
## NHWhite.College              0.9825825
## Hispanic.High School or Less 1.1976894
## NHBlack.High School or Less  1.5273259
## NHOther.High School or Less  3.9745981
## NHWhite.High School or Less  0.9395040
## Hispanic.Some College        1.5128831
## NHBlack.Some College         2.2460443
## NHOther.Some College         3.6385585
## NHWhite.Some College         0.9577137
svyby(~cigdays, ~agere, des, svymean, na.rm=T)
##           agere  cigdays        se
## (15,18] (15,18] 11.18421 3.3372110
## (18,24] (18,24] 13.80263 1.1558116
## (24,35] (24,35] 13.71868 0.7193883
## (35,45] (35,45] 13.84462 0.8286642
## (45,65] (45,65] 12.73291 0.8957113
## (65,85] (65,85] 10.00000 0.0000000

Education seems to show variable effects on the number of days smoked. At every level of education, the NHBlacks show higher number of days smoked compared to others, and Hispanics show the lowest number of days smoked.

Interestingly, the youngest age group (15-18 show highest number of days smoked in a month, 17days on an average. However, the number lowers during the middle ages and increases again during the 45-65 age groups.

Poisson glm fit to survey data

fit1<-svyglm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = cigdays ~ factor(race_eth) + factor(moredu) + 
##     factor(agere), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~mortwt, data = sub2[sub2$mortwt > 
##     0, ], nest = T)
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.05553    0.24048   8.548 9.79e-12 ***
## factor(race_eth)NHBlack            0.11468    0.12568   0.912   0.3654    
## factor(race_eth)NHOther            0.11111    0.20634   0.539   0.5924    
## factor(race_eth)NHWhite            0.20251    0.09853   2.055   0.0445 *  
## factor(moredu)High School or Less  0.24753    0.09290   2.664   0.0101 *  
## factor(moredu)Some College         0.19114    0.09552   2.001   0.0503 .  
## factor(agere)(18,24]               0.22640    0.23254   0.974   0.3344    
## factor(agere)(24,35]               0.23819    0.22253   1.070   0.2890    
## factor(agere)(35,45]               0.24200    0.22385   1.081   0.2843    
## factor(agere)(45,65]               0.15283    0.22855   0.669   0.5064    
## factor(agere)(65,85]              -0.20298    0.22032  -0.921   0.3609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 5.237111)
## 
## Number of Fisher Scoring iterations: 5

The output shows what we have seen above. The Hispanics show lower number of days smoked compared to NHBlacks and NHWhites. The association of education and number of days smoked is not significant.

The poisson model “risk ratios”

round(exp(summary(fit1)$coef[-1,1]), 3) #which just show the change in the mean
##           factor(race_eth)NHBlack           factor(race_eth)NHOther 
##                             1.122                             1.118 
##           factor(race_eth)NHWhite factor(moredu)High School or Less 
##                             1.224                             1.281 
##        factor(moredu)Some College              factor(agere)(18,24] 
##                             1.211                             1.254 
##              factor(agere)(24,35]              factor(agere)(35,45] 
##                             1.269                             1.274 
##              factor(agere)(45,65]              factor(agere)(65,85] 
##                             1.165                             0.816

The risk ratios show that, the NHBlacks and NHWhites have 35% and 25% higher mean counts of days smoked than the Hispanics. All other age groups show lower percentage of mean count of days smoked than the 15-18 age groups, which is justified from the literature.

Overdispersion

fit2<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), data=sub2, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = cigdays ~ factor(race_eth) + factor(moredu) + factor(agere), 
##     family = poisson, data = sub2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.5553  -2.3953   0.0618   1.6647   4.5694  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        2.00170    0.22776   8.789  < 2e-16 ***
## factor(race_eth)NHBlack            0.16334    0.04505   3.626 0.000288 ***
## factor(race_eth)NHOther            0.11119    0.07029   1.582 0.113654    
## factor(race_eth)NHWhite            0.24882    0.03472   7.166 7.71e-13 ***
## factor(moredu)High School or Less  0.18786    0.03757   5.000 5.72e-07 ***
## factor(moredu)Some College         0.14370    0.03881   3.703 0.000213 ***
## factor(agere)(18,24]               0.31488    0.22657   1.390 0.164591    
## factor(agere)(24,35]               0.29782    0.22500   1.324 0.185630    
## factor(agere)(35,45]               0.29797    0.22547   1.322 0.186318    
## factor(agere)(45,65]               0.24339    0.22536   1.080 0.280146    
## factor(agere)(65,85]              -0.13580    0.38789  -0.350 0.726271    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2727.4  on 433  degrees of freedom
## Residual deviance: 2655.8  on 423  degrees of freedom
## AIC: 4440.1
## 
## Number of Fisher Scoring iterations: 5
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.50567

We get 2.5 times as much deviance as degree of freedom residuals. Since the value is more than 1, the model does not fit the data.

1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0

The p-value of 0 also mean that the model does not fit the data.

Modeling Overdispersion via a Quasi distribution

fit3<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere), data=sub2, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = cigdays ~ factor(race_eth) + factor(moredu) + factor(agere), 
##     family = quasipoisson, data = sub2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.5553  -2.3953   0.0618   1.6647   4.5694  
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.00170    0.53136   3.767 0.000189 ***
## factor(race_eth)NHBlack            0.16334    0.10511   1.554 0.120920    
## factor(race_eth)NHOther            0.11119    0.16398   0.678 0.498080    
## factor(race_eth)NHWhite            0.24882    0.08100   3.072 0.002266 ** 
## factor(moredu)High School or Less  0.18786    0.08765   2.143 0.032655 *  
## factor(moredu)Some College         0.14370    0.09054   1.587 0.113200    
## factor(agere)(18,24]               0.31488    0.52857   0.596 0.551681    
## factor(agere)(24,35]               0.29782    0.52492   0.567 0.570775    
## factor(agere)(35,45]               0.29797    0.52602   0.566 0.571376    
## factor(agere)(45,65]               0.24339    0.52577   0.463 0.643655    
## factor(agere)(65,85]              -0.13580    0.90494  -0.150 0.880787    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 5.442738)
## 
##     Null deviance: 2727.4  on 433  degrees of freedom
## Residual deviance: 2655.8  on 423  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

We can still see presence of overdispersion. Which means that the poisson model is not a good choice at this point.

3) Negative binomial

#Getting the clustered, or robust standard errors.
clx2 <-   function(fm, dfcw,  cluster){
    
    # reweighting the var-cov matrix for the within model
    require(sandwich);require(lmtest)
    if(class(fm)=="zeroinfl"|class(fm)=="hurdle") {
    M <- length(unique(cluster))   
    N <- length(cluster)           
    K <- dim(fm$vcov)[1]        #here is the rank from the zero inflated fits             
    dfc <- (M/(M-1))*((N-1)/(N-K))  
    uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum, na.rm=T));
    vcovCL <- dfc[1]*sandwich(fm, meat=crossprod(uj)/N)*dfcw #fix a length problem in dfc
    list(summary=coeftest(fm, vcovCL))}
    else if(class(fm)!="zeroinfl"){
    M <- length(unique(cluster))
    N <- length(cluster)
    K <- fm$rank
    dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
    uj <- apply(estfun(fm), 2, function(x) tapply(x, cluster, sum, na.rm=T));
    rcse.cov <- dfc * sandwich(fm, meat = crossprod(uj)/N)
    rcse.se <- coeftest(fm, rcse.cov)
    return(list( rcse.se))}
}

Fitting poisson and comparing it to the fit from the survey design

#scaling the weights
sub2$wts<-sub2$mortwt/mean(sub2$mortwt, na.rm=T) 
fit.pois<-glm(cigdays~factor(race_eth)+factor(moredu)+factor(agere),
              data=sub2,
              weights=wts, family=poisson)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
clx2(fit.pois,cluster =sub2$strata)
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## Warning in if (class(fm) != "zeroinfl") {: the condition has length > 1 and
## only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                                    Estimate Std. Error z value  Pr(>|z|)
## (Intercept)                        2.055525   0.253238  8.1170 4.779e-16
## factor(race_eth)NHBlack            0.114680   0.121633  0.9428  0.345763
## factor(race_eth)NHOther            0.111112   0.201659  0.5510  0.581641
## factor(race_eth)NHWhite            0.202510   0.092108  2.1986  0.027906
## factor(moredu)High School or Less  0.247530   0.085691  2.8886  0.003869
## factor(moredu)Some College         0.191136   0.092596  2.0642  0.039000
## factor(agere)(18,24]               0.226400   0.234620  0.9650  0.334563
## factor(agere)(24,35]               0.238190   0.223293  1.0667  0.286099
## factor(agere)(35,45]               0.242002   0.226697  1.0675  0.285741
## factor(agere)(45,65]               0.152828   0.230154  0.6640  0.506674
## factor(agere)(65,85]              -0.202980   0.219205 -0.9260  0.354456
##                                      
## (Intercept)                       ***
## factor(race_eth)NHBlack              
## factor(race_eth)NHOther              
## factor(race_eth)NHWhite           *  
## factor(moredu)High School or Less ** 
## factor(moredu)Some College        *  
## factor(agere)(18,24]                 
## factor(agere)(24,35]                 
## factor(agere)(35,45]                 
## factor(agere)(45,65]                 
## factor(agere)(65,85]                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The output only show significance for NHBlacks and NHWhites compared to the Hispanics in terms of the number of days smoked in the past 30 days.

coeftest(fit.pois, vcov=vcovHC(fit.pois, type="HC1",cluster="ststr")) #same as survey!
## 
## z test of coefficients:
## 
##                                    Estimate Std. Error z value  Pr(>|z|)
## (Intercept)                        2.055525   0.245564  8.3706 < 2.2e-16
## factor(race_eth)NHBlack            0.114680   0.123707  0.9270  0.353910
## factor(race_eth)NHOther            0.111112   0.205613  0.5404  0.588925
## factor(race_eth)NHWhite            0.202510   0.086788  2.3334  0.019628
## factor(moredu)High School or Less  0.247530   0.088975  2.7820  0.005402
## factor(moredu)Some College         0.191136   0.093772  2.0383  0.041520
## factor(agere)(18,24]               0.226400   0.234694  0.9647  0.334715
## factor(agere)(24,35]               0.238190   0.226747  1.0505  0.293504
## factor(agere)(35,45]               0.242002   0.227042  1.0659  0.286474
## factor(agere)(45,65]               0.152828   0.230809  0.6621  0.507881
## factor(agere)(65,85]              -0.202980   0.220929 -0.9188  0.358223
##                                      
## (Intercept)                       ***
## factor(race_eth)NHBlack              
## factor(race_eth)NHOther              
## factor(race_eth)NHWhite           *  
## factor(moredu)High School or Less ** 
## factor(moredu)Some College        *  
## factor(agere)(18,24]                 
## factor(agere)(24,35]                 
## factor(agere)(35,45]                 
## factor(agere)(45,65]                 
## factor(agere)(65,85]                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
## 
## Call:
## svyglm(formula = cigdays ~ factor(race_eth) + factor(moredu) + 
##     factor(agere), design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~mortwt, data = sub2[sub2$mortwt > 
##     0, ], nest = T)
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.05553    0.24048   8.548 9.79e-12 ***
## factor(race_eth)NHBlack            0.11468    0.12568   0.912   0.3654    
## factor(race_eth)NHOther            0.11111    0.20634   0.539   0.5924    
## factor(race_eth)NHWhite            0.20251    0.09853   2.055   0.0445 *  
## factor(moredu)High School or Less  0.24753    0.09290   2.664   0.0101 *  
## factor(moredu)Some College         0.19114    0.09552   2.001   0.0503 .  
## factor(agere)(18,24]               0.22640    0.23254   0.974   0.3344    
## factor(agere)(24,35]               0.23819    0.22253   1.070   0.2890    
## factor(agere)(35,45]               0.24200    0.22385   1.081   0.2843    
## factor(agere)(45,65]               0.15283    0.22855   0.669   0.5064    
## factor(agere)(65,85]              -0.20298    0.22032  -0.921   0.3609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 5.237111)
## 
## Number of Fisher Scoring iterations: 5

Fit the Negative Binomial GLM

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.nb2<-glm.nb(cigdays~factor(race_eth)+factor(moredu)+factor(agere),
              data=sub2,
              weights=wts)
clx2(fit.nb2,cluster =sub2$strata)
## Warning in if (class(fm) == "zeroinfl" | class(fm) == "hurdle") {: the
## condition has length > 1 and only the first element will be used
## Warning in if (class(fm) != "zeroinfl") {: the condition has length > 1 and
## only the first element will be used
## [[1]]
## 
## z test of coefficients:
## 
##                                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)                        2.015804   0.273695  7.3651 1.77e-13
## factor(race_eth)NHBlack            0.113642   0.121341  0.9366 0.348990
## factor(race_eth)NHOther            0.115581   0.202821  0.5699 0.568766
## factor(race_eth)NHWhite            0.201721   0.092016  2.1922 0.028362
## factor(moredu)High School or Less  0.253645   0.087185  2.9093 0.003623
## factor(moredu)Some College         0.197107   0.093391  2.1106 0.034811
## factor(agere)(18,24]               0.270800   0.254725  1.0631 0.287734
## factor(agere)(24,35]               0.271840   0.244950  1.1098 0.267095
## factor(agere)(35,45]               0.284109   0.248987  1.1411 0.253845
## factor(agere)(45,65]               0.180431   0.251801  0.7166 0.473645
## factor(agere)(65,85]              -0.168585   0.242355 -0.6956 0.486674
##                                      
## (Intercept)                       ***
## factor(race_eth)NHBlack              
## factor(race_eth)NHOther              
## factor(race_eth)NHWhite           *  
## factor(moredu)High School or Less ** 
## factor(moredu)Some College        *  
## factor(agere)(18,24]                 
## factor(agere)(24,35]                 
## factor(agere)(35,45]                 
## factor(agere)(45,65]                 
## factor(agere)(65,85]                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="strata"))
## 
## z test of coefficients:
## 
##                                    Estimate Std. Error z value  Pr(>|z|)
## (Intercept)                        2.015804   0.265270  7.5991 2.983e-14
## factor(race_eth)NHBlack            0.113642   0.124411  0.9134  0.361010
## factor(race_eth)NHOther            0.115581   0.206726  0.5591  0.576090
## factor(race_eth)NHWhite            0.201721   0.087256  2.3118  0.020787
## factor(moredu)High School or Less  0.253645   0.089808  2.8243  0.004738
## factor(moredu)Some College         0.197107   0.094385  2.0883  0.036769
## factor(agere)(18,24]               0.270800   0.255316  1.0606  0.288850
## factor(agere)(24,35]               0.271840   0.248894  1.0922  0.274750
## factor(agere)(35,45]               0.284109   0.249534  1.1386  0.254887
## factor(agere)(45,65]               0.180431   0.252739  0.7139  0.475287
## factor(agere)(65,85]              -0.168585   0.244659 -0.6891  0.490786
##                                      
## (Intercept)                       ***
## factor(race_eth)NHBlack              
## factor(race_eth)NHOther              
## factor(race_eth)NHWhite           *  
## factor(moredu)High School or Less ** 
## factor(moredu)Some College        *  
## factor(agere)(18,24]                 
## factor(agere)(24,35]                 
## factor(agere)(35,45]                 
## factor(agere)(45,65]                 
## factor(agere)(65,85]                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We get similar output like the survey output above.

4) Comparing the model fits of the alternative models using AIC

AIC(fit1)
##       eff.p         AIC    deltabar 
##   52.667854 2686.335608    5.266785
AIC(fit.nb2)
## [1] 3087.563

By comparing the AIC output of the Poisson and Negative Binomial model, it seems that the Poisson model is a slightly better fit than the negative binomial model. However, both the models are not a good fit to the data. As a rule of thumb, both AIC values are way higher than the coveted AIC value of 10. Hence the poisson is a preffered count model than the negative binomial to adddress the question of number of days smoked in the past 30days in association with Edcation and Race/Ethnicity.