DEM 7283 - Homewk 6 -Poisson and Negative Binomial Models

#Load data for analysis#sub-setting and Re-codding variables for analysis purposes

brfss <- readRDS("brfss_177.rds")
# Cleaning the  variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  renam))
names(brfss)<-newnames


homewk3 <- brfss %>% 
  dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1) %>%  
  mutate( smokers=ifelse(smoke100==1 & smokday2%in%c(1,2),"Current",
                 ifelse(smoke100==1 & smokday2==3, "Former", 
                ifelse(smoke100==2, "Never", NA)))) %>% 
  
mutate(bmi = car::Recode(bmi5cat,recodes="1='Underweight';2='Normal';3='Overweight'; 4='Obese';else=NA",as.factor=T ),
       educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
       racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ), 
       ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'" ), 
       Physicala = car::Recode(exerany2,recodes="='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ),
       sleepdurat=car::Recode(sleptim1, recodes="77=NA ;99= NA"),
        mentald=ifelse(menthlth<31, "Yes",
                       ifelse(menthlth==88, "No",NA))) 
 
#Data containing only variables used in the analysis
 Final_data <- homewk3 %>% 
 dplyr:: select(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,Physicala, sleepdurat,sex,racegr3,mentald,ststr) %>% 
 filter(complete.cases(.))

# removing unwanted data from the R environment
rm(brfss,newnames,renam,rg);gc()

## Warning in rm(brfss, newnames, renam, rg): object 'rg' not found

##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  2576943 137.7    5325135  284.4   2995657  160.0
## Vcells 12708632  97.0  156031371 1190.5 194979110 1487.6

#rm(list = ls())

#  Survey Design
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data = Final_data )

For the purpose of this analysis, the outcome variable is self reported average hours of sleep in a 24 period. The questiuon on survey is: “On average, how many hours of sleep do you have in a day” was re-coded such that 3 will be the lowest (Inadequate) sleep duration and 1 being the Highest (More than adequate) sleep duration any of the respondents reported.

Research Question

what is the relationship between average daily sleep duration and Body Mass Index(BMI) ?

Fitting the Poisson regression model

Since i am not interested in rates but counts of the outcome variable, i will not be using an offset for the poisson model

svyhist(~sleepdurat, des)

svyby(~sleepdurat, ~bmi+educa, des, svymean, na.rm=T)

svyby(~sleepdurat, ~ageg, des, svymean, na.rm=T)

#Poisson glm fit to survey data
fit1<-svyglm(sleepdurat~factor(bmi)+factor(educa)+factor(ageg), design=des, family=poisson)
summary(fit1)

## 
## Call:
## svyglm(formula = sleepdurat ~ factor(bmi) + factor(educa) + factor(ageg), 
##     design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = Final_data)
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.983388   0.026176  75.772  < 2e-16 ***
## factor(bmi)Obese       -0.004991   0.009544  -0.523  0.60103    
## factor(bmi)Overweight  -0.005767   0.009420  -0.612  0.54042    
## factor(bmi)Underweight  0.018093   0.025691   0.704  0.48128    
## factor(educa)>HS       -0.037761   0.018476  -2.044  0.04100 *  
## factor(educa)HS        -0.038523   0.018851  -2.044  0.04102 *  
## factor(ageg)25-30      -0.028610   0.020649  -1.386  0.16592    
## factor(ageg)35-44      -0.017171   0.020079  -0.855  0.39249    
## factor(ageg)45-54      -0.044254   0.019932  -2.220  0.02642 *  
## factor(ageg)55-64      -0.006065   0.018283  -0.332  0.74008    
## factor(ageg)65+         0.048138   0.017598   2.735  0.00624 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 0.3740781)
## 
## Number of Fisher Scoring iterations: 4

#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)

##       factor(bmi)Obese  factor(bmi)Overweight factor(bmi)Underweight 
##                  0.995                  0.994                  1.018 
##       factor(educa)>HS        factor(educa)HS      factor(ageg)25-30 
##                  0.963                  0.962                  0.972 
##      factor(ageg)35-44      factor(ageg)45-54      factor(ageg)55-64 
##                  0.983                  0.957                  0.994 
##        factor(ageg)65+ 
##                  1.049

Testing for dispersion.

# fitting logit for each changes 
library(AER)

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

dispersiontest(fit1,trafo=1)

## 
##  Overdispersion test
## 
## data:  fit1
## z = -67.147, p-value = 1
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##      alpha 
## -0.6416407

Since the p-value from the test above is greater than 0 it means that there is no overdispersion. However, the alpha from the test is less than 0 this indiccates underdispersion. The result from the test suggests that Poisson model is not a good choice for the data.

Negative Binomial Model

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

fit2<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa),
              data=Final_data,
              weights=llcpwt/mean(llcpwt, na.rm=T))

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))

## 
## z test of coefficients:
## 
##                           Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)              1.9917218  0.0203111 98.0609 < 2.2e-16 ***
## factor(bmi)Obese        -0.0081711  0.0098870 -0.8265  0.408548    
## factor(bmi)Overweight   -0.0075330  0.0097022 -0.7764  0.437503    
## factor(bmi)Underweight   0.0251463  0.0261481  0.9617  0.336207    
## factor(racegr3)NH-Black -0.0265255  0.0194589 -1.3632  0.172834    
## factor(racegr3)NH-White -0.0123831  0.0126914 -0.9757  0.329211    
## factor(racegr3)Other    -0.0599543  0.0195331 -3.0694  0.002145 ** 
## factor(educa)>HS        -0.0287362  0.0192925 -1.4895  0.136357    
## factor(educa)HS         -0.0330634  0.0195729 -1.6892  0.091172 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Alternative models

fit.nb1<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa),
              data=Final_data,
              weights=llcpwt/mean(llcpwt, na.rm=T))

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

fit.nb2<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa)+factor(ageg),
              data=Final_data,
              weights=llcpwt/mean(llcpwt, na.rm=T))

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))

stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )

## 
## ---------------------------------------------------------------------------
##                                             sleepdurat                     
##                                  Model 1                   Model 2         
## ---------------------------------------------------------------------------
## factor(bmi)Obese                 -0.008                    -0.006          
##                                  (0.010)                   (0.010)         
## factor(bmi)Overweight            -0.008                    -0.008          
##                                  (0.009)                   (0.009)         
## factor(bmi)Underweight            0.025                     0.019          
##                                  (0.026)                   (0.026)         
## factor(racegr3)NH-Black          -0.027                    -0.037          
##                                  (0.019)                   (0.019)         
## factor(racegr3)NH-White          -0.012                   -0.033**         
##                                  (0.013)                   (0.013)         
## factor(racegr3)Other            -0.060**                  -0.067***        
##                                  (0.019)                   (0.019)         
## factor(educa)> HS                -0.029                    -0.020          
##                                  (0.019)                   (0.019)         
## factor(educa)HS                  -0.033                    -0.024          
##                                  (0.020)                   (0.020)         
## factor(ageg)25-30                                          -0.030          
##                                                            (0.021)         
## factor(ageg)35-44                                          -0.017          
##                                                            (0.020)         
## factor(ageg)45-54                                          -0.042*         
##                                                            (0.020)         
## factor(ageg)55-64                                          -0.002          
##                                                            (0.018)         
## factor(ageg)65+                                            0.052**         
##                                                            (0.018)         
## Constant                        1.992***                  1.998***         
##                                  (0.027)                   (0.027)         
## N                                11,432                    11,432          
## Log Likelihood                 -23,761.520               -23,715.180       
## theta                   333,562.800 (471,726.300) 339,828.300 (481,934.700)
## AIC                            47,541.040                47,458.370        
## ---------------------------------------------------------------------------
## *p < .05; **p < .01; ***p < .001

The second model appears to fit the data better. It has the smallest AIC 47,458.370