1.) a. Outcome variable is binge drinking. The survey question asks how many times in the past 30 days have you consumed 5+ drinks for men or 4+ drinks for women on an occassion. b. This means the potential exposure period is equivalent across. There is no need to have an offset term. In addition, I am limiting my analysis to individuals in the labor force.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(haven)
mydata<-haven::read_xpt("~/Documents/1A_DEM Doctorate/DEM 7283/R/LLCP2017.xpt")
#names(mydata)
Clean up names
newnames<-tolower(gsub(pattern = "_", replacement = "", x= names(mydata))) 
names(mydata)<-newnames
 
Recode variables
#Binary variable will be 'unintentslep' yes or no which is a recoded 'slepday1'.  The variable 'slepday1' which is the number of self-reported days the respondent unintentionally fell asleep during the day.
#recode the number of hours of sleep to binary
mydata$shortsleep<-Recode(mydata$sleptim1, recodes ="1:6='shortsleep'; 6:24='nrmsleep'; else=NA")
#recode unintentionally fell asleep during the day
mydata$unintentslep<-Recode(mydata$slepday1, recodes ="1:14=1; 88=0; else=NA")
#recode sex variable to male or female
mydata$male<-Recode(mydata$sex, recodes="1='male'; 2='0female'; else=NA")
#recode 5 level race/ethnicity 
mydata <- mydata %>%
  mutate(racethlvl = ifelse(racegr3 == 1, "0nh white", 
                                ifelse(racegr3== 2, "nh black",
                                ifelse(racegr3 == 3, "nh other/multi",
                                ifelse(racegr3 == 4, "nh other/multi",
                                ifelse(racegr3 == 5, "hispanic", NA))))))
#individual dummy variables
mydata$black<-Recode(mydata$racegr3, recodes="2='nh black';9=NA; else='0non black'")
mydata$white<-Recode(mydata$racegr3, recodes="1='nh white'; 9=NA; else='0non white'")
mydata$other<-Recode(mydata$racegr3, recodes="3:4='nh other/multi'; 9=NA; else='0non other/multi'")
mydata$hispanic<-Recode(mydata$racegr3, recodes="5='hispanic'; 9=NA; else='0non hispanic'")
#recode marital status
mydata <- mydata %>%
  mutate(maritallvl = ifelse(marital == 1, "Married", 
                                ifelse(marital == 2, "PrevMarried",
                                ifelse(marital == 3, "PrevMarried",
                                ifelse(marital == 4, "PrevMarried",
                                ifelse(marital == 5, "0NevMarried", NA))))))
#dummy variable married
mydata$married<-Recode(mydata$marital, recodes="1='married'; 2:5='0no'; else=NA")
#recode education
mydata <- mydata %>%
  mutate(educlvl = ifelse(educag == 1, "lessHS", 
                                ifelse(educag == 2, "HS",
                                ifelse(educag == 3, "Some Col",
                                ifelse(educag == 4, "Col Grad", NA)))))
#dummy variable education
mydata$col <- Recode(mydata$educag, recodes="1:2='0LessCol'; 3:4='Col'; else=NA")
#difficulty concentrating or remembering
mydata$brainf<-Recode(mydata$decide, recodes ="1='brainf'; 2='0no'; else=NA")
#recode poor health (either mental or physcial)
mydata$poorhlthnew<-Recode(mydata$poorhlth, recodes ="1:30='poor'; 88='0good'; else=NA")
#recode last checkup
mydata <- mydata %>%
  mutate(recchecklvl = ifelse(checkup1 == 1, "<1 yr", 
                                ifelse(checkup1 == 2, "<2 yr",
                                ifelse(checkup1 == 3, "<5 yr",
                                ifelse(checkup1 == 4, "5+", NA)))))
#dummy variable recent checkup (within 1 year)
mydata$reccheck<-Recode(mydata$checkup1, recodes ="1='recent'; 2:4='0no'; else=NA")
#recode laborforce etc
mydata$laborforce<-Recode(mydata$employ1, recodes="1:4='labforce'; 5:8='0no'; else=NA")
#recode employment
mydata$employyes<-Recode(mydata$employ1, recodes="1:2='employed'; 3:4='0no';  else=NA")
#outlierReplace(my_data, "brnk3ge5", which(my_data$drnk3ge5 <70), NA)
#recode binge drinking
mydata$bingedrink<-Recode(mydata$drnk3ge5, recodes= "88=0; 77=NA; 99=NA")
mydata$histress<-Recode(mydata$sdhstres, recodes="1='0none'; 2:3='some'; 4:5='histress'; else=NA")
mydata$cancertypes<-Recode(mydata$cncrdiff, recodes="7=NA; 9=NA")
#recode select illness
#mydata$hrdx <- Recode(mydata$crgvprb2, recode="8=1; 1:7=0; 9:76=0; else=NA")
#mydata$subst <- Recode(mydata$crgvprb2, recode="12=1; 1:11=0; 13:76=0; else=NA")
#mydata$injur <- Recode(mydata$crgvprb2, recode="13=1; 1:12=0; 14:76=0; else=NA")
#mydata$mentanx <- Recode(mydata$crgvprb2, recode="10=1; 1:9=0; 11:76=0; else=NA")
 
Subset data
library(dplyr)
mydatahm4<-mydata %>%
   select(unintentslep, male, racethlvl, black, white, other, hispanic, maritallvl, married, educlvl, col, brainf, poorhlthnew, recchecklvl, reccheck, laborforce, employyes, bingedrink, histress, llcpwt) %>% 
   filter(laborforce=='labforce') %>% 
   filter(is.na(llcpwt)==F)
  
#complete.cases(.)
 
Survey design object with weights
library(survey)
options(survey.lonely.psu = "adjust")
desmydata<-svydesign(ids=~1, weights=~llcpwt, data=mydatahm4)
 
What does the variable look like (no survey weights)
hist(mydatahm4$bingedrink)

summary (mydatahm4$bingedrink)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    1.32    1.00   76.00  106099
 
Descriptives with survey weights
svyhist(~bingedrink, desmydata)

svyby(~bingedrink, ~unintentslep, desmydata, svymean, na.rm=T)
##   unintentslep bingedrink         se
## 0            0   1.382093 0.05537481
## 1            1   1.584306 0.12599406
svyby(~bingedrink, ~histress, desmydata, svymean, na.rm=T)
##          histress bingedrink         se
## 0none       0none   1.454883 0.10759134
## histress histress   2.224345 0.17563528
## some         some   1.327717 0.05520098
 
 
2.) Based on the ‘intraocular traumatic test’ (C. Sparks ? to present) of the binge drinking variable histogram, Poisson could be a potential model to pursue. First will try with survey design, next without. a. The level of dispersion is best with the Poisson model without survey design weights (~2.2); however it is not 1. b. the Poisson model violates dispersion assumption and may not be the best model to choose.
Poisson model to sub survey data
pfit1<-svyglm(bingedrink~factor(racethlvl)+factor(histress), design=desmydata, family=poisson)
summary(pfit1)
## 
## Call:
## svyglm(formula = bingedrink ~ factor(racethlvl) + factor(histress), 
##     design = desmydata, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~llcpwt, data = mydatahm4)
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.44407    0.07889   5.629 1.83e-08 ***
## factor(racethlvl)hispanic       -0.37039    0.10250  -3.614 0.000302 ***
## factor(racethlvl)nh black       -0.34719    0.15242  -2.278 0.022744 *  
## factor(racethlvl)nh other/multi -0.34665    0.15160  -2.287 0.022225 *  
## factor(histress)histress         0.41339    0.10881   3.799 0.000146 ***
## factor(histress)some            -0.07535    0.08532  -0.883 0.377157    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 11.24609)
## 
## Number of Fisher Scoring iterations: 7
round(exp(summary(pfit1)$coef[-1,1]), 3)
##       factor(racethlvl)hispanic       factor(racethlvl)nh black 
##                           0.690                           0.707 
## factor(racethlvl)nh other/multi        factor(histress)histress 
##                           0.707                           1.512 
##            factor(histress)some 
##                           0.927
pfit2<-glm(bingedrink~factor(racethlvl)+factor(histress), data=mydatahm4, family=poisson)
summary(pfit2)
## 
## Call:
## glm(formula = bingedrink ~ factor(racethlvl) + factor(histress), 
##     family = poisson, data = mydatahm4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2899  -1.6163  -1.5840  -0.2796  21.7787  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      0.226759   0.008996  25.206  < 2e-16 ***
## factor(racethlvl)hispanic        0.059909   0.021126   2.836 0.004572 ** 
## factor(racethlvl)nh black       -0.243087   0.026357  -9.223  < 2e-16 ***
## factor(racethlvl)nh other/multi  0.088355   0.025244   3.500 0.000465 ***
## factor(histress)histress         0.648773   0.014958  43.372  < 2e-16 ***
## factor(histress)some             0.040411   0.011524   3.507 0.000454 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 133010  on 26985  degrees of freedom
## Residual deviance: 130979  on 26980  degrees of freedom
##   (216413 observations deleted due to missingness)
## AIC: 155472
## 
## Number of Fisher Scoring iterations: 7
scale<-sqrt(pfit2$deviance/pfit2$df.residual)
scale
## [1] 2.203335
qpfit3<-glm(bingedrink~factor(racethlvl)+factor(histress), data=mydatahm4, family=quasipoisson)
summary(qpfit3)
## 
## Call:
## glm(formula = bingedrink ~ factor(racethlvl) + factor(histress), 
##     family = quasipoisson, data = mydatahm4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2899  -1.6163  -1.5840  -0.2796  21.7787  
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.22676    0.03116   7.277 3.51e-13 ***
## factor(racethlvl)hispanic        0.05991    0.07318   0.819  0.41299    
## factor(racethlvl)nh black       -0.24309    0.09130  -2.663  0.00776 ** 
## factor(racethlvl)nh other/multi  0.08835    0.08744   1.010  0.31229    
## factor(histress)histress         0.64877    0.05181  12.521  < 2e-16 ***
## factor(histress)some             0.04041    0.03992   1.012  0.31138    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 11.99856)
## 
##     Null deviance: 133010  on 26985  degrees of freedom
## Residual deviance: 130979  on 26980  degrees of freedom
##   (216413 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(lmtest)
library(sandwich)
coeftest(pfit2, vcov=vcovHC(pfit2, type="HC1", cluster="desmydata"))
## 
## z test of coefficients:
## 
##                                  Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)                      0.226759   0.033301  6.8094 9.804e-12 ***
## factor(racethlvl)hispanic        0.059909   0.070301  0.8522  0.394117    
## factor(racethlvl)nh black       -0.243087   0.085468 -2.8442  0.004452 ** 
## factor(racethlvl)nh other/multi  0.088355   0.096925  0.9116  0.361993    
## factor(histress)histress         0.648773   0.056041 11.5767 < 2.2e-16 ***
## factor(histress)some             0.040411   0.040414  0.9999  0.317345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
 
3.) The Poisson could be better, so I will try a negative binomial model to see if it improves.
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
nbfit1<-glm.nb(bingedrink~factor(racethlvl)+factor(histress), data=mydatahm4)
summary(nbfit1)
## 
## Call:
## glm.nb(formula = bingedrink ~ factor(racethlvl) + factor(histress), 
##     data = mydatahm4, init.theta = 0.1713081505, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9785  -0.8590  -0.8523  -0.0986   4.2109  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      0.22783    0.02601   8.759  < 2e-16 ***
## factor(racethlvl)hispanic        0.06307    0.06619   0.953 0.340655    
## factor(racethlvl)nh black       -0.24866    0.07295  -3.409 0.000653 ***
## factor(racethlvl)nh other/multi  0.09052    0.08033   1.127 0.259813    
## factor(histress)histress         0.64880    0.05322  12.190  < 2e-16 ***
## factor(histress)some             0.03816    0.03345   1.141 0.253958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1713) family taken to be 1)
## 
##     Null deviance: 18217  on 26985  degrees of freedom
## Residual deviance: 18021  on 26980  degrees of freedom
##   (216413 observations deleted due to missingness)
## AIC: 71843
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.17131 
##           Std. Err.:  0.00252 
## 
##  2 x log-likelihood:  -71828.94000
#negative binomial
exp(coef(nbfit1))
##                     (Intercept)       factor(racethlvl)hispanic 
##                       1.2558657                       1.0650985 
##       factor(racethlvl)nh black factor(racethlvl)nh other/multi 
##                       0.7798478                       1.0947401 
##        factor(histress)histress            factor(histress)some 
##                       1.9132460                       1.0389007
#poisson survey design
exp(coef(pfit1))
##                     (Intercept)       factor(racethlvl)hispanic 
##                       1.5590324                       0.6904625 
##       factor(racethlvl)nh black factor(racethlvl)nh other/multi 
##                       0.7066714                       0.7070503 
##        factor(histress)histress            factor(histress)some 
##                       1.5119312                       0.9274158
#poisson no survey design
exp(coef(pfit2))
##                     (Intercept)       factor(racethlvl)hispanic 
##                       1.2545271                       1.0617397 
##       factor(racethlvl)nh black factor(racethlvl)nh other/multi 
##                       0.7842029                       1.0923757 
##        factor(histress)histress            factor(histress)some 
##                       1.9131915                       1.0412389
Comparing all the model fits, the one with the lowest AIC is the negative binomial model (from AIC: 155472 for the poisson to AIC: 71843 for the negative binomial). In all models, the trends say that respondents with high stress are more likely to report more binge drinking occasions.
My conclusion, is that the selected models are not ideal and perhaps another model could be selected to fit. Or it could be an issue address with the data/variable. Recoding binge drinking could potentially help.