Libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(ggplot2)
library(foreign)

Importing NHANES data

DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT", 
    NULL)
ALQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT", 
    NULL)
BMX_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT", 
    NULL)
DIQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT", 
    NULL)
SMQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT", 
    NULL)

Merging data into master file

Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)

Recoding, filtering, and cleaning up the data

nams<-names(Master4)
head(nams, n=10)
##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(Master4)<-newnames
#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
                                "Male",
                                "Female"))
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0")
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")

Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )

# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty", 
                      ifelse(Master4$indfmpir > 1 & Master4$indfmpir <=2, "1-2 times pov",
                             ifelse(Master4$indfmpir >2 & Master4$indfmpir <=3, "2-3 times pov",
                                    ifelse(Master4$indfmpir >3 & Master4$indfmpir <=4, "3-4 times pov",
                                           ifelse(Master4$indfmpir >4 & Master4$indfmpir <=5, "4-5 times pov",
                                                  ifelse(Master4$indfmpir >5, "5+ times pov",NA))))))


Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; '1-2 times pov'=2; '2-3 times pov'=3; '3-4 times pov'=4; '4-5 times pov'=5; '5+ times pov'=5",  as.factor=T)

# education

Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)

# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)

# age
Master4$agec<-cut(Master4$ridageyr,
                   breaks=c(0,17,65,80))
# smoking status
Master4$cur_smk <- Recode(Master4$smq040, recodes="1:2 = 1; else=0", as.factor=T)

# ever used e-cig
Master4$evr_ecig <- Recode(Master4$smq900, recodes="1= 1; 2=0; else=NA", as.factor=T)

# ever consumed alcohol
Master4$evr_alc <- Recode(Master4$alq111, recodes="1= 1; 2=0; else=NA", as.factor=T)

# diabetes
Master4$diabetes <- Recode(Master4$diq010, recodes="1= 1; 2:3=0; else=NA", as.factor=T)

# prediabetes
Master4$pre_dbts <- Recode(Master4$diq160, recodes="1= 1; 2=0; else=NA", as.factor=T)

# health risk for diabetes
Master4$hlth_rsk <- Recode(Master4$diq170, recodes="1= 1; 2=0; else=NA", as.factor=T)

# BMI
Master4$bmicat <- ifelse(Master4$bmxbmi < 18.5, "1 underweight",
                         ifelse(Master4$bmxbmi >=18.5 & Master4$bmxbmi < 25, "2 normal",
                                ifelse(Master4$bmxbmi >=25 & Master4$bmxbmi< 30, "3 overweight",
                                       ifelse(Master4$bmxbmi >= 30, "4 obese", NA))))

Master4$bmicat2 <- Recode (Master4$bmicat, recodes= "'1 underweight'=1; '2 normal'=2; '3 overweight'=3; '4 obese'=4; else=NA ", as.factor=T)

Master4$bmicat2 <- relevel(Master4$bmicat2, ref ="1")

Master4$bminum <- Recode (Master4$bmicat, recodes= "'1 underweight'=1; '2 normal'=2; '3 overweight'=3; '4 obese'=4; else=NA ", as.factor=F)

Master4$obese <- Recode (Master4$bmicat, recodes= "'4 obese'=1; else=0 ", as.factor=T)

Master4$smkct <- Recode (Master4$smd641, recodes="99=NA", as.factor=F)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub<-Master4%>%
  select(bmicat, bmicat2, bminum, smkct, hlth_rsk, obese, cur_smk, agec,ridageyr, marst, educ, pov, pov2, race_eth, hispanic, black, white, asian, other, sex, wtint2yr, sdmvpsu, sdmvstra) %>%
  filter

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~sdmvpsu, strata=~sdmvstra, weights=~wtint2yr, data =sub, nest = TRUE )

1. Define a count outcome for the dataset of your choosing

# The count variable I will use is smkct, the number of days in the past 30 days that participants smoked cigarettes.

hist(Master4$smkct)

a. Research Question

# Race/Ethnicity, sex, and educational attainment will have significant associations with smoking count. Will Blacks and Hispanics have higher mean counts than whites? Do men have higher mean counts than women? Will education attainment have a negative gradient on mean smoking counts?

b. Is an offset necessary?

# No because counts were collected from the same time period of 30 days for each participant.

2. Consider a Poisson regression model for the outcome

#Poisson glm fit to survey data
fit1<-svyglm(smkct~factor(race_eth)+factor(sex)+factor(educ), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = smkct ~ factor(race_eth) + factor(sex) + factor(educ), 
##     design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~sdmvpsu, strata = ~sdmvstra, weights = ~wtint2yr, 
##     data = sub, nest = TRUE)
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 3.313524   0.033156  99.937 2.65e-12 ***
## factor(race_eth)2 hispanic -0.206698   0.034973  -5.910 0.000593 ***
## factor(race_eth)3 black    -0.041686   0.029231  -1.426 0.196873    
## factor(race_eth)4 asian    -0.090768   0.075789  -1.198 0.270032    
## factor(race_eth)5 other     0.028668   0.039532   0.725 0.491868    
## factor(sex)Male             0.018639   0.029460   0.633 0.547057    
## factor(educ)2 hs            0.006774   0.032945   0.206 0.842938    
## factor(educ)3 some col     -0.061498   0.032232  -1.908 0.098052 .  
## factor(educ)4 col grad     -0.273658   0.126594  -2.162 0.067438 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 3.214746)
## 
## Number of Fisher Scoring iterations: 5
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## factor(race_eth)2 hispanic    factor(race_eth)3 black 
##                      0.813                      0.959 
##    factor(race_eth)4 asian    factor(race_eth)5 other 
##                      0.913                      1.029 
##            factor(sex)Male           factor(educ)2 hs 
##                      1.019                      1.007 
##     factor(educ)3 some col     factor(educ)4 col grad 
##                      0.940                      0.761

a. Evaluate the level of dispersion in the outcome

fit2<-glm(smkct~factor(race_eth)+factor(sex)+factor(educ), data=Master4, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = smkct ~ factor(race_eth) + factor(sex) + factor(educ), 
##     family = poisson, data = Master4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.8760   0.3084   0.5733   0.8620   2.3940  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 3.324236   0.016624 199.968  < 2e-16 ***
## factor(race_eth)2 hispanic -0.200775   0.020072 -10.003  < 2e-16 ***
## factor(race_eth)3 black    -0.056789   0.014966  -3.794 0.000148 ***
## factor(race_eth)4 asian    -0.106513   0.029858  -3.567 0.000361 ***
## factor(race_eth)5 other    -0.026423   0.024107  -1.096 0.273048    
## factor(sex)Male             0.012079   0.012861   0.939 0.347629    
## factor(educ)2 hs           -0.005148   0.016764  -0.307 0.758764    
## factor(educ)3 some col     -0.041641   0.016598  -2.509 0.012116 *  
## factor(educ)4 col grad     -0.193650   0.025671  -7.543 4.58e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3934.1  on 1001  degrees of freedom
## Residual deviance: 3749.8  on  993  degrees of freedom
##   (8252 observations deleted due to missingness)
## AIC: 8718.2
## 
## Number of Fisher Scoring iterations: 5
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 1.943249
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0

b. Is the Poisson model a good choice?

# No, the ratio between residual deviance and residual degrees of freedom is nearly 2 when it should be 1, and the test of model fit resulted in a p-value of 0, indicating the data do not fit the model.

3. Consider a Negative binomial model

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)

fit.nb1<-glm.nb(smkct~factor(race_eth),
              data=sub,
              weights=wtint2yr/mean(wtint2yr, na.rm=T))

fit.nb2<-glm.nb(smkct~factor(race_eth)+factor(sex)+factor(educ),
              data=sub,
              weights=wtint2yr/mean(wtint2yr, na.rm=T))
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)

stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )
## 
## ------------------------------------------------------------
##                                          smkct              
##                                Model 1          Model 2     
## ------------------------------------------------------------
## factor(race_eth)2 hispanic    -0.202***        -0.201***    
##                                (0.049)          (0.049)     
## factor(race_eth)3 black         -0.012           -0.038     
##                                (0.027)          (0.027)     
## factor(race_eth)4 asian         -0.137           -0.087     
##                                (0.073)          (0.073)     
## factor(race_eth)5 other         -0.023           0.033      
##                                (0.062)          (0.062)     
## factor(sex)Male                                  0.019      
##                                                 (0.031)     
## factor(educ)2 hs                                 0.009      
##                                                 (0.029)     
## factor(educ)3 some col                           -0.057     
##                                                 (0.034)     
## factor(educ)4 col grad                          -0.270**    
##                                                 (0.090)     
## Constant                       3.249***         3.309***    
##                                (0.032)          (0.032)     
## N                               1,060            1,002      
## Log Likelihood                -4,962.694       -4,579.391   
## theta                      4.183*** (0.226) 7.222*** (0.446)
## AIC                           9,935.389        9,176.781    
## ------------------------------------------------------------
## *p < .05; **p < .01; ***p < .001

4. Compare the model fits of the alternative models using AIC

# The AIC values for the poisson regression model and the negative binomial model were 8718.2 and 9176.7 respectively. Comparing both models along the lines of AIC, the poisson performed better.