Research question: Do racial/ethnic and educational background have any influence on smoking?

#Loading library

library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(haven)
library(ggplot2)

#Loading data from BRFSS 2017

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

###Recoding predictor variables

#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

Recoding outcome variable

#I have recoded smokday2 variable into binary variable. I put “1” for those who smoked everyday and those said somedays. I coded “0” for those who smoked last month and prior to that. Hence I have created smoker=1 and non-smoker=0

mydat<- brfss_17

mydat$smoker<-Recode(mydat$smokday2,recodes="1:2=1; 3=0; else=NA")

table(mydat$smoker)
## 
##     0     1 
## 61659 29620

Analysis

#Step 1: Prepare analytical dataset and survey design information

library(dplyr)
df<-mydat%>%
  select(smoker,mmsaname, bmi, agec,race_eth, marst, educ,white, black, hispanic, other, smoke, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =df )

Logit model

#step2: Run logit model

#Logit model
fit.logit<-svyglm(smoker~race_eth+educ+agec,
                  design= des,
                  family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
## 
## Call:
## svyglm(formula = smoker ~ race_eth + educ + agec, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = df)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.55570    0.10136   5.483 4.20e-08 ***
## race_ethnh black      0.70078    0.07909   8.861  < 2e-16 ***
## race_ethnh multirace  0.50998    0.11520   4.427 9.57e-06 ***
## race_ethnh other      0.51588    0.10817   4.769 1.85e-06 ***
## race_ethnhwhite       0.16273    0.06181   2.633 0.008470 ** 
## educ0Prim             0.18353    0.10943   1.677 0.093508 .  
## educ1somehs           0.56823    0.06478   8.772  < 2e-16 ***
## educ3somecol         -0.33601    0.04126  -8.143 3.91e-16 ***
## educ4colgrad         -1.07230    0.04133 -25.946  < 2e-16 ***
## agec(24,39]          -0.32865    0.09330  -3.522 0.000428 ***
## agec(39,59]          -0.80319    0.09140  -8.788  < 2e-16 ***
## agec(59,79]          -1.65692    0.09209 -17.992  < 2e-16 ***
## agec(79,99]          -3.01377    0.14102 -21.370  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001215)
## 
## Number of Fisher Scoring iterations: 5

##Here we only see coefficent, the direction of relationship, and the significance but we need to compare or how more/less likely the groups compared to reference group. Hence we need to calculate odds ratios and confidence interval.

Get odds ratios and confidence intervals

knitr::kable(data.frame(OR = exp(coef(fit.logit)), ci = exp(confint(fit.logit)))) 
OR ci.2.5.. ci.97.5..
(Intercept) 1.7431547 1.4290939 2.1262341
race_ethnh black 2.0153293 1.7259447 2.3532343
race_ethnh multirace 1.6652575 1.3286930 2.0870755
race_ethnh other 1.6751148 1.3551085 2.0706899
race_ethnhwhite 1.1767224 1.0424649 1.3282707
educ0Prim 1.2014543 0.9695308 1.4888567
educ1somehs 1.7651394 1.5546814 2.0040872
educ3somecol 0.7146194 0.6591012 0.7748142
educ4colgrad 0.3422210 0.3155938 0.3710949
agec(24,39] 0.7198951 0.5995833 0.8643484
agec(39,59] 0.4478969 0.3744396 0.5357649
agec(59,79] 0.1907263 0.1592286 0.2284548
agec(79,99] 0.0491063 0.0372475 0.0647408
### Interpretation

#Here we also see the relationship between smoker and every predictor value is significant as most of them are between confidence interval.

#However, people with less than primary education are 20% more likely to smoke cigarette compare to the people with highschool graduation.

#NHBlacks are two times more likely to smoke cigarette compare to the Hispanics.

Now Compare with probit model to see whether there is any difference or not??

fit.probit <- svyglm(smoker~race_eth+educ+agec,design=des,family=
                     binomial(link="probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

###Compare two model(Logit and Probit)

myexp<-function(x){exp(x)}

stargazer(fit.logit, fit.probit,
          type = "html",
          style="demography",
          covariate.labels=c("Black","MultRace" ,"Other","NHWhite", "PrimarySchool", "SomeHS","SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"),
          ci=T, column.sep.width = "10pt",out = "C:/Users/sarah/Google Drive/MSc Demography/Spring 2020/Stat II 5283/Homeworks/logitprobit.html" )
smoker
survey-weighted survey-weighted
logistic probit
Model 1 Model 2
Black 0.701*** 0.424***
(0.546, 0.856) (0.330, 0.519)
MultRace 0.510*** 0.311***
(0.284, 0.736) (0.173, 0.448)
Other 0.516*** 0.309***
(0.304, 0.728) (0.180, 0.439)
NHWhite 0.163** 0.097**
(0.042, 0.284) (0.023, 0.170)
PrimarySchool 0.184 0.111
(-0.031, 0.398) (-0.020, 0.242)
SomeHS 0.568*** 0.348***
(0.441, 0.695) (0.271, 0.425)
SomeColl -0.336*** -0.205***
(-0.417, -0.255) (-0.254, -0.155)
CollGrad -1.072*** -0.644***
(-1.153, -0.991) (-0.692, -0.596)
Age 24-39 -0.329*** -0.206***
(-0.512, -0.146) (-0.318, -0.094)
Age 39-59 -0.803*** -0.499***
(-0.982, -0.624) (-0.608, -0.389)
Age 59-79 -1.657*** -1.011***
(-1.837, -1.476) (-1.121, -0.902)
Age 80+ -3.014*** -1.757***
(-3.290, -2.737) (-1.908, -1.606)
Constant 0.556*** 0.346***
(0.357, 0.754) (0.225, 0.466)
N 84,408 84,408
Log Likelihood -47,460.650 -47,468.910
AIC 94,947.310 94,963.820
p < .05; p < .01; p < .001

#There is no difference between those two models. However, we can see if there is any marginal effect of the models on the results.

###Marginal effects

#Logit marginal effects
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)

#for probit now
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)


effects<-data.frame(effect=c(log.marg, prob.marg),
                    term=rep(names(log.marg),2),
                    model=c(rep("logit", length(log.marg)),rep("probit", length(log.marg)))
                    )

effects%>%
  ggplot(aes(x=term, y=effect))+geom_point(aes(color=model, group=model, shape=model),
  position=position_jitterdodge(jitter.width = 1),
             size=2)+
  ylab("Marginal Effect")+
  xlab("Model Term")+
  geom_abline(intercept = 0, slope=0)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle(label = "Comparison of marginal effects in Logit and Probit models")

#They almost have similar effects. There is no significant marginal effects on the data.

#In the logit model I see that less than highschool education had non-significant relationship to smoke cigarette everyday or someday. But in odds ratio I notice that the relatioship is significant. therefore, I am interested to know their predicted probabilites in the fitted value to compare within race/ethnicity.

###Predicted probabilites for interesting cases

dat<-expand.grid(race_eth=levels(brfss_17$race_eth), educ=levels(brfss_17$educ), agec=levels(brfss_17$agec)[3])
#generate fitted values
fit<-predict(fit.logit, newdat=dat,type="response")
fitp<-predict(fit.probit, newdat=dat,type="response")

dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)

#add the values to the fake data
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)

#Print the fitted probabilities for the first 20 cases
knitr::kable(head(dat, n=20))
race_eth educ agec fitted.prob.lrm fitted.prob.pro
hispanic 2hsgrad (39,59] 0.438 0.439
nh black 2hsgrad (39,59] 0.611 0.607
nh multirace 2hsgrad (39,59] 0.565 0.563
nh other 2hsgrad (39,59] 0.567 0.562
nhwhite 2hsgrad (39,59] 0.479 0.478
hispanic 0Prim (39,59] 0.484 0.483
nh black 0Prim (39,59] 0.654 0.649
nh multirace 0Prim (39,59] 0.610 0.606
nh other 0Prim (39,59] 0.611 0.605
nhwhite 0Prim (39,59] 0.525 0.522
hispanic 1somehs (39,59] 0.580 0.577
nh black 1somehs (39,59] 0.735 0.732
nh multirace 1somehs (39,59] 0.697 0.693
nh other 1somehs (39,59] 0.698 0.693
nhwhite 1somehs (39,59] 0.619 0.615
hispanic 3somecol (39,59] 0.358 0.360
nh black 3somecol (39,59] 0.529 0.527
nh multirace 3somecol (39,59] 0.482 0.481
nh other 3somecol (39,59] 0.483 0.481
nhwhite 3somecol (39,59] 0.396 0.397
dat[which(dat$race_eth=="nh black"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]
##   race_eth  educ    agec fitted.prob.lrm fitted.prob.pro
## 7 nh black 0Prim (39,59]           0.654           0.649
dat[which(dat$race_eth=="hispanic"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]
##   race_eth  educ    agec fitted.prob.lrm fitted.prob.pro
## 6 hispanic 0Prim (39,59]           0.484           0.483

##Interpretation: The probablity of smoking everyday for a NHBlack person who has less than primary education is 65%.

##This is where we find inference about the smoker and ask new questions for that particular group.

##According to my research question, there are influence of races and educational background on probability of smoking.