Research question: Do racial/ethnic and educational background have any influence on smoking?

#Loading library

library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(haven)
library(ggplot2)

#Loading data from BRFSS 2017

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

###Recoding predictor variables

#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

Recoding outcome variable

#I have recoded smokday2 variable into binary variable. I put “1” for those who smoked everyday and those said somedays. I coded “0” for those who smoked last month and prior to that. Hence I have created smoker=1 and non-smoker=0

mydat<- brfss_17

mydat$smoker<-Recode(mydat$smokday2,recodes="1:2=1; 3=0; else=NA")

table(mydat$smoker)

## 
##     0     1 
## 61659 29620

Analysis

#Step 1: Prepare analytical dataset and survey design information

library(dplyr)
df<-mydat%>%
  select(smoker,mmsaname, bmi, agec,race_eth, marst, educ,white, black, hispanic, other, smoke, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =df )

Logit model

#step2: Run logit model

#Logit model
fit.logit<-svyglm(smoker~race_eth+educ+agec,
                  design= des,
                  family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit)

## 
## Call:
## svyglm(formula = smoker ~ race_eth + educ + agec, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = df)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.55570    0.10136   5.483 4.20e-08 ***
## race_ethnh black      0.70078    0.07909   8.861  < 2e-16 ***
## race_ethnh multirace  0.50998    0.11520   4.427 9.57e-06 ***
## race_ethnh other      0.51588    0.10817   4.769 1.85e-06 ***
## race_ethnhwhite       0.16273    0.06181   2.633 0.008470 ** 
## educ0Prim             0.18353    0.10943   1.677 0.093508 .  
## educ1somehs           0.56823    0.06478   8.772  < 2e-16 ***
## educ3somecol         -0.33601    0.04126  -8.143 3.91e-16 ***
## educ4colgrad         -1.07230    0.04133 -25.946  < 2e-16 ***
## agec(24,39]          -0.32865    0.09330  -3.522 0.000428 ***
## agec(39,59]          -0.80319    0.09140  -8.788  < 2e-16 ***
## agec(59,79]          -1.65692    0.09209 -17.992  < 2e-16 ***
## agec(79,99]          -3.01377    0.14102 -21.370  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001215)
## 
## Number of Fisher Scoring iterations: 5

##Here we only see coefficent, the direction of relationship, and the significance but we need to compare or how more/less likely the groups compared to reference group. Hence we need to calculate odds ratios and confidence interval.

Get odds ratios and confidence intervals

knitr::kable(data.frame(OR = exp(coef(fit.logit)), ci = exp(confint(fit.logit))))

	OR	ci.2.5..	ci.97.5..
(Intercept)	1.7431547	1.4290939	2.1262341
race_ethnh black	2.0153293	1.7259447	2.3532343
race_ethnh multirace	1.6652575	1.3286930	2.0870755
race_ethnh other	1.6751148	1.3551085	2.0706899
race_ethnhwhite	1.1767224	1.0424649	1.3282707
educ0Prim	1.2014543	0.9695308	1.4888567
educ1somehs	1.7651394	1.5546814	2.0040872
educ3somecol	0.7146194	0.6591012	0.7748142
educ4colgrad	0.3422210	0.3155938	0.3710949
agec(24,39]	0.7198951	0.5995833	0.8643484
agec(39,59]	0.4478969	0.3744396	0.5357649
agec(59,79]	0.1907263	0.1592286	0.2284548
agec(79,99]	0.0491063	0.0372475	0.0647408
### Interpretation

#Here we also see the relationship between smoker and every predictor value is significant as most of them are between confidence interval.

#However, people with less than primary education are 20% more likely to smoke cigarette compare to the people with highschool graduation.

#NHBlacks are two times more likely to smoke cigarette compare to the Hispanics.

Now Compare with probit model to see whether there is any difference or not??

fit.probit <- svyglm(smoker~race_eth+educ+agec,design=des,family=
                     binomial(link="probit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

###Compare two model(Logit and Probit)

myexp<-function(x){exp(x)}

stargazer(fit.logit, fit.probit,
          type = "html",
          style="demography",
          covariate.labels=c("Black","MultRace" ,"Other","NHWhite", "PrimarySchool", "SomeHS","SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"),
          ci=T, column.sep.width = "10pt",out = "C:/Users/sarah/Google Drive/MSc Demography/Spring 2020/Stat II 5283/Homeworks/logitprobit.html" )


	smoker
	survey-weighted	survey-weighted
	logistic	probit
	Model 1	Model 2

Black	0.701^***	0.424^***
	(0.546, 0.856)	(0.330, 0.519)
MultRace	0.510^***	0.311^***
	(0.284, 0.736)	(0.173, 0.448)
Other	0.516^***	0.309^***
	(0.304, 0.728)	(0.180, 0.439)
NHWhite	0.163^**	0.097^**
	(0.042, 0.284)	(0.023, 0.170)
PrimarySchool	0.184	0.111
	(-0.031, 0.398)	(-0.020, 0.242)
SomeHS	0.568^***	0.348^***
	(0.441, 0.695)	(0.271, 0.425)
SomeColl	-0.336^***	-0.205^***
	(-0.417, -0.255)	(-0.254, -0.155)
CollGrad	-1.072^***	-0.644^***
	(-1.153, -0.991)	(-0.692, -0.596)
Age 24-39	-0.329^***	-0.206^***
	(-0.512, -0.146)	(-0.318, -0.094)
Age 39-59	-0.803^***	-0.499^***
	(-0.982, -0.624)	(-0.608, -0.389)
Age 59-79	-1.657^***	-1.011^***
	(-1.837, -1.476)	(-1.121, -0.902)
Age 80+	-3.014^***	-1.757^***
	(-3.290, -2.737)	(-1.908, -1.606)
Constant	0.556^***	0.346^***
	(0.357, 0.754)	(0.225, 0.466)
N	84,408	84,408
Log Likelihood	-47,460.650	-47,468.910
AIC	94,947.310	94,963.820

p < .05; p < .01; p < .001

#There is no difference between those two models. However, we can see if there is any marginal effect of the models on the results.

###Marginal effects

#Logit marginal effects
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)

#for probit now
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)


effects<-data.frame(effect=c(log.marg, prob.marg),
                    term=rep(names(log.marg),2),
                    model=c(rep("logit", length(log.marg)),rep("probit", length(log.marg)))
                    )

effects%>%
  ggplot(aes(x=term, y=effect))+geom_point(aes(color=model, group=model, shape=model),
  position=position_jitterdodge(jitter.width = 1),
             size=2)+
  ylab("Marginal Effect")+
  xlab("Model Term")+
  geom_abline(intercept = 0, slope=0)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle(label = "Comparison of marginal effects in Logit and Probit models")

#They almost have similar effects. There is no significant marginal effects on the data.

#In the logit model I see that less than highschool education had non-significant relationship to smoke cigarette everyday or someday. But in odds ratio I notice that the relatioship is significant. therefore, I am interested to know their predicted probabilites in the fitted value to compare within race/ethnicity.

###Predicted probabilites for interesting cases

dat<-expand.grid(race_eth=levels(brfss_17$race_eth), educ=levels(brfss_17$educ), agec=levels(brfss_17$agec)[3])
#generate fitted values
fit<-predict(fit.logit, newdat=dat,type="response")
fitp<-predict(fit.probit, newdat=dat,type="response")

dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)

#add the values to the fake data
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)

#Print the fitted probabilities for the first 20 cases
knitr::kable(head(dat, n=20))

race_eth	educ	agec	fitted.prob.lrm	fitted.prob.pro
hispanic	2hsgrad	(39,59]	0.438	0.439
nh black	2hsgrad	(39,59]	0.611	0.607
nh multirace	2hsgrad	(39,59]	0.565	0.563
nh other	2hsgrad	(39,59]	0.567	0.562
nhwhite	2hsgrad	(39,59]	0.479	0.478
hispanic	0Prim	(39,59]	0.484	0.483
nh black	0Prim	(39,59]	0.654	0.649
nh multirace	0Prim	(39,59]	0.610	0.606
nh other	0Prim	(39,59]	0.611	0.605
nhwhite	0Prim	(39,59]	0.525	0.522
hispanic	1somehs	(39,59]	0.580	0.577
nh black	1somehs	(39,59]	0.735	0.732
nh multirace	1somehs	(39,59]	0.697	0.693
nh other	1somehs	(39,59]	0.698	0.693
nhwhite	1somehs	(39,59]	0.619	0.615
hispanic	3somecol	(39,59]	0.358	0.360
nh black	3somecol	(39,59]	0.529	0.527
nh multirace	3somecol	(39,59]	0.482	0.481
nh other	3somecol	(39,59]	0.483	0.481
nhwhite	3somecol	(39,59]	0.396	0.397

dat[which(dat$race_eth=="nh black"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]

##   race_eth  educ    agec fitted.prob.lrm fitted.prob.pro
## 7 nh black 0Prim (39,59]           0.654           0.649

dat[which(dat$race_eth=="hispanic"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]

##   race_eth  educ    agec fitted.prob.lrm fitted.prob.pro
## 6 hispanic 0Prim (39,59]           0.484           0.483

##Interpretation: The probablity of smoking everyday for a NHBlack person who has less than primary education is 65%.

##This is where we find inference about the smoker and ask new questions for that particular group.

##According to my research question, there are influence of races and educational background on probability of smoking.

Dem5283_hw2_logit&probit