#Loading library
library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(haven)
library(ggplot2)
#Loading data from BRFSS 2017
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
###Recoding predictor variables
#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
#I have recoded smokday2 variable into binary variable. I put “1” for those who smoked everyday and those said somedays. I coded “0” for those who smoked last month and prior to that. Hence I have created smoker=1 and non-smoker=0
mydat<- brfss_17
mydat$smoker<-Recode(mydat$smokday2,recodes="1:2=1; 3=0; else=NA")
table(mydat$smoker)
##
## 0 1
## 61659 29620
#Step 1: Prepare analytical dataset and survey design information
library(dplyr)
df<-mydat%>%
select(smoker,mmsaname, bmi, agec,race_eth, marst, educ,white, black, hispanic, other, smoke, ins, mmsawt, ststr) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =df )
#step2: Run logit model
#Logit model
fit.logit<-svyglm(smoker~race_eth+educ+agec,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = smoker ~ race_eth + educ + agec, design = des,
## family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = df)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.55570 0.10136 5.483 4.20e-08 ***
## race_ethnh black 0.70078 0.07909 8.861 < 2e-16 ***
## race_ethnh multirace 0.50998 0.11520 4.427 9.57e-06 ***
## race_ethnh other 0.51588 0.10817 4.769 1.85e-06 ***
## race_ethnhwhite 0.16273 0.06181 2.633 0.008470 **
## educ0Prim 0.18353 0.10943 1.677 0.093508 .
## educ1somehs 0.56823 0.06478 8.772 < 2e-16 ***
## educ3somecol -0.33601 0.04126 -8.143 3.91e-16 ***
## educ4colgrad -1.07230 0.04133 -25.946 < 2e-16 ***
## agec(24,39] -0.32865 0.09330 -3.522 0.000428 ***
## agec(39,59] -0.80319 0.09140 -8.788 < 2e-16 ***
## agec(59,79] -1.65692 0.09209 -17.992 < 2e-16 ***
## agec(79,99] -3.01377 0.14102 -21.370 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001215)
##
## Number of Fisher Scoring iterations: 5
##Here we only see coefficent, the direction of relationship, and the significance but we need to compare or how more/less likely the groups compared to reference group. Hence we need to calculate odds ratios and confidence interval.
knitr::kable(data.frame(OR = exp(coef(fit.logit)), ci = exp(confint(fit.logit))))
| OR | ci.2.5.. | ci.97.5.. | |
|---|---|---|---|
| (Intercept) | 1.7431547 | 1.4290939 | 2.1262341 |
| race_ethnh black | 2.0153293 | 1.7259447 | 2.3532343 |
| race_ethnh multirace | 1.6652575 | 1.3286930 | 2.0870755 |
| race_ethnh other | 1.6751148 | 1.3551085 | 2.0706899 |
| race_ethnhwhite | 1.1767224 | 1.0424649 | 1.3282707 |
| educ0Prim | 1.2014543 | 0.9695308 | 1.4888567 |
| educ1somehs | 1.7651394 | 1.5546814 | 2.0040872 |
| educ3somecol | 0.7146194 | 0.6591012 | 0.7748142 |
| educ4colgrad | 0.3422210 | 0.3155938 | 0.3710949 |
| agec(24,39] | 0.7198951 | 0.5995833 | 0.8643484 |
| agec(39,59] | 0.4478969 | 0.3744396 | 0.5357649 |
| agec(59,79] | 0.1907263 | 0.1592286 | 0.2284548 |
| agec(79,99] | 0.0491063 | 0.0372475 | 0.0647408 |
| ### Interpretation |
#Here we also see the relationship between smoker and every predictor value is significant as most of them are between confidence interval.
#However, people with less than primary education are 20% more likely to smoke cigarette compare to the people with highschool graduation.
#NHBlacks are two times more likely to smoke cigarette compare to the Hispanics.
fit.probit <- svyglm(smoker~race_eth+educ+agec,design=des,family=
binomial(link="probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
###Compare two model(Logit and Probit)
myexp<-function(x){exp(x)}
stargazer(fit.logit, fit.probit,
type = "html",
style="demography",
covariate.labels=c("Black","MultRace" ,"Other","NHWhite", "PrimarySchool", "SomeHS","SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"),
ci=T, column.sep.width = "10pt",out = "C:/Users/sarah/Google Drive/MSc Demography/Spring 2020/Stat II 5283/Homeworks/logitprobit.html" )
| smoker | ||
| survey-weighted | survey-weighted | |
| logistic | probit | |
| Model 1 | Model 2 | |
| Black | 0.701*** | 0.424*** |
| (0.546, 0.856) | (0.330, 0.519) | |
| MultRace | 0.510*** | 0.311*** |
| (0.284, 0.736) | (0.173, 0.448) | |
| Other | 0.516*** | 0.309*** |
| (0.304, 0.728) | (0.180, 0.439) | |
| NHWhite | 0.163** | 0.097** |
| (0.042, 0.284) | (0.023, 0.170) | |
| PrimarySchool | 0.184 | 0.111 |
| (-0.031, 0.398) | (-0.020, 0.242) | |
| SomeHS | 0.568*** | 0.348*** |
| (0.441, 0.695) | (0.271, 0.425) | |
| SomeColl | -0.336*** | -0.205*** |
| (-0.417, -0.255) | (-0.254, -0.155) | |
| CollGrad | -1.072*** | -0.644*** |
| (-1.153, -0.991) | (-0.692, -0.596) | |
| Age 24-39 | -0.329*** | -0.206*** |
| (-0.512, -0.146) | (-0.318, -0.094) | |
| Age 39-59 | -0.803*** | -0.499*** |
| (-0.982, -0.624) | (-0.608, -0.389) | |
| Age 59-79 | -1.657*** | -1.011*** |
| (-1.837, -1.476) | (-1.121, -0.902) | |
| Age 80+ | -3.014*** | -1.757*** |
| (-3.290, -2.737) | (-1.908, -1.606) | |
| Constant | 0.556*** | 0.346*** |
| (0.357, 0.754) | (0.225, 0.466) | |
| N | 84,408 | 84,408 |
| Log Likelihood | -47,460.650 | -47,468.910 |
| AIC | 94,947.310 | 94,963.820 |
| p < .05; p < .01; p < .001 | ||
#There is no difference between those two models. However, we can see if there is any marginal effect of the models on the results.
###Marginal effects
#Logit marginal effects
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)
#for probit now
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)
effects<-data.frame(effect=c(log.marg, prob.marg),
term=rep(names(log.marg),2),
model=c(rep("logit", length(log.marg)),rep("probit", length(log.marg)))
)
effects%>%
ggplot(aes(x=term, y=effect))+geom_point(aes(color=model, group=model, shape=model),
position=position_jitterdodge(jitter.width = 1),
size=2)+
ylab("Marginal Effect")+
xlab("Model Term")+
geom_abline(intercept = 0, slope=0)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle(label = "Comparison of marginal effects in Logit and Probit models")
#They almost have similar effects. There is no significant marginal effects on the data.
#In the logit model I see that less than highschool education had non-significant relationship to smoke cigarette everyday or someday. But in odds ratio I notice that the relatioship is significant. therefore, I am interested to know their predicted probabilites in the fitted value to compare within race/ethnicity.
###Predicted probabilites for interesting cases
dat<-expand.grid(race_eth=levels(brfss_17$race_eth), educ=levels(brfss_17$educ), agec=levels(brfss_17$agec)[3])
#generate fitted values
fit<-predict(fit.logit, newdat=dat,type="response")
fitp<-predict(fit.probit, newdat=dat,type="response")
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)
#add the values to the fake data
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)
#Print the fitted probabilities for the first 20 cases
knitr::kable(head(dat, n=20))
| race_eth | educ | agec | fitted.prob.lrm | fitted.prob.pro |
|---|---|---|---|---|
| hispanic | 2hsgrad | (39,59] | 0.438 | 0.439 |
| nh black | 2hsgrad | (39,59] | 0.611 | 0.607 |
| nh multirace | 2hsgrad | (39,59] | 0.565 | 0.563 |
| nh other | 2hsgrad | (39,59] | 0.567 | 0.562 |
| nhwhite | 2hsgrad | (39,59] | 0.479 | 0.478 |
| hispanic | 0Prim | (39,59] | 0.484 | 0.483 |
| nh black | 0Prim | (39,59] | 0.654 | 0.649 |
| nh multirace | 0Prim | (39,59] | 0.610 | 0.606 |
| nh other | 0Prim | (39,59] | 0.611 | 0.605 |
| nhwhite | 0Prim | (39,59] | 0.525 | 0.522 |
| hispanic | 1somehs | (39,59] | 0.580 | 0.577 |
| nh black | 1somehs | (39,59] | 0.735 | 0.732 |
| nh multirace | 1somehs | (39,59] | 0.697 | 0.693 |
| nh other | 1somehs | (39,59] | 0.698 | 0.693 |
| nhwhite | 1somehs | (39,59] | 0.619 | 0.615 |
| hispanic | 3somecol | (39,59] | 0.358 | 0.360 |
| nh black | 3somecol | (39,59] | 0.529 | 0.527 |
| nh multirace | 3somecol | (39,59] | 0.482 | 0.481 |
| nh other | 3somecol | (39,59] | 0.483 | 0.481 |
| nhwhite | 3somecol | (39,59] | 0.396 | 0.397 |
dat[which(dat$race_eth=="nh black"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]
## race_eth educ agec fitted.prob.lrm fitted.prob.pro
## 7 nh black 0Prim (39,59] 0.654 0.649
dat[which(dat$race_eth=="hispanic"&dat$agec=="(39,59]"&dat$educ=="0Prim"),]
## race_eth educ agec fitted.prob.lrm fitted.prob.pro
## 6 hispanic 0Prim (39,59] 0.484 0.483
##Interpretation: The probablity of smoking everyday for a NHBlack person who has less than primary education is 65%.
##This is where we find inference about the smoker and ask new questions for that particular group.
##According to my research question, there are influence of races and educational background on probability of smoking.