This example will cover the use of R functions for fitting binary logit and probit models to complex survey data.
For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link
#load brfss
library(car)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2014). stargazer: LaTeX code and ASCII text for well-formatted regression and summary statistics tables.
## R package version 5.1. http://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(sjPlot)
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_11)
head(nams, n=10)
## [1] "_STATE" "precall" "fmonth" "intvid" "seqno" "_PSU"
## [7] "ctelenum" "cellfon" "pvtresid" "genhlth"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss_11)<-tolower(newnames)
#Poor or fair self rated health
#brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)
brfss_11$badhealth<-recode(brfss_11$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0")
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0")
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0")
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0")
brfss_11$race_group<-recode(brfss_11$racegr2, recodes="1='NH white'; 2='NH black'; 3:4='NH other';5='hispanic'; else=NA", as.factor.result = T)
brfss_11$race_group<-relevel(brfss_11$race_group, ref = 'NH white')
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#income grouping
brfss_11$inc<-ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)
#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss_11$educ<-relevel(brfss_11$educ, ref='2hsgrad')
#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
First, we will do some descriptive analysis, such as means and cross tabulations.
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~cntywt, data = brfss_11[is.na(brfss_11$cntywt)==F,] )
#First, we examine the % of US adults with poor/fair health by education level, and do a survey-corrected chi-square test for independence.
cat<-svyby(formula = ~badhealth, by = ~educ, design = des, FUN = svymean, na.rm=T)
svychisq(~badhealth+educ, design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~badhealth + educ, design = des)
## F = 700.4969, ndf = 3.625, ddf = 879846.356, p-value < 2.2e-16
qplot(x=cat$educ,y=cat$badhealth, data=cat ,xlab="Education", ylab="% Fair/Poor Health" )+
geom_errorbar(aes(x=educ, ymin=badhealth-se,ymax= badhealth+se), width=.25)+
ggtitle(label = "% of US Adults with Fair/Poor Health by Education")
#calculate race*health cross tabulation, and plot it
dog<-svyby(formula = ~badhealth, by = ~race_group, design = des, FUN = svymean, na.rm=T)
svychisq(~badhealth+race_group, design = des)
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~badhealth + race_group, design = des)
## F = 268.1499, ndf = 2.849, ddf = 691423.424, p-value < 2.2e-16
qplot(x=dog$race_group,y=dog$badhealth, data=dog ,xlab="Race", ylab="% Fair/Poor Health" )+
geom_errorbar(aes(x=race_group, ymin=badhealth-se,ymax= badhealth+se), width=.25)+
ggtitle(label = "% of US Adults with Fair/Poor Health by Race/Ethnicity")
#calculate race*education*health cross tabulation, and plot it
catdog<-svyby(formula = ~badhealth, by = ~race_group+educ, design = des, FUN = svymean, na.rm=T)
#this plot is a little more complicated
catdog$race_rec<-rep(c("White","Hispanic", "Black", "Other" ),5)
p<-ggplot(catdog, aes(educ,badhealth),xlab="Race", ylab="% Bad Health")
p<-p+geom_point(aes(colour=race_rec))
p<-p+geom_line(aes(colour=race_rec,group=race_rec))
p<-p+geom_errorbar(aes(x=educ, ymin=badhealth-se,ymax= badhealth+se,colour=race_rec), width=.25)
p<-p+ylab("% Fair/Poor Health")
p<-p+xlab("Education Level")
p+ggtitle("% of US Adults in 2011 in Bad Health by Race and Education")
Which shows a significant variation in health status by education level and race/ethnicty
There is no trick to fitting logistic regression models usign survey data, just use the svyglm()
function with the apppriate distribution specified via family=binomial
for logistic and family=binomial(link="probit")
for the probit model. You don’t have to specify the link function if you’re just doing the logistic model, as it is the default.
fit.logit<-svyglm(badhealth~race_group+educ+agec,design= des, family=binomial)
## Warning: non-integer #successes in a binomial glm!
fit.probit<-svyglm(badhealth~race_group+educ+agec, design=des, family=binomial(link= "probit"))
## Warning: non-integer #successes in a binomial glm!
stargazer(fit.logit, fit.probit, type="html", style="demography",title="Logit and Probit Fits", column.labels = c("Logit Model", "Probit Model"), covariate.labels=c("Hispanic", "Black", "Other","PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"), keep.stat="n", model.names=F, align=T)
badhealth | ||
Logit Model | Probit Model | |
Model 1 | Model 2 | |
Hispanic | 0.571*** | 0.320*** |
(0.040) | (0.023) | |
Black | 0.553*** | 0.312*** |
(0.037) | (0.021) | |
Other | 0.355*** | 0.200*** |
(0.055) | (0.030) | |
PrimarySchool | 0.928*** | 0.566*** |
(0.063) | (0.039) | |
SomeHS | 0.580*** | 0.337*** |
(0.047) | (0.028) | |
SomeColl | -0.354*** | -0.198*** |
(0.031) | (0.018) | |
CollGrad | -1.126*** | -0.602*** |
(0.035) | (0.019) | |
Age 24-39 | 0.468*** | 0.242*** |
(0.068) | (0.035) | |
Age 39-59 | 1.077*** | 0.572*** |
(0.062) | (0.032) | |
Age 59-79 | 1.462*** | 0.793*** |
(0.062) | (0.032) | |
Age 80+ | 1.685*** | 0.933*** |
(0.070) | (0.037) | |
Constant | -2.583*** | -1.479*** |
(0.063) | (0.032) | |
N | 238,438 | 238,438 |
p < .05; p < .01; p < .001 |
These are nice plots of the model effects
sjp.glm(fit.logit,title="Odds Ratios for Fair/Poor vs Good Health" , sortOdds=F, showModelSummary=T)
## Warning in logLik.svyglm(fit): svyglm not fitted by maximum likelihood.
## Warning in logLik.svyglm(fit): svyglm not fitted by maximum likelihood.
## Intercept = 0.08
## R2[cs] = 0.089
## R2[n] = 0.151
## Lambda = -382630.28
## Chi2 = 0.00
## AIC = 175473.40
sjp.glm(fit.probit,title="Odds Ratios for Fair/Poor vs Good Health", sortOdds=F, showModelSummary=T)
## Warning in logLik.svyglm(fit): svyglm not fitted by maximum likelihood.
## Warning in logLik.svyglm(fit): svyglm not fitted by maximum likelihood.
## Intercept = 0.23
## R2[cs] = 0.089
## R2[n] = 0.151
## Lambda = -382636.44
## Chi2 = 0.00
## AIC = 175486.44
As I often say, I like to talk about “interesting cases”. In order to do this, you need the fitted mean for a particular case. This is done by getting the fitted values for that case from the model. To do this, I generate a bunch of “fake people” that have variability in the model covariates, and fit the model for each type of person. This is perhaps overkill in this example because I fit every type of person, ideally you would want a few interesting cases to discuss:
#get a series of predicted probabilites for different "types" of people for each model
dat<-expand.grid(race_group=levels(brfss_11$race_group), educ=levels(brfss_11$educ), agec=levels(brfss_11$agec))
#You MAY need to get rid of impossible cases here
#generate the fitted values
fit<-predict(fit.logit, newdat=dat,type="response")
fitp<-predict(fit.probit, newdat=dat,type="response")
#add the values to the fake data
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fitp, 3)
#Print the fitted probabilities for the first 20 cases
head(dat, n=20)
## race_group educ agec fitted.prob.lrm fitted.prob.pro
## 1 NH white 2hsgrad (0,24] 0.070 0.070
## 2 hispanic 2hsgrad (0,24] 0.118 0.123
## 3 NH black 2hsgrad (0,24] 0.116 0.122
## 4 NH other 2hsgrad (0,24] 0.097 0.100
## 5 NH white 0Prim (0,24] 0.160 0.181
## 6 hispanic 0Prim (0,24] 0.253 0.277
## 7 NH black 0Prim (0,24] 0.249 0.274
## 8 NH other 0Prim (0,24] 0.214 0.238
## 9 NH white 1somehs (0,24] 0.119 0.127
## 10 hispanic 1somehs (0,24] 0.193 0.205
## 11 NH black 1somehs (0,24] 0.190 0.203
## 12 NH other 1somehs (0,24] 0.161 0.173
## 13 NH white 3somecol (0,24] 0.050 0.047
## 14 hispanic 3somecol (0,24] 0.086 0.087
## 15 NH black 3somecol (0,24] 0.084 0.086
## 16 NH other 3somecol (0,24] 0.070 0.070
## 17 NH white 4colgrad (0,24] 0.024 0.019
## 18 hispanic 4colgrad (0,24] 0.042 0.039
## 19 NH black 4colgrad (0,24] 0.041 0.038
## 20 NH other 4colgrad (0,24] 0.034 0.030