In the vast majority of situations in your work as demographers, your outcome will either be of a qualitative nature or non-normally distributed, especially if you work with individual level survey data.

When we speak of qualitative outcomes, we generally are concerned with the observation of:

Example

This example will cover the use of R functions for fitting binary logit and probit models to complex survey data.

For this example I am using 2014 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

# load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
load("brfss_14.Rdata")

# The names in the data are very ugly, so I make them less ugly
nams <- names(brfss14)
head(nams, n = 10)
 [1] "dispcode" "genhlth"  "physhlth" "menthlth" "poorhlth" "hlthpln1"
 [7] "persdoc2" "medcost"  "checkup1" "exerany2"
# we see some names are lower case, some are upper and some have a little _ in
# the first position. This is a nightmare.
newnames <- gsub(pattern = "x_", replacement = "", x = nams)
names(brfss14) <- tolower(newnames)

# Poor or fair self rated health brfss14$badhealth<-ifelse(brfss14$genhlth %in%
# c(4,5),1,0)
brfss14$badhealth <- recode(brfss14$genhlth, recodes = "4:5=1; 1:3=0; else=NA")
# race/ethnicity
brfss14$black <- recode(brfss14$racegr3, recodes = "2=1; 9=NA; else=0")
brfss14$white <- recode(brfss14$racegr3, recodes = "1=1; 9=NA; else=0")
brfss14$other <- recode(brfss14$racegr3, recodes = "3:4=1; 9=NA; else=0")
brfss14$hispanic <- recode(brfss14$racegr3, recodes = "5=1; 9=NA; else=0")
brfss14$race_eth <- recode(brfss14$racegr3, recodes = "1='nhwhite'; 2='nh black'; 3='nh other';
                         4='nh multirace'; 5='hispanic'; else=NA", 
    as.factor.result = T)
brfss14$race_eth <- relevel(brfss14$race_eth, ref = "nhwhite")
# insurance
brfss14$ins <- ifelse(brfss14$hlthpln1 == 1, 1, 0)

# income grouping
brfss14$inc <- ifelse(brfss14$incomg == 9, NA, brfss14$incomg)

# education level
brfss14$educ <- recode(brfss14$educa, recodes = "1:2='0Prim'; 3='1somehs'; 4='2hsgrad';
                     5='3somecol'; 6='4colgrad';9=NA", 
    as.factor.result = T)
# brfss14$educ<-relevel(brfss14$educ, ref='0Prim')

# employment
brfss14$employ <- recode(brfss14$employ, recodes = "1:2='Employed'; 2:6='nilf';
                       7='retired'; 8='unable'; else=NA", 
    as.factor.result = T)
brfss14$employ <- relevel(brfss14$employ, ref = "Employed")

# marital status
brfss14$marst <- recode(brfss14$marital, recodes = "1='married'; 2='divorced'; 3='widowed';
                      4='separated'; 5='nm';6='cohab'; else=NA", 
    as.factor.result = T)
brfss14$marst <- relevel(brfss14$marst, ref = "married")

# Age cut into intervals
brfss14$agec <- cut(brfss14$age80, breaks = c(0, 24, 39, 59, 79, 99), include.lowest = T)

Analysis

First, we will do some descriptive analysis, such as means and cross tabulations.

# First we tell R our survey design
options(survey.lonely.psu = "adjust")
des <- svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss14[is.na(brfss14$mmsawt) == 
    F, ])

First, we examine the % of US adults with poor/fair health by education level, and do a survey-corrected chi-square test for independence.

cat <- svyby(formula = ~badhealth, by = ~educ, design = des, FUN = svymean, na.rm = T)
svychisq(~badhealth + educ, design = des)

    Pearson's X^2: Rao & Scott adjustment

data:  svychisq(~badhealth + educ, design = des)
F = 998.11, ndf = 3.533, ddf = 856410.000, p-value < 2.2e-16
qplot(x = cat$educ, y = cat$badhealth, data = cat, xlab = "Education", ylab = "%  Fair/Poor Health") + 
    geom_errorbar(aes(x = educ, ymin = badhealth - se, ymax = badhealth + se), width = 0.25) + 
    ggtitle(label = "% of US Adults with Fair/Poor Health by Education")

# calculate race*health cross tabulation, and plot it
dog <- svyby(formula = ~badhealth, by = ~race_eth, design = des, FUN = svymean, na.rm = T)
svychisq(~badhealth + race_eth, design = des)

    Pearson's X^2: Rao & Scott adjustment

data:  svychisq(~badhealth + race_eth, design = des)
F = 253.91, ndf = 3.6193e+00, ddf = 8.7735e+05, p-value < 2.2e-16
qplot(x = dog$race_eth, y = dog$badhealth, data = dog, xlab = "Race", ylab = "%  Fair/Poor Health") + 
    geom_errorbar(aes(x = race_eth, ymin = badhealth - se, ymax = badhealth + se), 
        width = 0.25) + ggtitle(label = "% of US Adults with Fair/Poor Health by Race/Ethnicity")

# calculate race*education*health cross tabulation, and plot it
catdog <- svyby(formula = ~badhealth, by = ~race_eth + educ, design = des, FUN = svymean, 
    na.rm = T)
catdog
                          race_eth     educ  badhealth          se
nhwhite.0Prim              nhwhite    0Prim 0.46659801 0.021420370
hispanic.0Prim            hispanic    0Prim 0.51825795 0.014960682
nh black.0Prim            nh black    0Prim 0.39207216 0.042532902
nh multirace.0Prim    nh multirace    0Prim 0.47778750 0.096429344
nh other.0Prim            nh other    0Prim 0.29971925 0.066407769
nhwhite.1somehs            nhwhite  1somehs 0.33804897 0.010565874
hispanic.1somehs          hispanic  1somehs 0.33235610 0.015569293
nh black.1somehs          nh black  1somehs 0.38585274 0.019642872
nh multirace.1somehs  nh multirace  1somehs 0.22024908 0.038654420
nh other.1somehs          nh other  1somehs 0.24960345 0.046307615
nhwhite.2hsgrad            nhwhite  2hsgrad 0.18855849 0.003318257
hispanic.2hsgrad          hispanic  2hsgrad 0.21764786 0.008749008
nh black.2hsgrad          nh black  2hsgrad 0.23964410 0.009053724
nh multirace.2hsgrad  nh multirace  2hsgrad 0.21587613 0.025725335
nh other.2hsgrad          nh other  2hsgrad 0.14316482 0.016727762
nhwhite.3somecol           nhwhite 3somecol 0.13485193 0.002706258
hispanic.3somecol         hispanic 3somecol 0.16437824 0.008594292
nh black.3somecol         nh black 3somecol 0.19150096 0.008342976
nh multirace.3somecol nh multirace 3somecol 0.19958089 0.021875791
nh other.3somecol         nh other 3somecol 0.13182967 0.013536038
nhwhite.4colgrad           nhwhite 4colgrad 0.06340540 0.001419459
hispanic.4colgrad         hispanic 4colgrad 0.10980069 0.006238654
nh black.4colgrad         nh black 4colgrad 0.09221084 0.005473368
nh multirace.4colgrad nh multirace 4colgrad 0.09238272 0.013391668
nh other.4colgrad         nh other 4colgrad 0.05723535 0.006176939
# this plot is a little more complicated
catdog$race_rec <- rep(c("NHWhite", "Hispanic", "NHBlack", "NH Multi", "NH Other"), 
    5)
catdog$educ_rec <- factor(c(rep("Primary Sch", 5), rep("LT HS", 5), rep("HS Grad", 
    5), rep("Some College", 5), rep("College Grad", 5)))

p <- ggplot(catdog, aes(educ_rec, badhealth, ), xlab = "Race", ylab = "% Bad Health")
p <- p + geom_point(aes(colour = race_rec))
p <- p + geom_line(aes(colour = race_rec, group = race_rec))
p <- p + geom_errorbar(aes(x = educ_rec, ymin = badhealth - se, ymax = badhealth + 
    se, colour = race_rec), width = 0.25)
p <- p + ylab("% Fair/Poor Health")
p <- p + xlab("Education Level")
p + ggtitle("% of US Adults in 2011 in Bad Health by Race and Education")

Which shows a significant variation in health status by education level and race/ethnicty

Logit and Probit models

If our outcome is dichtomous (1/0), the natural distribution to consider for a GLM is the binomial \[y \sim \ \text{Binomial}\binom{n}{p}\] with \(p\) being the mean of the binomial, and n being the number of trials, generally when you have individual data, n is always 1, and p is the probability of observing the 1, conditional on the observed predictors. There are two common techniques to estimate the mean, logistic and probit regression. In a Logistic model, the link function is the inverse logit function, or

\(\text{Logit}^{-1}(p) =log \frac{p}{(1-p)}\)

Which gives us the following conditional mean model:

\[E(y|x) = \frac{1}{1+ exp({-\sum_k \beta_k x_{ik}})}\] Which situates the model within the logistic distribution function. Expressing p as a linear model is done via this log odds transformation of the probability:

\[log \frac{p}{(1-p)} = \sum_k \beta_k x_{ik}\]

For the Probit model, the link function is the inverse cumulative Normal distribution:

\[E(y|x) = \Phi^{-1}(p) = \Phi (\sum_k \beta_k x_{ik})\]

In practice, these two models give very similar estimates and will have very similar interpretations, although the logitistic regression model has the more convenient odds ratio interpretation of its \(\beta's\), while the probit model’s coefficients are often transformed into marginal coefficients, which is more of a challenge and software generally doesn’t give you these by default.

Logit/Probit Regression example

There is no trick to fitting logistic regression models usign survey data, just use the svyglm() function with the apppriate distribution specified via family=binomial for logistic and family=binomial(link="probit") for the probit model. You don’t have to specify the link function if you’re just doing the logistic model, as it is the default.

# Logit model
fit.logit <- svyglm(badhealth ~ race_eth + educ + agec, design = des, family = binomial)
# summary(fit.logit)
panderOptions("digits", 2)
pander(fit.logit, covariate.labels = c("Hispanic", "Black", "MultRace", "Other", 
    "PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39", "Age 39-59", 
    "Age 59-79", "Age 80+"))
Fitting generalized (binomial/logit) linear model: badhealth ~ race_eth + educ + agec
  Estimate Std. Error t value Pr(>|t|)
Hispanic 0.49 0.035 14 9.4e-44
Black 0.44 0.035 13 1.3e-36
MultRace 0.47 0.086 5.5 4e-08
Other 0.025 0.073 0.35 0.73
PrimarySchool -0.38 0.065 -5.8 6e-09
SomeHS -1.1 0.057 -19 1.2e-76
SomeColl -1.4 0.058 -24 1.3e-125
CollGrad -2.3 0.059 -38 2.4e-322
Age 24-39 0.61 0.062 9.8 1.2e-22
Age 39-59 1.2 0.057 21 6.6e-99
Age 59-79 1.6 0.057 27 4e-162
Age 80+ 1.7 0.066 25 2e-142
(Intercept) -1.6 0.08 -20 1.1e-84
# probit model
fit.probit <- svyglm(badhealth ~ race_eth + educ + agec, design = des, family = binomial(link = "probit"))
# summary(fit.probit)
pander(fit.probit, covariate.labels = c("Hispanic", "Black", "MultRace", "Other", 
    "PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39", "Age 39-59", 
    "Age 59-79", "Age 80+"))
Fitting generalized (binomial/probit) linear model: badhealth ~ race_eth + educ + agec
  Estimate Std. Error t value Pr(>|t|)
Hispanic 0.28 0.02 14 2e-43
Black 0.25 0.02 13 7e-37
MultRace 0.26 0.048 5.4 5.2e-08
Other 0.027 0.038 0.72 0.47
PrimarySchool -0.25 0.04 -6.2 4.4e-10
SomeHS -0.65 0.035 -19 3.7e-78
SomeColl -0.83 0.035 -24 5.6e-124
CollGrad -1.3 0.035 -37 3.4e-297
Age 24-39 0.32 0.031 10 1.8e-24
Age 39-59 0.64 0.029 22 4.9e-111
Age 59-79 0.84 0.029 29 9.3e-188
Age 80+ 0.93 0.035 27 3.9e-159
(Intercept) -0.85 0.044 -19 1.8e-81

Both of these models show the exact same patterns of effects, with Hispanics, blacks and multi-race individuals showing increased chances of reporting poor/fair health, when compared to whites (Reference group). Similarly, the education variables shows a negative linear trend, with those with more education having lower chances of reporting poor/fair health compared to those with a primary school education (Reference group), and likewise, as people get older, they are more likely to report poor/fair health, compared to those under age 24 (Reference group).

Present both model coefficients next to one another

paramnames <- c("intercept", "Hispanic", "Black", "MultRace", "Other", "PrimarySchool", 
    "SomeHS", "SomeColl", "CollGrad", "Age 24-39", "Age 39-59", "Age 59-79", "Age 80+")
suml <- summary(fit.logit)$coef
row.names(suml) <- paramnames
sump <- summary(fit.probit)$coef
row.names(sump) <- paramnames

kable(data.frame(suml, sump), digits = 2, col.names = c("logit.est", "logit.se", 
    "logit.t", "logit.p", "probit.est", "probit.se", "probit.t", "probit.p"), caption = "Model Coefficients in Logit and Probit Models")
Model Coefficients in Logit and Probit Models
logit.est logit.se logit.t logit.p probit.est probit.se probit.t probit.p
intercept -1.56 0.08 -19.51 0.00 -0.85 0.04 -19.13 0.00
Hispanic 0.49 0.04 13.87 0.00 0.28 0.02 13.82 0.00
Black 0.44 0.03 12.64 0.00 0.25 0.02 12.69 0.00
MultRace 0.47 0.09 5.49 0.00 0.26 0.05 5.44 0.00
Other 0.03 0.07 0.35 0.73 0.03 0.04 0.72 0.47
PrimarySchool -0.38 0.06 -5.82 0.00 -0.25 0.04 -6.24 0.00
SomeHS -1.06 0.06 -18.54 0.00 -0.65 0.03 -18.72 0.00
SomeColl -1.38 0.06 -23.85 0.00 -0.83 0.04 -23.70 0.00
CollGrad -2.25 0.06 -38.44 0.00 -1.29 0.03 -36.90 0.00
Age 24-39 0.61 0.06 9.80 0.00 0.32 0.03 10.21 0.00
Age 39-59 1.21 0.06 21.12 0.00 0.64 0.03 22.40 0.00
Age 59-79 1.56 0.06 27.15 0.00 0.84 0.03 29.25 0.00
Age 80+ 1.68 0.07 25.43 0.00 0.93 0.03 26.90 0.00

Effect Plots

These are nice plots of the model effects using the sjPlot library

sjp.glm(fit.logit, title = "Odds Ratios for Fair/Poor vs Good Health- Logit Model", 
    sortOdds = F, showModelSummary = T)

sjp.glm(fit.probit, title = "Risk ratios for Fair/Poor vs Good Health - Probit Model", 
    sortOdds = F, showModelSummary = T)

Marginal effects

In a regression model in general, the \(\beta's\) are the solution to the differential equation: \[\frac{\partial y}{\partial x} = \beta\]

which is just the rate of change in y, given x, known as the marginal effect. This is the case for strictly linear model

In the logit and probit model, which are nonlinear models, owing to their presumed model structure, the marginal effect also has to take into account the change in the respective pdf with respect to the mean, or:

\[\frac{\partial y}{\partial x} = \beta *\frac{\partial \Phi(x' \beta)}{\partial x'\beta}\]

So we have to multiply the estimated \(\beta\) by the p.d.f. of the assumed marginal distribution evaluated at the mean function. In R that’s not big problem:

# Logit marginal effects
log.marg <- coef(fit.logit) * mean(dlogis(predict(fit.logit)), na.rm = T)

# for probit now
prob.marg <- coef(fit.probit) * mean(dnorm(predict(fit.probit)), na.rm = T)

plot(log.marg[-1], ylab = "Marginal Effects", axes = T, xaxt = "n", main = "Marginal Effects from Logit and Probit models", 
    ylim = c(-0.25, 0.2))
axis(side = 1, at = 1:11, labels = F)
text(x = 1:11, y = -0.3, srt = 45, pos = 1, xpd = TRUE, labels = c("Hispanic", "Black", 
    "MultRace", "Other", "PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39", 
    "Age 39-59", "Age 59-79", "Age 80+"))
points(prob.marg[-1], col = 2)
abline(h = 0, col = 3)
legend("bottomright", legend = c("Logit Model", "Probit Model"), col = c("black", 
    "red"), pch = 1)

Which shows us that the marginal effects are very similar between the two models. We can coax these into a table like:

data.frame(m.logit = log.marg, m.probit = prob.marg)
                          m.logit     m.probit
(Intercept)          -0.195678907 -0.191571535
race_ethhispanic      0.061422572  0.062097744
race_ethnh black      0.054739084  0.056013239
race_ethnh multirace  0.059087347  0.059228034
race_ethnh other      0.003182204  0.006176122
educ1somehs          -0.047383692 -0.055747648
educ2hsgrad          -0.132582418 -0.146302106
educ3somecol         -0.173430407 -0.187073202
educ4colgrad         -0.282914325 -0.290695812
agec(24,39]           0.076160569  0.071623981
agec(39,59]           0.151524434  0.145163085
agec(59,79]           0.195263218  0.190513564
agec(79,99]           0.210924906  0.209759452

Fitted Values

As I often say, I like to talk about “interesting cases”. In order to do this, you need the fitted mean for a particular case. This is done by getting the fitted values for that case from the model. To do this, I generate a bunch of “fake people” that have variability in the model covariates, and fit the model for each type of person. This is perhaps overkill in this example because I fit every type of person, ideally you would want a few interesting cases to discuss.

In order to derive these, we effectively “solve the equation” for the model, or another way of saying it, we estimate the conditional mean of y, by specifying the x values that are meaningful for a particular comparison. For example the probabilty of a white, young college educated person reporting poor health is just the estimate of the model, evaluated at those particular characteristics:

\[\text{Pr(poor/fair health)} = \frac{1}{1+exp({\beta_0 + \beta_1*white + \beta_2*young+\beta_3*college})}\]

# get a series of predicted probabilites for different 'types' of people for each
# model expand.grid will generate all possible combinations of values you specify
dat <- expand.grid(race_eth = levels(brfss14$race_eth), educ = levels(brfss14$educ), 
    agec = levels(brfss14$agec))

# You MAY need to get rid of impossible cases here

# generate the fitted values
fit <- predict(fit.logit, newdat = dat, type = "response")
fitp <- predict(fit.probit, newdat = dat, type = "response")
# add the values to the fake data
dat$fitted.prob.lrm <- round(fit, 3)
dat$fitted.prob.pro <- round(fitp, 3)

# Print the fitted probabilities for the first 20 cases
head(dat, n = 20)
       race_eth     educ   agec fitted.prob.lrm fitted.prob.pro
1       nhwhite    0Prim [0,24]           0.174           0.198
2      hispanic    0Prim [0,24]           0.255           0.283
3      nh black    0Prim [0,24]           0.245           0.274
4  nh multirace    0Prim [0,24]           0.252           0.279
5      nh other    0Prim [0,24]           0.177           0.205
6       nhwhite  1somehs [0,24]           0.126           0.136
7      hispanic  1somehs [0,24]           0.190           0.206
8      nh black  1somehs [0,24]           0.182           0.198
9  nh multirace  1somehs [0,24]           0.188           0.202
10     nh other  1somehs [0,24]           0.129           0.142
11      nhwhite  2hsgrad [0,24]           0.068           0.067
12     hispanic  2hsgrad [0,24]           0.107           0.111
13     nh black  2hsgrad [0,24]           0.102           0.106
14 nh multirace  2hsgrad [0,24]           0.105           0.108
15     nh other  2hsgrad [0,24]           0.070           0.071
16      nhwhite 3somecol [0,24]           0.050           0.047
17     hispanic 3somecol [0,24]           0.079           0.080
18     nh black 3somecol [0,24]           0.075           0.076
19 nh multirace 3somecol [0,24]           0.078           0.078
20     nh other 3somecol [0,24]           0.051           0.049

Which show us the estimated probabilty of reporting poor/fair health for each specified type of “fake person” that we generate. For example, let’s look at the probability for a Non-Hispanic white, age 39-59 with a college education, compared to a Hispanic person, age 39-59 with a primary school education:

dat[which(dat$race_eth == "nhwhite" & dat$agec == "(39,59]" & dat$educ == "4colgrad"), 
    ]
   race_eth     educ    agec fitted.prob.lrm fitted.prob.pro
71  nhwhite 4colgrad (39,59]           0.069           0.067
dat[which(dat$race_eth == "hispanic" & dat$agec == "(39,59]" & dat$educ == "0Prim"), 
    ]
   race_eth  educ    agec fitted.prob.lrm fitted.prob.pro
52 hispanic 0Prim (39,59]           0.534           0.528

The first case has an estimated probability of reporting poor/fair health of about 7%, while the second case has over a 50% chance. These are often more effective ways to convey the result of a model, instead of talking about all the regression coefficients.