In the vast majority of situations in your work as demographers, your outcome will either be of a qualitative nature or non-normally distributed, especially if you work with individual level survey data.

When we speak of qualitative outcomes, we generally are concerned with the observation of:

Example

This example will cover the use of R functions for fitting binary logit and probit models to complex survey data.

For this example I am using 2014 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

# load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
load("brfss_14.Rdata")

# The names in the data are very ugly, so I make them less ugly
nams <- names(brfss14)
head(nams, n = 10)
 [1] "dispcode" "genhlth"  "physhlth" "menthlth" "poorhlth" "hlthpln1"
 [7] "persdoc2" "medcost"  "checkup1" "exerany2"
# we see some names are lower case, some are upper and some have a little _ in
# the first position. This is a nightmare.
newnames <- gsub(pattern = "x_", replacement = "", x = nams)
names(brfss14) <- tolower(newnames)

# Poor or fair self rated health brfss14$badhealth<-ifelse(brfss14$genhlth %in%
# c(4,5),1,0)
brfss14$badhealth <- recode(brfss14$genhlth, recodes = "4:5=1; 1:3=0; else=NA")
# race/ethnicity
brfss14$black <- recode(brfss14$racegr3, recodes = "2=1; 9=NA; else=0")
brfss14$white <- recode(brfss14$racegr3, recodes = "1=1; 9=NA; else=0")
brfss14$other <- recode(brfss14$racegr3, recodes = "3:4=1; 9=NA; else=0")
brfss14$hispanic <- recode(brfss14$racegr3, recodes = "5=1; 9=NA; else=0")
brfss14$race_eth <- recode(brfss14$racegr3, recodes = "1='nhwhite'; 2='nh black'; 3='nh other';
                         4='nh multirace'; 5='hispanic'; else=NA", 
    as.factor.result = T)
brfss14$race_eth <- relevel(brfss14$race_eth, ref = "nhwhite")
# insurance
brfss14$ins <- ifelse(brfss14$hlthpln1 == 1, 1, 0)

# income grouping
brfss14$inc <- ifelse(brfss14$incomg == 9, NA, brfss14$incomg)

# education level
brfss14$educ <- recode(brfss14$educa, recodes = "1:2='0Prim'; 3='1somehs'; 4='2hsgrad';
                     5='3somecol'; 6='4colgrad';9=NA", 
    as.factor.result = T)
# brfss14$educ<-relevel(brfss14$educ, ref='0Prim')

# employment
brfss14$employ <- recode(brfss14$employ, recodes = "1:2='Employed'; 2:6='nilf';
                       7='retired'; 8='unable'; else=NA", 
    as.factor.result = T)
brfss14$employ <- relevel(brfss14$employ, ref = "Employed")

# marital status
brfss14$marst <- recode(brfss14$marital, recodes = "1='married'; 2='divorced'; 3='widowed';
                      4='separated'; 5='nm';6='cohab'; else=NA", 
    as.factor.result = T)
brfss14$marst <- relevel(brfss14$marst, ref = "married")

# Age cut into intervals
brfss14$agec <- cut(brfss14$age80, breaks = c(0, 24, 39, 59, 79, 99), include.lowest = T)

Analysis

First, we will do some descriptive analysis, such as means and cross tabulations.

# First we tell R our survey design
options(survey.lonely.psu = "adjust")
des <- svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss14[is.na(brfss14$mmsawt) == 
    F, ])

First, we examine the % of US adults with poor/fair health by education level, and do a survey-corrected chi-square test for independence.

cat <- svyby(formula = ~badhealth, by = ~educ, design = des, FUN = svymean, na.rm = T)
svychisq(~badhealth + educ, design = des)

    Pearson's X^2: Rao & Scott adjustment

data:  svychisq(~badhealth + educ, design = des)
F = 998.11, ndf = 3.533, ddf = 856410.000, p-value < 2.2e-16
qplot(x = cat$educ, y = cat$badhealth, data = cat, xlab = "Education", ylab = "%  Fair/Poor Health") + 
    geom_errorbar(aes(x = educ, ymin = badhealth - se, ymax = badhealth + se), width = 0.25) + 
    ggtitle(label = "% of US Adults with Fair/Poor Health by Education")

# calculate race*health cross tabulation, and plot it
dog <- svyby(formula = ~badhealth, by = ~race_eth, design = des, FUN = svymean, na.rm = T)
svychisq(~badhealth + race_eth, design = des)

    Pearson's X^2: Rao & Scott adjustment

data:  svychisq(~badhealth + race_eth, design = des)
F = 253.91, ndf = 3.6193e+00, ddf = 8.7735e+05, p-value < 2.2e-16
qplot(x = dog$race_eth, y = dog$badhealth, data = dog, xlab = "Race", ylab = "%  Fair/Poor Health") + 
    geom_errorbar(aes(x = race_eth, ymin = badhealth - se, ymax = badhealth + se), 
        width = 0.25) + ggtitle(label = "% of US Adults with Fair/Poor Health by Race/Ethnicity")

# calculate race*education*health cross tabulation, and plot it
catdog <- svyby(formula = ~badhealth, by = ~race_eth + educ, design = des, FUN = svymean, 
    na.rm = T)
catdog
                          race_eth     educ  badhealth          se
nhwhite.0Prim              nhwhite    0Prim 0.46659801 0.021420370
hispanic.0Prim            hispanic    0Prim 0.51825795 0.014960682
nh black.0Prim            nh black    0Prim 0.39207216 0.042532902
nh multirace.0Prim    nh multirace    0Prim 0.47778750 0.096429344
nh other.0Prim            nh other    0Prim 0.29971925 0.066407769
nhwhite.1somehs            nhwhite  1somehs 0.33804897 0.010565874
hispanic.1somehs          hispanic  1somehs 0.33235610 0.015569293
nh black.1somehs          nh black  1somehs 0.38585274 0.019642872
nh multirace.1somehs  nh multirace  1somehs 0.22024908 0.038654420
nh other.1somehs          nh other  1somehs 0.24960345 0.046307615
nhwhite.2hsgrad            nhwhite  2hsgrad 0.18855849 0.003318257
hispanic.2hsgrad          hispanic  2hsgrad 0.21764786 0.008749008
nh black.2hsgrad          nh black  2hsgrad 0.23964410 0.009053724
nh multirace.2hsgrad  nh multirace  2hsgrad 0.21587613 0.025725335
nh other.2hsgrad          nh other  2hsgrad 0.14316482 0.016727762
nhwhite.3somecol           nhwhite 3somecol 0.13485193 0.002706258
hispanic.3somecol         hispanic 3somecol 0.16437824 0.008594292
nh black.3somecol         nh black 3somecol 0.19150096 0.008342976
nh multirace.3somecol nh multirace 3somecol 0.19958089 0.021875791
nh other.3somecol         nh other 3somecol 0.13182967 0.013536038
nhwhite.4colgrad           nhwhite 4colgrad 0.06340540 0.001419459
hispanic.4colgrad         hispanic 4colgrad 0.10980069 0.006238654
nh black.4colgrad         nh black 4colgrad 0.09221084 0.005473368
nh multirace.4colgrad nh multirace 4colgrad 0.09238272 0.013391668
nh other.4colgrad         nh other 4colgrad 0.05723535 0.006176939
# this plot is a little more complicated
catdog$race_rec <- rep(c("NHWhite", "Hispanic", "NHBlack", "NH Multi", "NH Other"), 
    5)
catdog$educ_rec <- factor(c(rep("Primary Sch", 5), rep("LT HS", 5), rep("HS Grad", 
    5), rep("Some College", 5), rep("College Grad", 5)))

p <- ggplot(catdog, aes(educ_rec, badhealth, ), xlab = "Race", ylab = "% Bad Health")
p <- p + geom_point(aes(colour = race_rec))
p <- p + geom_line(aes(colour = race_rec, group = race_rec))
p <- p + geom_errorbar(aes(x = educ_rec, ymin = badhealth - se, ymax = badhealth + 
    se, colour = race_rec), width = 0.25)
p <- p + ylab("% Fair/Poor Health")
p <- p + xlab("Education Level")
p + ggtitle("% of US Adults in 2011 in Bad Health by Race and Education")