# install packages if needed
# install.packages("tidyverse")
# install.packages("tidycensus")
# install.packages("DT")
# install.packages("survey")
# install.packages("epiDisplay")

# load in packages
library(tidyverse)
library(tidycensus) # the package to access US Census data in R

# other packages
library(DT)
library(survey)
# library(epiDisplay)

Overview

  1. Using the tidycensus package to extract 2017-2021 ACS (American Community Survey) estimates of demographics in the US.
  2. Using the survey package to obtain weighted, nationwide estimates.

More resources

  1. Survey Data Analysis with R, https://stats.oarc.ucla.edu/r/seminars/survey-data-analysis-with-r/
  2. Kalton G, Flores-Cervantes I. Weighting methods. Journal of official statistics. 2003 Jun 1;19(2):81, https://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/weighting-methods.pdf

1. Getting started with tidycensus

To use the tidycensus package, you will need a census API key. You can get an API key at https://api.census.gov/data/key_signup.html.

After you’ve signed up for an API key, be sure to activate the key from the email you receive from the Census Bureau so it works correctly. Declaring install = TRUE when calling census_api_key() will install the key for use in future R sessions, which may be convenient for many users. More information can be found at here.

# census_api_key("YOUR KEY GOES HERE", install = TRUE)

1.1 Looking into which variables exist in the ACS

Once your tidycensus package is loaded, you’re ready to start extracting some Census data. Before we can dive right into the data, we need to know what variables we are looking for. The tidycensus::load_variables() function allows us to examine the unique variable ID for the data we’re interested in.

acs_vars <- tidycensus::load_variables(2021, dataset = "acs5", cache = TRUE)

DT::datatable(head(acs_vars))

For example, B01001_001 will give us an estimate of the total population in the US, B06001_004 will give us an estimate of adults aged 18 to 24 years old, etc.

1.2 Extracting the actual data

For this example, we will be extracting 5-year, 2017-2021 ACS data using the get_acs() function. This centers the data at 2019. We will be looking at the intersection of sex, age, and race across the US.

You will need to specify at least the following arguments: year, geography and variables. I use the following syntax in the variables argument to alter the names of the Census variable IDs into something I find usable:

variables = c("SEX_endAGE_RACE" = "ACS_VAR_ID")

This will allow me to manipulate my assigned variable name (in the format "SEX_endAGE_RACE") in a systematic manner later.

acs_2019 <-
  get_acs(
    year = 2021, # end year of the ACS data (this means 2017-2021 ACS data centered at 2019)
    geography = "us", # can change the geographic unit, such as `state`
    survey = "acs5", # this is the default (5-year ACS)
    
    variables = c( # use this argument to extract variables of interest
      
      # get total count to create "other" classification
      "Male_19_total" = "B01001_007", # would be males, 18-19 years, and all races
      "Male_20_total" = "B01001_008",
      "Male_21_total" = "B01001_009",
      "Male_24_total" = "B01001_010",
      "Male_29_total" = "B01001_011",
      "Male_34_total" = "B01001_012",
      "Male_39_total" = "B01001_013",
      "Male_44_total" = "B01001_014",
      "Male_49_total" = "B01001_015",
      "Male_54_total" = "B01001_016",
      "Male_59_total" = "B01001_017",
      "Male_61_total" = "B01001_018",
      "Male_64_total" = "B01001_019",
      "Male_66_total" = "B01001_020",
      "Male_69_total" = "B01001_021",
      "Male_74_total" = "B01001_022",
      "Male_79_total" = "B01001_023",
      "Male_84_total" = "B01001_024",
      "Male_99_total" = "B01001_025",
      
      "Female_19_total" = "B01001_031",
      "Female_20_total" = "B01001_032",
      "Female_21_total" = "B01001_033",
      "Female_24_total" = "B01001_034",
      "Female_29_total" = "B01001_035",
      "Female_34_total" = "B01001_036",
      "Female_39_total" = "B01001_037",
      "Female_44_total" = "B01001_038",
      "Female_49_total" = "B01001_039",
      "Female_54_total" = "B01001_040",
      "Female_59_total" = "B01001_041",
      "Female_61_total" = "B01001_042",
      "Female_64_total" = "B01001_043",
      "Female_66_total" = "B01001_044",
      "Female_69_total" = "B01001_045",
      "Female_74_total" = "B01001_046",
      "Female_79_total" = "B01001_047",
      "Female_84_total" = "B01001_048",
      "Female_99_total" = "B01001_049",
      
      # white, non-hispanic
      "Male_19_whitenh" = "B01001H_007", # male, 18-19 years, non-hispanic white
      "Male_24_whitenh" = "B01001H_008", # male, 20-24 years, non-hispanic white
      "Male_29_whitenh" = "B01001H_009",
      "Male_34_whitenh" = "B01001H_010",
      "Male_44_whitenh" = "B01001H_011",
      "Male_54_whitenh" = "B01001H_012",
      "Male_64_whitenh" = "B01001H_013",
      "Male_74_whitenh" = "B01001H_014",
      "Male_84_whitenh" = "B01001H_015",
      "Male_99_whitenh" = "B01001H_016",
      
      "Female_19_whitenh" = "B01001H_022",
      "Female_24_whitenh" = "B01001H_023",
      "Female_29_whitenh" = "B01001H_024",
      "Female_34_whitenh" = "B01001H_025",
      "Female_44_whitenh" = "B01001H_026",
      "Female_54_whitenh" = "B01001H_027",
      "Female_64_whitenh" = "B01001H_028",
      "Female_74_whitenh" = "B01001H_029",
      "Female_84_whitenh" = "B01001H_030",
      "Female_99_whitenh" = "B01001H_031",
      
      # white (alone) can include hispanic
      "Male_19_whitealone" = "B01001A_007",
      "Male_24_whitealone" = "B01001A_008",
      "Male_29_whitealone" = "B01001A_009",
      "Male_34_whitealone" = "B01001A_010",
      "Male_44_whitealone" = "B01001A_011",
      "Male_54_whitealone" = "B01001A_012",
      "Male_64_whitealone" = "B01001A_013",
      "Male_74_whitealone" = "B01001A_014",
      "Male_84_whitealone" = "B01001A_015",
      "Male_99_whitealone" = "B01001A_016",
      
      "Female_19_whitealone" = "B01001A_022",
      "Female_24_whitealone" = "B01001A_023",
      "Female_29_whitealone" = "B01001A_024",
      "Female_34_whitealone" = "B01001A_025",
      "Female_44_whitealone" = "B01001A_026",
      "Female_54_whitealone" = "B01001A_027",
      "Female_64_whitealone" = "B01001A_028",
      "Female_74_whitealone" = "B01001A_029",
      "Female_84_whitealone" = "B01001A_030",
      "Female_99_whitealone" = "B01001A_031",
      
      #black (alone), can include hispanic
      "Male_19_black" = "B01001B_007",
      "Male_24_black" = "B01001B_008",
      "Male_29_black" = "B01001B_009",
      "Male_34_black" = "B01001B_010",
      "Male_44_black" = "B01001B_011",
      "Male_54_black" = "B01001B_012",
      "Male_64_black" = "B01001B_013",
      "Male_74_black" = "B01001B_014",
      "Male_84_black" = "B01001B_015",
      "Male_99_black" = "B01001B_016",
      
      "Female_19_black" = "B01001B_022",
      "Female_24_black" = "B01001B_023",
      "Female_29_black" = "B01001B_024",
      "Female_34_black" = "B01001B_025",
      "Female_44_black" = "B01001B_026",
      "Female_54_black" = "B01001B_027",
      "Female_64_black" = "B01001B_028",
      "Female_74_black" = "B01001B_029",
      "Female_84_black" = "B01001B_030",
      "Female_99_black" = "B01001B_031",
      
      #hispanic or latino
      "Male_19_hispanic" = "B01001I_007",
      "Male_24_hispanic" = "B01001I_008",
      "Male_29_hispanic" = "B01001I_009",
      "Male_34_hispanic" = "B01001I_010",
      "Male_44_hispanic" = "B01001I_011",
      "Male_54_hispanic" = "B01001I_012",
      "Male_64_hispanic" = "B01001I_013",
      "Male_74_hispanic" = "B01001I_014",
      "Male_84_hispanic" = "B01001I_015",
      "Male_99_hispanic" = "B01001I_016",
      
      "Female_19_hispanic" = "B01001I_022",
      "Female_24_hispanic" = "B01001I_023",
      "Female_29_hispanic" = "B01001I_024",
      "Female_34_hispanic" = "B01001I_025",
      "Female_44_hispanic" = "B01001I_026",
      "Female_54_hispanic" = "B01001I_027",
      "Female_64_hispanic" = "B01001I_028",
      "Female_74_hispanic" = "B01001I_029",
      "Female_84_hispanic" = "B01001I_030",
      "Female_99_hispanic" = "B01001I_031",
      
      #asian
      "Male_19_asian" = "B01001D_007",
      "Male_24_asian" = "B01001D_008",
      "Male_29_asian" = "B01001D_009",
      "Male_34_asian" = "B01001D_010",
      "Male_44_asian" = "B01001D_011",
      "Male_54_asian" = "B01001D_012",
      "Male_64_asian" = "B01001D_013",
      "Male_74_asian" = "B01001D_014",
      "Male_84_asian" = "B01001D_015",
      "Male_99_asian" = "B01001D_016",
      
      "Female_19_asian" = "B01001D_022",
      "Female_24_asian" = "B01001D_023",
      "Female_29_asian" = "B01001D_024",
      "Female_34_asian" = "B01001D_025",
      "Female_44_asian" = "B01001D_026",
      "Female_54_asian" = "B01001D_027",
      "Female_64_asian" = "B01001D_028",
      "Female_74_asian" = "B01001D_029",
      "Female_84_asian" = "B01001D_030",
      "Female_99_asian" = "B01001D_031"
      
    )
  )

datatable(head(acs_2019))

Above, we see sample output from this function.

  • variable has the new variable names we assigned.
  • estimate is the estiamted population count
  • moe is the margin of error associated with the estimate. Default is 90% confidence level; this can be altered in the get_acs() function argument moe_level. See ?get_acs for more details

1.3 Manipulating the output into something usable

Wow, that’s a lotta code. We can now begin manipulating our dataset to fit our needs. We will first create three separate columns that will identify the groups of age, sex, and race.

age_sex_race_acs <- acs_2019 %>%
  mutate(age = str_extract(variable, "\\d+"), # extract the digits in the var name
         sex = str_remove_all(str_extract(variable, "^[M|F][a-z]+_"), "_"), # extract Male or Female
         race = str_remove(str_extract(variable, "_[a-z]+$"), "_")) # extract race at the end

datatable(head(age_sex_race_acs))

We will now group the age groups into the following categories:

  • 18 to <25 years old
  • 25 to <35 years old
  • 25 to <45 years old
  • 45 to <55 years old
  • 55 to <65 years old
  • 65+ years old
age_sex_race_acs <- age_sex_race_acs %>%
  mutate(age_class = 
           case_when(
             age <25 ~ "[18,25)",
             age >=25 & age<35 ~ "[25,35)",
             age >=35 & age<45 ~ "[35,45)",
             age >=45 & age<55 ~ "[45,55)",
             age >=55 & age<65 ~ "[55,65)",
             age >=65 ~ "[65-100]"
           )) %>%
  #group_by(age, age_class) %>% count
  group_by(GEOID, NAME, age_class, sex, race) %>% # re-estimating pop. count with new age classes
  summarise(estimate = sum(estimate)) %>%
  ungroup()

datatable(age_sex_race_acs)

While the sex group is cleaned for us (just two groups - Male and Female), the race grouping need to be altered (at least for our purposes). We need the following categories:

  • White, Hispanic
  • White, non-Hispanic
  • Black (including Hispanic and non-Hispanic)
  • Asian alone
  • Other/multi-race

Since the ACS data does not give us these exact groups, we have to make the following manipulations to the data:

  1. Estimate the Hispanic White population by looking at the difference between White (alone) and Non-Hispanic White
  2. Estimate the Other/multi-race population by taking the difference of Total (all races) the main four races (referred to here as total_non_other)
age_sex_race_final <- age_sex_race_acs %>% spread(race, estimate) %>%
  mutate(whiteh = whitealone - whitenh, 
         total_non_other = whitenh + whiteh + black + asian,
         other = total - total_non_other) %>%
  select(GEOID, NAME, age_class, sex, whitenh, whiteh, black, asian, other) %>%
  gather("race", "count", -c(GEOID, NAME, age_class, sex))

datatable(age_sex_race_final)

Ta-da! We can now move onto part 2 - using the survey package.

2. Using the survey package

The package survey was developed by Dr. Thomas Lumley of the University of Washington. Useful information can be found at his website, http://r-survey.r-forge.r-project.org/survey/index.html.

The package was developed for complex survey designs, such as multi-stage, stratified, and cluster sampling, although can be used in many ways.

Our examples will focus on how to weight a simple sample design: convenience sampling.

2.1 Weighting

Given that we often do not design our studies with intentional, complex sampling strategies ahead of time (as we are typically interested in sampling as many people as possible), the complexity of using the survey package greatly decreases as we will use simple techniques of weighting.

An overview of different weighting methods can be found in Kalton & Flores-Cervantes, 2003. We will focus on cell weighting.

Cell weighting is useful when you know the population count of your target population (in our case, the US population) for each combination of variables (such as age X sex X race). We figured out these combinations in Section 1 above.

2.2 Creating a mock dataset

We need to figure out what weight we will be assigning each participant from our sample population (e.g., AHS participants) based on their age group, sex group, and race group. Let’s quickly simulate a mock AHS dataset.

set.seed(123)

#simulate mock AHS dataset
ahs_data <- tibble(subject_id = paste0("subject_", 1:200000),
                   age_class = sample(
                     x = c("[18,25)", "[25,35)", "[35,45)", "[45,55)", "[55,65)", "[65-100]"), # age groups
                     size = 200000, # sample 200,000 times
                     replace = T, # can replace the sample from x
                     prob = c(0.3, 0.2, 0.2, 0.1, 0.1, 0.1)), # probability of selection
                   
                   sex = sample(
                     x = c("Female", "Male"),
                     size = 200000,
                     replace = T,
                     prob = c(0.35, 0.65)),
                   
                   race = sample(
                     x = c("whitenh", "whiteh", "black", "asian", "other"),
                     size = 200000,
                     replace = T,
                     prob = c(0.7, 0.05, 0.05, 0.1, 0.1)),
                   
                   education = c(rep("noHSdiploma", 50000), rep("HSdiploma", 150000)),
                   
                   leq_24 = rnorm(n = 200000, 
                                  mean = 70,
                                  sd = 8),
                   
                   hl = c(rep(c(0, 1), 25000), rep(c(0, 0, 0, 0, 1), 30000))
)

datatable(head(ahs_data))

2.2 Creating the weights

A simple weight, \(w\) is to examine the ratio of the target population count (\(t\), US population) to the sample population count (\(n\), mock dataset), such that \(w = t/n\).

We have \(t\) from our age_sex_race_final dataframe. We need to calculate \(n\) for each combination of age X sex X race from our mock ahs_data.

ahs_data_grouped <- ahs_data %>%
  group_by(age_class, sex, race) %>%
  summarise(n = n())

datatable(ahs_data_grouped)

We can now calculate the weight, \(w\), using the equation above.

ahs_weights <- ahs_data_grouped %>%
  left_join(age_sex_race_final %>% rename(t = count) %>% select(-GEOID, -NAME),
            by = c("age_class", "sex", "race")) %>%
  mutate(w = t/n)

datatable(ahs_weights)

Thus, we can interpret \(w\) as the number of US population that any given AHS participant represents. For example, a single 18 to <25 year old, female, Asian adult in the AHS study would represent roughly 393 18 to <25 year old, female, Asian adults across the entire US.

Now we can add the weights to ahs_data.

ahs_data_final <- ahs_data %>%
  left_join(ahs_weights %>% select(-n, -t),
            by = c("age_class", "sex", "race"))

datatable(head(ahs_data_final))

We can begin some analysis!

2.3 survey functions

The usage of the survey package has a simple workflow:

  1. Specify the survey design, svydesign().
  2. Use the survey design to run analyses. All functions will follow the form svy[FUNC], such as svyglm() or svyhist().

2.4 Using the svydesign() function

Given we are using convenience sampling methods with cell weighting, this step is relatively easy. We can specify our survey design (which creates a new dataset) as follows:

ahs_svy <- svydesign(ids = ~1, weights = ~w, data = ahs_data_final)

The svydesign() function has many more arguments that can accommodate complex survey designs.

  • probs can be used for specifying cluster sampling probabilities
  • strata can be used if we conducted stratified sampling
  • and many more arguments

These arguments would be useful for complex survey designs, and are necessary if you were analyzing datasets such as NHANES.

2.5 Analyzing the data

2.5.1 Continuous data

Let’s look at the mean, 24-hr \(L_{EQ}\).

svymean(~leq_24, design = ahs_svy)
##          mean     SE
## leq_24 69.966 0.0234

We can use the confint() to get the confidence interval around this mean, since we are estimating the mean using weights.

confint(svymean(~leq_24, ahs_svy))
##           2.5 %   97.5 %
## leq_24 69.91987 70.01179

We can look at the mean broken down by different groups.

svyby(~leq_24, ~education, ahs_svy, svymean)
##               education   leq_24         se
## HSdiploma     HSdiploma 69.95438 0.02707746
## noHSdiploma noHSdiploma 70.00034 0.04688223

We can even do hypothesis testing! Here’s an example of a two-sample, independent t-test.

svyttest(leq_24~education, ahs_svy)
## 
##  Design-based t-test
## 
## data:  leq_24 ~ education
## t = 0.84891, df = 2e+05, p-value = 0.3959
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
##  -0.06015323  0.15207270
## sample estimates:
## difference in mean 
##         0.04595973
2.5.2 Binary data

Let’s look at the weighted number of people who have a hearing loss.

svytable(~hl, design = ahs_svy)
## hl
##         0         1 
## 185316857  70174549

We can specify fractions instead using Ntotal = 1 for proportions or Ntotal = 100 for percentages

svytable(~hl, Ntotal=1, design=ahs_svy)
## hl
##        0        1 
## 0.725335 0.274665

We can examine proportions by different groups as well.

svytable(~sex+hl, Ntotal = 1, design=ahs_svy)
##         hl
## sex              0         1
##   Female 0.3690176 0.1408870
##   Male   0.3563174 0.1337780

This is what the spread of the hearing loss looks like in our original data.

prop.table(table(ahs_data_final$sex, ahs_data_final$hl))
##         
##                 0        1
##   Female 0.253690 0.096595
##   Male   0.471310 0.178405

We can add multiple dimensions to our examination using the interaction() function.

svytable(~interaction(sex, race, hl), design = ahs_svy)
## interaction(sex, race, hl)
##   Female.asian.0     Male.asian.0   Female.black.0     Male.black.0 
##          5795158          5172454         11973118         10782226 
##   Female.other.0     Male.other.0  Female.whiteh.0    Male.whiteh.0 
##         10896204         11002512          6979396          7074255 
## Female.whitenh.0   Male.whitenh.0   Female.asian.1     Male.asian.1 
##         58636940         57004595          2150677          1891871 
##   Female.black.1     Male.black.1   Female.other.1     Male.other.1 
##          4503778          3912622          4059175          4187346 
##  Female.whiteh.1    Male.whiteh.1 Female.whitenh.1   Male.whitenh.1 
##          2941050          2750331         22340739         21436959

We can estimate the percentage of people who have hearing loss by education status.

svytable(~education+hl, design=ahs_svy) %>%
  as_tibble() %>%
  group_by(education) %>%
  mutate(sum = n/sum(n)) %>%
  ungroup()
## # A tibble: 4 × 4
##   education   hl             n   sum
##   <chr>       <chr>      <dbl> <dbl>
## 1 HSdiploma   0     153398511. 0.800
## 2 noHSdiploma 0      31918346. 0.502
## 3 HSdiploma   1      38455042. 0.200
## 4 noHSdiploma 1      31719507. 0.498

Again, we can do hypothesis testing. This time we will do a chi-squared test.

svychisq(~education+hl, ahs_svy, statistic="adjWald")
## 
##  Design-based Wald test of association
## 
## data:  svychisq(~education + hl, ahs_svy, statistic = "adjWald")
## F = 7555.8, ndf = 1e+00, ddf = 2e+05, p-value < 2.2e-16
2.5.3 Regression

The primary function used for regression in the package is the svyglm() function. This will allow you to do linear, logistic, poisson, etc. regressions. Other functions include svyolr() for ordered logistic regression and svykm() for Kaplan-Meier survival regression.

Let’s examine differences in 24-hr \(L_{EQ}\) by different socio-demographics.

# linear regression
svyglm(leq_24 ~ factor(education) + factor(age_class) + factor(sex) + factor(race), 
       ahs_svy) %>%
  summary()
## 
## Call:
## svyglm(formula = leq_24 ~ factor(education) + factor(age_class) + 
##     factor(sex) + factor(race), design = ahs_svy)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~w, data = ahs_data_final)
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  69.88590    0.08117 860.961   <2e-16 ***
## factor(education)noHSdiploma  0.04620    0.05413   0.853   0.3934    
## factor(age_class)[25,35)     -0.01905    0.06176  -0.308   0.7577    
## factor(age_class)[35,45)     -0.05373    0.06048  -0.888   0.3743    
## factor(age_class)[45,55)     -0.15382    0.07701  -1.997   0.0458 *  
## factor(age_class)[55,65)     -0.02628    0.07420  -0.354   0.7232    
## factor(age_class)[65-100]     0.07410    0.07449   0.995   0.3199    
## factor(sex)Male               0.05680    0.04623   1.229   0.2192    
## factor(race)black            -0.04501    0.11538  -0.390   0.6965    
## factor(race)other             0.10884    0.09270   1.174   0.2403    
## factor(race)whiteh            0.04099    0.11352   0.361   0.7180    
## factor(race)whitenh           0.09055    0.07255   1.248   0.2120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 64.13696)
## 
## Number of Fisher Scoring iterations: 2
#logistic regression
svyglm(hl ~ factor(education) + factor(age_class) + factor(sex) + factor(race), 
       family = binomial, 
       ahs_svy) %>%
  epiDisplay::logistic.display(alpha = 0.05, crude = TRUE, 
                   crude.p.value = FALSE, decimal = 2, simplified = FALSE) 
##  
##                                     OR lower95ci upper95ci    Pr(>|Z|)
## factor(education)noHSdiploma 3.9630551 3.8522480  4.077050 0.000000000
## factor(age_class)[25,35)     0.9851104 0.9507853  1.020675 0.407077586
## factor(age_class)[35,45)     0.9914686 0.9574705  1.026674 0.630316921
## factor(age_class)[45,55)     0.9789226 0.9370674  1.022647 0.339329631
## factor(age_class)[55,65)     1.0022077 0.9602566  1.045992 0.919484369
## factor(age_class)[65-100]    0.9988433 0.9567577  1.042780 0.957974276
## factor(sex)Male              0.9880709 0.9622485  1.014586 0.374432134
## factor(race)black            1.0177117 0.9520978  1.087847 0.605623617
## factor(race)other            1.0244386 0.9714507  1.080317 0.372907484
## factor(race)whiteh           1.0907015 1.0228830  1.163016 0.008032448
## factor(race)whitenh          1.0314966 0.9893501  1.075439 0.145136316

Wow, people without a HS diploma have 3.96 (95% CI: 3.85 to 4.08) times higher odds of having hearing loss compared to those with a HS diploma and controlling for age, sex, and race. Wow! Hispanic adults have 1.09 (95% CI: 1.02 to 1.16) times higher odds of having hearing loss compared to Asian adults, controlling for education, age, and sex.