# install packages if needed
# install.packages("tidyverse")
# install.packages("tidycensus")
# install.packages("DT")
# install.packages("survey")
# install.packages("epiDisplay")
# load in packages
library(tidyverse)
library(tidycensus) # the package to access US Census data in R
# other packages
library(DT)
library(survey)
# library(epiDisplay)
tidycensus package to extract 2017-2021 ACS
(American Community Survey) estimates of demographics in the US.survey package to obtain weighted, nationwide
estimates.tidycensusTo use the tidycensus package, you will need a census
API key. You can get an API key at https://api.census.gov/data/key_signup.html.
After you’ve signed up for an API key, be sure to activate the key
from the email you receive from the Census Bureau so it works correctly.
Declaring install = TRUE when calling
census_api_key() will install the key for use in future R
sessions, which may be convenient for many users. More information can
be found at here.
# census_api_key("YOUR KEY GOES HERE", install = TRUE)
Once your tidycensus package is loaded, you’re ready to
start extracting some Census data. Before we can dive right into the
data, we need to know what variables we are looking for. The
tidycensus::load_variables() function allows us to examine
the unique variable ID for the data we’re interested in.
acs_vars <- tidycensus::load_variables(2021, dataset = "acs5", cache = TRUE)
DT::datatable(head(acs_vars))
For example, B01001_001 will give us an estimate of the
total population in the US, B06001_004 will give us an
estimate of adults aged 18 to 24 years old, etc.
For this example, we will be extracting 5-year, 2017-2021 ACS data
using the get_acs() function. This centers the data at
2019. We will be looking at the intersection of sex, age, and race
across the US.
You will need to specify at least the following arguments:
year, geography and variables. I
use the following syntax in the variables argument to alter
the names of the Census variable IDs into something I find usable:
variables = c("SEX_endAGE_RACE" = "ACS_VAR_ID")
This will allow me to manipulate my assigned variable name (in the
format "SEX_endAGE_RACE") in a systematic manner later.
acs_2019 <-
get_acs(
year = 2021, # end year of the ACS data (this means 2017-2021 ACS data centered at 2019)
geography = "us", # can change the geographic unit, such as `state`
survey = "acs5", # this is the default (5-year ACS)
variables = c( # use this argument to extract variables of interest
# get total count to create "other" classification
"Male_19_total" = "B01001_007", # would be males, 18-19 years, and all races
"Male_20_total" = "B01001_008",
"Male_21_total" = "B01001_009",
"Male_24_total" = "B01001_010",
"Male_29_total" = "B01001_011",
"Male_34_total" = "B01001_012",
"Male_39_total" = "B01001_013",
"Male_44_total" = "B01001_014",
"Male_49_total" = "B01001_015",
"Male_54_total" = "B01001_016",
"Male_59_total" = "B01001_017",
"Male_61_total" = "B01001_018",
"Male_64_total" = "B01001_019",
"Male_66_total" = "B01001_020",
"Male_69_total" = "B01001_021",
"Male_74_total" = "B01001_022",
"Male_79_total" = "B01001_023",
"Male_84_total" = "B01001_024",
"Male_99_total" = "B01001_025",
"Female_19_total" = "B01001_031",
"Female_20_total" = "B01001_032",
"Female_21_total" = "B01001_033",
"Female_24_total" = "B01001_034",
"Female_29_total" = "B01001_035",
"Female_34_total" = "B01001_036",
"Female_39_total" = "B01001_037",
"Female_44_total" = "B01001_038",
"Female_49_total" = "B01001_039",
"Female_54_total" = "B01001_040",
"Female_59_total" = "B01001_041",
"Female_61_total" = "B01001_042",
"Female_64_total" = "B01001_043",
"Female_66_total" = "B01001_044",
"Female_69_total" = "B01001_045",
"Female_74_total" = "B01001_046",
"Female_79_total" = "B01001_047",
"Female_84_total" = "B01001_048",
"Female_99_total" = "B01001_049",
# white, non-hispanic
"Male_19_whitenh" = "B01001H_007", # male, 18-19 years, non-hispanic white
"Male_24_whitenh" = "B01001H_008", # male, 20-24 years, non-hispanic white
"Male_29_whitenh" = "B01001H_009",
"Male_34_whitenh" = "B01001H_010",
"Male_44_whitenh" = "B01001H_011",
"Male_54_whitenh" = "B01001H_012",
"Male_64_whitenh" = "B01001H_013",
"Male_74_whitenh" = "B01001H_014",
"Male_84_whitenh" = "B01001H_015",
"Male_99_whitenh" = "B01001H_016",
"Female_19_whitenh" = "B01001H_022",
"Female_24_whitenh" = "B01001H_023",
"Female_29_whitenh" = "B01001H_024",
"Female_34_whitenh" = "B01001H_025",
"Female_44_whitenh" = "B01001H_026",
"Female_54_whitenh" = "B01001H_027",
"Female_64_whitenh" = "B01001H_028",
"Female_74_whitenh" = "B01001H_029",
"Female_84_whitenh" = "B01001H_030",
"Female_99_whitenh" = "B01001H_031",
# white (alone) can include hispanic
"Male_19_whitealone" = "B01001A_007",
"Male_24_whitealone" = "B01001A_008",
"Male_29_whitealone" = "B01001A_009",
"Male_34_whitealone" = "B01001A_010",
"Male_44_whitealone" = "B01001A_011",
"Male_54_whitealone" = "B01001A_012",
"Male_64_whitealone" = "B01001A_013",
"Male_74_whitealone" = "B01001A_014",
"Male_84_whitealone" = "B01001A_015",
"Male_99_whitealone" = "B01001A_016",
"Female_19_whitealone" = "B01001A_022",
"Female_24_whitealone" = "B01001A_023",
"Female_29_whitealone" = "B01001A_024",
"Female_34_whitealone" = "B01001A_025",
"Female_44_whitealone" = "B01001A_026",
"Female_54_whitealone" = "B01001A_027",
"Female_64_whitealone" = "B01001A_028",
"Female_74_whitealone" = "B01001A_029",
"Female_84_whitealone" = "B01001A_030",
"Female_99_whitealone" = "B01001A_031",
#black (alone), can include hispanic
"Male_19_black" = "B01001B_007",
"Male_24_black" = "B01001B_008",
"Male_29_black" = "B01001B_009",
"Male_34_black" = "B01001B_010",
"Male_44_black" = "B01001B_011",
"Male_54_black" = "B01001B_012",
"Male_64_black" = "B01001B_013",
"Male_74_black" = "B01001B_014",
"Male_84_black" = "B01001B_015",
"Male_99_black" = "B01001B_016",
"Female_19_black" = "B01001B_022",
"Female_24_black" = "B01001B_023",
"Female_29_black" = "B01001B_024",
"Female_34_black" = "B01001B_025",
"Female_44_black" = "B01001B_026",
"Female_54_black" = "B01001B_027",
"Female_64_black" = "B01001B_028",
"Female_74_black" = "B01001B_029",
"Female_84_black" = "B01001B_030",
"Female_99_black" = "B01001B_031",
#hispanic or latino
"Male_19_hispanic" = "B01001I_007",
"Male_24_hispanic" = "B01001I_008",
"Male_29_hispanic" = "B01001I_009",
"Male_34_hispanic" = "B01001I_010",
"Male_44_hispanic" = "B01001I_011",
"Male_54_hispanic" = "B01001I_012",
"Male_64_hispanic" = "B01001I_013",
"Male_74_hispanic" = "B01001I_014",
"Male_84_hispanic" = "B01001I_015",
"Male_99_hispanic" = "B01001I_016",
"Female_19_hispanic" = "B01001I_022",
"Female_24_hispanic" = "B01001I_023",
"Female_29_hispanic" = "B01001I_024",
"Female_34_hispanic" = "B01001I_025",
"Female_44_hispanic" = "B01001I_026",
"Female_54_hispanic" = "B01001I_027",
"Female_64_hispanic" = "B01001I_028",
"Female_74_hispanic" = "B01001I_029",
"Female_84_hispanic" = "B01001I_030",
"Female_99_hispanic" = "B01001I_031",
#asian
"Male_19_asian" = "B01001D_007",
"Male_24_asian" = "B01001D_008",
"Male_29_asian" = "B01001D_009",
"Male_34_asian" = "B01001D_010",
"Male_44_asian" = "B01001D_011",
"Male_54_asian" = "B01001D_012",
"Male_64_asian" = "B01001D_013",
"Male_74_asian" = "B01001D_014",
"Male_84_asian" = "B01001D_015",
"Male_99_asian" = "B01001D_016",
"Female_19_asian" = "B01001D_022",
"Female_24_asian" = "B01001D_023",
"Female_29_asian" = "B01001D_024",
"Female_34_asian" = "B01001D_025",
"Female_44_asian" = "B01001D_026",
"Female_54_asian" = "B01001D_027",
"Female_64_asian" = "B01001D_028",
"Female_74_asian" = "B01001D_029",
"Female_84_asian" = "B01001D_030",
"Female_99_asian" = "B01001D_031"
)
)
datatable(head(acs_2019))
Above, we see sample output from this function.
variable has the new variable names we assigned.estimate is the estiamted population countmoe is the margin of error associated with the
estimate. Default is 90% confidence level; this can be altered in the
get_acs() function argument moe_level. See
?get_acs for more detailsWow, that’s a lotta code. We can now begin manipulating our dataset to fit our needs. We will first create three separate columns that will identify the groups of age, sex, and race.
age_sex_race_acs <- acs_2019 %>%
mutate(age = str_extract(variable, "\\d+"), # extract the digits in the var name
sex = str_remove_all(str_extract(variable, "^[M|F][a-z]+_"), "_"), # extract Male or Female
race = str_remove(str_extract(variable, "_[a-z]+$"), "_")) # extract race at the end
datatable(head(age_sex_race_acs))
We will now group the age groups into the following categories:
age_sex_race_acs <- age_sex_race_acs %>%
mutate(age_class =
case_when(
age <25 ~ "[18,25)",
age >=25 & age<35 ~ "[25,35)",
age >=35 & age<45 ~ "[35,45)",
age >=45 & age<55 ~ "[45,55)",
age >=55 & age<65 ~ "[55,65)",
age >=65 ~ "[65-100]"
)) %>%
#group_by(age, age_class) %>% count
group_by(GEOID, NAME, age_class, sex, race) %>% # re-estimating pop. count with new age classes
summarise(estimate = sum(estimate)) %>%
ungroup()
datatable(age_sex_race_acs)
While the sex group is cleaned for us (just two groups - Male and Female), the race grouping need to be altered (at least for our purposes). We need the following categories:
Since the ACS data does not give us these exact groups, we have to make the following manipulations to the data:
total_non_other)age_sex_race_final <- age_sex_race_acs %>% spread(race, estimate) %>%
mutate(whiteh = whitealone - whitenh,
total_non_other = whitenh + whiteh + black + asian,
other = total - total_non_other) %>%
select(GEOID, NAME, age_class, sex, whitenh, whiteh, black, asian, other) %>%
gather("race", "count", -c(GEOID, NAME, age_class, sex))
datatable(age_sex_race_final)
Ta-da! We can now move onto part 2 - using the survey
package.
survey packageThe package survey was developed by Dr. Thomas Lumley of
the University of Washington. Useful information can be found at his
website, http://r-survey.r-forge.r-project.org/survey/index.html.
The package was developed for complex survey designs, such as multi-stage, stratified, and cluster sampling, although can be used in many ways.
Our examples will focus on how to weight a simple sample design: convenience sampling.
Given that we often do not design our studies with intentional,
complex sampling strategies ahead of time (as we are typically
interested in sampling as many people as possible), the complexity of
using the survey package greatly decreases as we will use
simple techniques of weighting.
An overview of different weighting methods can be found in Kalton & Flores-Cervantes, 2003. We will focus on cell weighting.
Cell weighting is useful when you know the population count of your target population (in our case, the US population) for each combination of variables (such as age X sex X race). We figured out these combinations in Section 1 above.
We need to figure out what weight we will be assigning each participant from our sample population (e.g., AHS participants) based on their age group, sex group, and race group. Let’s quickly simulate a mock AHS dataset.
set.seed(123)
#simulate mock AHS dataset
ahs_data <- tibble(subject_id = paste0("subject_", 1:200000),
age_class = sample(
x = c("[18,25)", "[25,35)", "[35,45)", "[45,55)", "[55,65)", "[65-100]"), # age groups
size = 200000, # sample 200,000 times
replace = T, # can replace the sample from x
prob = c(0.3, 0.2, 0.2, 0.1, 0.1, 0.1)), # probability of selection
sex = sample(
x = c("Female", "Male"),
size = 200000,
replace = T,
prob = c(0.35, 0.65)),
race = sample(
x = c("whitenh", "whiteh", "black", "asian", "other"),
size = 200000,
replace = T,
prob = c(0.7, 0.05, 0.05, 0.1, 0.1)),
education = c(rep("noHSdiploma", 50000), rep("HSdiploma", 150000)),
leq_24 = rnorm(n = 200000,
mean = 70,
sd = 8),
hl = c(rep(c(0, 1), 25000), rep(c(0, 0, 0, 0, 1), 30000))
)
datatable(head(ahs_data))
A simple weight, \(w\) is to examine the ratio of the target population count (\(t\), US population) to the sample population count (\(n\), mock dataset), such that \(w = t/n\).
We have \(t\) from our
age_sex_race_final dataframe. We need to calculate \(n\) for each combination of age X sex X
race from our mock ahs_data.
ahs_data_grouped <- ahs_data %>%
group_by(age_class, sex, race) %>%
summarise(n = n())
datatable(ahs_data_grouped)
We can now calculate the weight, \(w\), using the equation above.
ahs_weights <- ahs_data_grouped %>%
left_join(age_sex_race_final %>% rename(t = count) %>% select(-GEOID, -NAME),
by = c("age_class", "sex", "race")) %>%
mutate(w = t/n)
datatable(ahs_weights)
Thus, we can interpret \(w\) as the number of US population that any given AHS participant represents. For example, a single 18 to <25 year old, female, Asian adult in the AHS study would represent roughly 393 18 to <25 year old, female, Asian adults across the entire US.
Now we can add the weights to ahs_data.
ahs_data_final <- ahs_data %>%
left_join(ahs_weights %>% select(-n, -t),
by = c("age_class", "sex", "race"))
datatable(head(ahs_data_final))
We can begin some analysis!
survey functionsThe usage of the survey package has a simple
workflow:
svydesign().svy[FUNC], such as svyglm() or
svyhist().svydesign() functionGiven we are using convenience sampling methods with cell weighting, this step is relatively easy. We can specify our survey design (which creates a new dataset) as follows:
ahs_svy <- svydesign(ids = ~1, weights = ~w, data = ahs_data_final)
The svydesign() function has many more arguments that
can accommodate complex survey designs.
probs can be used for specifying cluster sampling
probabilitiesstrata can be used if we conducted stratified
samplingThese arguments would be useful for complex survey designs, and are necessary if you were analyzing datasets such as NHANES.
Let’s look at the mean, 24-hr \(L_{EQ}\).
svymean(~leq_24, design = ahs_svy)
## mean SE
## leq_24 69.966 0.0234
We can use the confint() to get the confidence interval
around this mean, since we are estimating the mean using weights.
confint(svymean(~leq_24, ahs_svy))
## 2.5 % 97.5 %
## leq_24 69.91987 70.01179
We can look at the mean broken down by different groups.
svyby(~leq_24, ~education, ahs_svy, svymean)
## education leq_24 se
## HSdiploma HSdiploma 69.95438 0.02707746
## noHSdiploma noHSdiploma 70.00034 0.04688223
We can even do hypothesis testing! Here’s an example of a two-sample, independent t-test.
svyttest(leq_24~education, ahs_svy)
##
## Design-based t-test
##
## data: leq_24 ~ education
## t = 0.84891, df = 2e+05, p-value = 0.3959
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.06015323 0.15207270
## sample estimates:
## difference in mean
## 0.04595973
Let’s look at the weighted number of people who have a hearing loss.
svytable(~hl, design = ahs_svy)
## hl
## 0 1
## 185316857 70174549
We can specify fractions instead using Ntotal = 1 for
proportions or Ntotal = 100 for percentages
svytable(~hl, Ntotal=1, design=ahs_svy)
## hl
## 0 1
## 0.725335 0.274665
We can examine proportions by different groups as well.
svytable(~sex+hl, Ntotal = 1, design=ahs_svy)
## hl
## sex 0 1
## Female 0.3690176 0.1408870
## Male 0.3563174 0.1337780
This is what the spread of the hearing loss looks like in our original data.
prop.table(table(ahs_data_final$sex, ahs_data_final$hl))
##
## 0 1
## Female 0.253690 0.096595
## Male 0.471310 0.178405
We can add multiple dimensions to our examination using the
interaction() function.
svytable(~interaction(sex, race, hl), design = ahs_svy)
## interaction(sex, race, hl)
## Female.asian.0 Male.asian.0 Female.black.0 Male.black.0
## 5795158 5172454 11973118 10782226
## Female.other.0 Male.other.0 Female.whiteh.0 Male.whiteh.0
## 10896204 11002512 6979396 7074255
## Female.whitenh.0 Male.whitenh.0 Female.asian.1 Male.asian.1
## 58636940 57004595 2150677 1891871
## Female.black.1 Male.black.1 Female.other.1 Male.other.1
## 4503778 3912622 4059175 4187346
## Female.whiteh.1 Male.whiteh.1 Female.whitenh.1 Male.whitenh.1
## 2941050 2750331 22340739 21436959
We can estimate the percentage of people who have hearing loss by education status.
svytable(~education+hl, design=ahs_svy) %>%
as_tibble() %>%
group_by(education) %>%
mutate(sum = n/sum(n)) %>%
ungroup()
## # A tibble: 4 × 4
## education hl n sum
## <chr> <chr> <dbl> <dbl>
## 1 HSdiploma 0 153398511. 0.800
## 2 noHSdiploma 0 31918346. 0.502
## 3 HSdiploma 1 38455042. 0.200
## 4 noHSdiploma 1 31719507. 0.498
Again, we can do hypothesis testing. This time we will do a chi-squared test.
svychisq(~education+hl, ahs_svy, statistic="adjWald")
##
## Design-based Wald test of association
##
## data: svychisq(~education + hl, ahs_svy, statistic = "adjWald")
## F = 7555.8, ndf = 1e+00, ddf = 2e+05, p-value < 2.2e-16
The primary function used for regression in the package is the
svyglm() function. This will allow you to do linear,
logistic, poisson, etc. regressions. Other functions include
svyolr() for ordered logistic regression and
svykm() for Kaplan-Meier survival regression.
Let’s examine differences in 24-hr \(L_{EQ}\) by different socio-demographics.
# linear regression
svyglm(leq_24 ~ factor(education) + factor(age_class) + factor(sex) + factor(race),
ahs_svy) %>%
summary()
##
## Call:
## svyglm(formula = leq_24 ~ factor(education) + factor(age_class) +
## factor(sex) + factor(race), design = ahs_svy)
##
## Survey design:
## svydesign(ids = ~1, weights = ~w, data = ahs_data_final)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.88590 0.08117 860.961 <2e-16 ***
## factor(education)noHSdiploma 0.04620 0.05413 0.853 0.3934
## factor(age_class)[25,35) -0.01905 0.06176 -0.308 0.7577
## factor(age_class)[35,45) -0.05373 0.06048 -0.888 0.3743
## factor(age_class)[45,55) -0.15382 0.07701 -1.997 0.0458 *
## factor(age_class)[55,65) -0.02628 0.07420 -0.354 0.7232
## factor(age_class)[65-100] 0.07410 0.07449 0.995 0.3199
## factor(sex)Male 0.05680 0.04623 1.229 0.2192
## factor(race)black -0.04501 0.11538 -0.390 0.6965
## factor(race)other 0.10884 0.09270 1.174 0.2403
## factor(race)whiteh 0.04099 0.11352 0.361 0.7180
## factor(race)whitenh 0.09055 0.07255 1.248 0.2120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 64.13696)
##
## Number of Fisher Scoring iterations: 2
#logistic regression
svyglm(hl ~ factor(education) + factor(age_class) + factor(sex) + factor(race),
family = binomial,
ahs_svy) %>%
epiDisplay::logistic.display(alpha = 0.05, crude = TRUE,
crude.p.value = FALSE, decimal = 2, simplified = FALSE)
##
## OR lower95ci upper95ci Pr(>|Z|)
## factor(education)noHSdiploma 3.9630551 3.8522480 4.077050 0.000000000
## factor(age_class)[25,35) 0.9851104 0.9507853 1.020675 0.407077586
## factor(age_class)[35,45) 0.9914686 0.9574705 1.026674 0.630316921
## factor(age_class)[45,55) 0.9789226 0.9370674 1.022647 0.339329631
## factor(age_class)[55,65) 1.0022077 0.9602566 1.045992 0.919484369
## factor(age_class)[65-100] 0.9988433 0.9567577 1.042780 0.957974276
## factor(sex)Male 0.9880709 0.9622485 1.014586 0.374432134
## factor(race)black 1.0177117 0.9520978 1.087847 0.605623617
## factor(race)other 1.0244386 0.9714507 1.080317 0.372907484
## factor(race)whiteh 1.0907015 1.0228830 1.163016 0.008032448
## factor(race)whitenh 1.0314966 0.9893501 1.075439 0.145136316
Wow, people without a HS diploma have 3.96 (95% CI: 3.85 to 4.08) times higher odds of having hearing loss compared to those with a HS diploma and controlling for age, sex, and race. Wow! Hispanic adults have 1.09 (95% CI: 1.02 to 1.16) times higher odds of having hearing loss compared to Asian adults, controlling for education, age, and sex.