Coursera Sampling Assignment

The objective of this script is to select a two-stage sample. This will be done such that 2 BGs are draw from each stratum, with probabilities proportional to the number of persons in each BG.

Load Frame

# load packages
library(sampling)
library(dplyr)
library(tidyr)
library(purrr)
library(survey)
library(readr)
library(knitr)

Next, we read-in the frame of data.

# read-in frame
specification <- cols(
  .default = col_character()
)
frame <- read_csv("~/Downloads/NHISdata.5strata.csv", 
                  col_types = specification)

# add/transform variable for later use
frame <- frame %>%
  mutate(
    age_grp = case_when(
      AGE_P >= 18 & AGE_P < 35 ~ "18‐34",
      AGE_P >= 35 & AGE_P < 65 ~ "35‐64",
      AGE_P >= 65 ~ "65 and older",
      TRUE ~ "Unknown"
    ),
    sex = case_when(
      SEX == 1 ~ "Male",
      SEX == 2 ~ "Female",
      TRUE ~ "Unknown"
    ),
    work = case_when(
      WRKCATA == "1" ~ "Employee of a PRIVATE company for wages",
      WRKCATA == "2" ~ "A FEDERAL government employee",
      WRKCATA == "3" ~ "A STATE government employee",
      WRKCATA == "4" ~ "A LOCAL government employee",
      WRKCATA == "5" ~ "Self-employed in OWN business, professional practice or farm",
      WRKCATA == "6" ~ "Working WITHOUT PAY in a family-owned business or farm",
      WRKCATA == "7" ~ "Refused",
      WRKCATA == "8" ~ "Not ascertained",
      WRKCATA == "9" ~ "Don't know",
      TRUE ~ "Unknown"
    )
  )

The frame consists of people as units (rows), with clusters variable given by the BG variable. Other variables include survey questions on health, disease, and demographics.

Next, we analyse the counts of units in the frame, per stratum and block group.

# count individuals per BG, within stratum
kable(count(frame, STRATUM, BG))

STRATUM	BG	n
1	1	109
1	2	97
1	3	119
1	4	89
1	5	115
1	6	109
1	7	110
2	10	106
2	11	117
2	12	103
2	13	109
2	14	105
2	8	107
2	9	101
3	15	104
3	16	100
3	17	113
3	18	106
3	19	108
3	20	111
3	21	107
4	22	98
4	23	108
4	24	104
4	25	115
4	26	109
4	27	98
4	28	116
5	29	109
5	30	110
5	31	92
5	32	107
5	33	118
5	34	114
5	35	98

In the table above, the column 'n' represents the number of persons (i.e. the population count) in each BG by stratum.

Step 1: Stage 1 Sample

In the first sampling step, we select two BG’s per stratum. These are selected without replacement proportional to their size (number of individuals in each BG), using systematic PPS selection.

# sample two BGs per stratum
n_per_bg <- 2
strat_count <- frame %>%
  nest(-STRATUM, -BG) %>%
  group_by(STRATUM) %>%
  mutate(fp1 = n_distinct(BG),
         Nh = map_int(data, nrow), 
         pi1 = Nh / sum(Nh) * n_per_bg) %>%
  ungroup %>%
  split(.$STRATUM)

# select first stage sample
set.seed(402)
s <- map(strat_count, ~UPsystematic(.x$pi1))
stage1 <- map2_dfr(strat_count, s, ~.x[as.logical(.y), ])

# print data
stage1

## # A tibble: 10 x 6
##    STRATUM BG    data                  fp1    Nh   pi1
##    <chr>   <chr> <list>              <int> <int> <dbl>
##  1 1       4     <tibble [89 × 49]>      7    89 0.238
##  2 1       7     <tibble [110 × 49]>     7   110 0.294
##  3 2       10    <tibble [106 × 49]>     7   106 0.283
##  4 2       13    <tibble [109 × 49]>     7   109 0.291
##  5 3       16    <tibble [100 × 49]>     7   100 0.267
##  6 3       19    <tibble [108 × 49]>     7   108 0.288
##  7 4       25    <tibble [115 × 49]>     7   115 0.307
##  8 4       28    <tibble [116 × 49]>     7   116 0.310
##  9 5       30    <tibble [110 × 49]>     7   110 0.294
## 10 5       34    <tibble [114 × 49]>     7   114 0.305

Step 2: Stage 2 Sample

The following step is to select a simple random sample without replacement (srswor) of 10% of the persons within each sample BG. The list of selected persons (IDs) within each BG along with all other data for the selected persons is displayed.

# select second stage sample
set.seed(429)
pi2 <- 0.1
stage2 <- stage1 %>%
  mutate(s = map(data, sample_frac, 
                 size = pi2, replace = FALSE),
         pi2 = pi2) %>%
  unnest(s)

stage2

## # A tibble: 109 x 55
##    STRATUM BG      fp1    Nh   pi1   pi2 ID    SRVY_YR HHX   REGION SEX  
##    <chr>   <chr> <int> <int> <dbl> <dbl> <chr> <chr>   <chr> <chr>  <chr>
##  1 1       4         7    89 0.238 0.100 387   2015    23177 1      2    
##  2 1       4         7    89 0.238 0.100 331   2015    61340 1      1    
##  3 1       4         7    89 0.238 0.100 375   2015    44589 1      2    
##  4 1       4         7    89 0.238 0.100 409   2015    32467 1      2    
##  5 1       4         7    89 0.238 0.100 333   2015    85235 1      2    
##  6 1       4         7    89 0.238 0.100 410   2015    36684 1      2    
##  7 1       4         7    89 0.238 0.100 407   2015    8584  1      1    
##  8 1       4         7    89 0.238 0.100 384   2015    83089 1      2    
##  9 1       4         7    89 0.238 0.100 335   2015    13724 1      2    
## 10 1       7         7   110 0.294 0.100 672   2015    24612 3      1    
## # ... with 99 more rows, and 44 more variables: HISPAN_I <chr>,
## #   RACERPI2 <chr>, AGE_P <chr>, R_MARITL <chr>, DOINGLWA <chr>,
## #   WRKCATA <chr>, HYPEV <chr>, CHLEV <chr>, CHDEV <chr>, ANGEV <chr>,
## #   MIEV <chr>, STREV <chr>, AASMEV <chr>, CANEV <chr>, DIBEV <chr>,
## #   AHAYFYR <chr>, JNTSYMP <chr>, ARTH1 <chr>, CTSEVER <chr>,
## #   PAINLB <chr>, LBPSEV <chr>, AMIGR <chr>, HRAIDNOW <chr>,
## #   AVISION <chr>, WKDAYR <chr>, AHSTATYR <chr>, AFLHCA17 <chr>,
## #   SMKEV <chr>, SMKNOW <chr>, CIGSDA1 <chr>, CIGSDA2 <chr>,
## #   CIGQTYR <chr>, VIGTP <chr>, ALC12MNO <chr>, AWEIGHTP <chr>,
## #   AHEIGHT <chr>, BMI <chr>, ASISLEEP <chr>, ASISLPFL <chr>,
## #   ASISLPST <chr>, ASISAD <chr>, age_grp <chr>, sex <chr>, work <chr>

Step 3: Calculate Probabilites

Nest, we calculate sampling probabilities and add them to the sample data. Selection probabilities and weights were calculated by expanding the first stage probabilities (pi1) by the second stage probabilities (pi2). The final probability is stored as a variable called pik and its inverse, basewt gives the base weight.

# define sampling probabilities
sam <- stage2 %>%
  mutate(pik = pi1 * pi2,
         basewt = 1 / pik) %>%
  arrange(STRATUM, BG)

The sampling probabilities are as follows:

# show probabilites and pop totals by stratum and BG
sample_n(select(sam, STRATUM, BG, fp1, Nh, pi1, pi2, pik, basewt), 10)

## # A tibble: 10 x 8
##    STRATUM BG      fp1    Nh   pi1   pi2    pik basewt
##    <chr>   <chr> <int> <int> <dbl> <dbl>  <dbl>  <dbl>
##  1 5       30        7   110 0.294 0.100 0.0294   34.0
##  2 1       4         7    89 0.238 0.100 0.0238   42.0
##  3 3       16        7   100 0.267 0.100 0.0267   37.4
##  4 3       19        7   108 0.288 0.100 0.0288   34.7
##  5 1       7         7   110 0.294 0.100 0.0294   34.0
##  6 2       13        7   109 0.291 0.100 0.0291   34.3
##  7 5       30        7   110 0.294 0.100 0.0294   34.0
##  8 3       16        7   100 0.267 0.100 0.0267   37.4
##  9 2       13        7   109 0.291 0.100 0.0291   34.3
## 10 4       28        7   116 0.310 0.100 0.0310   32.2

Quality Control Checks

We can do some quality control checks to verify that our realized sample conforms to basic expectations:

The sum of BG selection probabilities within each stratum

# the sum and mean
sam %>%
  distinct(STRATUM, BG, pi1) %>%
  group_by(STRATUM) %>%
  summarise(sum = sum(pi1),
            mean = mean(pi1))

## # A tibble: 5 x 3
##   STRATUM   sum  mean
##   <chr>   <dbl> <dbl>
## 1 1       0.532 0.266
## 2 2       0.575 0.287
## 3 3       0.555 0.278
## 4 4       0.618 0.309
## 5 5       0.599 0.299

The sum of first stage probabilities is not a very intuitive measure, but should theoretically be close to 4/7 per stratum (this is the case). The mean propbabilites per stratum should be roughly 2/7 each, as each stratum has a fixed BG population size of 7, with 2 sampled. This is also the case.

The sum of the base weights for sample BGs and specification of what that sum represents in the population.

# sample sum of baseweights per BG. 
sam %>%
  group_by(BG) %>%
  summarise(basewt_sum = sum(basewt)) %>%
  arrange(BG)

## # A tibble: 10 x 2
##    BG    basewt_sum
##    <chr>      <dbl>
##  1 10          388.
##  2 13          377.
##  3 16          374.
##  4 19          381.
##  5 25          390.
##  6 28          387.
##  7 30          374.
##  8 34          361.
##  9 4           378.
## 10 7           374.

This represents the number of ultimate elements (respondents) in the sampled BG, plus an inflation factor for those BGs not in the frame. If we take the same of this basewt_sum variable (3786), it is roughly equal to the population total (3741).

The sum of the base weights for sample persons and what it represents in the population.

# sum of base weights, N-hat
sum(sam$basewt)

## [1] 3785.716

# compare with actual pop total, N
nrow(frame)

## [1] 3741

The two figures \(\hat{N}\) and \(N\) are close, so the sample passes this check. The figure represents the total number of individuals in the population.

Final Sample

Satisfied with the result, I list the sampled block groups, sampled persons.

# print BGs in sample
cat(unique(sam$BG), sep = ", ")

## 4, 7, 10, 13, 16, 19, 25, 28, 30, 34

# print persons in sample
cat(sam$ID, sep = ", ")

## 387, 331, 375, 409, 333, 410, 407, 384, 335, 672, 701, 681, 662, 733, 693, 724, 715, 683, 709, 698, 1042, 977, 1023, 1041, 1027, 1021, 1012, 1039, 979, 1035, 986, 1312, 1322, 1321, 1389, 1387, 1360, 1354, 1342, 1358, 1292, 1347, 1630, 1693, 1659, 1625, 1688, 1699, 1673, 1626, 1609, 1623, 2021, 1948, 2026, 1985, 1967, 1969, 1955, 1928, 2008, 1950, 2001, 2628, 2574, 2581, 2613, 2591, 2630, 2590, 2599, 2575, 2610, 2636, 2646, 2891, 2981, 2934, 2894, 2906, 2948, 2936, 2954, 2905, 2972, 2890, 2966, 3206, 3114, 3139, 3164, 3159, 3176, 3147, 3170, 3182, 3155, 3211, 3558, 3599, 3610, 3560, 3618, 3554, 3620, 3608, 3627, 3543, 3589

The sample can now be written to disk:

# write to disk
write.csv(sam, "~/Downloads/mysample.csv")

The csv file contains: a. BG identification code (BG) b. person ID (ID) c. selection probability of the BG containing each person (pi1) d. selection probability of the person within its BG (pi2) e. overall selection probability of the person (pik) f. base weight for each person (basewt) g. all data fields for each person

Marginal Analysis

Here, we estimate the proportion and population counts of persons who were ever told they had cancer (CANEV = Yes).

# population total and proportion
group_by(sam, CANEV == "1") %>% 
  summarise(n = n(),
            total = sum(basewt)) %>%
  mutate(proportion = total / sum(total))

## # A tibble: 2 x 4
##   `CANEV == "1"`     n total proportion
##   <lgl>          <int> <dbl>      <dbl>
## 1 FALSE             97 3376.      0.892
## 2 TRUE              12  409.      0.108

From the sample, we estimate that 409 individuals in the frame, or 10.8%, have been told they have cancer.

Domain Analysis

We estimate same thing by age group, sex, socio-economic status.

# Totals and proportions by age group
sam %>%
  group_by(age_grp, CANEV) %>% 
  summarise(total = sum(basewt)) %>%
  mutate(proportion = total / sum(total))

## # A tibble: 6 x 4
## # Groups:   age_grp [3]
##   age_grp      CANEV  total proportion
##   <chr>        <chr>  <dbl>      <dbl>
## 1 18‐34        1       34.0     0.0335
## 2 18‐34        2      981.      0.967 
## 3 35‐64        1      104.      0.0604
## 4 35‐64        2     1617.      0.940 
## 5 65 and older 1      271.      0.259 
## 6 65 and older 2      778.      0.741

# Totals and proportions by sex
sam %>%
  group_by(sex, CANEV) %>% 
  summarise(total = sum(basewt)) %>%
  mutate(proportion = total / sum(total))

## # A tibble: 4 x 4
## # Groups:   sex [2]
##   sex    CANEV total proportion
##   <chr>  <chr> <dbl>      <dbl>
## 1 Female 1      274.     0.126 
## 2 Female 2     1895.     0.874 
## 3 Male   1      136.     0.0838
## 4 Male   2     1482.     0.916

# Totals and proportions by socioeconomic status
sam %>%
  group_by(work, CANEV) %>% 
  summarise(total = sum(basewt)) %>%
  mutate(proportion = total / sum(total))

## # A tibble: 8 x 4
## # Groups:   work [6]
##   work                                             CANEV  total proportion
##   <chr>                                            <chr>  <dbl>      <dbl>
## 1 A FEDERAL government employee                    2       67.8      1.00 
## 2 A LOCAL government employee                      2      151.       1.00 
## 3 A STATE government employee                      1       34.0      0.166
## 4 A STATE government employee                      2      170.       0.834
## 5 Employee of a PRIVATE company for wages          1      375.       0.123
## 6 Employee of a PRIVATE company for wages          2     2667.       0.877
## 7 Self-employed in OWN business, professional pra… 2      179.       1.00 
## 8 Unknown                                          2      142.       1.00

Levels that do no appear in the summary tables above are estimated unbiasedly at 0.

Note: the assignment rubric asks for domain analysis by ‘education’, however there is no such variable in the data. Instead, I have given a breakdown by socio-economic status (WRKCATA).

STRATUM	BG	n
1	1	109
1	2	97
1	3	119
1	4	89
1	5	115
1	6	109
1	7	110
2	10	106
2	11	117
2	12	103
2	13	109
2	14	105
2	8	107
2	9	101
3	15	104
3	16	100
3	17	113
3	18	106
3	19	108
3	20	111
3	21	107
4	22	98
4	23	108
4	24	104
4	25	115
4	26	109
4	27	98
4	28	116
5	29	109
5	30	110
5	31	92
5	32	107
5	33	118
5	34	114
5	35	98

STRATUM	BG	n
1	1	109
1	2	97
1	3	119
1	4	89
1	5	115
1	6	109
1	7	110
2	10	106
2	11	117
2	12	103
2	13	109
2	14	105
2	8	107
2	9	101
3	15	104
3	16	100
3	17	113
3	18	106
3	19	108
3	20	111
3	21	107
4	22	98
4	23	108
4	24	104
4	25	115
4	26	109
4	27	98
4	28	116
5	29	109
5	30	110
5	31	92
5	32	107
5	33	118
5	34	114
5	35	98