745 Project: Detroit Multi-Stage Sampling (3-Stage PPS Final)

# =================================================================
# FINAL DESIGN — 3-stage with PPS BG selection
#
# Both goals achieved EXACTLY [Lecture Slides 34-35]:
#
# Stage 1: Select m=40 PSUs (tracts) by PPS systematic
#   pi_1 = m * S_i / S+
#   S_i = sum_d f_d * Q_i(d)  [PSU composite MOS, HH residents]
#
# Stage 2: Select 1 BG per PSU by PPS systematic
#   pi_{2|i} = S_ij / S_i
#   S_ij = sum_d f_d * Q_ij(d)  [BG composite MOS, HH residents]
#   Small/zero BGs linked before selection (same feasibility logic)
#
# Stage 3: Person sampling within selected BG
#   pi_{3|ij}(d) = n_bar * f_d / S_ij
#   1 adult per HH (last-birthday); HH rate q_ij = n_bar / TotHH_ij
#
# SELF-WEIGHTING [EXACT, Slide 35]:
#   pi_1 * pi_{2|i} * pi_{3|ij}(d)
#   = (m*S_i/S+) * (S_ij/S_i) * (n_bar*f_d/S_ij)
#   = m*n_bar*f_d / S+ = f_d      [S_i and S_ij both cancel]
#
# EQUAL WORKLOAD [EXACT per selected BG, Slide 34]:
#   sum_d n*_ij(d) = sum_d Q_ij(d) * (n_bar*f_d/S_ij)
#                 = n_bar * S_ij / S_ij = n_bar
#   Since 1 BG selected per PSU, BG workload = PSU workload = n_bar
#
# GQ HANDLING:
#   N_i(d) and N_ij(d) count only HH residents (TotHH>0 BGs).
#   GQ BGs excluded from sampling frame entirely.
# =================================================================

library(readxl); library(readr);  library(dplyr)
library(stringr); library(tidyr); library(sampling); library(tibble)

1. Read Data

xlsx_url <- paste0(
  "https://raw.githubusercontent.com/LinneaLiny/",
  "-Multi-Stage-Sample-for-Detroit-Michigan-U.S.-Project/main/",
  "Detroit%20MI_Updated.xlsx")
fips_url <- paste0(
  "https://raw.githubusercontent.com/LinneaLiny/-Multi-Stage-Sample-",
  "for-Detroit-Michigan-U.S.-Project/main/",
  "Census%20Tract%20FIPS%20Code%20Detroit%20MI_Updated.csv")

tmp_xlsx <- tempfile(fileext = ".xlsx")
tmp_csv  <- tempfile(fileext = ".csv")
download.file(xlsx_url, tmp_xlsx, mode = "wb", quiet = TRUE)
download.file(fips_url, tmp_csv,  mode = "wb", quiet = TRUE)

tract_raw  <- read_excel(tmp_xlsx, sheet = "Tract")
bg_raw     <- read_excel(tmp_xlsx, sheet = "BlockGroup")
tract_fips <- read_csv(tmp_csv, show_col_types = FALSE) %>%
  rename(GEOID = 1) %>% mutate(GEOID = as.character(GEOID))

cat("Detroit tracts:", nrow(tract_fips), "\n")

## Detroit tracts: 276

2. Cleaning

tract <- tract_raw %>%
  mutate(Tract = as.character(Tract), tract_id = Tract) %>%
  filter(tract_id %in% tract_fips$GEOID)

bg <- bg_raw %>%
  mutate(BlockGroup = as.character(BlockGroup),
         tract_id   = str_sub(BlockGroup, 1, 11),
         bg_id      = BlockGroup) %>%
  filter(tract_id %in% tract_fips$GEOID)

3. Age-Group Counts

make_age_groups <- function(df) {
  df %>% mutate(
    age18_34 =
      Male18to19Yrs+Male20Yrs+Male21Yrs+Male22to24Yrs+
      Male25to29Yrs+Male30to34Yrs+Female18to19Yrs+Female20Yrs+
      Female21Yrs+Female22to24Yrs+Female25to29Yrs+Female30to34Yrs,
    age35_49 =
      Male35to39Yrs+Male40to44Yrs+Male45to49Yrs+
      Female35to39Yrs+Female40to44Yrs+Female45to49Yrs,
    age50_64 =
      Male50to54Yrs+Male55to59Yrs+Male60to61Yrs+Male62to64Yrs+
      Female50to54Yrs+Female55to59Yrs+Female60to61Yrs+Female62to64Yrs,
    age65plus =
      Male65to66Yrs+Male67to69Yrs+Male70to74Yrs+Male75to79Yrs+
      Male80to84Yrs+MaleGE85Yrs+Female65to66Yrs+Female67to69Yrs+
      Female70to74Yrs+Female75to79Yrs+Female80to84Yrs+FemaleGE85Yrs,
    age18plus = age18_34+age35_49+age50_64+age65plus)
}
tract <- make_age_groups(tract)
bg    <- make_age_groups(bg)

4. Build Frames — HH Residents Only

# Separate HH-BGs (TotHH>0) from GQ-BGs (TotHH=0, age18plus>0).
# Only HH-BGs enter the sampling frame and contribute to MOS.
# GQ residents cannot be reached via household listing.

bg_frame <- bg %>%
  transmute(bg_id, tract_id, NAME, TotHH, TotPerson,
            age18_34, age35_49, age50_64, age65plus, age18plus) %>%
  mutate(is_gq  = (TotHH == 0 & age18plus > 0),
         has_hh = (TotHH > 0))

cat("Total BGs:", nrow(bg_frame), "\n")

## Total BGs: 627

cat("HH BGs (TotHH>0):", sum(bg_frame$has_hh), "\n")

## HH BGs (TotHH>0): 612

cat("GQ BGs (TotHH=0, age18plus>0):", sum(bg_frame$is_gq), "\n\n")

## GQ BGs (TotHH=0, age18plus>0): 2

# HH-BG frame: the sampling frame for both PSU and BG stages
bg_hh <- bg_frame %>% filter(has_hh)

# Aggregate HH-resident counts to tract level for PSU MOS
tract_hh <- bg_hh %>%
  group_by(tract_id) %>%
  summarise(
    n_bg_hh      = n(),
    TotHH_tract  = sum(TotHH),
    age18_34_hh  = sum(age18_34),
    age35_49_hh  = sum(age35_49),
    age50_64_hh  = sum(age50_64),
    age65plus_hh = sum(age65plus),
    age18plus_hh = sum(age18plus),
    .groups = "drop")

tract_frame <- tract %>%
  transmute(tract_id, NAME, TotHH, TotPerson,
            age18_34, age35_49, age50_64, age65plus, age18plus) %>%
  left_join(tract_hh, by = "tract_id") %>%
  mutate(across(c(n_bg_hh, TotHH_tract,
                  age18_34_hh, age35_49_hh, age50_64_hh,
                  age65plus_hh, age18plus_hh),
                ~replace_na(., 0))) %>%
  arrange(tract_id)   # GEOID sort FIRST, before linkage

cat("=== HH residents per tract (age18plus_hh) ===\n")

## === HH residents per tract (age18plus_hh) ===

summary(tract_frame$age18plus_hh)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1156    1702    1740    2310    4417

cat("\n=== HH-BGs per tract ===\n")

## 
## === HH-BGs per tract ===

print(table(tract_frame$n_bg_hh))

## 
##   0   1   2   3   4   5 
##   5  44 139  65  20   3

5. Design Parameters

n_psu <- 40

target_resp <- c(age18_34=250, age35_49=250, age50_64=250, age65plus=250)
resp_rate   <- c(age18_34=0.35, age35_49=0.45, age50_64=0.50, age65plus=0.65)

target_table <- tibble(
  age_group = names(target_resp),
  n_resp    = as.numeric(target_resp),
  rr        = as.numeric(resp_rate)) %>%
  mutate(n_selected = ceiling(n_resp / rr))

print(target_table)

## # A tibble: 4 × 4
##   age_group n_resp    rr n_selected
##   <chr>      <dbl> <dbl>      <dbl>
## 1 age18_34     250  0.35        715
## 2 age35_49     250  0.45        556
## 3 age50_64     250  0.5         500
## 4 age65plus    250  0.65        385

total_n <- sum(target_table$n_selected)   # S+ = 2156
m_bar   <- total_n / n_psu                # n_bar = 53.9

cat("\nS+ =", total_n, "| n_bar =", m_bar, "\n")

## 
## S+ = 2156 | n_bar = 53.9

6. Why 1 Person per Household

# Outcome variables are household-level or highly correlated within HH:
#   (1) HH vehicle ownership -> IDENTICAL for all HH members (rho_HH=1)
#   (2) Main transport mode  -> highly correlated within HH
#   (3) Skipped trips        -> correlated within HH
#   (4) Asked family for help-> correlated within HH
# Selecting 2+ per HH: DEFF_HH = 1+(k-1)*rho_HH >> 1 for these vars.
# Decision: 1 eligible adult per HH via Last-Birthday Method.
cat("1 adult per HH. Last-Birthday selection rule.\n")

## 1 adult per HH. Last-Birthday selection rule.

7. Same-Rate Design Discussion

N_d <- c(
  age18_34  = sum(tract_frame$age18_34_hh),
  age35_49  = sum(tract_frame$age35_49_hh),
  age50_64  = sum(tract_frame$age50_64_hh),
  age65plus = sum(tract_frame$age65plus_hh))

f_18_34  <- target_table$n_selected[target_table$age_group=="age18_34"]  / N_d["age18_34"]
f_35_49  <- target_table$n_selected[target_table$age_group=="age35_49"]  / N_d["age35_49"]
f_50_64  <- target_table$n_selected[target_table$age_group=="age50_64"]  / N_d["age50_64"]
f_65plus <- target_table$n_selected[target_table$age_group=="age65plus"] / N_d["age65plus"]
f_vals   <- c(f_18_34, f_35_49, f_50_64, f_65plus)
max_f    <- max(f_vals)
f_unif   <- total_n / sum(N_d)

tibble(
  age_group = names(N_d),
  N_d_hh    = as.integer(N_d),
  n_needed  = target_table$n_selected,
  rr        = as.numeric(resp_rate),
  f_d       = round(f_vals, 6),
  resp_unif = round(f_unif * as.integer(N_d) * as.numeric(resp_rate))) %>%
  print()

## # A tibble: 4 × 6
##   age_group N_d_hh n_needed    rr     f_d resp_unif
##   <chr>      <int>    <dbl> <dbl>   <dbl>     <dbl>
## 1 age18_34  161352      715  0.35 0.00443       254
## 2 age35_49  113271      556  0.45 0.00491       229
## 3 age50_64  117759      500  0.5  0.00425       264
## 4 age65plus  87738      385  0.65 0.00439       256

cat("\nA uniform rate cannot achieve 250 respondents per domain.",
    "Domain-specific rates f_d are required.\n")

## 
## A uniform rate cannot achieve 250 respondents per domain. Domain-specific rates f_d are required.

8. Composite MOS for PSUs and BGs

# PSU MOS: S_i = sum_d f_d * Q_i(d)  using HH-resident counts
# BG  MOS: S_ij = sum_d f_d * Q_ij(d) using HH-resident counts
#
# Feasibility [Slide 34]:
#   PSU: S_i  >= n_bar * max(f_d)
#   BG:  S_ij >= n_bar * max(f_d)  (same threshold, same formula)
# If a BG is too small, it is linked with adjacent BGs before selection.

min_feasible_MOS <- m_bar * max_f
cat("Feasibility threshold:", round(min_feasible_MOS, 6), "\n\n")

## Feasibility threshold: 0.264573

# PSU-level MOS (from HH tract aggregates)
tract_frame <- tract_frame %>%
  mutate(
    MOS_comp = f_18_34*age18_34_hh + f_35_49*age35_49_hh +
               f_50_64*age50_64_hh + f_65plus*age65plus_hh,
    flag_zero_hh  = age18plus_hh == 0,
    flag_lt2_bg   = n_bg_hh < 2,
    flag_infeas   = (MOS_comp < min_feasible_MOS) & !flag_zero_hh)

# BG-level MOS (HH residents only)
bg_hh <- bg_hh %>%
  mutate(MOS_bg = f_18_34*age18_34 + f_35_49*age35_49 +
                  f_50_64*age50_64 + f_65plus*age65plus)

cat("=== PSU composite MOS ===\n")

## === PSU composite MOS ===

summary(tract_frame$MOS_comp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.205   7.642   7.812  10.415  19.562

cat("\nSum S_i = S+ (should =", total_n, "):",
    round(sum(tract_frame$MOS_comp[!tract_frame$flag_zero_hh]), 2), "\n")

## 
## Sum S_i = S+ (should = 2156 ): 2156

cat("Tracts with zero HH adults:", sum(tract_frame$flag_zero_hh), "\n")

## Tracts with zero HH adults: 6

cat("Tracts with <2 HH-BGs:", sum(tract_frame$flag_lt2_bg & !tract_frame$flag_zero_hh), "\n")

## Tracts with <2 HH-BGs: 43

cat("Tracts with infeasible MOS:", sum(tract_frame$flag_infeas), "\n\n")

## Tracts with infeasible MOS: 4

cat("=== BG composite MOS ===\n")

## === BG composite MOS ===

summary(bg_hh$MOS_bg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.417   3.328   3.523   4.361  11.509

cat("BGs with MOS < threshold:",
    sum(bg_hh$MOS_bg < min_feasible_MOS), "\n")

## BGs with MOS < threshold: 14

9. Stage 1: PSU Linkage and Frame

# Sort FIRST (Section 4), then link. Linked PSUs inherit GEOID order
# through sort_key = min(tract_id).

tract_frame <- tract_frame %>%
  mutate(needs_link = !flag_zero_hh & (flag_infeas | flag_lt2_bg))

cat("Tracts needing PSU linkage:", sum(tract_frame$needs_link), "\n\n")

## Tracts needing PSU linkage: 44

build_psu_map <- function(tract_df, min_MOS) {
  regular <- tract_df %>%
    filter(age18plus_hh > 0, !needs_link) %>%
    transmute(tract_id, psu_id = tract_id,
              psu_type = "single_tract", link_group = NA_integer_)

  pool <- tract_df %>%
    filter(age18plus_hh > 0, needs_link) %>%
    arrange(tract_id) %>%
    mutate(grp = ceiling(row_number() / 2))

  if (nrow(pool) == 0) return(regular)

  for (iter in seq_len(30)) {
    grp_mos <- pool %>% group_by(grp) %>%
      summarise(mos = sum(MOS_comp), .groups = "drop")
    bad <- grp_mos$grp[grp_mos$mos < min_MOS]
    if (length(bad) == 0) break
    for (b in bad) {
      all_g   <- sort(unique(pool$grp))
      idx     <- which(all_g == b)
      partner <- if (idx < length(all_g)) all_g[idx+1] else all_g[idx-1]
      pool$grp[pool$grp == partner] <- b
      pool$grp <- as.integer(factor(pool$grp,
                                    levels = sort(unique(pool$grp))))
    }
  }

  linked <- pool %>% group_by(grp) %>%
    mutate(
      psu_id   = paste0("LPSU_", first(tract_id), "_", last(tract_id)),
      psu_type = case_when(n()==1~"singleton_link",
                           n()==2~"paired_link",
                           TRUE ~paste0(n(),"tract_link"))) %>%
    ungroup() %>%
    transmute(tract_id, psu_id, psu_type, link_group = grp)

  bind_rows(regular, linked)
}

psu_map <- build_psu_map(tract_frame, min_feasible_MOS)
cat("=== PSU types ===\n"); print(psu_map %>% count(psu_type))

## === PSU types ===

## # A tibble: 3 × 2
##   psu_type         n
##   <chr>        <int>
## 1 4tract_link      4
## 2 paired_link     40
## 3 single_tract   226

psu_frame <- tract_frame %>%
  filter(age18plus_hh > 0) %>%
  left_join(psu_map, by = "tract_id") %>%
  group_by(psu_id, psu_type) %>%
  summarise(
    sort_key     = min(tract_id),
    n_tracts     = n(),
    n_bg_hh      = sum(n_bg_hh,      na.rm = TRUE),
    TotHH        = sum(TotHH_tract,  na.rm = TRUE),
    age18_34_hh  = sum(age18_34_hh,  na.rm = TRUE),
    age35_49_hh  = sum(age35_49_hh,  na.rm = TRUE),
    age50_64_hh  = sum(age50_64_hh,  na.rm = TRUE),
    age65plus_hh = sum(age65plus_hh, na.rm = TRUE),
    age18plus_hh = sum(age18plus_hh, na.rm = TRUE),
    age18_34     = sum(age18_34,     na.rm = TRUE),
    age35_49     = sum(age35_49,     na.rm = TRUE),
    age50_64     = sum(age50_64,     na.rm = TRUE),
    age65plus    = sum(age65plus,    na.rm = TRUE),
    MOS_psu      = sum(MOS_comp,     na.rm = TRUE),
    .groups = "drop") %>%
  arrange(sort_key)   # geographic order via first constituent GEOID

cat("Total PSUs:", nrow(psu_frame), "\n")

## Total PSUs: 247

cat("Sum MOS_psu = S+ (should =", total_n, "):",
    round(sum(psu_frame$MOS_psu), 2), "\n")

## Sum MOS_psu = S+ (should = 2156 ): 2156

n_bad <- sum(psu_frame$MOS_psu < min_feasible_MOS)
if (n_bad > 0) {
  cat("INFEASIBLE PSUs:\n")
  print(psu_frame %>% filter(MOS_psu < min_feasible_MOS) %>%
          dplyr::select(psu_id, psu_type, n_bg_hh, age18plus_hh, MOS_psu))
  stop("Linkage incomplete.")
}
cat("PSU feasibility check PASSED.\n")

## PSU feasibility check PASSED.

cat("\n=== PSU MOS ===\n"); summary(psu_frame$MOS_psu)

## 
## === PSU MOS ===

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3183  6.4253  8.4826  8.7287 10.8684 19.5623

cat("\n=== PSU types ===\n"); print(table(psu_frame$psu_type))

## 
## === PSU types ===

## 
##  4tract_link  paired_link single_tract 
##            1           20          226

10. Stage 1: PPS Systematic Sampling of PSUs

set.seed(-77)

pik_psu <- inclusionprobabilities(psu_frame$MOS_psu, n = n_psu)
s_psu   <- UPsystematic(pik_psu)

psu_frame <- psu_frame %>%
  mutate(pik_psu = pik_psu, selected_psu = as.logical(s_psu))

psu_sample <- psu_frame %>% filter(selected_psu)

cat("PSUs selected:", nrow(psu_sample), "\n")

## PSUs selected: 40

cat("sum(pik_psu) =", round(sum(pik_psu),4), "(should =", n_psu, ")\n\n")

## sum(pik_psu) = 40 (should = 40 )

cat("=== MOS of selected PSUs ===\n"); summary(psu_sample$MOS_psu)

## === MOS of selected PSUs ===

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.747   7.809  10.140  10.109  12.127  19.562

cat("\n=== pi_1 of selected PSUs ===\n"); summary(psu_sample$pik_psu)

## 
## === pi_1 of selected PSUs ===

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.06951 0.14487 0.18812 0.18756 0.22498 0.36294

cat("\n=== Types ===\n"); print(table(psu_sample$psu_type))

## 
## === Types ===

## 
##  paired_link single_tract 
##            6           34

11. Stage 2: BG Linkage and PPS Systematic Sampling

# Within each selected PSU, select 1 BG by PPS systematic using S_ij.
# Before selection, BGs too small (S_ij < threshold) are linked with
# adjacent BGs within the same PSU (GEOID order within PSU).
#
# pi_{2|i} = S_ij / S_i   =>  pi_{2|i} * S_i = S_ij
#
# This gives the key cancellation:
#   pi_1 * pi_{2|i} * pi_{3|ij}(d)
#   = (m*S_i/S+) * (S_ij/S_i) * (n_bar*f_d/S_ij) = f_d  EXACT

selected_tract_ids <- psu_map %>%
  filter(psu_id %in% psu_sample$psu_id) %>%
  pull(tract_id)

# BG frame for selected PSUs (HH-BGs only, sorted by GEOID within PSU)
bg_in_sample <- bg_hh %>%
  filter(tract_id %in% selected_tract_ids) %>%
  left_join(psu_map %>% dplyr::select(tract_id, psu_id), by = "tract_id") %>%
  left_join(psu_frame %>% dplyr::select(psu_id, MOS_psu), by = "psu_id") %>%
  arrange(psu_id, bg_id)   # GEOID sort within PSU before linkage

cat("HH-BGs in sampled PSUs:", nrow(bg_in_sample), "\n")

## HH-BGs in sampled PSUs: 97

cat("BGs with MOS_bg < threshold:",
    sum(bg_in_sample$MOS_bg < min_feasible_MOS), "\n\n")

## BGs with MOS_bg < threshold: 2

# BG linkage within each PSU (same iterative logic as PSU linkage)
build_bg_map <- function(bg_df, min_MOS) {
  bg_df %>%
    group_by(psu_id) %>%
    group_modify(function(df, key) {
      df <- df %>% arrange(bg_id) %>% mutate(grp = row_number())

      # Mark BGs needing linkage
      df <- df %>% mutate(needs_link_bg = MOS_bg < min_MOS | age18plus == 0)

      # Iterative linkage within PSU
      pool_idx <- which(df$needs_link_bg)
      if (length(pool_idx) == 0) {
        return(df %>% mutate(bg_oper_id = bg_id, bg_oper_type = "single_bg"))
      }

      # Simple approach: pair small BGs with next BG in GEOID order
      assigned <- rep(NA_integer_, nrow(df))
      grp_id   <- 1L
      i <- 1L
      while (i <= nrow(df)) {
        if (df$needs_link_bg[i]) {
          # Pair with next row
          j <- i + 1L
          if (j <= nrow(df)) {
            assigned[i] <- grp_id; assigned[j] <- grp_id
            i <- j + 1L
          } else {
            # Last row, pair with previous
            assigned[i] <- grp_id - 1L
            i <- i + 1L
          }
        } else {
          assigned[i] <- grp_id
          i <- i + 1L
        }
        grp_id <- grp_id + 1L
      }
      assigned[is.na(assigned)] <- max(assigned, na.rm=TRUE) + 1L

      df %>%
        mutate(
          bg_grp = assigned,
          bg_oper_id = ave(bg_id, bg_grp, FUN=function(x)
            if(length(x)==1) x[1] else paste0("LBG_",x[1],"_",x[length(x)])),
          bg_oper_type = ave(bg_id, bg_grp, FUN=function(x)
            if(length(x)==1) "single_bg" else "linked_bg"))
    }) %>%
    ungroup()
}

bg_in_sample_linked <- build_bg_map(bg_in_sample, min_feasible_MOS)

# Aggregate to operational BG level
bg_oper_frame <- bg_in_sample_linked %>%
  group_by(psu_id, MOS_psu, bg_oper_id, bg_oper_type) %>%
  summarise(
    n_bg_combined = n(),
    TotHH         = sum(TotHH),
    age18_34      = sum(age18_34),
    age35_49      = sum(age35_49),
    age50_64      = sum(age50_64),
    age65plus     = sum(age65plus),
    age18plus     = sum(age18plus),
    MOS_bg_oper   = sum(MOS_bg),   # S_ij for this operational BG
    .groups = "drop") %>%
  arrange(psu_id, bg_oper_id)

cat("Operational BGs in sampled PSUs:", nrow(bg_oper_frame), "\n")

## Operational BGs in sampled PSUs: 95

cat("BGs still infeasible after linkage:",
    sum(bg_oper_frame$MOS_bg_oper < min_feasible_MOS), "\n\n")

## BGs still infeasible after linkage: 0

cat("=== BG MOS after linkage ===\n")

## === BG MOS after linkage ===

summary(bg_oper_frame$MOS_bg_oper)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.362   3.200   3.975   4.257   5.109  11.509

# PPS systematic selection: 1 operational BG per PSU
# pi_{2|i} = S_ij / S_i  (inclusion prob proportional to BG MOS)
set.seed(-77 + 1)

bg_sample_list <- bg_oper_frame %>%
  group_by(psu_id) %>%
  group_map(function(df, key) {
    n_here  <- nrow(df)
    S_i_val <- df$MOS_psu[1]
    if (n_here == 1) {
      df %>% mutate(pik_bg = df$MOS_bg_oper / S_i_val,
                    selected_bg = TRUE)
    } else {
      pik <- inclusionprobabilities(df$MOS_bg_oper, n = 1)
      sel <- as.logical(UPsystematic(pik))
      df  %>% mutate(pik_bg = pik, selected_bg = sel)
    }
  }, .keep = TRUE) %>%
  bind_rows()

bg_sample <- bg_sample_list %>% filter(selected_bg)

cat("\nBGs selected:", nrow(bg_sample), "(should be", n_psu, ")\n")

## 
## BGs selected: 40 (should be 40 )

cat("\n=== pi_{2|i} = S_ij/S_i of selected BGs ===\n")

## 
## === pi_{2|i} = S_ij/S_i of selected BGs ===

summary(bg_sample$pik_bg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1832  0.3710  0.4973  0.4826  0.5741  1.0000

cat("\n=== MOS_bg of selected BGs (S_ij) ===\n")

## 
## === MOS_bg of selected BGs (S_ij) ===

summary(bg_sample$MOS_bg_oper)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.965   3.359   4.436   4.645   5.536  11.509

cat("\n=== TotHH of selected BGs ===\n")

## 
## === TotHH of selected BGs ===

summary(bg_sample$TotHH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   260.0   452.2   590.0   619.9   701.8  1429.0

12. Stage 3: Person Sampling Rates

# pi_{3|ij}(d) = n_bar * f_d / S_ij
# where S_ij = MOS_bg_oper of the selected BG
#
# Field operation:
#   List all HHs in selected BG.
#   Sample HHs at composite rate q_ij = n_bar / TotHH_selected_BG.
#   Select 1 adult per sampled HH (last-birthday method).

sample_design <- bg_sample %>%
  mutate(
    # Person-stage rates: pi_{3|ij}(d) = n_bar * f_d / S_ij
    p3_18_34  = m_bar * f_18_34  / MOS_bg_oper,
    p3_35_49  = m_bar * f_35_49  / MOS_bg_oper,
    p3_50_64  = m_bar * f_50_64  / MOS_bg_oper,
    p3_65plus = m_bar * f_65plus / MOS_bg_oper,

    # Cap at 1 (take-all for very small BGs)
    p3_18_34_cap  = pmin(p3_18_34,  1),
    p3_35_49_cap  = pmin(p3_35_49,  1),
    p3_50_64_cap  = pmin(p3_50_64,  1),
    p3_65plus_cap = pmin(p3_65plus, 1),

    takeall_18_34  = p3_18_34  > 1,
    takeall_35_49  = p3_35_49  > 1,
    takeall_50_64  = p3_50_64  > 1,
    takeall_65plus = p3_65plus > 1,

    # HH sampling rate for field use: q_ij = n_bar / TotHH_BG
    hh_rate = pmin(m_bar / TotHH, 1),

    # Expected workload per selected BG [n_bar * S_ij / S_ij = n_bar]
    exp_n_18_34  = age18_34  * p3_18_34_cap,
    exp_n_35_49  = age35_49  * p3_35_49_cap,
    exp_n_50_64  = age50_64  * p3_50_64_cap,
    exp_n_65plus = age65plus * p3_65plus_cap,
    exp_n_total  = exp_n_18_34 + exp_n_35_49 + exp_n_50_64 + exp_n_65plus)

cat("=== Take-all check ===\n")

## === Take-all check ===

cat("age18_34 :", sum(sample_design$takeall_18_34),  "BGs\n")

## age18_34 : 0 BGs

cat("age35_49 :", sum(sample_design$takeall_35_49),  "BGs\n")

## age35_49 : 0 BGs

cat("age50_64 :", sum(sample_design$takeall_50_64),  "BGs\n")

## age50_64 : 0 BGs

cat("age65plus:", sum(sample_design$takeall_65plus), "BGs\n")

## age65plus: 0 BGs

cat("\n=== HH sampling rates (q_ij = n_bar/TotHH) ===\n")

## 
## === HH sampling rates (q_ij = n_bar/TotHH) ===

summary(sample_design$hh_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03772 0.07688 0.09146 0.09883 0.11918 0.20731

cat("NA count:", sum(is.na(sample_design$hh_rate)), "(should be 0)\n")

## NA count: 0 (should be 0)

13. Verify Both Goals

sample_final <- sample_design %>%
  left_join(psu_frame %>% dplyr::select(psu_id, pik_psu),
            by = "psu_id") %>%
  mutate(
    # Overall: pi_1 * pi_{2|i} * pi_{3|ij}(d) = f_d  [EXACT]
    p_overall_18_34  = pik_psu * pik_bg * p3_18_34_cap,
    p_overall_35_49  = pik_psu * pik_bg * p3_35_49_cap,
    p_overall_50_64  = pik_psu * pik_bg * p3_50_64_cap,
    p_overall_65plus = pik_psu * pik_bg * p3_65plus_cap,
    w_base_18_34  = 1 / p_overall_18_34,
    w_base_35_49  = 1 / p_overall_35_49,
    w_base_50_64  = 1 / p_overall_50_64,
    w_base_65plus = 1 / p_overall_65plus)

cat("=== GOAL 1: SELF-WEIGHTING ===\n")

## === GOAL 1: SELF-WEIGHTING ===

cat("pi_1 * pi_{2|i} * pi_{3|ij}(d) should equal f_d exactly\n\n")

## pi_1 * pi_{2|i} * pi_{3|ij}(d) should equal f_d exactly

for (ag in c("18_34","35_49","50_64","65plus")) {
  col <- paste0("p_overall_", ag)
  fd  <- switch(ag,"18_34"=f_18_34,"35_49"=f_35_49,
                   "50_64"=f_50_64,"65plus"=f_65plus)
  v   <- sample_final[[col]]
  ok  <- max(abs(v - fd)/fd, na.rm=TRUE) < 0.001
  cat(sprintf("  age%-6s: [%.6f, %.6f]  f_d=%.6f  %s\n",
              ag, min(v,na.rm=T), max(v,na.rm=T), fd,
              ifelse(ok, "✓ EXACT", "⚠ deviation")))
}

##   age18_34 : [0.004431, 0.004431]  f_d=0.004431  ✓ EXACT
##   age35_49 : [0.004909, 0.004909]  f_d=0.004909  ✓ EXACT
##   age50_64 : [0.004246, 0.004246]  f_d=0.004246  ✓ EXACT
##   age65plus: [0.004388, 0.004388]  f_d=0.004388  ✓ EXACT

cat("\nCV of base weights (0 = perfect self-weighting):\n")

## 
## CV of base weights (0 = perfect self-weighting):

sample_final %>%
  summarise(
    CV_18_34  = sd(w_base_18_34) /mean(w_base_18_34),
    CV_35_49  = sd(w_base_35_49) /mean(w_base_35_49),
    CV_50_64  = sd(w_base_50_64) /mean(w_base_50_64),
    CV_65plus = sd(w_base_65plus)/mean(w_base_65plus)) %>%
  print()

## # A tibble: 1 × 4
##   CV_18_34 CV_35_49 CV_50_64 CV_65plus
##      <dbl>    <dbl>    <dbl>     <dbl>
## 1 1.72e-16 1.20e-16 1.24e-16  1.49e-16

cat("\n=== GOAL 2: EQUAL WORKLOAD PER SELECTED BG/PSU ===\n")

## 
## === GOAL 2: EQUAL WORKLOAD PER SELECTED BG/PSU ===

cat("exp_n_total = sum_d Q_ij(d) * pi_{3|ij}(d) = n_bar * S_ij/S_ij = n_bar\n\n")

## exp_n_total = sum_d Q_ij(d) * pi_{3|ij}(d) = n_bar * S_ij/S_ij = n_bar

summary(sample_final$exp_n_total)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    53.9    53.9    53.9    53.9    53.9    53.9

cat("Target n_bar:", m_bar, "\n")

## Target n_bar: 53.9

cat("Deviation only from take-all capping (p3>1 => workload slightly < n_bar).\n\n")

## Deviation only from take-all capping (p3>1 => workload slightly < n_bar).

cat("exp_n_total for all 40 PSUs:\n")

## exp_n_total for all 40 PSUs:

sample_final %>%
  dplyr::select(psu_id, MOS_psu, MOS_bg_oper, pik_bg, exp_n_total) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  print(n = Inf)

## # A tibble: 40 × 5
##    psu_id                       MOS_psu MOS_bg_oper pik_bg exp_n_total
##    <chr>                          <dbl>       <dbl>  <dbl>       <dbl>
##  1 26163500200                     9.41        6.67  0.709        53.9
##  2 26163500900                    11.6         5.19  0.448        53.9
##  3 26163501400                    12.1         5.58  0.462        53.9
##  4 26163501900                     7.76        3.37  0.435        53.9
##  5 26163505100                    10.1         4.17  0.413        53.9
##  6 26163506900                    11.5         3.41  0.297        53.9
##  7 26163508000                     3.75        3.75  1            53.9
##  8 26163511400                     7.48        3.76  0.502        53.9
##  9 26163513900                     5.21        2.73  0.524        53.9
## 10 26163515700                    19.6         7.42  0.379        53.9
## 11 26163516700                    11.6         3.11  0.268        53.9
## 12 26163517500                    11.2         5.52  0.492        53.9
## 13 26163520200                    15.8        11.5   0.730        53.9
## 14 26163523200                    10.2         3.21  0.315        53.9
## 15 26163524200                    13.1         4.87  0.372        53.9
## 16 26163525700                    13.2         4.86  0.368        53.9
## 17 26163526400                     4.35        2.06  0.474        53.9
## 18 26163530800                     4.88        2.90  0.593        53.9
## 19 26163531800                     5.05        2.59  0.512        53.9
## 20 26163533600                     4.66        1.97  0.422        53.9
## 21 26163534700                     9.93        2.52  0.254        53.9
## 22 26163535600                    13.0         2.37  0.183        53.9
## 23 26163536500                     7.18        4.08  0.568        53.9
## 24 26163538100                     8.07        4.06  0.503        53.9
## 25 26163538600                    16.2         3.69  0.227        53.9
## 26 26163539500                    10.7         3.48  0.327        53.9
## 27 26163540400                     9.08        4.77  0.526        53.9
## 28 26163540900                    11.8         3.32  0.280        53.9
## 29 26163541500                    13.0         6.92  0.534        53.9
## 30 26163542300                     7.83        4.83  0.617        53.9
## 31 26163543100                     8.58        4.65  0.543        53.9
## 32 26163545500                    11.7         4.22  0.360        53.9
## 33 26163545900                    11.4         5.78  0.507        53.9
## 34 26163546700                     9.38        5.68  0.606        53.9
## 35 LPSU_26163503400_26163503600    7.89        4.86  0.616        53.9
## 36 LPSU_26163506100_26163506200   12.3         6.19  0.505        53.9
## 37 LPSU_26163521800_26163522400    8.01        5.26  0.656        53.9
## 38 LPSU_26163537200_26163538800    7.49        5.46  0.729        53.9
## 39 LPSU_26163539100_26163539700   13.6         6.10  0.449        53.9
## 40 LPSU_26163544000_26163546301   14.9         8.91  0.598        53.9

14. Anticipated Precision

# Anticipated Precision — correct calculation
p0   <- 0.20    # planning proportion: public transit use
rho  <- 0.05    # assumed ICC within tracts (typical urban survey)

# Overall response rate (weighted by selections needed)
r_bar <- sum(target_table$n_resp) / sum(target_table$n_selected)
# = 1000 / 2156 = 0.4638

# Expected respondents per PSU
n_resp_per_psu <- m_bar * r_bar    # 53.9 * 0.4638 ≈ 25.0
n_resp_total   <- sum(target_resp) # 1000

# DEFF using respondents per PSU (correct denominator)
DEFF  <- 1 + (n_resp_per_psu - 1) * rho
n_eff <- n_resp_total / DEFF
SE    <- sqrt(p0 * (1 - p0) / n_eff)
MOE   <- 1.96 * SE

cat("=== Anticipated Precision ===\n")

## === Anticipated Precision ===

cat("Overall response rate (r_bar):", round(r_bar, 4), "\n")

## Overall response rate (r_bar): 0.4638

cat("Expected respondents per PSU:", round(n_resp_per_psu, 2), "\n")

## Expected respondents per PSU: 25

cat("DEFF =", round(DEFF, 3), "\n")

## DEFF = 2.2

cat("n_eff =", round(n_eff, 1), "\n")

## n_eff = 454.5

cat("SE(p_hat) =", round(SE, 4), "\n")

## SE(p_hat) = 0.0188

cat("95% MOE =", round(MOE, 4), "(", round(MOE*100, 2), "%)\n\n")

## 95% MOE = 0.0368 ( 3.68 %)

# Domain-level DEFF (each domain: 250 respondents over 40 PSUs)
n_resp_per_psu_per_domain <- n_resp_per_psu / 4  # = 6.25
DEFF_domain <- 1 + (n_resp_per_psu_per_domain - 1) * rho
cat("Domain-level DEFF (per age group):", round(DEFF_domain, 3), "\n")

## Domain-level DEFF (per age group): 1.262

cat("(For domain-specific estimates, e.g., transit use among 18-34)\n\n")

## (For domain-specific estimates, e.g., transit use among 18-34)

# Sensitivity table
tibble(rho = c(0.01, 0.03, 0.05, 0.10, 0.15)) %>%
  mutate(
    n_resp_psu = n_resp_per_psu,
    DEFF       = 1 + (n_resp_psu - 1) * rho,
    n_eff      = n_resp_total / DEFF,
    SE         = sqrt(p0 * (1 - p0) / n_eff),
    MOE_95pct  = round(1.96 * SE, 4)) %>%
  print()

## # A tibble: 5 × 6
##     rho n_resp_psu  DEFF n_eff     SE MOE_95pct
##   <dbl>      <dbl> <dbl> <dbl>  <dbl>     <dbl>
## 1  0.01         25  1.24  806. 0.0141    0.0276
## 2  0.03         25  1.72  581. 0.0166    0.0325
## 3  0.05         25  2.2   455. 0.0188    0.0368
## 4  0.1          25  3.4   294. 0.0233    0.0457
## 5  0.15         25  4.6   217. 0.0271    0.0532

15. Variance Estimation Guidance

cat("library(survey)\n")

## library(survey)

cat("design <- svydesign(\n")

## design <- svydesign(

cat("  ids     = ~psu_id + bg_oper_id,\n")

##   ids     = ~psu_id + bg_oper_id,

cat("  weights = ~w_base_d,\n")

##   weights = ~w_base_d,

cat("  data    = respondent_data\n")

##   data    = respondent_data

cat(")\n\n")

## )

cat("Both stages used systematic PPS => no unbiased variance estimator.\n")

## Both stages used systematic PPS => no unbiased variance estimator.

cat("Use successive-differences estimator or WR approximation.\n")

## Use successive-differences estimator or WR approximation.

16. Output Files

frame1 <- psu_frame %>%
  dplyr::select(psu_id, psu_type, sort_key, n_tracts, n_bg_hh,
                TotHH, age18plus_hh,
                age18_34_hh, age35_49_hh, age50_64_hh, age65plus_hh,
                MOS_psu, pik_psu, selected_psu)
write.csv(frame1, "/tmp/FRAME1_PSU.csv", row.names = FALSE)

frame2 <- bg_sample_list %>%
  dplyr::select(psu_id, bg_oper_id, n_bg_combined,
                TotHH, age18plus, age18_34, age35_49, age50_64, age65plus,
                MOS_bg_oper, pik_bg, selected_bg)
write.csv(frame2, "/tmp/FRAME2_BG.csv", row.names = FALSE)

# psu_type lives in psu_frame; bg_oper_type lives in bg_sample.
# Join them in before selecting for output.
sample_out <- sample_final %>%
  left_join(psu_frame  %>% dplyr::select(psu_id, psu_type),
            by = "psu_id") %>%
  left_join(bg_sample  %>% dplyr::select(psu_id, bg_oper_id, bg_oper_type),
            by = c("psu_id", "bg_oper_id")) %>%
  dplyr::select(
    psu_id, psu_type, bg_oper_id, 
    TotHH, age18plus, age18_34, age35_49, age50_64, age65plus,
    MOS_psu, pik_psu, MOS_bg_oper, pik_bg,
    p3_18_34, p3_35_49, p3_50_64, p3_65plus,
    p_overall_18_34, p_overall_35_49, p_overall_50_64, p_overall_65plus,
    w_base_18_34, w_base_35_49, w_base_50_64, w_base_65plus,
    hh_rate, exp_n_18_34, exp_n_35_49, exp_n_50_64, exp_n_65plus, exp_n_total,
    takeall_18_34, takeall_35_49, takeall_50_64, takeall_65plus)
write.csv(sample_out, "/tmp/SAMPLE.csv", row.names = FALSE)

cat("FRAME1_PSU.csv:", nrow(frame1), "rows\n")

## FRAME1_PSU.csv: 247 rows

cat("FRAME2_BG.csv :", nrow(frame2), "rows\n")

## FRAME2_BG.csv : 95 rows

cat("SAMPLE.csv    :", nrow(sample_out), "rows\n\n")

## SAMPLE.csv    : 40 rows

cat("=== Sample preview ===\n")

## === Sample preview ===

sample_out %>%
  dplyr::select(psu_id, psu_type, bg_oper_id, 
                TotHH, pik_psu, pik_bg,
                hh_rate, w_base_18_34, exp_n_total) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  print(n = Inf)

## # A tibble: 40 × 9
##    psu_id          psu_type bg_oper_id TotHH pik_psu pik_bg hh_rate w_base_18_34
##    <chr>           <chr>    <chr>      <dbl>   <dbl>  <dbl>   <dbl>        <dbl>
##  1 26163500200     single_… 261635002…   764  0.175   0.709  0.0705         226.
##  2 26163500900     single_… 261635009…   661  0.215   0.448  0.0815         226.
##  3 26163501400     single_… 261635014…   652  0.224   0.462  0.0827         226.
##  4 26163501900     single_… 261635019…   502  0.144   0.435  0.107          226.
##  5 26163505100     single_… 261635051…   689  0.187   0.413  0.0782         226.
##  6 26163506900     single_… 261635069…   430  0.213   0.297  0.125          226.
##  7 26163508000     single_… LBG_26163…   629  0.0695  1      0.0857         226.
##  8 26163511400     single_… 261635114…   625  0.139   0.502  0.0862         226.
##  9 26163513900     single_… 261635139…   453  0.0966  0.524  0.119          226.
## 10 26163515700     single_… 261635157…  1429  0.363   0.379  0.0377         226.
## 11 26163516700     single_… 261635167…   465  0.216   0.268  0.116          226.
## 12 26163517500     single_… 261635175…   921  0.208   0.492  0.0585         226.
## 13 26163520200     single_… 261635202…   415  0.293   0.730  0.130          226.
## 14 26163523200     single_… 261635232…   347  0.189   0.315  0.155          226.
## 15 26163524200     single_… 261635242…   547  0.243   0.372  0.0985         226.
## 16 26163525700     single_… LBG_26163…   570  0.245   0.368  0.0946         226.
## 17 26163526400     single_… 261635264…   260  0.0808  0.474  0.207          226.
## 18 26163530800     single_… 261635308…   545  0.0906  0.593  0.0989         226.
## 19 26163531800     single_… 261635318…   465  0.0936  0.512  0.116          226.
## 20 26163533600     single_… 261635336…   414  0.0864  0.422  0.130          226.
## 21 26163534700     single_… 261635347…   360  0.184   0.254  0.150          226.
## 22 26163535600     single_… 261635356…   299  0.240   0.183  0.180          226.
## 23 26163536500     single_… 261635365…   644  0.133   0.568  0.0837         226.
## 24 26163538100     single_… 261635381…   522  0.150   0.503  0.103          226.
## 25 26163538600     single_… 261635386…   414  0.301   0.227  0.130          226.
## 26 26163539500     single_… 261635395…   450  0.198   0.327  0.120          226.
## 27 26163540400     single_… 261635404…   610  0.168   0.526  0.0884         226.
## 28 26163540900     single_… 261635409…   378  0.220   0.280  0.143          226.
## 29 26163541500     single_… 261635415…   980  0.240   0.534  0.055          226.
## 30 26163542300     single_… 261635423…   679  0.145   0.617  0.0794         226.
## 31 26163543100     single_… 261635431…   546  0.159   0.543  0.0987         226.
## 32 26163545500     single_… 261635455…   545  0.217   0.360  0.0989         226.
## 33 26163545900     single_… 261635459…   826  0.212   0.507  0.0653         226.
## 34 26163546700     single_… 261635467…   740  0.174   0.606  0.0728         226.
## 35 LPSU_261635034… paired_… 261635036…   643  0.146   0.616  0.0838         226.
## 36 LPSU_261635061… paired_… 261635061…   744  0.228   0.505  0.0724         226.
## 37 LPSU_261635218… paired_… 261635218…   901  0.149   0.656  0.0598         226.
## 38 LPSU_261635372… paired_… 261635388…   678  0.139   0.729  0.0795         226.
## 39 LPSU_261635391… paired_… 261635391…   804  0.252   0.449  0.067          226.
## 40 LPSU_261635440… paired_… 261635440…  1250  0.276   0.598  0.0431         226.
## # ℹ 1 more variable: exp_n_total <dbl>

library(tigris)
library(sf)
library(leaflet)
library(ggplot2)
options(tigris_class    = "sf")
options(tigris_use_cache = TRUE)

# Download Wayne County shapefiles
wayne_tracts <- tracts(state = "MI", county = "Wayne", year = 2020,
                       progress_bar = FALSE)
wayne_bgs    <- block_groups(state = "MI", county = "Wayne", year = 2020,
                             progress_bar = FALSE)

# Filter to Detroit only
detroit_tracts_sf <- wayne_tracts %>%
  right_join(tract_fips, by = "GEOID")

detroit_bgs_sf <- wayne_bgs %>%
  mutate(GEOIDt = substr(GEOID, 1, 11)) %>%
  filter(GEOIDt %in% tract_fips$GEOID)

# Project to WGS84
detroit_tracts_trans <- st_transform(detroit_tracts_sf, 4326)
detroit_bgs_trans    <- st_transform(detroit_bgs_sf,    4326)

cat("Detroit tracts in shapefile:", nrow(detroit_tracts_trans), "\n")

## Detroit tracts in shapefile: 276

cat("Detroit BGs in shapefile:",    nrow(detroit_bgs_trans),    "\n")

## Detroit BGs in shapefile: 627

ggplot() +
  geom_sf(data  = detroit_bgs_trans,
          fill      = NA,
          linetype  = "dotted",
          linewidth = 0.3,
          color     = "grey50") +
  geom_sf(data  = detroit_tracts_trans,
          fill      = NA,
          color     = "black",
          linewidth  = 0.7) +
  labs(title   = "Detroit Tracts (Solid) and Block Groups (Dotted)",
       caption = "Source: 2020 Census TIGER/Line Shapefiles") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12))

Figure 1: City of Detroit Census Tracts (solid) and Block Groups (dotted)

# ---------------------------------------------------------------
# FIX for 3-stage design:
#
# In 745_final_3stage.Rmd the relevant objects are:
#   psu_map        -> maps tract_id to psu_id
#   psu_sample     -> the 40 selected PSUs (has psu_id)
#   bg_sample      -> the 40 selected BGs (has bg_oper_id, NOT bg_id directly)
#   bg_in_sample_linked -> original bg_id rows with their bg_oper_id
#
# The GEOID column in tigris block_groups() is 12-digit (= original bg_id).
# bg_oper_id may be "LBG_..." for linked BGs, which won't match tigris.
# Solution: trace back from bg_sample$bg_oper_id to original bg_id(s)
#           via bg_in_sample_linked, then match those to tigris GEOID.
# ---------------------------------------------------------------

# Step 1: selected PSU tract GEOIDs
selected_tract_geoids <- psu_map %>%
  filter(psu_id %in% psu_sample$psu_id) %>%
  pull(tract_id)

# Step 2: map selected bg_oper_ids back to original 12-digit bg_ids
#   bg_in_sample_linked has columns: bg_id, bg_oper_id, psu_id
selected_oper_ids <- bg_sample$bg_oper_id

listing_bg_geoids <- bg_in_sample_linked %>%
  filter(psu_id  %in% psu_sample$psu_id,
         bg_oper_id %in% selected_oper_ids) %>%
  pull(bg_id) %>%
  unique()

cat("Selected PSU tracts:", length(selected_tract_geoids), "\n")

## Selected PSU tracts: 46

cat("Listing BG original GEOIDs:", length(listing_bg_geoids), "\n")

## Listing BG original GEOIDs: 42

# Mark flags on spatial data
detroit_tracts_map <- detroit_tracts_trans %>%
  mutate(selected_psu  = GEOID %in% selected_tract_geoids)

detroit_bgs_map <- detroit_bgs_trans %>%
  mutate(is_listing_bg = GEOID %in% listing_bg_geoids)

selected_tracts_sf <- detroit_tracts_map %>% filter(selected_psu)
listing_bgs_sf     <- detroit_bgs_map    %>% filter(is_listing_bg)

cat("Tracts matched as selected:", nrow(selected_tracts_sf),
    "(should be close to", length(selected_tract_geoids), ")\n")

## Tracts matched as selected: 46 (should be close to 46 )

cat("BGs matched as listing:",     nrow(listing_bgs_sf),
    "(may be > 40 if any listing BG spans a linked operational unit)\n")

## BGs matched as listing: 42 (may be > 40 if any listing BG spans a linked operational unit)

leaflet() %>%
  addTiles() %>%

  # Base: all Detroit tracts
  addPolygons(
    data        = detroit_tracts_trans,
    color       = "grey40",
    weight      = 0.8,
    fillOpacity = 0.0,
    popup       = ~NAME,
    group       = "All Tracts") %>%

  # Selected PSUs (blue)
  addPolygons(
    data        = selected_tracts_sf,
    color       = "blue",
    weight      = 2,
    fillColor   = "blue",
    fillOpacity = 0.25,
    popup       = ~paste0("<b>Selected PSU</b><br>GEOID: ", GEOID,
                          "<br>", NAME),
    group       = "Selected PSUs (Blue)") %>%

  # Listing BGs (red)
  addPolygons(
    data        = listing_bgs_sf,
    color       = "darkred",
    weight      = 2,
    fillColor   = "red",
    fillOpacity = 0.70,
    popup       = ~paste0("<b>Listing Block Group</b><br>GEOID: ", GEOID),
    group       = "Listing BGs (Red)") %>%

  addLayersControl(
    overlayGroups = c("All Tracts",
                      "Selected PSUs (Blue)",
                      "Listing BGs (Red)"),
    options = layersControlOptions(collapsed = FALSE)) %>%

  addLegend(
    position = "bottomright",
    colors   = c("blue", "red"),
    labels   = c("Selected PSU (Tract)", "Designated Listing BG"),
    title    = "Sample Units",
    opacity  = 0.8)

Figure 2: Selected Sample PSUs (blue) and Designated Listing Block Groups (red)

745 Project: Detroit Multi-Stage Sampling (3-Stage PPS Final)

XINYU LIN

2026-04-19

1. Read Data

2. Cleaning

3. Age-Group Counts

4. Build Frames — HH Residents Only

5. Design Parameters

6. Why 1 Person per Household

7. Same-Rate Design Discussion

8. Composite MOS for PSUs and BGs

9. Stage 1: PSU Linkage and Frame

10. Stage 1: PPS Systematic Sampling of PSUs

11. Stage 2: BG Linkage and PPS Systematic Sampling

12. Stage 3: Person Sampling Rates

13. Verify Both Goals

14. Anticipated Precision

15. Variance Estimation Guidance

16. Output Files