PS5708 Week 8: Sampling and Weighting

Crash course on basic statistical concepts

set.seed(123)  # For reproducibility 

# Simulate population data
population <- rnorm(10000, mean = 50, sd = 10)  # Normal distribution. (Read center at μ=50 with σ=10)
mean(population)

## [1] 49.97628

sd(population)

## [1] 9.986366

# Draw a random sample
sample_data <- sample(population, 1000)  # n = 1000
n <- length(sample_data)

# Descriptive statistics (you should always provide your readers with descriptive statistics)
mean(sample_data)     # Mean

## [1] 49.29693

median(sample_data)   # Median

## [1] 49.28208

var(sample_data)      # Variance: measur how spread out a set of data points are from their average (mean).

## [1] 99.07115

sd(sample_data)       # Standard Deviation: a quantity (unit) that measures the dispersion or spread of data points around the mean (average) of a dataset

## [1] 9.953449

# Mode (most frequent value, rounded)
mode_value <- as.numeric(names(sort(table(round(sample_data)), decreasing = TRUE)[1]))
mode_value

## [1] 49

# Sampling distribution & Central Limit Theorem (CLT)
sample_means <- replicate(1000, mean(sample(population, 100)))
hist(sample_means, main="CLT: Sampling Distribution of Means", col="lightblue")

mean(sample_means)

## [1] 50.00556

sd(sample_means)

## [1] 0.9851401

# Point Estimate & Confidence Interval (CI)
x_bar <- mean(sample_data)  # Read x bar
s <- sd(sample_data)
alpha <- 0.05  # 95% CI
t_crit <- qt(1 - alpha/2, df = n - 1)  # qt() calculates the quantile for a given probability and degrees of freedom of the Student's t-distribution
margin_error <- t_crit * s / sqrt(n)
CI <- c(x_bar - margin_error, x_bar + margin_error)
CI

## [1] 48.67928 49.91459

# Sampling Error (SE)
SE <- s / sqrt(n)
SE

## [1] 0.3147557

# Determining sample size (n) for a given CI & margin of error (E)
E <- 1.5            # Desired margin of error
z_crit <- qnorm(0.975)  # 95% confidence
sigma <- 10         # Estimated population SD
n_required <- (z_crit * sigma / E)^2
n_required

## [1] 170.7315

# True population mean vs sample estimate
cat("True Mean:", mean(population), "\nSample Mean:", x_bar, "\n95% CI:", round(CI,2))

## True Mean: 49.97628 
## Sample Mean: 49.29693 
## 95% CI: 48.68 49.91

# How changes in CI and the desired margin of error affects required sample size (n)

### Effect of Margin of Error (E) and Confidence Level on Required Sample Size
# install.packages("ggplot2")
library(ggplot2)

sigma <- 10  # Assumed population SD
E_vals <- seq(0.5, 5, by = 0.25)  # Range of desired margins of error 0.5 to 5 with an increment of 0.25
confidence_levels <- c(0.90, 0.95, 0.99)  # Possible CI thresholds

# Compute sample size for each combination of E and CI
sample_size_df <- expand.grid(E = E_vals, CI = confidence_levels)
sample_size_df$n <- with(sample_size_df, {
  z <- qnorm(1 - (1 - CI)/2)  # z critical value
  (z * sigma / E)^2           # formula: n = (z*sigma/E)^2
})

# Plot
ggplot(sample_size_df, aes(x = E, y = n, color = factor(CI))) +
  geom_line(linewidth = 1.2) +
  scale_y_log10() +  # log scale for clarity
  labs(
    title = "Sample Size vs. Margin of Error and CI",
    x = "Margin of Error (E)",
    y = "Required Sample Size (log scale)",
    color = "Confidence Level"
  ) +
  theme_minimal(base_size = 13)

# The takeaway: As margin of error (CI) decreases (increases), the required sample size increases.

The Data

Let’s generate an example data of 25,000 obs.

set.seed(123)  # For reproducibility

N <- 25000

# 1. Respondent ID
ID <- 1:N

# 2. Gender
Gender <- sample(c("male", "female"),
                 size = N,
                 replace = TRUE,
                 prob = c(0.495, 0.505))

# 3. Age cohort
Age <- sample(c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
              size = N,
              replace = TRUE,
              prob = c(0.15, 0.30, 0.25, 0.15, 0.10, 0.05))

# 4. Education level
Edu <- sample(c("elementary school", "high school", "some college", "grad school"),
              size = N,
              replace = TRUE,
              prob = c(0.05, 0.20, 0.45, 0.30))

# 5. Marital status
Married <- sample(c("Yes", "No"),
                  size = N,
                  replace = TRUE,
                  prob = c(0.48, 0.52))

# 6. Party list preference
Party <- sample(c("KMT", "DPP", "TPP"),
               size = N,
               replace = TRUE,
               prob = c(0.33, 0.36, 0.31))

# 7. Referendum voting choice
Referendum <- sample(c("Yes", "No"),
                     size = N,
                     replace = TRUE,
                     prob = c(0.53, 0.47))

# 8. City
City <- sample(c("Taipei", "New Taipei", "Taichung", "Taoyuan", "Kaohsiung", "Tainan"),
               size = N,
               replace = TRUE,
               prob = c(0.19, 0.27, 0.23, 0.10, 0.18, 0.03))

# 9. Districts (randomly within city)
district_list <- list(
  "Taipei"     = c("大安區", "松山區", "內湖區", "大同區", "文山區"),
  "New Taipei" = c("板橋區", "金山區", "瑞芳區", "平溪區", "永和區", "淡水區", "五股區"),
  "Taichung"   = c("中區", "西區", "北屯區", "沙鹿區", "大里區"),
  "Taoyuan"    = c("中壢區", "平鎮區", "楊梅區", "新屋區"),
  "Kaohsiung"  = c("小港區", "連雅區", "左營區", "三民區"),
  "Tainan"     = c("永康區", "中西區")
)

District <- sapply(City, function(cty) {
  sample(district_list[[cty]], 1)
})

# 10. Source: Web / Cell / Phone (special condition)
Source <- rep(NA, N)

# Start with Web and Cell first
Source <- sample(c("Web", "Cell", "Phone"),
                 size = N,
                 replace = TRUE,
                 prob = c(0.65, 0.25, 0.10))

# Adjust: ensure 78% of "Phone" come from age groups 60-69 and >70
is_phone <- Source == "Phone"
phone_indices <- which(is_phone)

# Identify old-age respondents
old_age_indices <- which(Age %in% c("60-69", ">70"))
young_age_indices <- which(!(Age %in% c("60-69", ">70")))

# Number of phone respondents
n_phone <- length(phone_indices)
target_old_phone <- round(0.78 * n_phone)

# If not enough old respondents, sample all
if(length(old_age_indices) < target_old_phone){
  target_old_phone <- length(old_age_indices)
}

# Assign phones preferentially to old-age respondents
old_phone_ids <- sample(old_age_indices, target_old_phone)
remaining_phone_ids <- sample(young_age_indices, n_phone - target_old_phone)
phone_final_ids <- c(old_phone_ids, remaining_phone_ids)

# Reassign "Phone" source properly
Source <- rep(NA, N)
Source[phone_final_ids] <- "Phone"

# Fill remaining with Web/Cell by given proportion (within leftover)
remaining_ids <- setdiff(1:N, phone_final_ids)
Source[remaining_ids] <- sample(c("Web", "Cell"),
                                size = length(remaining_ids),
                                replace = TRUE,
                                prob = c(0.65/(0.65+0.25), 0.25/(0.65+0.25)))

# 11. Household size
Household <- sample(1:5,
                    size = N,
                    replace = TRUE,
                    prob = c(0.11, 0.18, 0.25, 0.40, 0.06))

# 12. Phone number (10 digits)
Phone <- rep(NA, N)

# Define area code map
area_code <- c("Taipei"="02", "New Taipei"="02",   # Region code
               "Taoyuan"="03", "Taichung"="04",
               "Tainan"="06", "Kaohsiung"="07")

# Assign phone numbers only for Source == "Phone"
for (i in which(Source == "Phone")) {
  prefix <- area_code[City[i]]
  if (is.na(prefix)) prefix <- sample(c("02","03","04","06","07"), 1)
  suffix <- sprintf("%08d", sample(0:99999999, 1))  # String formatting by inserting variables and values into a template string
  Phone[i] <- paste0(prefix, suffix)
}


# Combine into a single data frame
survey_data <- data.frame(
  ID, Gender, Age, Edu, Married, Party, Referendum, City, District,
  Source, Household, Phone,
  stringsAsFactors = FALSE
)

# Inspect distribution summary
str(survey_data)   # A list of column names and their variable class

## 'data.frame':    25000 obs. of  12 variables:
##  $ ID        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender    : chr  "female" "male" "female" "male" ...
##  $ Age       : chr  "40-49" "60-69" "60-69" "50-59" ...
##  $ Edu       : chr  "some college" "some college" "some college" "grad school" ...
##  $ Married   : chr  "No" "No" "Yes" "No" ...
##  $ Party     : chr  "KMT" "DPP" "TPP" "DPP" ...
##  $ Referendum: chr  "Yes" "Yes" "Yes" "No" ...
##  $ City      : chr  "Taipei" "Taipei" "Taipei" "Kaohsiung" ...
##  $ District  : chr  "內湖區" "內湖區" "文山區" "小港區" ...
##  $ Source    : chr  "Web" "Cell" "Web" "Cell" ...
##  $ Household : int  4 3 1 1 1 4 2 4 4 1 ...
##  $ Phone     : chr  NA NA NA NA ...

table(survey_data$Source)

## 
##  Cell Phone   Web 
##  6276  2526 16198

table(survey_data$City)

## 
##  Kaohsiung New Taipei   Taichung     Tainan     Taipei    Taoyuan 
##       4534       6697       5779        790       4657       2543

table(survey_data$Age)

## 
##   >70 20-29 30-39 40-49 50-59 60-69 
##  1273  3775  7607  6297  3616  2432

Simple Random Sampling (SRS)

Sampling from a population such that every observations has an equal, non-zero probability of being chosen for the sample. Let’s draw a sample of size = 1000.

set.seed(2025)  # For reproducibility

# Population size
N <- nrow(survey_data)

# Sample size
n <- 1000

# Draw random sample based on ID, note the sampling function used inside the df and the no replacement argument
srs_sample <- survey_data[sample(survey_data$ID, n, replace = FALSE), ]

# Check sample size
nrow(srs_sample)

## [1] 1000

SRS is the simpliest sampling method, but how representative is the sample drawn using this simple method, compared to the true population parameter? A simple yardstick would be standard error (SE):

\[\begin{equation*} SE(\hat{p}) = \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}\cdot\sqrt{\frac{N - n}{N - 1}}, \end{equation*}\]

where \(\hat{p}\) is sampling proportion, \(n\) = sample size, \(N\) = population size, and the second square root term is usually called the finite population correction (FPC)

Let’s use the percentage vote for the Democratic Progressive Party (DPP) as the \(\hat{p}\) and calculate the SE of this sample.

p_hat <- mean(srs_sample$Party== "DPP")
p_hat

## [1] 0.35

SE <- sqrt( (p_hat * (1 - p_hat)) / n ) * sqrt( (N - n) / (N - 1) )
SE

## [1] 0.01477866

And the 95% confidence interval (CI) can be calculated as: \[\begin{equation*} CI = \hat{p} \pm z_{0.975}{\times}SE. \end{equation*}\]

z <- 1.96
CI_lower <- p_hat - z * SE
CI_upper <- p_hat + z * SE

cat("Estimated support for Referendum (Yes):", round(p_hat, 3), "\n")

## Estimated support for Referendum (Yes): 0.35

cat("Standard Error:", round(SE, 4), "\n")

## Standard Error: 0.0148

cat("95% Confidence Interval: [", round(CI_lower, 3), ",", round(CI_upper, 3), "]\n")

## 95% Confidence Interval: [ 0.321 , 0.379 ]

Note that in \(\textsf{survey_data}\) the percentage of population voted for the DPP is 0.36, so our sample propotion still captures this population parameter but differs by roughly \(\pm\) 1.48 percentage points (sampling error).

Systematic Sampling

Selecting a sample from a population by choosing every n-th individual from a list, starting at a random point. The “sampling interval” (n) is determined by dividing the population size (N) by the desired sample size. This fixed interval ensures that samples are evenly spread and can be more \(\textit{efficient}\) than SRS or other sampling methods.

Let’s draw a sample of size = 1000 using this method. Given our population size (N = 2500) and sample size (n = 1000), we will randomly pick a starting ID between 1 and 25, then take every 25th case.

set.seed(2025)

# Population size and desired sample size
N <- nrow(survey_data)
n <- 1000
k <- floor(N / n)  # Sampling interval. floor() returns the largest integer that is less than or equal to the value passed to it

# Random start between 1 and k
start <- sample(1:k, 1)

# Selected indices
sys_indices <- seq(from = start, to = N, by = k)

# Ensure we get exactly n observations
sys_indices <- sys_indices[1:n]

# Extract sample
sys_sample <- survey_data[sys_indices, ]

# Check sample size
nrow(sys_sample)

## [1] 1000

head(sys_sample$ID)

## [1]  13  38  63  88 113 138

We then compute the \(\hat{p}\) and SE of this sampling strategy.

p_hat_sys <- mean(sys_sample$Party== "DPP")
p_hat_sys

## [1] 0.35

SE_sys <- sqrt( (p_hat_sys * (1 - p_hat_sys)) / n ) * sqrt( (N - n) / (N - 1) )
SE_sys

## [1] 0.01477866

# Compute standard error (approximation same as SRS)
# Note that Systematic sampling can have lower or higher SE depending on the population ordering, but with random order (as in our toy data), it largely behaves like SRS. Their SEs are the same.

SE_sys <- sqrt((p_hat_sys * (1 - p_hat_sys)) / n) * sqrt((N - n) / (N - 1))
SE_sys

## [1] 0.01477866

How does the SE estimated from \(\textsf{sys_sample}\) compared to that of SRS-generated sample?

true_p <- mean(survey_data$Party == "DPP")

cat("Systematic Sampling Results\n")

## Systematic Sampling Results

cat("Estimated support (p̂):", round(p_hat_sys, 3), "\n")

## Estimated support (p̂): 0.35

cat("Standard Error (approx.):", round(SE_sys, 4), "\n")

## Standard Error (approx.): 0.0148

cat("True population support:", round(true_p, 3), "\n")

## True population support: 0.36

cat("Absolute error:", round(abs(p_hat_sys - true_p), 4), "\n")

## Absolute error: 0.0102

Stratified Sampling

Stratified Sampling divides a population into distinct subgroups, or “strata,” based on shared characteristics, and then randomly sampling from each stratum to ensure proportional representation in the final sample. This strategy ensures that key characteristic of the population is represented in the sample, leading to more accurate and less biased results compared to SRS.

At below, we will draw a random sample of size = 1000 from our toy data stratified sequentially by

Gender
: \(\downarrow\)
: Age cohort
: \(\downarrow\)
: City/county
: \(\downarrow\)
: District

library(dplyr)  # We need this package for group_by, mutate, and cur_group_id functions

set.seed(2025)

# Population size
N <- nrow(survey_data)

# Sample size
n <- 1000

# Get stratum sizes: count population size in each stratum
strata_sizes <- survey_data %>%
  group_by(Gender, Age, City, District) %>%
  summarise(N_h = n(), .groups = "drop")  # Generate N_h ID for each distinct combination of grouping variables


# Compute proportional allocation
strata_sizes <- strata_sizes %>%
  mutate(prop = N_h / N,  # Calculate proportion %
         n_h = prop * n)

# Round sample sizes and adjust to ensure total = 1000
strata_sizes$n_h <- floor(strata_sizes$n_h)
remainder <- n - sum(strata_sizes$n_h)


# Distribute remaining samples to strata with the largest fractional parts
frac_part <- (strata_sizes$prop * n) - strata_sizes$n_h
if (remainder > 0) {
  add_index <- order(frac_part, decreasing = TRUE)[1:remainder]
  strata_sizes$n_h[add_index] <- strata_sizes$n_h[add_index] + 1
}


# Double-check total
sum(strata_sizes$n_h)

## [1] 1000

# Draw the stratified sample
set.seed(2025)

stratified_sample <- survey_data %>%
  group_by(Gender, Age, City, District) %>%
  group_modify(~ {
    n_h <- strata_sizes$n_h[
      strata_sizes$Gender == .y$Gender &
      strata_sizes$Age == .y$Age &
      strata_sizes$City == .y$City &
      strata_sizes$District == .y$District
    ]
    if (length(n_h) == 0 || n_h == 0) return(tibble())
    slice_sample(.x, n = min(n_h, nrow(.x)))   # slice_sample() provides weighted sampling without explicit probability tables
  }) %>%
  ungroup()


# Verify sample size
nrow(stratified_sample)

## [1] 1000

# Preview
head(stratified_sample)

## # A tibble: 6 × 12
##   Gender Age   City      District    ID Edu      Married Party Referendum Source
##   <chr>  <chr> <chr>     <chr>    <int> <chr>    <chr>   <chr> <chr>      <chr> 
## 1 female 20-29 Kaohsiung 三民區    3231 some co… No      TPP   No         Cell  
## 2 female 20-29 Kaohsiung 三民區   20425 high sc… Yes     TPP   Yes        Web   
## 3 female 20-29 Kaohsiung 三民區    9185 high sc… No      DPP   No         Web   
## 4 female 20-29 Kaohsiung 三民區    6964 some co… Yes     DPP   Yes        Cell  
## 5 female 20-29 Kaohsiung 小港區   17352 grad sc… Yes     TPP   No         Cell  
## 6 female 20-29 Kaohsiung 小港區    4969 grad sc… Yes     DPP   Yes        Cell  
## # ℹ 2 more variables: Household <int>, Phone <chr>

How representative is the stratified sample at “stratum” level? Let’s compare the sample’s \(n_h\)s against their corresponding \(N_h\)s at the population level.

# library(dplyr)

### For a given strata
### Compute population proportions (N_h / N)
pop_strata <- survey_data %>%
  group_by(Gender, Age, City, District) %>%
  summarise(N_h = n(), .groups = "drop") %>%
  mutate(pop_prop = N_h / sum(N_h))

### Compute sample proportions (n_h / n) 
samp_strata <- stratified_sample %>%
  group_by(Gender, Age, City, District) %>%
  summarise(n_h = n(), .groups = "drop") %>%
  mutate(samp_prop = n_h / sum(n_h))

### Merge and compare 
compare_strata <- left_join(
  pop_strata,
  samp_strata,
  by = c("Gender", "Age", "City", "District")   # Join by these columns
) %>%
  mutate(
    samp_prop = ifelse(is.na(samp_prop), 0, samp_prop),
    diff = samp_prop - pop_prop  # Calculate sample vs. population difference
  )

### Display 5 example strata (see the last column) 
set.seed(42)
compare_strata %>%
  sample_n(5) %>%
  select(Gender, Age, City, District, pop_prop, samp_prop, diff)

## # A tibble: 5 × 7
##   Gender Age   City     District pop_prop samp_prop      diff
##   <chr>  <chr> <chr>    <chr>       <dbl>     <dbl>     <dbl>
## 1 female 30-39 Taipei   文山區    0.00572     0.006  0.00028 
## 2 male   >70   Taoyuan  中壢區    0.00064     0.001  0.00036 
## 3 female >70   Tainan   永康區    0.00024     0     -0.00024 
## 4 female 40-49 Taipei   大同區    0.005       0.005  0       
## 5 male   40-49 Taichung 中區      0.00524     0.005 -0.000240

The computation of stratified sample SE is a bit more involved. The sampling error of its estimated mean is:

\[\begin{equation*} SE_{strat} = \sqrt{\sum^{L}_{h=1}\left(\frac{N_h}{N}\right)^{2}\cdot\left(1 - \frac{n_{h}}{N_{h}}\right)^{2}\cdot\frac{s^{2}_{h}}{n_{h}}}, \end{equation*}\]

where

\(L\): number of strata
: \(N\): population size
: \(N_h\): population size in stratum \(h\)
: \(n_h\): sample size in stratum \(h\)
: \(s^{2}_{h}\): within-stratum variance

### Compute p_hat (overall DPP support in stratified sample)
p_hat <- mean(stratified_sample$Party == "DPP")

cat("Estimated DPP proportion (p̂):", round(p_hat, 4), "\n")

## Estimated DPP proportion (p̂): 0.352

# Compute within-stratum statistics
strata_stats <- survey_data %>%
  group_by(Gender, Age, City, District) %>%
  summarise(
    N_h = n(),
    p_h = mean(Party == "DPP"),
    s2_h = var(as.numeric(Party == "DPP")),
    .groups = "drop"
  ) %>%
  left_join(
    strata_sizes %>% select(Gender, Age, City, District, n_h),
    by = c("Gender", "Age", "City", "District")
  ) %>%
  filter(!is.na(n_h) & n_h > 0 & N_h > 1)   # Remove problematic strata (quota = 0)


# Replace NA variances (e.g., only one observation) with 0
strata_stats$s2_h[is.na(strata_stats$s2_h)] <- 0

# Compute stratified variance
var_strat <- sum(
  ((strata_stats$N_h / N)^2) *
    (1 - strata_stats$n_h / strata_stats$N_h) *
    (strata_stats$s2_h / strata_stats$n_h)
)

SE_strat <- sqrt(var_strat)

# Print
cat("Standard Error (SE):", round(SE_strat, 4), "\n")

## Standard Error (SE): 0.0149

# Pretty close to the population SE

Multi-Stage Cluster Sampling

Multi-Stage Cluster Sampling divided the population into hierarchical groups (clusters), and samples are drawn in a series of random steps from these progressively smaller clusters until a final sample of individual units is selected. This sampling strategy is used for large, geographically dispersed populations to make data collection more feasible, cost-effective, and faster by reducing the need to survey every member of the population.

Multi-stage cluster sampling is often paired with Probability Proportional to Size (PPS), which is a probability sampling technique that involves selecting clusters and then selecting units within those clusters at multiple stages, using PPS at one or more stages to give larger clusters a greater chance of being selected. The sampling procedure generally starts by first randomly selecting some administrative units (such as provinces, cities) according to PPS, then within each units, randomly select some sub-units (e.g., districts) according to PPS. Finally, within each sub-unit, randomly select individuals (participants) to assemble the final sample.

We demonstrate this sampling strategy with an example.

set.seed(2025)

### Set population and Target Sample

N <- nrow(survey_data)
n <- 1000

### First-stage PPS Sampling of Cities
# Compute city population sizes in order to compute PPS
city_sizes <- survey_data %>%
  group_by(City) %>%
  summarise(N_city = n(), .groups = "drop") %>%
  mutate(prob_city = N_city / sum(N_city))  # PPS weights

# Number of cities to select (at least 4, you can do more, but we only have 6)
n_cities <- max(4, round(0.4 * nrow(city_sizes)))

# Select cities with PPS
selected_cities <- city_sizes %>%
  slice_sample(n = n_cities, weight_by = prob_city, replace = FALSE)


cat("First-stage PPS - Selected Cities:\n")

## First-stage PPS - Selected Cities:

print(selected_cities)

## # A tibble: 4 × 3
##   City       N_city prob_city
##   <chr>       <int>     <dbl>
## 1 Kaohsiung    4534     0.181
## 2 Taichung     5779     0.231
## 3 Taipei       4657     0.186
## 4 New Taipei   6697     0.268

### Second-stage PPS Sampling of Districts Within Selected Cities
# Compute district population sizes within selected cities
district_sizes <- survey_data %>%
  filter(City %in% selected_cities$City) %>%
  group_by(City, District) %>%
  summarise(N_district = n(), .groups = "drop") %>%
  group_by(City) %>%
  mutate(prob_district = N_district / sum(N_district)) %>%  # PPS within city
  ungroup()


# Select at least 2 districts per city (PPS)

districts_per_city <- district_sizes %>%
  group_by(City) %>%
  summarise(n_districts = min(2, n()), .groups = "drop")

selected_districts <- district_sizes %>%
  left_join(districts_per_city, by = "City") %>%
  group_by(City) %>%
  group_modify(~ slice_sample(.x, n = .x$n_districts[1],
                              weight_by = prob_district,
                              replace = FALSE)) %>%
  ungroup()

cat("\nSecond-stage PPS - Selected Districts:\n")

## 
## Second-stage PPS - Selected Districts:

print(selected_districts)

## # A tibble: 8 × 5
##   City       District N_district prob_district n_districts
##   <chr>      <chr>         <int>         <dbl>       <dbl>
## 1 Kaohsiung  小港區         1100         0.243           2
## 2 Kaohsiung  三民區         1143         0.252           2
## 3 New Taipei 平溪區          929         0.139           2
## 4 New Taipei 板橋區          994         0.148           2
## 5 Taichung   西區           1136         0.197           2
## 6 Taichung   大里區         1166         0.202           2
## 7 Taipei     大安區          918         0.197           2
## 8 Taipei     松山區          878         0.189           2

### Third-stage SRS within Selected Districts (not PPS)
district_alloc <- survey_data %>%
  semi_join(selected_districts, by = c("City", "District")) %>%
  group_by(City, District) %>%
  summarise(N_district = n(), .groups = "drop") %>%
  mutate(prop = N_district / sum(N_district),
         n_draw = round(prop * n))

# Round to 1000
remainder <- n - sum(district_alloc$n_draw)
if (remainder > 0) {
  add_index <- order(district_alloc$prop, decreasing = TRUE)[1:remainder]
  district_alloc$n_draw[add_index] <- district_alloc$n_draw[add_index] + 1
}


multi_stage_sample <- survey_data %>%
  semi_join(selected_districts, by = c("City", "District")) %>%
  group_by(City, District) %>%
  group_modify(~ {
    n_draw <- district_alloc$n_draw[
      district_alloc$City == .y$City &
      district_alloc$District == .y$District
    ]
    slice_sample(.x, n = min(n_draw, nrow(.x)))
  }) %>%
  ungroup()

cat("\nFinal multi-stage cluster sample size:\n")

## 
## Final multi-stage cluster sample size:

print(nrow(multi_stage_sample))

## [1] 1000

To get reliable SE from a complex two-stage PPS sampling design, the most feasible way is use simulation-based method (repeated application of the exact sampling procedure) and use the empirical SD of the resulting estimates. This avoids messy derivation of inclusion probabilities for PPS-without-replacement at two stages, while also providing estimated design variance for the ease of comparison against less sophisticated sampling methods.

### To do this, we first define a function that implement one draw of the multistage PPS design shown above

set.seed(123)

# First, set parameters

target_n <- 1000        # Final sample size
m_cities <- 4           # Number of cities selected in stage 1 (at least 4)
districts_per_city <- 2 # Number of districts selected per city (at least 2)
num_iter <- 500         # Number of repeated samples for simulation-based SE

# Define function

draw_multistage_sample <- function(pop_df,
                                   target_n = 1000,
                                   m_cities = 4,
                                   districts_per_city = 2) {
  # pop_df: population sampling frame (i.e., survey_data)
  # First-stage: city sizes & PPS selection of cities
  city_sizes <- pop_df %>%
    group_by(City) %>%
    summarise(N_city = n(), .groups = "drop") %>%
    mutate(prob_city = N_city / sum(N_city))
  
  # Enforce at least m_cities, but not more than available
  m_cities_use <- min(m_cities, nrow(city_sizes))
  selected_cities <- city_sizes %>%
    slice_sample(n = m_cities_use, weight_by = prob_city, replace = FALSE)
  
  # Second-stage: district sizes (within selected cities) & PPS selection
  district_sizes <- pop_df %>%
    filter(City %in% selected_cities$City) %>%
    group_by(City, District) %>%
    summarise(N_district = n(), .groups = "drop") %>%
    group_by(City) %>%
    mutate(prob_district = N_district / sum(N_district)) %>%
    ungroup()
  
  # For each selected city choose up to districts_per_city districts (if fewer exist, take all!)
  # Compute how many to choose per city
  districts_count <- district_sizes %>%
    group_by(City) %>%
    summarise(n_avail = n(), .groups = "drop") %>%
    mutate(n_choose = pmin(districts_per_city, n_avail))
  
  selected_districts <- district_sizes %>%
    left_join(districts_count %>% select(City, n_choose), by = "City") %>%
    group_by(City) %>%
    group_modify(~ slice_sample(.x,
                                n = unique(.x$n_choose),
                                weight_by = .x$prob_district,
                                replace = FALSE)) %>%
    ungroup()
  
  # Third stage: allocate target_n to selected districts proportional to their N_district
  district_alloc <- pop_df %>%
    semi_join(selected_districts, by = c("City", "District")) %>%
    group_by(City, District) %>%
    summarise(N_district = n(), .groups = "drop") %>%
    mutate(prop = N_district / sum(N_district),
           n_draw = floor(prop * target_n))
  
  # Rounding to total target_n
  remainder <- target_n - sum(district_alloc$n_draw)
  if (remainder > 0) {
    idx_add <- order(district_alloc$prop, decreasing = TRUE)[1:remainder]
    district_alloc$n_draw[idx_add] <- district_alloc$n_draw[idx_add] + 1
  } else if (remainder < 0) {
    # If we overshot due to floor/round, reduce largest props
    idx_reduce <- order(district_alloc$prop, decreasing = TRUE)[1:(-remainder)]
    district_alloc$n_draw[idx_reduce] <- pmax(0, district_alloc$n_draw[idx_reduce] - 1)
  }
  # Ensure sum is target_n
  if (sum(district_alloc$n_draw) != target_n) {
    stop("Allocation bug: sum of n_draw != target_n")
  }
  
  # Draw SRS within each selected district
  sample_df <- pop_df %>%
    semi_join(selected_districts, by = c("City", "District")) %>%
    group_by(City, District) %>%
    group_modify(~ {
      n_draw <- district_alloc$n_draw[
        district_alloc$City == .y$City &
        district_alloc$District == .y$District
      ]
      slice_sample(.x, n = min(n_draw, nrow(.x)))
    }) %>%
    ungroup()
  
  # return the sample (and also the allocation metadata optionally)
  list(sample = sample_df,
       selected_cities = selected_cities,
       selected_districts = selected_districts,
       district_alloc = district_alloc)
}


# Sanity check! @@a
one_draw <- draw_multistage_sample(survey_data,
                                   target_n = target_n,
                                   m_cities = m_cities,
                                   districts_per_city = districts_per_city)

multi_stage_sample <- one_draw$sample
cat("Single-run sample size:", nrow(multi_stage_sample), "\n")

## Single-run sample size: 1000

### Estimate empirical SD

# p_hat from the single realization (using the same indicator, DPP proportion)
p_hat_single <- mean(multi_stage_sample$Party == "DPP")
cat("Single-run p̂ (DPP):", round(p_hat_single, 4), "\n\n")

## Single-run p̂ (DPP): 0.345

# Repeated simulation to get empirical SE
p_hats <- numeric(num_iter)
for (i in seq_len(num_iter)) {
  draw <- draw_multistage_sample(survey_data,
                                 target_n = target_n,
                                 m_cities = m_cities,
                                 districts_per_city = districts_per_city)
  samp_i <- draw$sample
  p_hats[i] <- mean(samp_i$Party == "DPP")
  if (i %% 50 == 0) cat("Completed iteration:", i, "\n")
}

## Completed iteration: 50 
## Completed iteration: 100 
## Completed iteration: 150 
## Completed iteration: 200 
## Completed iteration: 250 
## Completed iteration: 300 
## Completed iteration: 350 
## Completed iteration: 400 
## Completed iteration: 450 
## Completed iteration: 500

# Empirical results
mean_p_hat_sim <- mean(p_hats)
sd_p_hat_sim <- sd(p_hats)    # Empirical SD
se_normal_ci <- c(mean_p_hat_sim - 1.96 * sd_p_hat_sim,   # +-95% CI
                  mean_p_hat_sim + 1.96 * sd_p_hat_sim)

cat("Simulation results (", num_iter, "iterations ):\n", sep = "")

## Simulation results (500iterations ):

cat("Mean of p̂:", round(mean_p_hat_sim, 4), "\n")

## Mean of p̂: 0.3608

cat("Empirical SE (SD of p̂):", round(sd_p_hat_sim, 4), "\n")

## Empirical SE (SD of p̂): 0.0152

cat("Approx Normal 95% CI for mean p̂:", round(se_normal_ci[1],4), "to", round(se_normal_ci[2],4), "\n\n")

## Approx Normal 95% CI for mean p̂: 0.3311 to 0.3906

# Compare to true population proportion
true_p <- mean(survey_data$Party == "DPP")
cat("True population DPP proportion:", round(true_p, 4), "\n")

## True population DPP proportion: 0.3602

# Pretty damn good!

# Visualizing sampling distribution
library(ggplot2)

p_df <- data.frame(p_hat = p_hats)
ggplot(p_df, aes(x = p_hat)) +
  geom_histogram(binwidth = 0.0025, color = "white", boundary = 0) +
  geom_vline(xintercept = true_p, color = "red", size = 1) +
  labs(title = paste0("Sampling distribution of p̂ (multi-stage PPS, n = ", target_n, ")"),
       subtitle = paste0("Empirical SD = ", round(sd_p_hat_sim,4),
                         "; mean p̂ = ", round(mean_p_hat_sim,4)),
       x = "Sample proportion p̂ (DPP)",
       y = "Frequency") +
  theme_minimal()

Mixed-mode Sampling Frame & Sample Heterogeneity

With declining landline (phone) usage (though some beg to differ), many survey practitioners proposed using mixed-mode contact method-online, mobile, phone-to recruit participants from a more diverse pool of population. However, there exists considerable heterogeneity in the demographic profiles of participants recruited through different venues. For example, online recruitment method reaches primarily younger, working-age people, while participants recruited by phone tend to be older. Are these characteristics borne out by our toy data?

To find out, let’s first draw a sample of size = 5000 with their “Source” proportional to each source’s share of total population. Then, for each “Source,” draw a PPS sample of size = 500 with “Age” composition proportional to the share of age cohort within each source.

set.seed(2025)

### Proportional allocation by "Source"
# Calculate proportions

source_share <- survey_data %>%
  count(Source, name = "N") %>%
  mutate(prop = N / sum(N),
         n_draw = round(prop * 5000))

print(source_share)  # Most of our sample were recruited online

##   Source     N    prop n_draw
## 1   Cell  6276 0.25104   1255
## 2  Phone  2526 0.10104    505
## 3    Web 16198 0.64792   3240

### Draw a sample of size = 5000 with their "Source" proportional to each source's share of total population
# Also, adjust for rounding error

n = 5000

diff_total <- n - sum(source_share$n_draw)
if (diff_total != 0) {
  idx <- order(source_share$prop, decreasing = TRUE)[1:abs(diff_total)]
  source_share$n_draw[idx] <- source_share$n_draw[idx] + sign(diff_total)
}

print(source_share)

##   Source     N    prop n_draw
## 1   Cell  6276 0.25104   1255
## 2  Phone  2526 0.10104    505
## 3    Web 16198 0.64792   3240

## Draw proportional samples by "Source"
# Join n_draw before sampling

sample_prop <- survey_data %>%
  left_join(source_share, by = "Source") %>%
  group_by(Source) %>%
  group_modify(~ {
    n_draw_val <- unique(.x$n_draw)
    if (is.na(n_draw_val) || length(n_draw_val) != 1) {
      stop("n_draw missing or not unique for Source = ", unique(.x$Source))
    }
    slice_sample(.x, n = n_draw_val)
  }) %>%
  ungroup()

cat("\nProportional sample size:", nrow(sample_prop), "\n")

## 
## Proportional sample size: 5000

# Add age weights

age_midpoints <- tibble(
  Age = c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
  age_mid = c(25, 35, 45, 55, 65, 75)
)

survey_data_age <- survey_data %>%
  left_join(age_midpoints, by = "Age")


### PPS sampling (of size = 500) within each source, weighted by age_mid
pps_sample <- survey_data_age %>%
  group_by(Source) %>%
  group_modify(~ slice_sample(.x, n = 500, weight_by = age_mid, replace = FALSE)) %>%
  ungroup()

cat("Age-weighted PPS sample size:", nrow(pps_sample), "\n")

## Age-weighted PPS sample size: 1500

### Diagnostics: share of older respondents by Source
diagnostic_table <- pps_sample %>%
  mutate(older = Age %in% c("60-69", ">70")) %>%
  group_by(Source) %>%
  summarise(
    n = n(),
    older_share = mean(older),
    .groups = "drop"
  ) %>%
  arrange(desc(older_share))

print(diagnostic_table)

## # A tibble: 3 × 3
##   Source     n older_share
##   <chr>  <int>       <dbl>
## 1 Phone    500       0.842
## 2 Cell     500       0.126
## 3 Web      500       0.1

# Phone-based sample indeed consists primarily of older people!

We will use the same simulation-based approach to get empirical SD in lieu of sample SE.

set.seed(2025)

# Parameters
n <- 5000     # Proportional draw by Source
pps_per_source <- 500    # PPS draws per Source
num_iter <- 500          # Number of simulation iterations (you can have more for better precision)

# Prepare age midpoints (ensure exact matching)
age_midpoints <- tibble(
  Age = c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
  age_mid = c(25, 35, 45, 55, 65, 75)
)

# Append age_mid (and drop rows with missing Age mapping if any)
survey_data_age <- survey_data %>%
  left_join(age_midpoints, by = "Age")

if (any(is.na(survey_data_age$age_mid))) {
  warning("Some rows had NA age_mid; dropping them for PPS draws.")
  survey_data_age <- survey_data_age %>% filter(!is.na(age_mid))
}

# Precompute Source proportions & integer allocations for the proportional draw
source_share <- survey_data %>%
  count(Source, name = "N") %>%
  mutate(prop = N / sum(N),
         n_draw = round(prop * n))

# Rounding so sum(n_draw) == total_prop_n
diff_total <- n - sum(source_share$n_draw)
if (diff_total != 0) {
  idx <- order(source_share$prop, decreasing = TRUE)[1:abs(diff_total)]
  source_share$n_draw[idx] <- source_share$n_draw[idx] + sign(diff_total)
}
stopifnot(sum(source_share$n_draw) == n)

# Perform estimation on a single combined sample
draw_combined_sample_once <- function(pop_df) {
  # 1) First-stage: Proportional draw by Source
  prop_part <- pop_df %>%
    left_join(source_share %>% select(Source, n_draw), by = "Source") %>%
    group_by(Source) %>%
    group_modify(~ slice_sample(.x, n = unique(.x$n_draw))) %>%
    ungroup()
  
  # 2) Second-stage: PPS draw (age-weighted) within each Source (fixed n = pps_per_source)
  # Ensure no NA weights
  pps_part <- pop_df %>%
    group_by(Source) %>%
    group_modify(~ {
      this <- .x %>% filter(!is.na(age_mid))
      # If available rows fewer than pps_per_source, sample all
      n_avail <- nrow(this)
      n_take <- min(pps_per_source, n_avail)
      slice_sample(this, n = n_take, weight_by = age_mid, replace = FALSE)
    }) %>%
    ungroup()
  
  combined <- bind_rows(prop_part, pps_part)
  
  # If some Source had fewer than pps_per_source available, combined size may be < total_prop_n + num_sources*pps_per_source
  # Compute p_hat for DPP
  p_hat <- mean(combined$Party == "DPP")
  
  list(sample = combined, p_hat = p_hat)
}


# Sanity check
one_draw <- draw_combined_sample_once(survey_data_age)
cat("Single-run combined sample size:", nrow(one_draw$sample), "\n")

## Single-run combined sample size: 6500

cat("Single-run p̂ (DPP):", round(one_draw$p_hat, 4), "\n\n")

## Single-run p̂ (DPP): 0.3503

# Repeated simulation to estimate SE
p_hats <- numeric(num_iter)
for (i in seq_len(num_iter)) {
  res <- draw_combined_sample_once(survey_data_age)
  p_hats[i] <- res$p_hat
  if (i %% 50 == 0) cat("Completed iteration:", i, "\n")
}

## Completed iteration: 50 
## Completed iteration: 100 
## Completed iteration: 150 
## Completed iteration: 200 
## Completed iteration: 250 
## Completed iteration: 300 
## Completed iteration: 350 
## Completed iteration: 400 
## Completed iteration: 450 
## Completed iteration: 500

mean_p_hat_sim <- mean(p_hats)
sd_p_hat_sim <- sd(p_hats)   # Empirical SD
ci_lower <- mean_p_hat_sim - 1.96 * sd_p_hat_sim
ci_upper <- mean_p_hat_sim + 1.96 * sd_p_hat_sim

cat("Simulation summary (", num_iter, " iterations):\n", sep = "")

## Simulation summary (500 iterations):

cat("Mean of p̂:", round(mean_p_hat_sim, 4), "\n")

## Mean of p̂: 0.3594

cat("Empirical SE (SD of p̂):", round(sd_p_hat_sim, 4), "\n")

## Empirical SE (SD of p̂): 0.0054

cat("Normal-approx 95% CI for mean p̂: [", round(ci_lower,4), ", ", round(ci_upper,4), "]\n\n")

## Normal-approx 95% CI for mean p̂: [ 0.3488 ,  0.3699 ]

# Compare to true population proportion
true_p <- mean(survey_data$Party == "DPP")
cat("True population DPP proportion:", round(true_p, 4), "\n")

## True population DPP proportion: 0.3602

# Still very precise

# Plot sampling distribution
# library(ggplot2)

p_df <- data.frame(p_hat = p_hats)
ggplot(p_df, aes(x = p_hat)) +
  geom_histogram(binwidth = 0.0025, color = "white", boundary = 0) +
  geom_vline(xintercept = true_p, color = "red", size = 1) +
  labs(title = paste0("Sampling distribution of p̂ (combined sample, n ≈ ", nrow(one_draw$sample), ")"),
       subtitle = paste0("Empirical SD = ", round(sd_p_hat_sim,4), "; mean p̂ = ", round(mean_p_hat_sim,4)),
       x = "Sample proportion p̂ (DPP)",
       y = "Frequency") +
  theme_minimal()

So how “accurate” are the samples produced by these methods, compared to the true population parameter? Let’s use \(\textbf{sampling error}\) and \(\textbf{margin of error}\) (MOE) as our evaluation metrics.

Sampling error is the inevitable difference between a statistic calculated from a sample (\(n\)) and the true parameter value of the population (\(N\)), \(\hat{p} - p\), for the quantity of interest (in our case here, the proportion of respondents who voted for the DPP). Sampling error should be distinguished from SE, the latter is the expected random variability under that design.

The MOE tells us how many percentage points our sample estimate differ from the true population parameter. The MOE is defined as the range of values below and above the sample estimate in a given confidence interval. The MOE is estimated as:

\[\begin{equation*} MOE = Z\cdot\sqrt{\frac{p(1 - p)}{n}} \end{equation*}\]

# The ground truth: population parameters
true_p <- mean(survey_data$Party == "DPP")
N <- nrow(survey_data)

### Compute p_hat and sampling error
est_sampling_error <- function(sample_df, name) {
  if (!exists(deparse(substitute(sample_df))) || is.null(sample_df)) {
    return(data.frame(Method = name, n = NA, p_hat = NA, SamplingError = NA))
  }
  n <- nrow(sample_df)
  p_hat <- mean(sample_df$Party == "DPP")
  sampling_error <- p_hat - true_p
  data.frame(Method = name, n = n, p_hat = p_hat, SamplingError = sampling_error)
}

# Compute sampling error for each sampling method
results_SE <- bind_rows(
  est_sampling_error(srs_sample, "Simple Random Sampling"),
  est_sampling_error(sys_sample, "Systematic Sampling"),
  est_sampling_error(stratified_sample, "Stratified Sampling"),
  est_sampling_error(multi_stage_sample, "Multi-Stage PPS Sampling"),
  est_sampling_error(pps_sample, "PPS by Source x Age Sampling")
)

print(results_SE)

##                         Method    n p_hat SamplingError
## 1       Simple Random Sampling 1000 0.350       -0.0102
## 2          Systematic Sampling 1000 0.350       -0.0102
## 3          Stratified Sampling 1000 0.352       -0.0082
## 4     Multi-Stage PPS Sampling 1000 0.345       -0.0152
## 5 PPS by Source x Age Sampling 1500 0.352       -0.0082

# SRS, systematic sampling, and PPS by source x age sampling tend to underestimate the true share of DPP voters, while stratified sampling and multi-stage sampling tend to overestimate the true share of DPP voters. However, the differences are quite small.


### Estimate MOE (at 95% CI)

est_MOE <- function(sample_df, name, N, true_p, z = 1.96) {
  if (is.null(sample_df)) return(data.frame(Method = name, n = NA, p_hat = NA, SamplingError = NA, SE = NA, MOE = NA))
  
  n <- nrow(sample_df)
  p_hat <- mean(sample_df$Party == "DPP")
  sampling_error <- p_hat - true_p
  
  # Standard error with finite population correction
  SE <- sqrt((p_hat * (1 - p_hat)) / n * ((N - n) / (N - 1)))
  
  # 95% Margin of error
  MOE <- z * SE
  
  data.frame(Method = name, n = n, p_hat = p_hat, SamplingError = sampling_error, SE = SE, MOE = MOE)  # Also append sampling error and SE into this dataframe
}

# Apply to all five sampling methods
results_MOE <- bind_rows(
  est_MOE(srs_sample, "Simple Random Sampling", N, true_p),
  est_MOE(sys_sample, "Systematic Sampling", N, true_p),
  est_MOE(stratified_sample, "Stratified Sampling", N, true_p),
  est_MOE(multi_stage_sample, "Multi-Stage PPS Sampling", N, true_p),
  est_MOE(pps_sample, "PPS by Source x Age Sampling", N, true_p)
)

print(results_MOE)

##                         Method    n p_hat SamplingError         SE        MOE
## 1       Simple Random Sampling 1000 0.350       -0.0102 0.01477866 0.02896617
## 2          Systematic Sampling 1000 0.350       -0.0102 0.01477866 0.02896617
## 3          Stratified Sampling 1000 0.352       -0.0082 0.01479800 0.02900409
## 4     Multi-Stage PPS Sampling 1000 0.345       -0.0152 0.01472904 0.02886892
## 5 PPS by Source x Age Sampling 1500 0.352       -0.0082 0.01195600 0.02343376

# PPS by Source x Age Sampling has the smallest MOE, while the MOEs of Stratified Sampling and Multi-Stage PPS Sampling are indistinguishable.

Another question to ask is how much more (less) variance is produced by these sampling methods, relative to when the true data (aka., the population) is randomly generated that is best captured by the SRS. For this, we turn to a metric called “Design Effect,” compares the variance of your complex sample design to what it would have been under SRS of the same size:

\[\begin{equation*} D_{eff} = \frac{Var(\hat{p}_{design})}{Var(\hat{p}_{SRS})}. \end{equation*}\]

Since \(variance = SE^2\) (which we have already computed), we can equivalently compute:

\[\begin{equation*} D_{eff} = \left(\frac{SE_{design}}{SE_{SRS}}\right)^2. \end{equation*}\]

Hence, if

\(Deff\) = 1.0: the (sampling) design performs as efficiently as SRS
: \(Deff\) > 1.0: less efficient (more variability)
: \(Deff\) < 1.0: more efficient (usually via stratification)

Let’s compare the relative performance of these sampling methods.

# Again, using the proportion of DPP voters as our population parameter
true_p <- mean(survey_data$Party == "DPP")
N <- nrow(survey_data)

# Use the results_MOE we obtained earlier
# Compute design effects (Deff) relative to SRS
SE_SRS <- results_MOE %>% filter(Method == "Simple Random Sampling") %>% pull(SE)
results_DE <- results_MOE %>%
  mutate(
    Deff = (SE / SE_SRS)^2,
    `Sampling Error (%)` = round(SamplingError * 100, 2),
    `SE` = round(SE * 100, 2),
    `MOE 95%CI` = round(MOE * 100, 2),
    `Deff` = round(Deff, 3)
  ) %>%
  select(Method, n, `Sampling Error (%)`, `SE`, `MOE 95%CI`, Deff)


print(results_DE)

##                         Method    n Sampling Error (%)   SE MOE 95%CI  Deff
## 1       Simple Random Sampling 1000              -1.02 1.48      2.90 1.000
## 2          Systematic Sampling 1000              -1.02 1.48      2.90 1.000
## 3          Stratified Sampling 1000              -0.82 1.48      2.90 1.003
## 4     Multi-Stage PPS Sampling 1000              -1.52 1.47      2.89 0.993
## 5 PPS by Source x Age Sampling 1500              -0.82 1.20      2.34 0.654

# Systematic sampling is closest to SRS (the baseline). Stratified Sampling and Multi-Stage PPS Sampling have nearly identical performance. PPS by Source x Age Sampling is far-off.

This analysis is related to another measure, called weighting loss (\(L_{w}\) ) introduced by Kish (1965) in the context of proportionated stratified sampling:

\[\begin{equation*} L_{w}(\bar{y}) = \frac{\sigma^2(w)}{w^2} = \left(\frac{\sum^{n}_{i=1}w^2}{(\sum^{n}_{i=1}w)^2}\cdot{n}\right) - 1, \end{equation*}\]

which assumes that

variances of \(y\) are approximately equal in all strata, and

Weights (\(w\)) are uncorrelated with the values of the random variable \(y\)

In short, it approximates the proportional increase in the variances of means and proportions (aka., weighting loss), ignoring any clustering if the above assumptions are met. So if a sampling design gives us a design effect of 1.35, then, by the above formula, the weighting loss (\(L_w\)) is 1.35 - 1 = 0.35. The smaller such statistic, the more representative a sampling design.

The Sampling Rationale behind Random Digit Dialing (RDD)

Random digit dialing (RDD) is a telephone survey method that uses software to generate phone numbers at random, including both listed and unlisted numbers, to create a representative sample of a population for polling. It differs from the use of phone books by selecting from the entire telephone number frame, ensuring that everyone with a phone has an equal chance of being contacted, which is crucial for general population studies.

RDD generates telephone numbers by randomly creating sequences of digits within established geographic area codes and prefixes or using our now familiar method of systematic sampling - splitting the population by the desired sample size (\(K\)) with fixed interval (\(N\)/\(K\)), randomly select a number, say the \(k\)-th phone number, within the first interval and then pick the next one by locating the \(k\) \(+\) the range of the interval-th phone number on the list, so on and so forth.

Let’s experiment with the second method at below.

# Filter observations with valid phone numbers in the "Phone" column
valid_phone <- survey_data %>%
  filter(!is.na(Phone) & Phone != "")

n_valid <- nrow(valid_phone)   # Should be 2523
n_sample <- 200   # Select 200 samples

# Get sampling interval
interval <- n_valid / n_sample  # = 12.615


# Randomly choose a start between 1 and interval ---
set.seed(2025)
start <- sample(1:floor(interval), 1)

# Compute exact 200 indices using floor-based sequence (by the same interval)
sample_indices <- floor(start + (0:(n_sample - 1)) * interval)

# Adjust any 0 indices (just in case) and ensure within bounds
sample_indices <- sample_indices[sample_indices > 0 & sample_indices <= n_valid]

# Label the 200-sample dataset
sys_phone_sample <- valid_phone[sample_indices, ]

# Check sample size and inspect
nrow(sys_phone_sample)

## [1] 200

head(sys_phone_sample$Phone)

## [1] "0462067952" "0382510018" "0382688025" "0293218395" "0405531430"
## [6] "0270816571"

Are the 200 selected samples evenly spread across the 2,523 records? Let’s take a look!

summary(sample_indices)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    12.0   639.8  1268.5  1268.2  1896.2  2525.0

range(sample_indices)

## [1]   12 2525

diff_indices <- diff(sample_indices)
summary(diff_indices)  # Average gap should be around 12–13

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   12.00   13.00   12.63   13.00   13.00

# Check even spacing visually
plot(sample_indices,
     type = "h",
     col = "steelblue",
     lwd = 2,
     main = "Systematic Sampling: 200 Samples from 2,523 Phone Records",
     xlab = "Sample Order (1–200)",
     ylab = "Index Position in Population Frame")

# Add a line to show expected spacing
abline(a = start, b = interval, col = "red", lty = 2)

# It looks okay.

How do the demographic composition (e.g., Age, City) of these 200 phone samples compare against the full 2523 phone population?

library(dplyr)
library(tidyr)

# We first define population and sample frames
phone_pop <- valid_phone   # 2,523 valid phone respondents
phone_sample <- sys_phone_sample  # The 200 systematically selected respondents

# Select key demographic variables for comparison
var_list <- c("Gender", "Age", "City")

# Compute population proportions
# pivot_longer() reshapes data from a "wide" format to a "long" format
pop_props <- phone_pop %>%
  select(all_of(var_list)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
  group_by(Variable, Category) %>%
  summarise(Population_Share = n() / nrow(phone_pop), .groups = "drop")

# Compute sample proportions
sample_props <- phone_sample %>%
  select(all_of(var_list)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
  group_by(Variable, Category) %>%
  summarise(Sample_Share = n() / nrow(phone_sample), .groups = "drop")

# Merge
composition_compare <- full_join(pop_props, sample_props,
                                 by = c("Variable", "Category")) %>%
  mutate(
    Population_Share = round(Population_Share * 100, 1),
    Sample_Share = round(Sample_Share * 100, 1),
    Difference = round(Sample_Share - Population_Share, 1)
  ) %>%
  arrange(Variable, desc(Population_Share))  # display the result in descending order


print(composition_compare)

## # A tibble: 14 × 5
##    Variable Category   Population_Share Sample_Share Difference
##    <chr>    <chr>                 <dbl>        <dbl>      <dbl>
##  1 Age      60-69                  52           48.5       -3.5
##  2 Age      >70                    26           26          0  
##  3 Age      30-39                   7.8          8.5        0.7
##  4 Age      40-49                   6.6          7.5        0.9
##  5 Age      20-29                   4.2          6.5        2.3
##  6 Age      50-59                   3.4          3         -0.4
##  7 City     New Taipei             27.4         23         -4.4
##  8 City     Taichung               23.4         32          8.6
##  9 City     Taipei                 18.2         15         -3.2
## 10 City     Kaohsiung              17.5         15         -2.5
## 11 City     Taoyuan                10           14          4  
## 12 City     Tainan                  3.5          1         -2.5
## 13 Gender   female                 52.4         53          0.6
## 14 Gender   male                   47.6         47         -0.6

# Most age cohorts are under-represented, but > 70 cohort, as expected, is over-represented. Women (men) are under (over)-represented.

Sample Weighting

Even with sophisticated sampling methods, the resulting sample may still not adequately represent the underlying population due to unobserved/unmeasured individual heterogeneity. For example, some people may be reluctant to participate in surveys or decline to be interviewed on sensitive topics. Some segments of the population may be inherently hard to reach. So the polling result can be biased.

To address this problem, we can assign a “weight” to each observation in a sample to correct for imbalances and ensure the sample accurately reflects a known population distribution, compensating for issues like unequal selection probabilities, non-coverage, and non-response. By adjusting weights, researchers can ensure that different groups within a sample contribute appropriately to overall findings, leading to more reliable and representative conclusions about the larger population.

There are two oft-used weighting techniques: the \(\textbf{inverse probability weight}\) proposed by Horvitz and Thompson (1952) and \(\textbf{raking}\) (or iterative proportional fitting, see Kalton and Flores-Cervantes 2003 and Battaglia et al. 2004).

The \(\textbf{inverse probability weight}\) treats each cluster’s inclusion probability \(p\) as its share of the population \(n/N\). The proper “weight” used to adjust potential sample imbalance is just the inverse of each cluster’s inclusion probability:

\[\begin{equation*} w_i = \frac{1}{p_i} = \frac{1}{n_i/N}. \end{equation*}\]

One can then use those weights to adjust the estimate of the quantity of interest, say % DPP voters, attributable to a cluster in the sample that may be over or under-represented due to sampling design.

### Let's draw a simple random sample of size = 1000

set.seed(2025)

n_sample <- 1000
sample_srs <- survey_data %>%
  sample_n(n_sample)

# Get city/county-level population stats (from the full data)
city_pop <- survey_data %>%
  count(City, name = "N_city") %>%
  mutate(pop_share = N_city / sum(N_city))

# Get city/county-level sample counts (from the sample)
city_sample <- sample_srs %>%
  count(City, name = "n_city")

# Merge and compute the Horvitz-Thompson style inverse-probability weights (IPW)
ht_weights <- city_pop %>%
  left_join(city_sample, by = "City") %>%
  mutate(
    n_city = ifelse(is.na(n_city), 0, n_city),
    inclusion_prob = n_city / N_city,
    weight = ifelse(inclusion_prob > 0, 1 / inclusion_prob, NA)
  )

# Append weights to each respondent in the sample
sample_srs <- sample_srs %>%
  left_join(ht_weights %>% select(City, weight), by = "City") %>%
  mutate(DPP_support = ifelse(Party == "DPP", 1, 0))

# Compute unweighted and weighted estimates
estimates <- sample_srs %>%
  summarise(
    Unweighted_Prop = mean(DPP_support, na.rm = TRUE),
    Weighted_Prop = sum(weight * DPP_support, na.rm = TRUE) / sum(weight, na.rm = TRUE)
  )

# Compute the true population proportion as reference
true_prop <- mean(survey_data$Party == "DPP", na.rm = TRUE)

# Display all three estimates together
comparison <- data.frame(
  Type = c("True parameter", "Unweighted SRS", "IPW"),
  DPP_Proportion = c(true_prop, estimates$Unweighted_Prop, estimates$Weighted_Prop)
)

print(comparison)

##             Type DPP_Proportion
## 1 True parameter       0.360200
## 2 Unweighted SRS       0.350000
## 3            IPW       0.349734

# The IPW estimate is a bit closer to the true population parameter than the unweighted estimate.

# The design effect of the IPW-adjusted SRS sample can be calculated as

w <- sample_srs$weight
y <- sample_srs$DPP_support
n <- nrow(sample_srs)

# Remove rows with NA weights (if any)
ok <- !is.na(w)
w <- w[ok] 
y <- y[ok]

sum_w <- sum(w)
sum_w2 <- sum(w^2)

# Horvitz-Thompson weighted ratio estimator
p_hat_w <- sum(w * y) / sum_w

# Approximate variance of weighted proportion (common linearization / HT approx)
# Var_hat ≈ sum( w_i^2 * (y_i - p_hat_w)^2 ) / (sum_w)^2
var_hat_w <- sum( w^2 * (y - p_hat_w)^2 ) / (sum_w^2)
SE_w <- sqrt(var_hat_w)

# Kish design effect (the formula is defined in the previous section)
deff_kish <- n * (sum_w2 / (sum_w^2))

# The conventional Variance-based design effect (compare to SRS variance at same n using p_hat_w)
var_srs_at_n <- (p_hat_w * (1 - p_hat_w)) / n 
deff_var <- ifelse(var_srs_at_n > 0, var_hat_w / var_srs_at_n, NA_real_) # NA_real_ is R’s internal representation of a missing numeric value of type “double” (real number).


# Results
cat("Weighted estimate p_hat_HT =", round(p_hat_w, 4), "\n")

## Weighted estimate p_hat_HT = 0.3497

cat("Approx SE (weighted)       =", round(SE_w, 4), "\n\n")

## Approx SE (weighted)       = 0.0151

cat("Kish Deff                  =", round(deff_kish, 3), "  (n * sum(w^2)/(sum(w)^2))\n")

## Kish Deff                  = 1.002   (n * sum(w^2)/(sum(w)^2))

cat("Variance-based Deff        =", round(deff_var, 3), "  (Var_weighted / Var_SRS_at_n)\n\n")

## Variance-based Deff        = 1.001   (Var_weighted / Var_SRS_at_n)

# You can subtract 1 from deff_kish or deff_var to calculate the "design effect"

Raking

Raking (also called iterative proportional fitting or post-stratification by margins) adjusts survey weights so that the marginal distributions of selected variables (whose population distributions are known) in the weighted sample match known population margins. Raking is done iteratively: you adjust the proportion of attribute \(x_1\), but this can cause imbalance in the marginal distribution of attribute \(x_2\), so you need to re-weigh your sample again using \(x_2\), so on and so forth. The good thing is you don’t need full cross-tabulation (cells); you only need the marginal totals.

The main intuition goes like this:

We first start with base weights (often 1 or the original design weights).

Iteratively multiply weights so that one margin aligns, then the next, back and forth, until all margins converge (within tolerance).

Formally, for a data of \(N\) samples consist of \(R\) row variables \(x_{r}\) and \(C\) column variables \(x_{c}\) to be weighted, you adjust weights iteratively such that

\[\begin{equation*} \sum^{R}_{r}x_{r}w_{r} = \underbrace{N_{1} + ... + N_{R} = N = N_{c} + ... + N_{C}}_{\mbox{ ... gradually converges to ... }} = \sum^{C}_{c}x_{c}w_{c}, \end{equation*}\]

where \(N_r\) and \(N_c\) are population margins for weighting row (column) cells \(r\) (\(c\)), which must sum to 1.

You just have to \(\textit{rake}\) the sample data column by column, row by row, and iteratively. For pedagogical purpose, we will use functions from \(\textsf{R}\)’s \(\textsf{anesrake}\) to rake \(\textsf{sample_srs}\) data to ensure that this SRS sample matches population margins by Gender, Age, and City/County.

### Require anesrake package
# install.packages("anesrake")

library(anesrake)
# library(dplyr)

# Compute population margins (proportions) for these variables from survey_data (the population)
# Convert raking variables to factor level

survey_data <- survey_data %>%
  mutate(
    Gender = trimws(as.character(Gender)),  # trimws() removes leading and trailing whitespace from character strings
    Age = trimws(as.character(Age)),
    City = trimws(as.character(City))
  )

sample_srs <- sample_srs %>%
  mutate(
    Gender = trimws(as.character(Gender)),
    Age = trimws(as.character(Age)),
    City = trimws(as.character(City))
  )

# Make sure sample_srs has all categories as factors with same levels as population
sample_srs$Gender <- factor(sample_srs$Gender, levels = unique(survey_data$Gender))
sample_srs$Age    <- factor(sample_srs$Age, levels = unique(survey_data$Age))
sample_srs$City   <- factor(sample_srs$City, levels = unique(survey_data$City))

# Compute population margins (of raking variables) from survey_data
marg_gender <- survey_data %>%
  count(Gender) %>%
  mutate(prop = n / sum(n)) %>%
  pull(prop, name = Gender)  # pull() extracts a single column as a vector

marg_age <- survey_data %>%
  count(Age) %>%
  mutate(prop = n / sum(n)) %>%
  pull(prop, name = Age)

marg_city <- survey_data %>%
  count(City) %>%
  mutate(prop = n / sum(n)) %>%
  pull(prop, name = City)

# Create population margins as a list
margins_list <- list(
  Gender = marg_gender,
  Age = marg_age,
  City = marg_city
)

# Also, create unique case IDs for sample_srs
sample_srs$caseid <- seq_len(nrow(sample_srs))


# Run anesrake
set.seed(2025)

rake_res <- anesrake(
      inputter = margins_list,
     dataframe = sample_srs,
        caseid = sample_srs$caseid,
           cap = 5,  # Limit weights to 5× base weight for greater flexibility
  choosemethod = "total", # Variable choice is determined by the sum of the differences between actual and target values for each prospective weighting variable
         maxit = 500, # You can do more if the model is not converged after 500 runs
      convcrit = 0.01,  # Convergence is achieved at this critical value
        force1 = TRUE # Force any proportion distribution to sum one
)

## [1] "Raking converged in 3 iterations"

# Append raked weights to SRS sample
sample_srs$rake_weight <- rake_res$weightvec

# Check how well it worked
summary(rake_res)

## $convergence
## [1] "Complete convergence was achieved after 3 iterations"
## 
## $base.weights
## [1] "No Base Weights Were Used"
## 
## $raking.variables
## [1] "Age"
## 
## $weight.summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8933  0.9651  0.9826  1.0000  1.0350  1.1479 
## 
## $selection.method
## [1] "variable selection conducted using _pctlim_ - discrepancies selected using _total_."
## 
## $general.design.effect
## [1] 1.004998
## 
## $Gender
##         Target Unweighted N Unweighted %     Wtd N     Wtd %   Change in %
## female 0.51088          495        0.495  494.3427 0.4943427 -0.0006572961
## male   0.48912          505        0.505  505.6573 0.5056573  0.0006572961
## Total  1.00000         1000        1.000 1000.0000 1.0000000  0.0013145922
##        Resid. Disc. Orig. Disc.
## female   0.01653730     0.01588
## male    -0.01653730    -0.01588
## Total    0.03307459     0.03176
## 
## $Age
##        Target Unweighted N Unweighted %   Wtd N   Wtd % Change in %
## 40-49 0.25188          261        0.261  251.88 0.25188    -0.00912
## 60-69 0.09728           99        0.099   97.28 0.09728    -0.00172
## 50-59 0.14464          126        0.126  144.64 0.14464     0.01864
## 20-29 0.15100          163        0.163  151.00 0.15100    -0.01200
## 30-39 0.30428          294        0.294  304.28 0.30428     0.01028
## >70   0.05092           57        0.057   50.92 0.05092    -0.00608
## Total 1.00000         1000        1.000 1000.00 1.00000     0.05784
##       Resid. Disc. Orig. Disc.
## 40-49            0    -0.00912
## 60-69            0    -0.00172
## 50-59            0     0.01864
## 20-29            0    -0.01200
## 30-39            0     0.01028
## >70              0    -0.00608
## Total            0     0.05784
## 
## $City
##             Target Unweighted N Unweighted %      Wtd N      Wtd %
## Taipei     0.18628          187        0.187  188.38980 0.18838980
## Kaohsiung  0.18136          174        0.174  173.71272 0.17371272
## Taoyuan    0.10172          104        0.104  104.52780 0.10452780
## Taichung   0.23116          232        0.232  228.80937 0.22880937
## New Taipei 0.26788          277        0.277  278.45994 0.27845994
## Tainan     0.03160           26        0.026   26.10037 0.02610037
## Total      1.00000         1000        1.000 1000.00000 1.00000000
##              Change in % Resid. Disc. Orig. Disc.
## Taipei      0.0013898038 -0.002109804    -0.00072
## Kaohsiung  -0.0002872787  0.007647279     0.00736
## Taoyuan     0.0005278039 -0.002807804    -0.00228
## Taichung   -0.0031906307  0.002350631    -0.00084
## New Taipei  0.0014599352 -0.010579935    -0.00912
## Tainan      0.0001003665  0.005499633     0.00560
## Total       0.0069558187  0.030995086     0.02592

# Estimate raked and un-raked DPP vote share
# sample_srs <- sample_srs %>% mutate(DPP = ifelse(Party == "DPP", 1, 0))

weighted_est <- sum(sample_srs$rake_weight * sample_srs$DPP_support) / sum(sample_srs$rake_weight)
unweighted_est <- mean(sample_srs$DPP_support)
actual_DPP <- sum(survey_data$Party == "DPP") / nrow(survey_data)

cat("Unweighted estimate:", round(unweighted_est, 4), "\n")

## Unweighted estimate: 0.35

cat("Raked estimate:", round(weighted_est, 4), "\n")

## Raked estimate: 0.3505

cat("Actual DPP vote share:", round(actual_DPP, 4), "\n")

## Actual DPP vote share: 0.3602

# Raked estimate is indeed closer to the true population parameter

We will now compute the Kish design effect and compare it with the conventional variance-based design effect.

# Parameters
n <- nrow(sample_srs)
true_p <- mean(survey_data$Party == "DPP")

# Unweighted (unraked) estimate and SRS SE
p_unw <- mean(sample_srs$DPP_support, na.rm = TRUE)

# SRS SE with finite pop correction
N <- nrow(survey_data)
SE_unw <- sqrt((p_unw * (1 - p_unw)) / n) * sqrt((N - n) / (N - 1))

# Raked (weighted) estimate and approximate variance
w <- sample_srs$rake_weight
# Drop NA weights (if any, but it shouldn't happen since force1 = TRUE)
ok <- !is.na(w) & !is.na(sample_srs$DPP_support)
w <- w[ok]; y <- sample_srs$DPP_support[ok]

sum_w <- sum(w)
sum_w2 <- sum(w^2)

# Weighted proportion (ratio estimator)
p_w <- sum(w * y) / sum_w

# Approximate variance for weighted proportion (linearization / HT-style)
# Var_hat ≈ sum( w_i^2 * (y_i - p_w)^2 ) / (sum_w)^2
var_w_hat <- sum( w^2 * (y - p_w)^2 ) / (sum_w^2)
SE_w <- sqrt(var_w_hat)

# Kish design effect (captures weight inequality)
deff_kish <- n * (sum_w2 / (sum_w^2))

# Variance-based design effect relative to SRS variance at same n (using sample unweighted p)
var_srs_at_n_unw <- (p_unw * (1 - p_unw)) / n 
deff_var_unw <- ifelse(var_srs_at_n_unw > 0, var_w_hat / var_srs_at_n_unw, NA_real_)

# Design effect calculated using true population p (alternative denominator)
var_srs_at_n_true <- (true_p * (1 - true_p)) / n
deff_var_true <- ifelse(var_srs_at_n_true > 0, var_w_hat / var_srs_at_n_true, NA_real_)


# Summarize and print
res <- data.frame(
  Method = c("Unweighted (SRS)", "Raked (weighted)"),
  n = c(n, n),
  p_hat = c(p_unw, p_w),
  SE = c(SE_unw, SE_w),
  Kish_deff = c(1, deff_kish),
  Deff_vs_SRS_using_unweighted_p = c(1, deff_var_unw),
  Deff_vs_SRS_using_true_p = c(1, deff_var_true)
)

# Rounding
res_print <- res %>%
  mutate_at(vars(p_hat, SE, Kish_deff, Deff_vs_SRS_using_unweighted_p, Deff_vs_SRS_using_true_p),
            ~ round(., 4))

print(res_print)

##             Method    n  p_hat     SE Kish_deff Deff_vs_SRS_using_unweighted_p
## 1 Unweighted (SRS) 1000 0.3500 0.0148     1.000                         1.0000
## 2 Raked (weighted) 1000 0.3505 0.0151     1.005                         1.0063
##   Deff_vs_SRS_using_true_p
## 1                   1.0000
## 2                   0.9934

# Raked sample exhibits smaller variance.

cat("\nInterpretation notes:\n")

## 
## Interpretation notes:

cat("Kish_deff: effect of unequal weights (n * sum(w^2)/(sum(w)^2)).\n")

## Kish_deff: effect of unequal weights (n * sum(w^2)/(sum(w)^2)).

cat("Deff_vs_SRS_using_unweighted_p: variance-based design effect comparing weighted variance to SRS var computed using the unweighted sample p.\n")

## Deff_vs_SRS_using_unweighted_p: variance-based design effect comparing weighted variance to SRS var computed using the unweighted sample p.

cat("Deff_vs_SRS_using_true_p: alternative variance-based design effect using the true population p in SRS denominator.\n")

## Deff_vs_SRS_using_true_p: alternative variance-based design effect using the true population p in SRS denominator.