set.seed(123) # For reproducibility
# Simulate population data
population <- rnorm(10000, mean = 50, sd = 10) # Normal distribution. (Read center at μ=50 with σ=10)
mean(population)
## [1] 49.97628
sd(population)
## [1] 9.986366
# Draw a random sample
sample_data <- sample(population, 1000) # n = 1000
n <- length(sample_data)
# Descriptive statistics (you should always provide your readers with descriptive statistics)
mean(sample_data) # Mean
## [1] 49.29693
median(sample_data) # Median
## [1] 49.28208
var(sample_data) # Variance: measur how spread out a set of data points are from their average (mean).
## [1] 99.07115
sd(sample_data) # Standard Deviation: a quantity (unit) that measures the dispersion or spread of data points around the mean (average) of a dataset
## [1] 9.953449
# Mode (most frequent value, rounded)
mode_value <- as.numeric(names(sort(table(round(sample_data)), decreasing = TRUE)[1]))
mode_value
## [1] 49
# Sampling distribution & Central Limit Theorem (CLT)
sample_means <- replicate(1000, mean(sample(population, 100)))
hist(sample_means, main="CLT: Sampling Distribution of Means", col="lightblue")
mean(sample_means)
## [1] 50.00556
sd(sample_means)
## [1] 0.9851401
# Point Estimate & Confidence Interval (CI)
x_bar <- mean(sample_data) # Read x bar
s <- sd(sample_data)
alpha <- 0.05 # 95% CI
t_crit <- qt(1 - alpha/2, df = n - 1) # qt() calculates the quantile for a given probability and degrees of freedom of the Student's t-distribution
margin_error <- t_crit * s / sqrt(n)
CI <- c(x_bar - margin_error, x_bar + margin_error)
CI
## [1] 48.67928 49.91459
# Sampling Error (SE)
SE <- s / sqrt(n)
SE
## [1] 0.3147557
# Determining sample size (n) for a given CI & margin of error (E)
E <- 1.5 # Desired margin of error
z_crit <- qnorm(0.975) # 95% confidence
sigma <- 10 # Estimated population SD
n_required <- (z_crit * sigma / E)^2
n_required
## [1] 170.7315
# True population mean vs sample estimate
cat("True Mean:", mean(population), "\nSample Mean:", x_bar, "\n95% CI:", round(CI,2))
## True Mean: 49.97628
## Sample Mean: 49.29693
## 95% CI: 48.68 49.91
# How changes in CI and the desired margin of error affects required sample size (n)
### Effect of Margin of Error (E) and Confidence Level on Required Sample Size
# install.packages("ggplot2")
library(ggplot2)
sigma <- 10 # Assumed population SD
E_vals <- seq(0.5, 5, by = 0.25) # Range of desired margins of error 0.5 to 5 with an increment of 0.25
confidence_levels <- c(0.90, 0.95, 0.99) # Possible CI thresholds
# Compute sample size for each combination of E and CI
sample_size_df <- expand.grid(E = E_vals, CI = confidence_levels)
sample_size_df$n <- with(sample_size_df, {
z <- qnorm(1 - (1 - CI)/2) # z critical value
(z * sigma / E)^2 # formula: n = (z*sigma/E)^2
})
# Plot
ggplot(sample_size_df, aes(x = E, y = n, color = factor(CI))) +
geom_line(linewidth = 1.2) +
scale_y_log10() + # log scale for clarity
labs(
title = "Sample Size vs. Margin of Error and CI",
x = "Margin of Error (E)",
y = "Required Sample Size (log scale)",
color = "Confidence Level"
) +
theme_minimal(base_size = 13)
# The takeaway: As margin of error (CI) decreases (increases), the required sample size increases.
Let’s generate an example data of 25,000 obs.
set.seed(123) # For reproducibility
N <- 25000
# 1. Respondent ID
ID <- 1:N
# 2. Gender
Gender <- sample(c("male", "female"),
size = N,
replace = TRUE,
prob = c(0.495, 0.505))
# 3. Age cohort
Age <- sample(c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
size = N,
replace = TRUE,
prob = c(0.15, 0.30, 0.25, 0.15, 0.10, 0.05))
# 4. Education level
Edu <- sample(c("elementary school", "high school", "some college", "grad school"),
size = N,
replace = TRUE,
prob = c(0.05, 0.20, 0.45, 0.30))
# 5. Marital status
Married <- sample(c("Yes", "No"),
size = N,
replace = TRUE,
prob = c(0.48, 0.52))
# 6. Party list preference
Party <- sample(c("KMT", "DPP", "TPP"),
size = N,
replace = TRUE,
prob = c(0.33, 0.36, 0.31))
# 7. Referendum voting choice
Referendum <- sample(c("Yes", "No"),
size = N,
replace = TRUE,
prob = c(0.53, 0.47))
# 8. City
City <- sample(c("Taipei", "New Taipei", "Taichung", "Taoyuan", "Kaohsiung", "Tainan"),
size = N,
replace = TRUE,
prob = c(0.19, 0.27, 0.23, 0.10, 0.18, 0.03))
# 9. Districts (randomly within city)
district_list <- list(
"Taipei" = c("大安區", "松山區", "內湖區", "大同區", "文山區"),
"New Taipei" = c("板橋區", "金山區", "瑞芳區", "平溪區", "永和區", "淡水區", "五股區"),
"Taichung" = c("中區", "西區", "北屯區", "沙鹿區", "大里區"),
"Taoyuan" = c("中壢區", "平鎮區", "楊梅區", "新屋區"),
"Kaohsiung" = c("小港區", "連雅區", "左營區", "三民區"),
"Tainan" = c("永康區", "中西區")
)
District <- sapply(City, function(cty) {
sample(district_list[[cty]], 1)
})
# 10. Source: Web / Cell / Phone (special condition)
Source <- rep(NA, N)
# Start with Web and Cell first
Source <- sample(c("Web", "Cell", "Phone"),
size = N,
replace = TRUE,
prob = c(0.65, 0.25, 0.10))
# Adjust: ensure 78% of "Phone" come from age groups 60-69 and >70
is_phone <- Source == "Phone"
phone_indices <- which(is_phone)
# Identify old-age respondents
old_age_indices <- which(Age %in% c("60-69", ">70"))
young_age_indices <- which(!(Age %in% c("60-69", ">70")))
# Number of phone respondents
n_phone <- length(phone_indices)
target_old_phone <- round(0.78 * n_phone)
# If not enough old respondents, sample all
if(length(old_age_indices) < target_old_phone){
target_old_phone <- length(old_age_indices)
}
# Assign phones preferentially to old-age respondents
old_phone_ids <- sample(old_age_indices, target_old_phone)
remaining_phone_ids <- sample(young_age_indices, n_phone - target_old_phone)
phone_final_ids <- c(old_phone_ids, remaining_phone_ids)
# Reassign "Phone" source properly
Source <- rep(NA, N)
Source[phone_final_ids] <- "Phone"
# Fill remaining with Web/Cell by given proportion (within leftover)
remaining_ids <- setdiff(1:N, phone_final_ids)
Source[remaining_ids] <- sample(c("Web", "Cell"),
size = length(remaining_ids),
replace = TRUE,
prob = c(0.65/(0.65+0.25), 0.25/(0.65+0.25)))
# 11. Household size
Household <- sample(1:5,
size = N,
replace = TRUE,
prob = c(0.11, 0.18, 0.25, 0.40, 0.06))
# 12. Phone number (10 digits)
Phone <- rep(NA, N)
# Define area code map
area_code <- c("Taipei"="02", "New Taipei"="02", # Region code
"Taoyuan"="03", "Taichung"="04",
"Tainan"="06", "Kaohsiung"="07")
# Assign phone numbers only for Source == "Phone"
for (i in which(Source == "Phone")) {
prefix <- area_code[City[i]]
if (is.na(prefix)) prefix <- sample(c("02","03","04","06","07"), 1)
suffix <- sprintf("%08d", sample(0:99999999, 1)) # String formatting by inserting variables and values into a template string
Phone[i] <- paste0(prefix, suffix)
}
# Combine into a single data frame
survey_data <- data.frame(
ID, Gender, Age, Edu, Married, Party, Referendum, City, District,
Source, Household, Phone,
stringsAsFactors = FALSE
)
# Inspect distribution summary
str(survey_data) # A list of column names and their variable class
## 'data.frame': 25000 obs. of 12 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "female" "male" "female" "male" ...
## $ Age : chr "40-49" "60-69" "60-69" "50-59" ...
## $ Edu : chr "some college" "some college" "some college" "grad school" ...
## $ Married : chr "No" "No" "Yes" "No" ...
## $ Party : chr "KMT" "DPP" "TPP" "DPP" ...
## $ Referendum: chr "Yes" "Yes" "Yes" "No" ...
## $ City : chr "Taipei" "Taipei" "Taipei" "Kaohsiung" ...
## $ District : chr "內湖區" "內湖區" "文山區" "小港區" ...
## $ Source : chr "Web" "Cell" "Web" "Cell" ...
## $ Household : int 4 3 1 1 1 4 2 4 4 1 ...
## $ Phone : chr NA NA NA NA ...
table(survey_data$Source)
##
## Cell Phone Web
## 6276 2526 16198
table(survey_data$City)
##
## Kaohsiung New Taipei Taichung Tainan Taipei Taoyuan
## 4534 6697 5779 790 4657 2543
table(survey_data$Age)
##
## >70 20-29 30-39 40-49 50-59 60-69
## 1273 3775 7607 6297 3616 2432
Sampling from a population such that every observations has an equal, non-zero probability of being chosen for the sample. Let’s draw a sample of size = 1000.
set.seed(2025) # For reproducibility
# Population size
N <- nrow(survey_data)
# Sample size
n <- 1000
# Draw random sample based on ID, note the sampling function used inside the df and the no replacement argument
srs_sample <- survey_data[sample(survey_data$ID, n, replace = FALSE), ]
# Check sample size
nrow(srs_sample)
## [1] 1000
SRS is the simpliest sampling method, but how representative is the sample drawn using this simple method, compared to the true population parameter? A simple yardstick would be standard error (SE):
\[\begin{equation*} SE(\hat{p}) = \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}\cdot\sqrt{\frac{N - n}{N - 1}}, \end{equation*}\]
where \(\hat{p}\) is sampling proportion, \(n\) = sample size, \(N\) = population size, and the second square root term is usually called the finite population correction (FPC)
Let’s use the percentage vote for the Democratic Progressive Party (DPP) as the \(\hat{p}\) and calculate the SE of this sample.
p_hat <- mean(srs_sample$Party== "DPP")
p_hat
## [1] 0.35
SE <- sqrt( (p_hat * (1 - p_hat)) / n ) * sqrt( (N - n) / (N - 1) )
SE
## [1] 0.01477866
And the 95% confidence interval (CI) can be calculated as: \[\begin{equation*} CI = \hat{p} \pm z_{0.975}{\times}SE. \end{equation*}\]
z <- 1.96
CI_lower <- p_hat - z * SE
CI_upper <- p_hat + z * SE
cat("Estimated support for Referendum (Yes):", round(p_hat, 3), "\n")
## Estimated support for Referendum (Yes): 0.35
cat("Standard Error:", round(SE, 4), "\n")
## Standard Error: 0.0148
cat("95% Confidence Interval: [", round(CI_lower, 3), ",", round(CI_upper, 3), "]\n")
## 95% Confidence Interval: [ 0.321 , 0.379 ]
Note that in \(\textsf{survey_data}\) the percentage of population voted for the DPP is 0.36, so our sample propotion still captures this population parameter but differs by roughly \(\pm\) 1.48 percentage points (sampling error).
Selecting a sample from a population by choosing every n-th individual from a list, starting at a random point. The “sampling interval” (n) is determined by dividing the population size (N) by the desired sample size. This fixed interval ensures that samples are evenly spread and can be more \(\textit{efficient}\) than SRS or other sampling methods.
Let’s draw a sample of size = 1000 using this method. Given our population size (N = 2500) and sample size (n = 1000), we will randomly pick a starting ID between 1 and 25, then take every 25th case.
set.seed(2025)
# Population size and desired sample size
N <- nrow(survey_data)
n <- 1000
k <- floor(N / n) # Sampling interval. floor() returns the largest integer that is less than or equal to the value passed to it
# Random start between 1 and k
start <- sample(1:k, 1)
# Selected indices
sys_indices <- seq(from = start, to = N, by = k)
# Ensure we get exactly n observations
sys_indices <- sys_indices[1:n]
# Extract sample
sys_sample <- survey_data[sys_indices, ]
# Check sample size
nrow(sys_sample)
## [1] 1000
head(sys_sample$ID)
## [1] 13 38 63 88 113 138
We then compute the \(\hat{p}\) and SE of this sampling strategy.
p_hat_sys <- mean(sys_sample$Party== "DPP")
p_hat_sys
## [1] 0.35
SE_sys <- sqrt( (p_hat_sys * (1 - p_hat_sys)) / n ) * sqrt( (N - n) / (N - 1) )
SE_sys
## [1] 0.01477866
# Compute standard error (approximation same as SRS)
# Note that Systematic sampling can have lower or higher SE depending on the population ordering, but with random order (as in our toy data), it largely behaves like SRS. Their SEs are the same.
SE_sys <- sqrt((p_hat_sys * (1 - p_hat_sys)) / n) * sqrt((N - n) / (N - 1))
SE_sys
## [1] 0.01477866
How does the SE estimated from \(\textsf{sys_sample}\) compared to that of SRS-generated sample?
true_p <- mean(survey_data$Party == "DPP")
cat("Systematic Sampling Results\n")
## Systematic Sampling Results
cat("Estimated support (p̂):", round(p_hat_sys, 3), "\n")
## Estimated support (p̂): 0.35
cat("Standard Error (approx.):", round(SE_sys, 4), "\n")
## Standard Error (approx.): 0.0148
cat("True population support:", round(true_p, 3), "\n")
## True population support: 0.36
cat("Absolute error:", round(abs(p_hat_sys - true_p), 4), "\n")
## Absolute error: 0.0102
Stratified Sampling divides a population into distinct subgroups, or “strata,” based on shared characteristics, and then randomly sampling from each stratum to ensure proportional representation in the final sample. This strategy ensures that key characteristic of the population is represented in the sample, leading to more accurate and less biased results compared to SRS.
At below, we will draw a random sample of size = 1000 from our toy data stratified sequentially by
library(dplyr) # We need this package for group_by, mutate, and cur_group_id functions
set.seed(2025)
# Population size
N <- nrow(survey_data)
# Sample size
n <- 1000
# Get stratum sizes: count population size in each stratum
strata_sizes <- survey_data %>%
group_by(Gender, Age, City, District) %>%
summarise(N_h = n(), .groups = "drop") # Generate N_h ID for each distinct combination of grouping variables
# Compute proportional allocation
strata_sizes <- strata_sizes %>%
mutate(prop = N_h / N, # Calculate proportion %
n_h = prop * n)
# Round sample sizes and adjust to ensure total = 1000
strata_sizes$n_h <- floor(strata_sizes$n_h)
remainder <- n - sum(strata_sizes$n_h)
# Distribute remaining samples to strata with the largest fractional parts
frac_part <- (strata_sizes$prop * n) - strata_sizes$n_h
if (remainder > 0) {
add_index <- order(frac_part, decreasing = TRUE)[1:remainder]
strata_sizes$n_h[add_index] <- strata_sizes$n_h[add_index] + 1
}
# Double-check total
sum(strata_sizes$n_h)
## [1] 1000
# Draw the stratified sample
set.seed(2025)
stratified_sample <- survey_data %>%
group_by(Gender, Age, City, District) %>%
group_modify(~ {
n_h <- strata_sizes$n_h[
strata_sizes$Gender == .y$Gender &
strata_sizes$Age == .y$Age &
strata_sizes$City == .y$City &
strata_sizes$District == .y$District
]
if (length(n_h) == 0 || n_h == 0) return(tibble())
slice_sample(.x, n = min(n_h, nrow(.x))) # slice_sample() provides weighted sampling without explicit probability tables
}) %>%
ungroup()
# Verify sample size
nrow(stratified_sample)
## [1] 1000
# Preview
head(stratified_sample)
## # A tibble: 6 × 12
## Gender Age City District ID Edu Married Party Referendum Source
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr>
## 1 female 20-29 Kaohsiung 三民區 3231 some co… No TPP No Cell
## 2 female 20-29 Kaohsiung 三民區 20425 high sc… Yes TPP Yes Web
## 3 female 20-29 Kaohsiung 三民區 9185 high sc… No DPP No Web
## 4 female 20-29 Kaohsiung 三民區 6964 some co… Yes DPP Yes Cell
## 5 female 20-29 Kaohsiung 小港區 17352 grad sc… Yes TPP No Cell
## 6 female 20-29 Kaohsiung 小港區 4969 grad sc… Yes DPP Yes Cell
## # ℹ 2 more variables: Household <int>, Phone <chr>
How representative is the stratified sample at “stratum” level? Let’s compare the sample’s \(n_h\)s against their corresponding \(N_h\)s at the population level.
# library(dplyr)
### For a given strata
### Compute population proportions (N_h / N)
pop_strata <- survey_data %>%
group_by(Gender, Age, City, District) %>%
summarise(N_h = n(), .groups = "drop") %>%
mutate(pop_prop = N_h / sum(N_h))
### Compute sample proportions (n_h / n)
samp_strata <- stratified_sample %>%
group_by(Gender, Age, City, District) %>%
summarise(n_h = n(), .groups = "drop") %>%
mutate(samp_prop = n_h / sum(n_h))
### Merge and compare
compare_strata <- left_join(
pop_strata,
samp_strata,
by = c("Gender", "Age", "City", "District") # Join by these columns
) %>%
mutate(
samp_prop = ifelse(is.na(samp_prop), 0, samp_prop),
diff = samp_prop - pop_prop # Calculate sample vs. population difference
)
### Display 5 example strata (see the last column)
set.seed(42)
compare_strata %>%
sample_n(5) %>%
select(Gender, Age, City, District, pop_prop, samp_prop, diff)
## # A tibble: 5 × 7
## Gender Age City District pop_prop samp_prop diff
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 female 30-39 Taipei 文山區 0.00572 0.006 0.00028
## 2 male >70 Taoyuan 中壢區 0.00064 0.001 0.00036
## 3 female >70 Tainan 永康區 0.00024 0 -0.00024
## 4 female 40-49 Taipei 大同區 0.005 0.005 0
## 5 male 40-49 Taichung 中區 0.00524 0.005 -0.000240
The computation of stratified sample SE is a bit more involved. The sampling error of its estimated mean is:
\[\begin{equation*} SE_{strat} = \sqrt{\sum^{L}_{h=1}\left(\frac{N_h}{N}\right)^{2}\cdot\left(1 - \frac{n_{h}}{N_{h}}\right)^{2}\cdot\frac{s^{2}_{h}}{n_{h}}}, \end{equation*}\]
where
### Compute p_hat (overall DPP support in stratified sample)
p_hat <- mean(stratified_sample$Party == "DPP")
cat("Estimated DPP proportion (p̂):", round(p_hat, 4), "\n")
## Estimated DPP proportion (p̂): 0.352
# Compute within-stratum statistics
strata_stats <- survey_data %>%
group_by(Gender, Age, City, District) %>%
summarise(
N_h = n(),
p_h = mean(Party == "DPP"),
s2_h = var(as.numeric(Party == "DPP")),
.groups = "drop"
) %>%
left_join(
strata_sizes %>% select(Gender, Age, City, District, n_h),
by = c("Gender", "Age", "City", "District")
) %>%
filter(!is.na(n_h) & n_h > 0 & N_h > 1) # Remove problematic strata (quota = 0)
# Replace NA variances (e.g., only one observation) with 0
strata_stats$s2_h[is.na(strata_stats$s2_h)] <- 0
# Compute stratified variance
var_strat <- sum(
((strata_stats$N_h / N)^2) *
(1 - strata_stats$n_h / strata_stats$N_h) *
(strata_stats$s2_h / strata_stats$n_h)
)
SE_strat <- sqrt(var_strat)
# Print
cat("Standard Error (SE):", round(SE_strat, 4), "\n")
## Standard Error (SE): 0.0149
# Pretty close to the population SE
Multi-Stage Cluster Sampling divided the population into hierarchical groups (clusters), and samples are drawn in a series of random steps from these progressively smaller clusters until a final sample of individual units is selected. This sampling strategy is used for large, geographically dispersed populations to make data collection more feasible, cost-effective, and faster by reducing the need to survey every member of the population.
Multi-stage cluster sampling is often paired with Probability Proportional to Size (PPS), which is a probability sampling technique that involves selecting clusters and then selecting units within those clusters at multiple stages, using PPS at one or more stages to give larger clusters a greater chance of being selected. The sampling procedure generally starts by first randomly selecting some administrative units (such as provinces, cities) according to PPS, then within each units, randomly select some sub-units (e.g., districts) according to PPS. Finally, within each sub-unit, randomly select individuals (participants) to assemble the final sample.
We demonstrate this sampling strategy with an example.
set.seed(2025)
### Set population and Target Sample
N <- nrow(survey_data)
n <- 1000
### First-stage PPS Sampling of Cities
# Compute city population sizes in order to compute PPS
city_sizes <- survey_data %>%
group_by(City) %>%
summarise(N_city = n(), .groups = "drop") %>%
mutate(prob_city = N_city / sum(N_city)) # PPS weights
# Number of cities to select (at least 4, you can do more, but we only have 6)
n_cities <- max(4, round(0.4 * nrow(city_sizes)))
# Select cities with PPS
selected_cities <- city_sizes %>%
slice_sample(n = n_cities, weight_by = prob_city, replace = FALSE)
cat("First-stage PPS - Selected Cities:\n")
## First-stage PPS - Selected Cities:
print(selected_cities)
## # A tibble: 4 × 3
## City N_city prob_city
## <chr> <int> <dbl>
## 1 Kaohsiung 4534 0.181
## 2 Taichung 5779 0.231
## 3 Taipei 4657 0.186
## 4 New Taipei 6697 0.268
### Second-stage PPS Sampling of Districts Within Selected Cities
# Compute district population sizes within selected cities
district_sizes <- survey_data %>%
filter(City %in% selected_cities$City) %>%
group_by(City, District) %>%
summarise(N_district = n(), .groups = "drop") %>%
group_by(City) %>%
mutate(prob_district = N_district / sum(N_district)) %>% # PPS within city
ungroup()
# Select at least 2 districts per city (PPS)
districts_per_city <- district_sizes %>%
group_by(City) %>%
summarise(n_districts = min(2, n()), .groups = "drop")
selected_districts <- district_sizes %>%
left_join(districts_per_city, by = "City") %>%
group_by(City) %>%
group_modify(~ slice_sample(.x, n = .x$n_districts[1],
weight_by = prob_district,
replace = FALSE)) %>%
ungroup()
cat("\nSecond-stage PPS - Selected Districts:\n")
##
## Second-stage PPS - Selected Districts:
print(selected_districts)
## # A tibble: 8 × 5
## City District N_district prob_district n_districts
## <chr> <chr> <int> <dbl> <dbl>
## 1 Kaohsiung 小港區 1100 0.243 2
## 2 Kaohsiung 三民區 1143 0.252 2
## 3 New Taipei 平溪區 929 0.139 2
## 4 New Taipei 板橋區 994 0.148 2
## 5 Taichung 西區 1136 0.197 2
## 6 Taichung 大里區 1166 0.202 2
## 7 Taipei 大安區 918 0.197 2
## 8 Taipei 松山區 878 0.189 2
### Third-stage SRS within Selected Districts (not PPS)
district_alloc <- survey_data %>%
semi_join(selected_districts, by = c("City", "District")) %>%
group_by(City, District) %>%
summarise(N_district = n(), .groups = "drop") %>%
mutate(prop = N_district / sum(N_district),
n_draw = round(prop * n))
# Round to 1000
remainder <- n - sum(district_alloc$n_draw)
if (remainder > 0) {
add_index <- order(district_alloc$prop, decreasing = TRUE)[1:remainder]
district_alloc$n_draw[add_index] <- district_alloc$n_draw[add_index] + 1
}
multi_stage_sample <- survey_data %>%
semi_join(selected_districts, by = c("City", "District")) %>%
group_by(City, District) %>%
group_modify(~ {
n_draw <- district_alloc$n_draw[
district_alloc$City == .y$City &
district_alloc$District == .y$District
]
slice_sample(.x, n = min(n_draw, nrow(.x)))
}) %>%
ungroup()
cat("\nFinal multi-stage cluster sample size:\n")
##
## Final multi-stage cluster sample size:
print(nrow(multi_stage_sample))
## [1] 1000
To get reliable SE from a complex two-stage PPS sampling design, the most feasible way is use simulation-based method (repeated application of the exact sampling procedure) and use the empirical SD of the resulting estimates. This avoids messy derivation of inclusion probabilities for PPS-without-replacement at two stages, while also providing estimated design variance for the ease of comparison against less sophisticated sampling methods.
### To do this, we first define a function that implement one draw of the multistage PPS design shown above
set.seed(123)
# First, set parameters
target_n <- 1000 # Final sample size
m_cities <- 4 # Number of cities selected in stage 1 (at least 4)
districts_per_city <- 2 # Number of districts selected per city (at least 2)
num_iter <- 500 # Number of repeated samples for simulation-based SE
# Define function
draw_multistage_sample <- function(pop_df,
target_n = 1000,
m_cities = 4,
districts_per_city = 2) {
# pop_df: population sampling frame (i.e., survey_data)
# First-stage: city sizes & PPS selection of cities
city_sizes <- pop_df %>%
group_by(City) %>%
summarise(N_city = n(), .groups = "drop") %>%
mutate(prob_city = N_city / sum(N_city))
# Enforce at least m_cities, but not more than available
m_cities_use <- min(m_cities, nrow(city_sizes))
selected_cities <- city_sizes %>%
slice_sample(n = m_cities_use, weight_by = prob_city, replace = FALSE)
# Second-stage: district sizes (within selected cities) & PPS selection
district_sizes <- pop_df %>%
filter(City %in% selected_cities$City) %>%
group_by(City, District) %>%
summarise(N_district = n(), .groups = "drop") %>%
group_by(City) %>%
mutate(prob_district = N_district / sum(N_district)) %>%
ungroup()
# For each selected city choose up to districts_per_city districts (if fewer exist, take all!)
# Compute how many to choose per city
districts_count <- district_sizes %>%
group_by(City) %>%
summarise(n_avail = n(), .groups = "drop") %>%
mutate(n_choose = pmin(districts_per_city, n_avail))
selected_districts <- district_sizes %>%
left_join(districts_count %>% select(City, n_choose), by = "City") %>%
group_by(City) %>%
group_modify(~ slice_sample(.x,
n = unique(.x$n_choose),
weight_by = .x$prob_district,
replace = FALSE)) %>%
ungroup()
# Third stage: allocate target_n to selected districts proportional to their N_district
district_alloc <- pop_df %>%
semi_join(selected_districts, by = c("City", "District")) %>%
group_by(City, District) %>%
summarise(N_district = n(), .groups = "drop") %>%
mutate(prop = N_district / sum(N_district),
n_draw = floor(prop * target_n))
# Rounding to total target_n
remainder <- target_n - sum(district_alloc$n_draw)
if (remainder > 0) {
idx_add <- order(district_alloc$prop, decreasing = TRUE)[1:remainder]
district_alloc$n_draw[idx_add] <- district_alloc$n_draw[idx_add] + 1
} else if (remainder < 0) {
# If we overshot due to floor/round, reduce largest props
idx_reduce <- order(district_alloc$prop, decreasing = TRUE)[1:(-remainder)]
district_alloc$n_draw[idx_reduce] <- pmax(0, district_alloc$n_draw[idx_reduce] - 1)
}
# Ensure sum is target_n
if (sum(district_alloc$n_draw) != target_n) {
stop("Allocation bug: sum of n_draw != target_n")
}
# Draw SRS within each selected district
sample_df <- pop_df %>%
semi_join(selected_districts, by = c("City", "District")) %>%
group_by(City, District) %>%
group_modify(~ {
n_draw <- district_alloc$n_draw[
district_alloc$City == .y$City &
district_alloc$District == .y$District
]
slice_sample(.x, n = min(n_draw, nrow(.x)))
}) %>%
ungroup()
# return the sample (and also the allocation metadata optionally)
list(sample = sample_df,
selected_cities = selected_cities,
selected_districts = selected_districts,
district_alloc = district_alloc)
}
# Sanity check! @@a
one_draw <- draw_multistage_sample(survey_data,
target_n = target_n,
m_cities = m_cities,
districts_per_city = districts_per_city)
multi_stage_sample <- one_draw$sample
cat("Single-run sample size:", nrow(multi_stage_sample), "\n")
## Single-run sample size: 1000
### Estimate empirical SD
# p_hat from the single realization (using the same indicator, DPP proportion)
p_hat_single <- mean(multi_stage_sample$Party == "DPP")
cat("Single-run p̂ (DPP):", round(p_hat_single, 4), "\n\n")
## Single-run p̂ (DPP): 0.345
# Repeated simulation to get empirical SE
p_hats <- numeric(num_iter)
for (i in seq_len(num_iter)) {
draw <- draw_multistage_sample(survey_data,
target_n = target_n,
m_cities = m_cities,
districts_per_city = districts_per_city)
samp_i <- draw$sample
p_hats[i] <- mean(samp_i$Party == "DPP")
if (i %% 50 == 0) cat("Completed iteration:", i, "\n")
}
## Completed iteration: 50
## Completed iteration: 100
## Completed iteration: 150
## Completed iteration: 200
## Completed iteration: 250
## Completed iteration: 300
## Completed iteration: 350
## Completed iteration: 400
## Completed iteration: 450
## Completed iteration: 500
# Empirical results
mean_p_hat_sim <- mean(p_hats)
sd_p_hat_sim <- sd(p_hats) # Empirical SD
se_normal_ci <- c(mean_p_hat_sim - 1.96 * sd_p_hat_sim, # +-95% CI
mean_p_hat_sim + 1.96 * sd_p_hat_sim)
cat("Simulation results (", num_iter, "iterations ):\n", sep = "")
## Simulation results (500iterations ):
cat("Mean of p̂:", round(mean_p_hat_sim, 4), "\n")
## Mean of p̂: 0.3608
cat("Empirical SE (SD of p̂):", round(sd_p_hat_sim, 4), "\n")
## Empirical SE (SD of p̂): 0.0152
cat("Approx Normal 95% CI for mean p̂:", round(se_normal_ci[1],4), "to", round(se_normal_ci[2],4), "\n\n")
## Approx Normal 95% CI for mean p̂: 0.3311 to 0.3906
# Compare to true population proportion
true_p <- mean(survey_data$Party == "DPP")
cat("True population DPP proportion:", round(true_p, 4), "\n")
## True population DPP proportion: 0.3602
# Pretty damn good!
# Visualizing sampling distribution
library(ggplot2)
p_df <- data.frame(p_hat = p_hats)
ggplot(p_df, aes(x = p_hat)) +
geom_histogram(binwidth = 0.0025, color = "white", boundary = 0) +
geom_vline(xintercept = true_p, color = "red", size = 1) +
labs(title = paste0("Sampling distribution of p̂ (multi-stage PPS, n = ", target_n, ")"),
subtitle = paste0("Empirical SD = ", round(sd_p_hat_sim,4),
"; mean p̂ = ", round(mean_p_hat_sim,4)),
x = "Sample proportion p̂ (DPP)",
y = "Frequency") +
theme_minimal()
With declining landline (phone) usage (though some beg to differ), many survey practitioners proposed using mixed-mode contact method-online, mobile, phone-to recruit participants from a more diverse pool of population. However, there exists considerable heterogeneity in the demographic profiles of participants recruited through different venues. For example, online recruitment method reaches primarily younger, working-age people, while participants recruited by phone tend to be older. Are these characteristics borne out by our toy data?
To find out, let’s first draw a sample of size = 5000 with their “Source” proportional to each source’s share of total population. Then, for each “Source,” draw a PPS sample of size = 500 with “Age” composition proportional to the share of age cohort within each source.
set.seed(2025)
### Proportional allocation by "Source"
# Calculate proportions
source_share <- survey_data %>%
count(Source, name = "N") %>%
mutate(prop = N / sum(N),
n_draw = round(prop * 5000))
print(source_share) # Most of our sample were recruited online
## Source N prop n_draw
## 1 Cell 6276 0.25104 1255
## 2 Phone 2526 0.10104 505
## 3 Web 16198 0.64792 3240
### Draw a sample of size = 5000 with their "Source" proportional to each source's share of total population
# Also, adjust for rounding error
n = 5000
diff_total <- n - sum(source_share$n_draw)
if (diff_total != 0) {
idx <- order(source_share$prop, decreasing = TRUE)[1:abs(diff_total)]
source_share$n_draw[idx] <- source_share$n_draw[idx] + sign(diff_total)
}
print(source_share)
## Source N prop n_draw
## 1 Cell 6276 0.25104 1255
## 2 Phone 2526 0.10104 505
## 3 Web 16198 0.64792 3240
## Draw proportional samples by "Source"
# Join n_draw before sampling
sample_prop <- survey_data %>%
left_join(source_share, by = "Source") %>%
group_by(Source) %>%
group_modify(~ {
n_draw_val <- unique(.x$n_draw)
if (is.na(n_draw_val) || length(n_draw_val) != 1) {
stop("n_draw missing or not unique for Source = ", unique(.x$Source))
}
slice_sample(.x, n = n_draw_val)
}) %>%
ungroup()
cat("\nProportional sample size:", nrow(sample_prop), "\n")
##
## Proportional sample size: 5000
# Add age weights
age_midpoints <- tibble(
Age = c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
age_mid = c(25, 35, 45, 55, 65, 75)
)
survey_data_age <- survey_data %>%
left_join(age_midpoints, by = "Age")
### PPS sampling (of size = 500) within each source, weighted by age_mid
pps_sample <- survey_data_age %>%
group_by(Source) %>%
group_modify(~ slice_sample(.x, n = 500, weight_by = age_mid, replace = FALSE)) %>%
ungroup()
cat("Age-weighted PPS sample size:", nrow(pps_sample), "\n")
## Age-weighted PPS sample size: 1500
### Diagnostics: share of older respondents by Source
diagnostic_table <- pps_sample %>%
mutate(older = Age %in% c("60-69", ">70")) %>%
group_by(Source) %>%
summarise(
n = n(),
older_share = mean(older),
.groups = "drop"
) %>%
arrange(desc(older_share))
print(diagnostic_table)
## # A tibble: 3 × 3
## Source n older_share
## <chr> <int> <dbl>
## 1 Phone 500 0.842
## 2 Cell 500 0.126
## 3 Web 500 0.1
# Phone-based sample indeed consists primarily of older people!
We will use the same simulation-based approach to get empirical SD in lieu of sample SE.
set.seed(2025)
# Parameters
n <- 5000 # Proportional draw by Source
pps_per_source <- 500 # PPS draws per Source
num_iter <- 500 # Number of simulation iterations (you can have more for better precision)
# Prepare age midpoints (ensure exact matching)
age_midpoints <- tibble(
Age = c("20-29", "30-39", "40-49", "50-59", "60-69", ">70"),
age_mid = c(25, 35, 45, 55, 65, 75)
)
# Append age_mid (and drop rows with missing Age mapping if any)
survey_data_age <- survey_data %>%
left_join(age_midpoints, by = "Age")
if (any(is.na(survey_data_age$age_mid))) {
warning("Some rows had NA age_mid; dropping them for PPS draws.")
survey_data_age <- survey_data_age %>% filter(!is.na(age_mid))
}
# Precompute Source proportions & integer allocations for the proportional draw
source_share <- survey_data %>%
count(Source, name = "N") %>%
mutate(prop = N / sum(N),
n_draw = round(prop * n))
# Rounding so sum(n_draw) == total_prop_n
diff_total <- n - sum(source_share$n_draw)
if (diff_total != 0) {
idx <- order(source_share$prop, decreasing = TRUE)[1:abs(diff_total)]
source_share$n_draw[idx] <- source_share$n_draw[idx] + sign(diff_total)
}
stopifnot(sum(source_share$n_draw) == n)
# Perform estimation on a single combined sample
draw_combined_sample_once <- function(pop_df) {
# 1) First-stage: Proportional draw by Source
prop_part <- pop_df %>%
left_join(source_share %>% select(Source, n_draw), by = "Source") %>%
group_by(Source) %>%
group_modify(~ slice_sample(.x, n = unique(.x$n_draw))) %>%
ungroup()
# 2) Second-stage: PPS draw (age-weighted) within each Source (fixed n = pps_per_source)
# Ensure no NA weights
pps_part <- pop_df %>%
group_by(Source) %>%
group_modify(~ {
this <- .x %>% filter(!is.na(age_mid))
# If available rows fewer than pps_per_source, sample all
n_avail <- nrow(this)
n_take <- min(pps_per_source, n_avail)
slice_sample(this, n = n_take, weight_by = age_mid, replace = FALSE)
}) %>%
ungroup()
combined <- bind_rows(prop_part, pps_part)
# If some Source had fewer than pps_per_source available, combined size may be < total_prop_n + num_sources*pps_per_source
# Compute p_hat for DPP
p_hat <- mean(combined$Party == "DPP")
list(sample = combined, p_hat = p_hat)
}
# Sanity check
one_draw <- draw_combined_sample_once(survey_data_age)
cat("Single-run combined sample size:", nrow(one_draw$sample), "\n")
## Single-run combined sample size: 6500
cat("Single-run p̂ (DPP):", round(one_draw$p_hat, 4), "\n\n")
## Single-run p̂ (DPP): 0.3503
# Repeated simulation to estimate SE
p_hats <- numeric(num_iter)
for (i in seq_len(num_iter)) {
res <- draw_combined_sample_once(survey_data_age)
p_hats[i] <- res$p_hat
if (i %% 50 == 0) cat("Completed iteration:", i, "\n")
}
## Completed iteration: 50
## Completed iteration: 100
## Completed iteration: 150
## Completed iteration: 200
## Completed iteration: 250
## Completed iteration: 300
## Completed iteration: 350
## Completed iteration: 400
## Completed iteration: 450
## Completed iteration: 500
mean_p_hat_sim <- mean(p_hats)
sd_p_hat_sim <- sd(p_hats) # Empirical SD
ci_lower <- mean_p_hat_sim - 1.96 * sd_p_hat_sim
ci_upper <- mean_p_hat_sim + 1.96 * sd_p_hat_sim
cat("Simulation summary (", num_iter, " iterations):\n", sep = "")
## Simulation summary (500 iterations):
cat("Mean of p̂:", round(mean_p_hat_sim, 4), "\n")
## Mean of p̂: 0.3594
cat("Empirical SE (SD of p̂):", round(sd_p_hat_sim, 4), "\n")
## Empirical SE (SD of p̂): 0.0054
cat("Normal-approx 95% CI for mean p̂: [", round(ci_lower,4), ", ", round(ci_upper,4), "]\n\n")
## Normal-approx 95% CI for mean p̂: [ 0.3488 , 0.3699 ]
# Compare to true population proportion
true_p <- mean(survey_data$Party == "DPP")
cat("True population DPP proportion:", round(true_p, 4), "\n")
## True population DPP proportion: 0.3602
# Still very precise
# Plot sampling distribution
# library(ggplot2)
p_df <- data.frame(p_hat = p_hats)
ggplot(p_df, aes(x = p_hat)) +
geom_histogram(binwidth = 0.0025, color = "white", boundary = 0) +
geom_vline(xintercept = true_p, color = "red", size = 1) +
labs(title = paste0("Sampling distribution of p̂ (combined sample, n ≈ ", nrow(one_draw$sample), ")"),
subtitle = paste0("Empirical SD = ", round(sd_p_hat_sim,4), "; mean p̂ = ", round(mean_p_hat_sim,4)),
x = "Sample proportion p̂ (DPP)",
y = "Frequency") +
theme_minimal()
So how “accurate” are the samples produced by these methods, compared to the true population parameter? Let’s use \(\textbf{sampling error}\) and \(\textbf{margin of error}\) (MOE) as our evaluation metrics.
Sampling error is the inevitable difference between a statistic calculated from a sample (\(n\)) and the true parameter value of the population (\(N\)), \(\hat{p} - p\), for the quantity of interest (in our case here, the proportion of respondents who voted for the DPP). Sampling error should be distinguished from SE, the latter is the expected random variability under that design.
The MOE tells us how many percentage points our sample estimate differ from the true population parameter. The MOE is defined as the range of values below and above the sample estimate in a given confidence interval. The MOE is estimated as:
\[\begin{equation*} MOE = Z\cdot\sqrt{\frac{p(1 - p)}{n}} \end{equation*}\]
# The ground truth: population parameters
true_p <- mean(survey_data$Party == "DPP")
N <- nrow(survey_data)
### Compute p_hat and sampling error
est_sampling_error <- function(sample_df, name) {
if (!exists(deparse(substitute(sample_df))) || is.null(sample_df)) {
return(data.frame(Method = name, n = NA, p_hat = NA, SamplingError = NA))
}
n <- nrow(sample_df)
p_hat <- mean(sample_df$Party == "DPP")
sampling_error <- p_hat - true_p
data.frame(Method = name, n = n, p_hat = p_hat, SamplingError = sampling_error)
}
# Compute sampling error for each sampling method
results_SE <- bind_rows(
est_sampling_error(srs_sample, "Simple Random Sampling"),
est_sampling_error(sys_sample, "Systematic Sampling"),
est_sampling_error(stratified_sample, "Stratified Sampling"),
est_sampling_error(multi_stage_sample, "Multi-Stage PPS Sampling"),
est_sampling_error(pps_sample, "PPS by Source x Age Sampling")
)
print(results_SE)
## Method n p_hat SamplingError
## 1 Simple Random Sampling 1000 0.350 -0.0102
## 2 Systematic Sampling 1000 0.350 -0.0102
## 3 Stratified Sampling 1000 0.352 -0.0082
## 4 Multi-Stage PPS Sampling 1000 0.345 -0.0152
## 5 PPS by Source x Age Sampling 1500 0.352 -0.0082
# SRS, systematic sampling, and PPS by source x age sampling tend to underestimate the true share of DPP voters, while stratified sampling and multi-stage sampling tend to overestimate the true share of DPP voters. However, the differences are quite small.
### Estimate MOE (at 95% CI)
est_MOE <- function(sample_df, name, N, true_p, z = 1.96) {
if (is.null(sample_df)) return(data.frame(Method = name, n = NA, p_hat = NA, SamplingError = NA, SE = NA, MOE = NA))
n <- nrow(sample_df)
p_hat <- mean(sample_df$Party == "DPP")
sampling_error <- p_hat - true_p
# Standard error with finite population correction
SE <- sqrt((p_hat * (1 - p_hat)) / n * ((N - n) / (N - 1)))
# 95% Margin of error
MOE <- z * SE
data.frame(Method = name, n = n, p_hat = p_hat, SamplingError = sampling_error, SE = SE, MOE = MOE) # Also append sampling error and SE into this dataframe
}
# Apply to all five sampling methods
results_MOE <- bind_rows(
est_MOE(srs_sample, "Simple Random Sampling", N, true_p),
est_MOE(sys_sample, "Systematic Sampling", N, true_p),
est_MOE(stratified_sample, "Stratified Sampling", N, true_p),
est_MOE(multi_stage_sample, "Multi-Stage PPS Sampling", N, true_p),
est_MOE(pps_sample, "PPS by Source x Age Sampling", N, true_p)
)
print(results_MOE)
## Method n p_hat SamplingError SE MOE
## 1 Simple Random Sampling 1000 0.350 -0.0102 0.01477866 0.02896617
## 2 Systematic Sampling 1000 0.350 -0.0102 0.01477866 0.02896617
## 3 Stratified Sampling 1000 0.352 -0.0082 0.01479800 0.02900409
## 4 Multi-Stage PPS Sampling 1000 0.345 -0.0152 0.01472904 0.02886892
## 5 PPS by Source x Age Sampling 1500 0.352 -0.0082 0.01195600 0.02343376
# PPS by Source x Age Sampling has the smallest MOE, while the MOEs of Stratified Sampling and Multi-Stage PPS Sampling are indistinguishable.
Another question to ask is how much more (less) variance is produced by these sampling methods, relative to when the true data (aka., the population) is randomly generated that is best captured by the SRS. For this, we turn to a metric called “Design Effect,” compares the variance of your complex sample design to what it would have been under SRS of the same size:
\[\begin{equation*} D_{eff} = \frac{Var(\hat{p}_{design})}{Var(\hat{p}_{SRS})}. \end{equation*}\]
Since \(variance = SE^2\) (which we have already computed), we can equivalently compute:
\[\begin{equation*} D_{eff} = \left(\frac{SE_{design}}{SE_{SRS}}\right)^2. \end{equation*}\]
Hence, if
Let’s compare the relative performance of these sampling methods.
# Again, using the proportion of DPP voters as our population parameter
true_p <- mean(survey_data$Party == "DPP")
N <- nrow(survey_data)
# Use the results_MOE we obtained earlier
# Compute design effects (Deff) relative to SRS
SE_SRS <- results_MOE %>% filter(Method == "Simple Random Sampling") %>% pull(SE)
results_DE <- results_MOE %>%
mutate(
Deff = (SE / SE_SRS)^2,
`Sampling Error (%)` = round(SamplingError * 100, 2),
`SE` = round(SE * 100, 2),
`MOE 95%CI` = round(MOE * 100, 2),
`Deff` = round(Deff, 3)
) %>%
select(Method, n, `Sampling Error (%)`, `SE`, `MOE 95%CI`, Deff)
print(results_DE)
## Method n Sampling Error (%) SE MOE 95%CI Deff
## 1 Simple Random Sampling 1000 -1.02 1.48 2.90 1.000
## 2 Systematic Sampling 1000 -1.02 1.48 2.90 1.000
## 3 Stratified Sampling 1000 -0.82 1.48 2.90 1.003
## 4 Multi-Stage PPS Sampling 1000 -1.52 1.47 2.89 0.993
## 5 PPS by Source x Age Sampling 1500 -0.82 1.20 2.34 0.654
# Systematic sampling is closest to SRS (the baseline). Stratified Sampling and Multi-Stage PPS Sampling have nearly identical performance. PPS by Source x Age Sampling is far-off.
This analysis is related to another measure, called weighting loss (\(L_{w}\) ) introduced by Kish (1965) in the context of proportionated stratified sampling:
\[\begin{equation*} L_{w}(\bar{y}) = \frac{\sigma^2(w)}{w^2} = \left(\frac{\sum^{n}_{i=1}w^2}{(\sum^{n}_{i=1}w)^2}\cdot{n}\right) - 1, \end{equation*}\]
which assumes that
In short, it approximates the proportional increase in the variances of means and proportions (aka., weighting loss), ignoring any clustering if the above assumptions are met. So if a sampling design gives us a design effect of 1.35, then, by the above formula, the weighting loss (\(L_w\)) is 1.35 - 1 = 0.35. The smaller such statistic, the more representative a sampling design.
Random digit dialing (RDD) is a telephone survey method that uses software to generate phone numbers at random, including both listed and unlisted numbers, to create a representative sample of a population for polling. It differs from the use of phone books by selecting from the entire telephone number frame, ensuring that everyone with a phone has an equal chance of being contacted, which is crucial for general population studies.
RDD generates telephone numbers by randomly creating sequences of digits within established geographic area codes and prefixes or using our now familiar method of systematic sampling - splitting the population by the desired sample size (\(K\)) with fixed interval (\(N\)/\(K\)), randomly select a number, say the \(k\)-th phone number, within the first interval and then pick the next one by locating the \(k\) \(+\) the range of the interval-th phone number on the list, so on and so forth.
Let’s experiment with the second method at below.
# Filter observations with valid phone numbers in the "Phone" column
valid_phone <- survey_data %>%
filter(!is.na(Phone) & Phone != "")
n_valid <- nrow(valid_phone) # Should be 2523
n_sample <- 200 # Select 200 samples
# Get sampling interval
interval <- n_valid / n_sample # = 12.615
# Randomly choose a start between 1 and interval ---
set.seed(2025)
start <- sample(1:floor(interval), 1)
# Compute exact 200 indices using floor-based sequence (by the same interval)
sample_indices <- floor(start + (0:(n_sample - 1)) * interval)
# Adjust any 0 indices (just in case) and ensure within bounds
sample_indices <- sample_indices[sample_indices > 0 & sample_indices <= n_valid]
# Label the 200-sample dataset
sys_phone_sample <- valid_phone[sample_indices, ]
# Check sample size and inspect
nrow(sys_phone_sample)
## [1] 200
head(sys_phone_sample$Phone)
## [1] "0462067952" "0382510018" "0382688025" "0293218395" "0405531430"
## [6] "0270816571"
Are the 200 selected samples evenly spread across the 2,523 records? Let’s take a look!
summary(sample_indices)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.0 639.8 1268.5 1268.2 1896.2 2525.0
range(sample_indices)
## [1] 12 2525
diff_indices <- diff(sample_indices)
summary(diff_indices) # Average gap should be around 12–13
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 12.00 13.00 12.63 13.00 13.00
# Check even spacing visually
plot(sample_indices,
type = "h",
col = "steelblue",
lwd = 2,
main = "Systematic Sampling: 200 Samples from 2,523 Phone Records",
xlab = "Sample Order (1–200)",
ylab = "Index Position in Population Frame")
# Add a line to show expected spacing
abline(a = start, b = interval, col = "red", lty = 2)
# It looks okay.
How do the demographic composition (e.g., Age, City) of these 200 phone samples compare against the full 2523 phone population?
library(dplyr)
library(tidyr)
# We first define population and sample frames
phone_pop <- valid_phone # 2,523 valid phone respondents
phone_sample <- sys_phone_sample # The 200 systematically selected respondents
# Select key demographic variables for comparison
var_list <- c("Gender", "Age", "City")
# Compute population proportions
# pivot_longer() reshapes data from a "wide" format to a "long" format
pop_props <- phone_pop %>%
select(all_of(var_list)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
group_by(Variable, Category) %>%
summarise(Population_Share = n() / nrow(phone_pop), .groups = "drop")
# Compute sample proportions
sample_props <- phone_sample %>%
select(all_of(var_list)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category") %>%
group_by(Variable, Category) %>%
summarise(Sample_Share = n() / nrow(phone_sample), .groups = "drop")
# Merge
composition_compare <- full_join(pop_props, sample_props,
by = c("Variable", "Category")) %>%
mutate(
Population_Share = round(Population_Share * 100, 1),
Sample_Share = round(Sample_Share * 100, 1),
Difference = round(Sample_Share - Population_Share, 1)
) %>%
arrange(Variable, desc(Population_Share)) # display the result in descending order
print(composition_compare)
## # A tibble: 14 × 5
## Variable Category Population_Share Sample_Share Difference
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Age 60-69 52 48.5 -3.5
## 2 Age >70 26 26 0
## 3 Age 30-39 7.8 8.5 0.7
## 4 Age 40-49 6.6 7.5 0.9
## 5 Age 20-29 4.2 6.5 2.3
## 6 Age 50-59 3.4 3 -0.4
## 7 City New Taipei 27.4 23 -4.4
## 8 City Taichung 23.4 32 8.6
## 9 City Taipei 18.2 15 -3.2
## 10 City Kaohsiung 17.5 15 -2.5
## 11 City Taoyuan 10 14 4
## 12 City Tainan 3.5 1 -2.5
## 13 Gender female 52.4 53 0.6
## 14 Gender male 47.6 47 -0.6
# Most age cohorts are under-represented, but > 70 cohort, as expected, is over-represented. Women (men) are under (over)-represented.
Even with sophisticated sampling methods, the resulting sample may still not adequately represent the underlying population due to unobserved/unmeasured individual heterogeneity. For example, some people may be reluctant to participate in surveys or decline to be interviewed on sensitive topics. Some segments of the population may be inherently hard to reach. So the polling result can be biased.
To address this problem, we can assign a “weight” to each observation in a sample to correct for imbalances and ensure the sample accurately reflects a known population distribution, compensating for issues like unequal selection probabilities, non-coverage, and non-response. By adjusting weights, researchers can ensure that different groups within a sample contribute appropriately to overall findings, leading to more reliable and representative conclusions about the larger population.
There are two oft-used weighting techniques: the \(\textbf{inverse probability weight}\) proposed by Horvitz and Thompson (1952) and \(\textbf{raking}\) (or iterative proportional fitting, see Kalton and Flores-Cervantes 2003 and Battaglia et al. 2004).
The \(\textbf{inverse probability weight}\) treats each cluster’s inclusion probability \(p\) as its share of the population \(n/N\). The proper “weight” used to adjust potential sample imbalance is just the inverse of each cluster’s inclusion probability:
\[\begin{equation*} w_i = \frac{1}{p_i} = \frac{1}{n_i/N}. \end{equation*}\]
One can then use those weights to adjust the estimate of the quantity of interest, say % DPP voters, attributable to a cluster in the sample that may be over or under-represented due to sampling design.
### Let's draw a simple random sample of size = 1000
set.seed(2025)
n_sample <- 1000
sample_srs <- survey_data %>%
sample_n(n_sample)
# Get city/county-level population stats (from the full data)
city_pop <- survey_data %>%
count(City, name = "N_city") %>%
mutate(pop_share = N_city / sum(N_city))
# Get city/county-level sample counts (from the sample)
city_sample <- sample_srs %>%
count(City, name = "n_city")
# Merge and compute the Horvitz-Thompson style inverse-probability weights (IPW)
ht_weights <- city_pop %>%
left_join(city_sample, by = "City") %>%
mutate(
n_city = ifelse(is.na(n_city), 0, n_city),
inclusion_prob = n_city / N_city,
weight = ifelse(inclusion_prob > 0, 1 / inclusion_prob, NA)
)
# Append weights to each respondent in the sample
sample_srs <- sample_srs %>%
left_join(ht_weights %>% select(City, weight), by = "City") %>%
mutate(DPP_support = ifelse(Party == "DPP", 1, 0))
# Compute unweighted and weighted estimates
estimates <- sample_srs %>%
summarise(
Unweighted_Prop = mean(DPP_support, na.rm = TRUE),
Weighted_Prop = sum(weight * DPP_support, na.rm = TRUE) / sum(weight, na.rm = TRUE)
)
# Compute the true population proportion as reference
true_prop <- mean(survey_data$Party == "DPP", na.rm = TRUE)
# Display all three estimates together
comparison <- data.frame(
Type = c("True parameter", "Unweighted SRS", "IPW"),
DPP_Proportion = c(true_prop, estimates$Unweighted_Prop, estimates$Weighted_Prop)
)
print(comparison)
## Type DPP_Proportion
## 1 True parameter 0.360200
## 2 Unweighted SRS 0.350000
## 3 IPW 0.349734
# The IPW estimate is a bit closer to the true population parameter than the unweighted estimate.
# The design effect of the IPW-adjusted SRS sample can be calculated as
w <- sample_srs$weight
y <- sample_srs$DPP_support
n <- nrow(sample_srs)
# Remove rows with NA weights (if any)
ok <- !is.na(w)
w <- w[ok]
y <- y[ok]
sum_w <- sum(w)
sum_w2 <- sum(w^2)
# Horvitz-Thompson weighted ratio estimator
p_hat_w <- sum(w * y) / sum_w
# Approximate variance of weighted proportion (common linearization / HT approx)
# Var_hat ≈ sum( w_i^2 * (y_i - p_hat_w)^2 ) / (sum_w)^2
var_hat_w <- sum( w^2 * (y - p_hat_w)^2 ) / (sum_w^2)
SE_w <- sqrt(var_hat_w)
# Kish design effect (the formula is defined in the previous section)
deff_kish <- n * (sum_w2 / (sum_w^2))
# The conventional Variance-based design effect (compare to SRS variance at same n using p_hat_w)
var_srs_at_n <- (p_hat_w * (1 - p_hat_w)) / n
deff_var <- ifelse(var_srs_at_n > 0, var_hat_w / var_srs_at_n, NA_real_) # NA_real_ is R’s internal representation of a missing numeric value of type “double” (real number).
# Results
cat("Weighted estimate p_hat_HT =", round(p_hat_w, 4), "\n")
## Weighted estimate p_hat_HT = 0.3497
cat("Approx SE (weighted) =", round(SE_w, 4), "\n\n")
## Approx SE (weighted) = 0.0151
cat("Kish Deff =", round(deff_kish, 3), " (n * sum(w^2)/(sum(w)^2))\n")
## Kish Deff = 1.002 (n * sum(w^2)/(sum(w)^2))
cat("Variance-based Deff =", round(deff_var, 3), " (Var_weighted / Var_SRS_at_n)\n\n")
## Variance-based Deff = 1.001 (Var_weighted / Var_SRS_at_n)
# You can subtract 1 from deff_kish or deff_var to calculate the "design effect"
Raking (also called iterative proportional fitting or post-stratification by margins) adjusts survey weights so that the marginal distributions of selected variables (whose population distributions are known) in the weighted sample match known population margins. Raking is done iteratively: you adjust the proportion of attribute \(x_1\), but this can cause imbalance in the marginal distribution of attribute \(x_2\), so you need to re-weigh your sample again using \(x_2\), so on and so forth. The good thing is you don’t need full cross-tabulation (cells); you only need the marginal totals.
The main intuition goes like this:
You just have to \(\textit{rake}\) the sample data column by column, row by row, and iteratively. For pedagogical purpose, we will use functions from \(\textsf{R}\)’s \(\textsf{anesrake}\) to rake \(\textsf{sample_srs}\) data to ensure that this SRS sample matches population margins by Gender, Age, and City/County.
### Require anesrake package
# install.packages("anesrake")
library(anesrake)
# library(dplyr)
# Compute population margins (proportions) for these variables from survey_data (the population)
# Convert raking variables to factor level
survey_data <- survey_data %>%
mutate(
Gender = trimws(as.character(Gender)), # trimws() removes leading and trailing whitespace from character strings
Age = trimws(as.character(Age)),
City = trimws(as.character(City))
)
sample_srs <- sample_srs %>%
mutate(
Gender = trimws(as.character(Gender)),
Age = trimws(as.character(Age)),
City = trimws(as.character(City))
)
# Make sure sample_srs has all categories as factors with same levels as population
sample_srs$Gender <- factor(sample_srs$Gender, levels = unique(survey_data$Gender))
sample_srs$Age <- factor(sample_srs$Age, levels = unique(survey_data$Age))
sample_srs$City <- factor(sample_srs$City, levels = unique(survey_data$City))
# Compute population margins (of raking variables) from survey_data
marg_gender <- survey_data %>%
count(Gender) %>%
mutate(prop = n / sum(n)) %>%
pull(prop, name = Gender) # pull() extracts a single column as a vector
marg_age <- survey_data %>%
count(Age) %>%
mutate(prop = n / sum(n)) %>%
pull(prop, name = Age)
marg_city <- survey_data %>%
count(City) %>%
mutate(prop = n / sum(n)) %>%
pull(prop, name = City)
# Create population margins as a list
margins_list <- list(
Gender = marg_gender,
Age = marg_age,
City = marg_city
)
# Also, create unique case IDs for sample_srs
sample_srs$caseid <- seq_len(nrow(sample_srs))
# Run anesrake
set.seed(2025)
rake_res <- anesrake(
inputter = margins_list,
dataframe = sample_srs,
caseid = sample_srs$caseid,
cap = 5, # Limit weights to 5× base weight for greater flexibility
choosemethod = "total", # Variable choice is determined by the sum of the differences between actual and target values for each prospective weighting variable
maxit = 500, # You can do more if the model is not converged after 500 runs
convcrit = 0.01, # Convergence is achieved at this critical value
force1 = TRUE # Force any proportion distribution to sum one
)
## [1] "Raking converged in 3 iterations"
# Append raked weights to SRS sample
sample_srs$rake_weight <- rake_res$weightvec
# Check how well it worked
summary(rake_res)
## $convergence
## [1] "Complete convergence was achieved after 3 iterations"
##
## $base.weights
## [1] "No Base Weights Were Used"
##
## $raking.variables
## [1] "Age"
##
## $weight.summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8933 0.9651 0.9826 1.0000 1.0350 1.1479
##
## $selection.method
## [1] "variable selection conducted using _pctlim_ - discrepancies selected using _total_."
##
## $general.design.effect
## [1] 1.004998
##
## $Gender
## Target Unweighted N Unweighted % Wtd N Wtd % Change in %
## female 0.51088 495 0.495 494.3427 0.4943427 -0.0006572961
## male 0.48912 505 0.505 505.6573 0.5056573 0.0006572961
## Total 1.00000 1000 1.000 1000.0000 1.0000000 0.0013145922
## Resid. Disc. Orig. Disc.
## female 0.01653730 0.01588
## male -0.01653730 -0.01588
## Total 0.03307459 0.03176
##
## $Age
## Target Unweighted N Unweighted % Wtd N Wtd % Change in %
## 40-49 0.25188 261 0.261 251.88 0.25188 -0.00912
## 60-69 0.09728 99 0.099 97.28 0.09728 -0.00172
## 50-59 0.14464 126 0.126 144.64 0.14464 0.01864
## 20-29 0.15100 163 0.163 151.00 0.15100 -0.01200
## 30-39 0.30428 294 0.294 304.28 0.30428 0.01028
## >70 0.05092 57 0.057 50.92 0.05092 -0.00608
## Total 1.00000 1000 1.000 1000.00 1.00000 0.05784
## Resid. Disc. Orig. Disc.
## 40-49 0 -0.00912
## 60-69 0 -0.00172
## 50-59 0 0.01864
## 20-29 0 -0.01200
## 30-39 0 0.01028
## >70 0 -0.00608
## Total 0 0.05784
##
## $City
## Target Unweighted N Unweighted % Wtd N Wtd %
## Taipei 0.18628 187 0.187 188.38980 0.18838980
## Kaohsiung 0.18136 174 0.174 173.71272 0.17371272
## Taoyuan 0.10172 104 0.104 104.52780 0.10452780
## Taichung 0.23116 232 0.232 228.80937 0.22880937
## New Taipei 0.26788 277 0.277 278.45994 0.27845994
## Tainan 0.03160 26 0.026 26.10037 0.02610037
## Total 1.00000 1000 1.000 1000.00000 1.00000000
## Change in % Resid. Disc. Orig. Disc.
## Taipei 0.0013898038 -0.002109804 -0.00072
## Kaohsiung -0.0002872787 0.007647279 0.00736
## Taoyuan 0.0005278039 -0.002807804 -0.00228
## Taichung -0.0031906307 0.002350631 -0.00084
## New Taipei 0.0014599352 -0.010579935 -0.00912
## Tainan 0.0001003665 0.005499633 0.00560
## Total 0.0069558187 0.030995086 0.02592
# Estimate raked and un-raked DPP vote share
# sample_srs <- sample_srs %>% mutate(DPP = ifelse(Party == "DPP", 1, 0))
weighted_est <- sum(sample_srs$rake_weight * sample_srs$DPP_support) / sum(sample_srs$rake_weight)
unweighted_est <- mean(sample_srs$DPP_support)
actual_DPP <- sum(survey_data$Party == "DPP") / nrow(survey_data)
cat("Unweighted estimate:", round(unweighted_est, 4), "\n")
## Unweighted estimate: 0.35
cat("Raked estimate:", round(weighted_est, 4), "\n")
## Raked estimate: 0.3505
cat("Actual DPP vote share:", round(actual_DPP, 4), "\n")
## Actual DPP vote share: 0.3602
# Raked estimate is indeed closer to the true population parameter
We will now compute the Kish design effect and compare it with the conventional variance-based design effect.
# Parameters
n <- nrow(sample_srs)
true_p <- mean(survey_data$Party == "DPP")
# Unweighted (unraked) estimate and SRS SE
p_unw <- mean(sample_srs$DPP_support, na.rm = TRUE)
# SRS SE with finite pop correction
N <- nrow(survey_data)
SE_unw <- sqrt((p_unw * (1 - p_unw)) / n) * sqrt((N - n) / (N - 1))
# Raked (weighted) estimate and approximate variance
w <- sample_srs$rake_weight
# Drop NA weights (if any, but it shouldn't happen since force1 = TRUE)
ok <- !is.na(w) & !is.na(sample_srs$DPP_support)
w <- w[ok]; y <- sample_srs$DPP_support[ok]
sum_w <- sum(w)
sum_w2 <- sum(w^2)
# Weighted proportion (ratio estimator)
p_w <- sum(w * y) / sum_w
# Approximate variance for weighted proportion (linearization / HT-style)
# Var_hat ≈ sum( w_i^2 * (y_i - p_w)^2 ) / (sum_w)^2
var_w_hat <- sum( w^2 * (y - p_w)^2 ) / (sum_w^2)
SE_w <- sqrt(var_w_hat)
# Kish design effect (captures weight inequality)
deff_kish <- n * (sum_w2 / (sum_w^2))
# Variance-based design effect relative to SRS variance at same n (using sample unweighted p)
var_srs_at_n_unw <- (p_unw * (1 - p_unw)) / n
deff_var_unw <- ifelse(var_srs_at_n_unw > 0, var_w_hat / var_srs_at_n_unw, NA_real_)
# Design effect calculated using true population p (alternative denominator)
var_srs_at_n_true <- (true_p * (1 - true_p)) / n
deff_var_true <- ifelse(var_srs_at_n_true > 0, var_w_hat / var_srs_at_n_true, NA_real_)
# Summarize and print
res <- data.frame(
Method = c("Unweighted (SRS)", "Raked (weighted)"),
n = c(n, n),
p_hat = c(p_unw, p_w),
SE = c(SE_unw, SE_w),
Kish_deff = c(1, deff_kish),
Deff_vs_SRS_using_unweighted_p = c(1, deff_var_unw),
Deff_vs_SRS_using_true_p = c(1, deff_var_true)
)
# Rounding
res_print <- res %>%
mutate_at(vars(p_hat, SE, Kish_deff, Deff_vs_SRS_using_unweighted_p, Deff_vs_SRS_using_true_p),
~ round(., 4))
print(res_print)
## Method n p_hat SE Kish_deff Deff_vs_SRS_using_unweighted_p
## 1 Unweighted (SRS) 1000 0.3500 0.0148 1.000 1.0000
## 2 Raked (weighted) 1000 0.3505 0.0151 1.005 1.0063
## Deff_vs_SRS_using_true_p
## 1 1.0000
## 2 0.9934
# Raked sample exhibits smaller variance.
cat("\nInterpretation notes:\n")
##
## Interpretation notes:
cat("Kish_deff: effect of unequal weights (n * sum(w^2)/(sum(w)^2)).\n")
## Kish_deff: effect of unequal weights (n * sum(w^2)/(sum(w)^2)).
cat("Deff_vs_SRS_using_unweighted_p: variance-based design effect comparing weighted variance to SRS var computed using the unweighted sample p.\n")
## Deff_vs_SRS_using_unweighted_p: variance-based design effect comparing weighted variance to SRS var computed using the unweighted sample p.
cat("Deff_vs_SRS_using_true_p: alternative variance-based design effect using the true population p in SRS denominator.\n")
## Deff_vs_SRS_using_true_p: alternative variance-based design effect using the true population p in SRS denominator.