Topics :

  • Cluster SRS Sample

  • Systematic Sample

  • SRS sample

Research question :

  • What proportion of adult females in the United States are living alone?
library(haven)
setwd("/Users/isaiahmireles/Downloads")
df <- read_dta("cohabiting.dta")

glance :

library(tidyverse)
df <- df |> select(geography, county, statefips, adultwomen, femalesingle)

head(df)
colnames(df)
## [1] "geography"    "county"       "statefips"    "adultwomen"   "femalesingle"

God-view calculation :

\[ p = \frac{\text{femalesingle}}{\text{femalepop}} \]

p <- sum(df$femalesingle) / sum(df$adultwomen)
p # true pct
## [1] 0.1428876

Samp. Methods

library(tidyr)
# generate 2 col & count counties per state + 
df <- df |> 
  separate(geography, into = c("county", "state"), sep = ", ") |>
  group_by(statefips) |> # group by state
  mutate(mi = n()) |> # count counties 
  ungroup() 

SRS w/o replacement

SRS Procedure

set.seed(100) 
N_SRS <- nrow(df)

# SRS
SRS_idx <- sample(1:nrow(df), 50, replace = F) 
SRS_samp <- df[SRS_idx,]
SRS_samp

SRS Calculation

p_SRS <- sum(SRS_samp$femalesingle)/sum(SRS_samp$adultwomen)
pc <- 1 - p_SRS
n_SRS <- nrow(SRS_samp)
FPC_SRS <- (N_SRS - n_SRS)/(N_SRS-1)

# variance est : 
V_p <- ((p_SRS*pc)/(n_SRS- 1))*FPC_SRS
ci_SRS <- p_SRS+c(-1,1)*(2*(sqrt(V_p))) # 0.0485717 0.2508547

Cluster Samp.

\[ \text{Let } a_i \text{ denote the total number of elements in cluster i that possess the characteristic of interest} \]

Cluster Sampling Procedure

set.seed(123)
df
# used for cluster samp. : 
N_cluster <- length(unique(df$state)) # 50 states + 1 District of Columbia
n_cluster <- 5
states_idx <- sample(unique(df$statefips), n_cluster, replace = F)
states_idx
## [1] 34 18 17  4 46
cluster_samp <- df |> filter(statefips %in% states_idx)
cluster_samp

Cluster Calculations

FPC_cluster <- (N_cluster - n_cluster)/(N_cluster-1)
cluster_samp |> distinct(statefips, state)
state_totals <- 
  cluster_samp |>
  group_by(statefips) |>
  summarise(
    Ai = sum(femalesingle),  # adult women living alone (ct)
    Mi = sum(adultwomen),   # adult women (ct)
    .groups = "drop"
  )

#plt
plot(state_totals$Mi, state_totals$Ai)

# pt est
p_hat_cluster <- sum(state_totals$Ai) / sum(state_totals$Mi)
p_hat_cluster
## [1] 0.1426604
# variance est
e_i  <- state_totals$Ai - p_hat_cluster * state_totals$Mi
s_p2 <- sum(e_i^2) / (n_cluster - 1)
s_p2
## [1] 400864041
Mbar <- mean(state_totals$Mi)
V_cluster <- FPC_cluster * s_p2 / (n_cluster * Mbar^2)
se_phat  <- sqrt(V_cluster)

se_phat
## [1] 0.003411777
ci_cluster <- p_hat_cluster + c(-1,1)*se_phat*2
ci_cluster
## [1] 0.1358368 0.1494839
grep("^ci", ls(), value = T)
## [1] "ci_cluster" "ci_SRS"
data.frame(ci_SRS, ci_cluster) |> t()
##                  [,1]      [,2]
## ci_SRS     0.04228769 0.2395527
## ci_cluster 0.13583683 0.1494839
cat(paste("Truth : ",p))
## Truth :  0.14288760249612

Systematic Sample

So we are told : “Draw a sample of 50 counties” – we must therefore briefly calculate \(k\) to use a \(\text{1 in k Systematic Sample}\)

Systematic Sampling Procedure

N_SRS <- nrow(df)
k <- floor(N_SRS / n_SRS)

# random start should be between 1 and k (not 1 and N)
random_start <- sample(1:k, 1)

sys_idx <- ((random_start - 1) + (0:(n_SRS - 1)) * k) %% N_SRS + 1

sys_samp <- df[sys_idx, ]

Systematic Sampling Calculation

n_sys <- nrow(sys_samp)
N <- N_SRS
p_sys <- sum(sys_samp$femalesingle) / sum(sys_samp$adultwomen)
q_sys <- 1-p_sys
FPC_sys <- FPC_SRS
V_sys <- FPC_sys*((p_sys*q_sys)/(n_sys-1))
ci_sys <- p_sys+c(-1,1)*2*sqrt(V_sys)

Comparison

data.frame(ci_cluster, ci_SRS, ci_sys) |> t()
##                  [,1]      [,2]
## ci_cluster 0.13583683 0.1494839
## ci_SRS     0.04228769 0.2395527
## ci_sys     0.04541401 0.2452193
grep("V_", ls(), value = T)
## [1] "V_cluster" "V_p"       "V_sys"