Cluster SRS Sample
Systematic Sample
SRS sample
library(haven)
setwd("/Users/isaiahmireles/Downloads")
df <- read_dta("cohabiting.dta")
library(tidyverse)
df <- df |> select(geography, county, statefips, adultwomen, femalesingle)
head(df)
colnames(df)
## [1] "geography" "county" "statefips" "adultwomen" "femalesingle"
\[ p = \frac{\text{femalesingle}}{\text{femalepop}} \]
p <- sum(df$femalesingle) / sum(df$adultwomen)
p # true pct
## [1] 0.1428876
library(tidyr)
# generate 2 col & count counties per state +
df <- df |>
separate(geography, into = c("county", "state"), sep = ", ") |>
group_by(statefips) |> # group by state
mutate(mi = n()) |> # count counties
ungroup()
set.seed(100)
N_SRS <- nrow(df)
# SRS
SRS_idx <- sample(1:nrow(df), 50, replace = F)
SRS_samp <- df[SRS_idx,]
SRS_samp
p_SRS <- sum(SRS_samp$femalesingle)/sum(SRS_samp$adultwomen)
pc <- 1 - p_SRS
n_SRS <- nrow(SRS_samp)
FPC_SRS <- (N_SRS - n_SRS)/(N_SRS-1)
# variance est :
V_p <- ((p_SRS*pc)/(n_SRS- 1))*FPC_SRS
ci_SRS <- p_SRS+c(-1,1)*(2*(sqrt(V_p))) # 0.0485717 0.2508547
\[ \text{Let } a_i \text{ denote the total number of elements in cluster i that possess the characteristic of interest} \]
set.seed(123)
df
# used for cluster samp. :
N_cluster <- length(unique(df$state)) # 50 states + 1 District of Columbia
n_cluster <- 5
states_idx <- sample(unique(df$statefips), n_cluster, replace = F)
states_idx
## [1] 34 18 17 4 46
cluster_samp <- df |> filter(statefips %in% states_idx)
cluster_samp
FPC_cluster <- (N_cluster - n_cluster)/(N_cluster-1)
cluster_samp |> distinct(statefips, state)
state_totals <-
cluster_samp |>
group_by(statefips) |>
summarise(
Ai = sum(femalesingle), # adult women living alone (ct)
Mi = sum(adultwomen), # adult women (ct)
.groups = "drop"
)
#plt
plot(state_totals$Mi, state_totals$Ai)
# pt est
p_hat_cluster <- sum(state_totals$Ai) / sum(state_totals$Mi)
p_hat_cluster
## [1] 0.1426604
# variance est
e_i <- state_totals$Ai - p_hat_cluster * state_totals$Mi
s_p2 <- sum(e_i^2) / (n_cluster - 1)
s_p2
## [1] 400864041
Mbar <- mean(state_totals$Mi)
V_cluster <- FPC_cluster * s_p2 / (n_cluster * Mbar^2)
se_phat <- sqrt(V_cluster)
se_phat
## [1] 0.003411777
ci_cluster <- p_hat_cluster + c(-1,1)*se_phat*2
ci_cluster
## [1] 0.1358368 0.1494839
grep("^ci", ls(), value = T)
## [1] "ci_cluster" "ci_SRS"
data.frame(ci_SRS, ci_cluster) |> t()
## [,1] [,2]
## ci_SRS 0.04228769 0.2395527
## ci_cluster 0.13583683 0.1494839
cat(paste("Truth : ",p))
## Truth : 0.14288760249612
So we are told : “Draw a sample of 50 counties” – we must therefore briefly calculate \(k\) to use a \(\text{1 in k Systematic Sample}\)
N_SRS <- nrow(df)
k <- floor(N_SRS / n_SRS)
# random start should be between 1 and k (not 1 and N)
random_start <- sample(1:k, 1)
sys_idx <- ((random_start - 1) + (0:(n_SRS - 1)) * k) %% N_SRS + 1
sys_samp <- df[sys_idx, ]
n_sys <- nrow(sys_samp)
N <- N_SRS
p_sys <- sum(sys_samp$femalesingle) / sum(sys_samp$adultwomen)
q_sys <- 1-p_sys
FPC_sys <- FPC_SRS
V_sys <- FPC_sys*((p_sys*q_sys)/(n_sys-1))
ci_sys <- p_sys+c(-1,1)*2*sqrt(V_sys)
data.frame(ci_cluster, ci_SRS, ci_sys) |> t()
## [,1] [,2]
## ci_cluster 0.13583683 0.1494839
## ci_SRS 0.04228769 0.2395527
## ci_sys 0.04541401 0.2452193
grep("V_", ls(), value = T)
## [1] "V_cluster" "V_p" "V_sys"