library(haven)
setwd("/Users/isaiahmireles/Desktop")
dat <- read_dta("SamplingPrac/cohabiting.dta")
library(tidyverse)
dat <-
dat |> select(unmarriedpartner, married, malesingle, malepop, county, totalpop) |>
mutate(prop_men_alone = malesingle/malepop)
county, this is because they occur in clusters.N <- dat |> nrow()
dat |>
count(county) |>
arrange(county) |>
mutate(pct = n/N) |>
arrange(desc(pct))
As the instructions state, “sample of 100 county level elements” –
and, “Technically, counties are not individual elements, but clusters.
For this exercise, however, we are going to treat them as individual
elements” – so I believe we are meant to sample unique values.
Clearly, as we can see – some counties are more likely to be chosen than
others if we sample dat$county itself as it has repeated
values of varying amounts per county. Making certain counties more
likely than others. By definition a Simple Random
Sample is :
Meaning, by sampling the unique counties, we give each county and “equal probability of selection”. Sampling rows instead of counties would violate the SRS definition because counties would not have equal selection probability.
set.seed(123)
samp <- sample(unique(dat$county), 100, replace = F)
# length(unique(dat$county))
Above I have sampled, 100 counties without replacement. There are 324 unique counties total.
sample_dat <- dat |> filter(county %in% samp)
dim(sample_dat)
## [1] 982 7
length(unique(sample_dat$county)) == 100
## [1] TRUE
Next, we will generate \(N\) , which is our population size :
N <- length(unique(dat$county)); paste0("Population Size (Unique Counties) : ", N)
## [1] "Population Size (Unique Counties) : 324"
n <- 100; paste0("Sample Size : ", n)
## [1] "Sample Size : 100"
Lets take a brief step to understand whats going on :
Elements = counties
Population = all U.S. counties + District of Columbia
Sampling unit = counties
Observation = A row in the dataset
Sampling Frame = List of all counties
\[ \pi_i=\frac{n}{N} \]
and,
\[ w_i = \frac{1}{\pi_i} = \frac{N}{n} \]
sample_dat <- sample_dat |> mutate(swgt = N/n)
Generate FPC :
\[ \text{FPC} =\frac{N-n}{N-1} \]
M <- (N-n)/(N-1)
FPC <- sqrt(M)
Notice right now our data isnt a summary but rather the observations within each county, so lets summarize per county :
dim(sample_dat)
## [1] 982 8
county_lvl_samp <-
sample_dat |>
group_by(county) |>
summarise(
unmarriedpartner = sum(unmarriedpartner),
married = sum(married),
malesingle = sum(malesingle),
malepop = sum(malepop),
totalpop = sum(totalpop),
prop_men_alone = sum(malesingle) / sum(malepop),
.groups = "drop" #ungroup after grouping by county
) |> mutate(swgt = N/n)
Notice now that we have 100 rows (100 sampled counties)
dim(county_lvl_samp)
## [1] 100 8
head(county_lvl_samp)
Okay, before calculating lets take a look at the questions we are asking and then figure out what we are estimating :
Respectively :
\[ \tau_{\text{households in the United States where two adults were living in a cohabiting relationship}} \\ p_{\text{males in the United States were living alone}} \\ \mu_{\text{number of households per county in the United States that were occupied by a married couple}} \\ \text{Parameters} = \{ \tau, p, \mu\} \]
We est. the following parameters with the following estimates :
\[ \text{Parameters} = \{ t, \hat{p}, \bar{x}\} \]
\[ t= \frac{N}{n} \sum_{i \in s} y_i \]
t <- round(sum(county_lvl_samp$unmarriedpartner) * (N/n), 0); t # round to nearest whole value
## [1] 5960025
True_Total <- 5475768
abs_err <- abs(t-True_Total); abs_err
## [1] 484257
round(abs_err/True_Total, 2)
## [1] 0.09
So about 10% relative err – not bad – not great
Notice that the variance is S.E. is different from \(\bar{x}\) because the standard err. is :
First, recall :
\[ \hat{\tau} = \\ \frac{N}{n}\sum y_i= \\ N*(\frac{1}{n}\sum y_i) = N*(\bar{y}) \\ =N\bar{y} \]
Therefore :
\[ \text{S.E.}(\hat{\tau}) \\ = \text{S.E.}(N\bar{x}) \\ = \sqrt{\text{Var}(N\bar{x})} \]
and,
\[ \text{Var}(N\bar{x}) \\ =N^2\text{Var}(\bar{x}) \]
and as we know from our variance est of \(\bar{x}\),
\[ \text{Var}(\bar{x}) :=\frac{\sigma^2}{n}\approx\frac{s^2}{n} \]
So :
\[ \sqrt{N^2\frac{s^2}{n}} \]
Therefore, we can first calculate \(s^2 \approx \sigma^2\) :
hist(county_lvl_samp$unmarriedpartner)
s2 <- var(county_lvl_samp$unmarriedpartner); s2
## [1] 1206766322
FPC <- (N-n)/(N-1)
SE_tot <- sqrt((s2/n)*N^2*FPC)
CI_tot <- t + c(-1,1)*2*SE_tot # we est 1.96 with 2
CI_tot
## [1] 4085422 7834628
plot(
x = t,
y = 1,
xlim = range(c(CI_tot, True_Total)),
pch = 19,
xlab = "Total cohabiting households",
ylab = "",
yaxt = "n",
main = "SRS Estimate with 95% CI and True Value"
)
# Confidence interval
segments(CI_tot[1], 1, CI_tot[2], 1, lwd = 3)
# True value
abline(v = True_Total, col = "red", lwd = 2, lty = 2)
legend(
"topright",
legend = c("Estimate", "95% CI", "True value"),
pch = c(19, NA, NA),
lty = c(NA, 1, 2),
col = c("black", "black", "red"),
lwd = c(NA, 3, 2)
)
\[ \hat{p}= \frac{\sum_{i \in s} y_i}{\sum_{i \in s} x_i} \]
p_hat <- sum(county_lvl_samp$malesingle) /
sum(county_lvl_samp$malepop)
p_hat
## [1] 0.0865937
p_true <- .1172
abs_err <- abs(p_hat - p_true); abs_err
## [1] 0.0306063
round(abs_err/p_true, 2)
## [1] 0.26
we are about 30% – quite bad
Notice we need the variance of \(\hat{p}\)
\[ \text{Var}(\hat{p})=\frac{\sum y_i}{n} \]
for :
\[ y_i =\begin{cases}1, & \text{if density < 100} \\0, & \text{otherwise}\end{cases} \]
So,
\[ \text{Var}(\hat{p})\\ =\text{Var}(\frac{\sum y_i}{n})\\ =\frac{1}{n}\text{Var}(\sum y_i) \] where we assume \(y_i \text{ is independent from } y_j\), so variance simplifies :
\[ =\frac{1}{n^2}\sum\text{Var}(y_i) \\ =\frac{1}{n^2}npp^c \\ =\frac{1}{n}pp^c \]
pc <- 1-p_hat
SE <- sqrt((1/n)*p_hat*pc)
CI_p_hat <- p_hat + c(-1,1) * SE * 2
CI_p_hat
## [1] 0.03034595 0.14284144
\[ \bar{x}= \frac{1}{n} \sum_{i \in s} x_i \]
xbar <- mean(county_lvl_samp$married); xbar
## [1] 180179.2
SE <- sqrt((s2/n)*FPC)
CI_xbar <- xbar + c(-1,1) * SE * 2
CI_xbar
## [1] 174393.4 185965.1
dat <- read_dta("SamplingPrac/cohabiting.dta")
dat <-
dat |> select(statefips,region, unmarriedpartner, married, malesingle, malepop, county, totalpop) |>
mutate(prop_men_alone = malesingle/malepop)
Suppose we stratefy by region :
region_county_frame <- dat |>
group_by(region, county) |>
summarise(
unmarriedpartner = mean(unmarriedpartner),
married = mean(married),
malesingle = mean(malesingle),
malepop = mean(malepop),
totalpop = mean(totalpop),
prop_men_alone = mean(malesingle / malepop),
.groups = "drop"
)
Notice that the size of each region varies :
table(region_county_frame$region)
##
## 1 2 3 4
## 67 118 296 93
set.seed(123)
samp_strat <- region_county_frame |>
group_by(region) |>
slice_sample(n = 25, replace = FALSE) |>
ungroup()
Okay, lets take an equal amount from each region :
table(samp_strat$region)
##
## 1 2 3 4
## 25 25 25 25
county_frame <- dat |>
group_by(county, region) |>
summarise(
region = first(region),
unmarriedpartner = sum(unmarriedpartner),
married = sum(married),
malesingle = sum(malesingle),
malepop = sum(malepop),
totalpop = sum(totalpop),
prop_men_alone = sum(malesingle) / sum(malepop),
.groups = "drop"
)
Nh <- table(county_frame$region)
samp_strat <- samp_strat |>
mutate(
Nh = Nh[as.character(region)],
nh = 25,
swgt = Nh / nh
)
Recall that for strata there is a FPC :
samp_strat <- samp_strat |>
mutate(
fpc = (Nh - nh) / (Nh - 1)
)
tau_hat_strat <-
sum(samp_strat$unmarriedpartner * samp_strat$swgt)
# s^2 per group :
library(dplyr)
var_by_region <- samp_strat |>
group_by(region) |>
summarise(
Nh = first(Nh),
nh = first(nh),
s2h = var(unmarriedpartner),
.groups = "drop"
)
Var_tau_strat <- sum(
var_by_region$Nh^2 *
(1 - var_by_region$nh / var_by_region$Nh) *
var_by_region$s2h / var_by_region$nh
)
SE_tau_strat <- sqrt(Var_tau_strat)
CI_tau_strat <- c(
tau_hat_strat - 1.96 * SE_tau_strat,
tau_hat_strat + 1.96 * SE_tau_strat
)
CI_tau_strat
## [1] 824826 1190355
p_hat_strat <-
sum(samp_strat$malesingle * samp_strat$swgt) /
sum(samp_strat$malepop * samp_strat$swgt)
# total weighted denominator
X_hat <- sum(samp_strat$malepop * samp_strat$swgt)
# residuals for ratio estimator
samp_strat <- samp_strat |>
mutate(e = malesingle - p_hat_strat * malepop)
var_by_region_p <- samp_strat |>
group_by(region) |>
summarise(
Nh = first(Nh),
nh = first(nh),
s2e = var(e),
.groups = "drop"
)
Var_p_hat_strat <- sum(
var_by_region_p$Nh^2 *
(1 - var_by_region_p$nh / var_by_region_p$Nh) *
var_by_region_p$s2e / var_by_region_p$nh
) / (X_hat^2)
SE_p_hat_strat <- sqrt(Var_p_hat_strat)
CI_p_hat_strat <- c(
p_hat_strat - 1.96 * SE_p_hat_strat,
p_hat_strat + 1.96 * SE_p_hat_strat
)
CI_p_hat_strat
## [1] 0.07852242 0.09619817
mu_hat_strat <-
sum(samp_strat$married * samp_strat$swgt) /
sum(samp_strat$swgt)
var_by_region_mu <- samp_strat |>
group_by(region) |>
summarise(
Nh = first(Nh),
nh = first(nh),
s2h = var(married),
.groups = "drop"
)
N <- sum(var_by_region_mu$Nh)
Var_mu_hat_strat <- sum(
(var_by_region_mu$Nh / N)^2 *
(1 - var_by_region_mu$nh / var_by_region_mu$Nh) *
var_by_region_mu$s2h / var_by_region_mu$nh
)
SE_mu_hat_strat <- sqrt(Var_mu_hat_strat)
CI_mu_hat_strat <- c(
mu_hat_strat - 1.96 * SE_mu_hat_strat,
mu_hat_strat + 1.96 * SE_mu_hat_strat
)
CI_mu_hat_strat
## [1] 14417.64 20259.84
Clearly I have made a mistake somewhere because all of my estimates are dramatically off and underestimating. I do not understand.