library(haven)
setwd("/Users/isaiahmireles/Downloads")
df <- read_dta("UC_Admissions.dta")
df
colnames(df)
##  [1] "personid"   "countyid"   "county"     "hsid"       "highschool"
##  [6] "public"     "city"       "apply"      "admit"      "enroll"
lapply(as.list(df), class)
## $personid
## [1] "numeric"
## 
## $countyid
## [1] "numeric"
## 
## $county
## [1] "character"
## 
## $hsid
## [1] "numeric"
## 
## $highschool
## [1] "character"
## 
## $public
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $city
## [1] "character"
## 
## $apply
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $admit
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $enroll
## [1] "haven_labelled" "vctrs_vctr"     "double"

Objective :

  • Estimate the total number of California high school students who applied, were admitted and enrolled at a UC campus in 2022

Variables :

library(dplyr)
# Number of accepted applicants who enroll
sum(df$enroll)
## [1] 39707
# Total number of applicants in CA
nrow(df)
## [1] 130981
# Total number of high schools with applicants to the UC in CA
length(unique(df$hsid))
## [1] 1639

God View :

\[ y_i =\begin{cases}1 & \text{if student } i \text{ applied, was admitted, and enrolled} \\0 & \text{otherwise}\end{cases} \]

So :

\[ \text{Applied} \cap \text{Admitted} \cap \text{Enrolled} \]

library(dplyr)
df <- df |> select(highschool, hsid, apply, admit, enroll) # using hsid cuz assumes " had at least 1 applicant"
df$hsid <- df$hsid |> as.character(df$hsid)

length(unique(df$hsid)) < nrow(df) # therefore, we have repeat HS observation
## [1] TRUE
# df |> select(apply, admit, enroll) |> as.list() |> lapply(unique)

god_view <- df |> filter(apply==1 & admit==1 & enroll==1) |> nrow()
all_of_em <- df |> nrow()
cat(paste("As we can see, ",god_view," ppl from Cali HS 2022 were admited and went to a UC campus"))
## As we can see,  39707  ppl from Cali HS 2022 were admited and went to a UC campus

PPS Cluster Sample :

Draw a sample of 20 high schools proportionate to number of applications

Verify Assumptions :

v <- df |> group_by(hsid) |> summarize(tot_enroll = sum(enroll)) |> select(tot_enroll) |> unlist()
v2 <- df |> group_by(hsid) |> summarize(tot_apply = sum(apply)) |> select(tot_apply) |> unlist()

plot(v,v2) # appears linear

cor(v, v2)
## [1] 0.9512413

Draw PPS Cluster Samp.

set.seed(14)

n <- 20

auxilary <- df |> 
  group_by(hsid) |> 
  summarize(mi = sum(apply))

M <- sum(auxilary$mi)

sigmai <- auxilary$mi/M
names(sigmai) <- auxilary$hsid

clusters <- sample(auxilary$hsid, size = n, replace = TRUE, prob = sigmai) # replace = TRUE!!

est. \(\hat{\tau}_{\text{pps}}\) and genrate CI-95%

# cluster totals
cluster_totals <- df |>
  group_by(hsid) |>
  summarize(yi = sum(enroll))

y_draw <- cluster_totals$yi[match(clusters, cluster_totals$hsid)]
m_draw <- auxilary$mi[match(clusters, auxilary$hsid)]

yi_bar <- y_draw/m_draw

T_pps <- (M/n)*sum(yi_bar)

mu_pps <- mean(yi_bar)

V_pps <- (M^2)/(n*(n-1))*sum((yi_bar-mu_pps)^2)

SE_pps <- sqrt(V_pps)
SE_pps
## [1] 2106.221
ci_pps <- T_pps+SE_pps*c(-1,1)*2 # est. 1.96 with "2"
ci_pps
## [1] 34104.06 42528.94

2 Stage Sample :

Draw a sample of at maximum 1000 applicants from 100 high schools from California in 2 stages

Draw 2 Stage Sample :

set.seed(14)
clusters <- sample(unique(df$hsid), replace = F, 100)
n <- length(clusters)
cluster_samp <- df[df$hsid %in% clusters, ]
SRS_samp <- cluster_samp |> group_by(hsid) |> slice_sample(n=10) |> ungroup()

est. \(\hat{\tau}_{\text{ts}}\) and genrerate CI-95%

N <- length(unique(df$hsid))
n <- 100 #HS
Mi <- df |> group_by(hsid) |> count(hsid) |> rename(Mi = n)
Mi[Mi$hsid %in% clusters,]
# numb elements in cluster i
mi <- SRS_samp |> count(hsid) |> rename(mi = n)
mi
mi |>
  left_join(Mi, by = "hsid")
M <- sum(Mi$Mi)


yi <- SRS_samp |>
  group_by(hsid) |>
  summarize(
    mi = n(),
    yi_bar = mean(enroll),
    .groups = "drop"
  )

Mi_clusters <- Mi[Mi$hsid %in% clusters,]
mu_hat <- (N/M)*((sum((Mi_clusters$Mi*yi$yi_bar)))/n)
T_ts <- mu_hat*M

FPC <- (1-n/N)
M_bar <- M/N
sb2 <- sum((Mi_clusters$Mi*yi$yi_bar - M_bar*mu_hat)^2)/(n-1)
betweem <- sb2 * FPC * (N^2)/n

si2 <- SRS_samp |>
  group_by(hsid) |>
  summarize(
    mi = n(),
    ybar_i = mean(enroll),
    si2 = var(enroll),   # automatically divides by (mi - 1)
    .groups = "drop"
  )

cluster_stats <- SRS_samp |>
  group_by(hsid) |>
  summarize(
    mi = n(),
    ybar_i = mean(enroll),
    si2 = var(enroll),
    .groups = "drop"
  ) |>
  left_join(
    df |> count(hsid, name = "Mi"),
    by = "hsid"
  )

within <- (N / n) *
  sum(
    cluster_stats$Mi^2 *
    (1 - cluster_stats$mi / cluster_stats$Mi) *
    (cluster_stats$si2 / cluster_stats$mi)
  )

V_hat <- betweem + within
SE_ts <- sqrt(V_hat)

ci_ts <- T_ts + c(-1,1)*2*SE_ts

Comparison :

results <- tibble::tibble(
  method = c("Two-Stage SRS", "PPS"),
  estimate = c(T_ts, T_pps),
  lower = c(ci_ts[1], ci_pps[1]),
  upper = c(ci_ts[2], ci_pps[2])
)
library(ggplot2)

ggplot(results, aes(x = method, y = estimate)) +
  geom_point(size = 3) +
  geom_errorbar(
    aes(ymin = lower, ymax = upper),
    width = 0.15
  ) +
  geom_hline(
    yintercept = god_view,
    linetype = "dashed",
    color = "red"
  ) +
  labs(
    title = "Comparison of Two-Stage SRS and PPS Estimates",
    y = "Estimated Total Enrollment",
    x = ""
  ) +
  theme_minimal()