library(haven)
setwd("/Users/isaiahmireles/Downloads")
df <- read_dta("UC_Admissions.dta")
df
colnames(df)
## [1] "personid" "countyid" "county" "hsid" "highschool"
## [6] "public" "city" "apply" "admit" "enroll"
lapply(as.list(df), class)
## $personid
## [1] "numeric"
##
## $countyid
## [1] "numeric"
##
## $county
## [1] "character"
##
## $hsid
## [1] "numeric"
##
## $highschool
## [1] "character"
##
## $public
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $city
## [1] "character"
##
## $apply
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $admit
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $enroll
## [1] "haven_labelled" "vctrs_vctr" "double"
library(dplyr)
# Number of accepted applicants who enroll
sum(df$enroll)
## [1] 39707
# Total number of applicants in CA
nrow(df)
## [1] 130981
# Total number of high schools with applicants to the UC in CA
length(unique(df$hsid))
## [1] 1639
\[ y_i =\begin{cases}1 & \text{if student } i \text{ applied, was admitted, and enrolled} \\0 & \text{otherwise}\end{cases} \]
So :
\[ \text{Applied} \cap \text{Admitted} \cap \text{Enrolled} \]
library(dplyr)
df <- df |> select(highschool, hsid, apply, admit, enroll) # using hsid cuz assumes " had at least 1 applicant"
df$hsid <- df$hsid |> as.character(df$hsid)
length(unique(df$hsid)) < nrow(df) # therefore, we have repeat HS observation
## [1] TRUE
# df |> select(apply, admit, enroll) |> as.list() |> lapply(unique)
god_view <- df |> filter(apply==1 & admit==1 & enroll==1) |> nrow()
all_of_em <- df |> nrow()
cat(paste("As we can see, ",god_view," ppl from Cali HS 2022 were admited and went to a UC campus"))
## As we can see, 39707 ppl from Cali HS 2022 were admited and went to a UC campus
Draw a sample of 20 high schools proportionate to number of applications
v <- df |> group_by(hsid) |> summarize(tot_enroll = sum(enroll)) |> select(tot_enroll) |> unlist()
v2 <- df |> group_by(hsid) |> summarize(tot_apply = sum(apply)) |> select(tot_apply) |> unlist()
plot(v,v2) # appears linear
cor(v, v2)
## [1] 0.9512413
set.seed(14)
n <- 20
auxilary <- df |>
group_by(hsid) |>
summarize(mi = sum(apply))
M <- sum(auxilary$mi)
sigmai <- auxilary$mi/M
names(sigmai) <- auxilary$hsid
clusters <- sample(auxilary$hsid, size = n, replace = TRUE, prob = sigmai) # replace = TRUE!!
# cluster totals
cluster_totals <- df |>
group_by(hsid) |>
summarize(yi = sum(enroll))
y_draw <- cluster_totals$yi[match(clusters, cluster_totals$hsid)]
m_draw <- auxilary$mi[match(clusters, auxilary$hsid)]
yi_bar <- y_draw/m_draw
T_pps <- (M/n)*sum(yi_bar)
mu_pps <- mean(yi_bar)
V_pps <- (M^2)/(n*(n-1))*sum((yi_bar-mu_pps)^2)
SE_pps <- sqrt(V_pps)
SE_pps
## [1] 2106.221
ci_pps <- T_pps+SE_pps*c(-1,1)*2 # est. 1.96 with "2"
ci_pps
## [1] 34104.06 42528.94
Draw a sample of at maximum 1000 applicants from 100 high schools from California in 2 stages
set.seed(14)
clusters <- sample(unique(df$hsid), replace = F, 100)
n <- length(clusters)
cluster_samp <- df[df$hsid %in% clusters, ]
SRS_samp <- cluster_samp |> group_by(hsid) |> slice_sample(n=10) |> ungroup()
N <- length(unique(df$hsid))
n <- 100 #HS
Mi <- df |> group_by(hsid) |> count(hsid) |> rename(Mi = n)
Mi[Mi$hsid %in% clusters,]
# numb elements in cluster i
mi <- SRS_samp |> count(hsid) |> rename(mi = n)
mi
mi |>
left_join(Mi, by = "hsid")
M <- sum(Mi$Mi)
yi <- SRS_samp |>
group_by(hsid) |>
summarize(
mi = n(),
yi_bar = mean(enroll),
.groups = "drop"
)
Mi_clusters <- Mi[Mi$hsid %in% clusters,]
mu_hat <- (N/M)*((sum((Mi_clusters$Mi*yi$yi_bar)))/n)
T_ts <- mu_hat*M
FPC <- (1-n/N)
M_bar <- M/N
sb2 <- sum((Mi_clusters$Mi*yi$yi_bar - M_bar*mu_hat)^2)/(n-1)
betweem <- sb2 * FPC * (N^2)/n
si2 <- SRS_samp |>
group_by(hsid) |>
summarize(
mi = n(),
ybar_i = mean(enroll),
si2 = var(enroll), # automatically divides by (mi - 1)
.groups = "drop"
)
cluster_stats <- SRS_samp |>
group_by(hsid) |>
summarize(
mi = n(),
ybar_i = mean(enroll),
si2 = var(enroll),
.groups = "drop"
) |>
left_join(
df |> count(hsid, name = "Mi"),
by = "hsid"
)
within <- (N / n) *
sum(
cluster_stats$Mi^2 *
(1 - cluster_stats$mi / cluster_stats$Mi) *
(cluster_stats$si2 / cluster_stats$mi)
)
V_hat <- betweem + within
SE_ts <- sqrt(V_hat)
ci_ts <- T_ts + c(-1,1)*2*SE_ts
results <- tibble::tibble(
method = c("Two-Stage SRS", "PPS"),
estimate = c(T_ts, T_pps),
lower = c(ci_ts[1], ci_pps[1]),
upper = c(ci_ts[2], ci_pps[2])
)
library(ggplot2)
ggplot(results, aes(x = method, y = estimate)) +
geom_point(size = 3) +
geom_errorbar(
aes(ymin = lower, ymax = upper),
width = 0.15
) +
geom_hline(
yintercept = god_view,
linetype = "dashed",
color = "red"
) +
labs(
title = "Comparison of Two-Stage SRS and PPS Estimates",
y = "Estimated Total Enrollment",
x = ""
) +
theme_minimal()