Lab 6: Inference for Categorical Data
(Ultra‑Light)
Sachi Kapoor
October 19, 2025
Getting Started
no_helmet <- yrbss %>% filter(helmet_12m == "never")
nrow(no_helmet); invisible(gc())
## [1] 6977
Exercise 1 — Counts
yrbss %>%
count(text_while_driving_30d, name = "count") %>%
arrange(as.numeric(as.character(text_while_driving_30d)))
## # A tibble: 9 × 2
## text_while_driving_30d count
## <chr> <int>
## 1 0 4792
## 2 30 827
## 3 1-2 925
## 4 10-19 373
## 5 20-29 298
## 6 3-5 493
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
Exercise 2 — Proportion + CI
no_helmet <- no_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
prop_everyday_nohelmet <- no_helmet %>%
summarise(p_hat = mean(text_ind == "yes", na.rm = TRUE),
n = sum(!is.na(text_ind)))
prop_everyday_nohelmet
## # A tibble: 1 × 2
## p_hat n
## <dbl> <int>
## 1 0.0712 6503
no_helmet_bin <- no_helmet %>%
mutate(text_ind = factor(text_ind, levels = c("no","yes"))) %>%
filter(!is.na(text_ind)) %>%
transmute(text_ind_num = as.integer(text_ind == "yes"))
ci_ex2 <- no_helmet_bin %>%
specify(response = text_ind_num) %>%
generate(reps = 400, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
ci_ex2; rm(no_helmet_bin); invisible(gc())
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0655 0.0777
Exercise 3 — ME (normal approx)
z <- 1.96
p_hat <- prop_everyday_nohelmet$p_hat
n_hat <- prop_everyday_nohelmet$n
ME_ex3 <- z * sqrt(p_hat * (1 - p_hat) / n_hat)
data.frame(p_hat = p_hat, n = n_hat, ME_approx_95 = ME_ex3)
## p_hat n ME_approx_95
## 1 0.07119791 6503 0.006250207
Exercise 4 — Two more CIs
yrbss <- yrbss %>%
mutate(active7_ind = ifelse(physically_active_7d == 7, "yes", "no"))
tv3plus_ind <- str_detect(yrbss$hours_tv_per_school_day, "^[3-9]") |
str_detect(yrbss$hours_tv_per_school_day, "or more")
yrbss <- yrbss %>% mutate(tv3plus_ind = ifelse(tv3plus_ind, "yes", "no"))
yrbss_active <- yrbss %>%
mutate(active7_ind = factor(active7_ind, levels = c("no","yes"))) %>%
filter(!is.na(active7_ind)) %>%
transmute(active7_num = as.integer(active7_ind == "yes"))
ci_active7 <- yrbss_active %>%
specify(response = active7_num) %>%
generate(reps = 400, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
phat_active7 <- mean(yrbss_active$active7_num)
n_active7 <- nrow(yrbss_active)
ME_active7 <- 1.96 * sqrt(phat_active7 * (1 - phat_active7) / n_active7)
list(ci = ci_active7, phat = phat_active7, n = n_active7, ME_95 = ME_active7)
## $ci
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.265 0.280
##
## $phat
## [1] 0.2721262
##
## $n
## [1] 13310
##
## $ME_95
## [1] 0.007561018
rm(yrbss_active); invisible(gc())
yrbss_tv <- yrbss %>%
mutate(tv3plus_ind = factor(tv3plus_ind, levels = c("no","yes"))) %>%
filter(!is.na(tv3plus_ind)) %>%
transmute(tv3_num = as.integer(tv3plus_ind == "yes"))
ci_tv3 <- yrbss_tv %>%
specify(response = tv3_num) %>%
generate(reps = 400, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
phat_tv3 <- mean(yrbss_tv$tv3_num)
n_tv3 <- nrow(yrbss_tv)
ME_tv3 <- 1.96 * sqrt(phat_tv3 * (1 - phat_tv3) / n_tv3)
list(ci = ci_tv3, phat = phat_tv3, n = n_tv3, ME_95 = ME_tv3)
## $ci
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.353 0.369
##
## $phat
## [1] 0.3610419
##
## $n
## [1] 13245
##
## $ME_95
## [1] 0.008179845
rm(yrbss_tv); invisible(gc())
Exercise 5 — ME vs p (n = 1000)
n <- 1000
p <- seq(0, 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p) / n)
ggplot(data.frame(p, me), aes(p, me)) +
geom_line() +
labs(x = "Population Proportion (p)", y = "Approx. 95% ME")

Exercises 6–8 — Conceptual
- For fixed n, center at p; SD $\sqrt{p(1-p)/n}$; widest at p=0.5.
- Larger n → smaller SD.
- Variance largest near p=0.5.
Exercise 9 — Sleep 10+ vs daily strength
# Exercise 9 — Sleep 10+ vs daily strength: one-sided test ("greater") + 95% CI
# Build comparison data (binary factors, no NAs)
yrbss_cmp <- yrbss %>%
dplyr::mutate(
sleep10plus = ifelse(stringr::str_detect(school_night_hours_sleep, "^10"), "yes", "no"),
strength_daily = ifelse(strength_training_7d == 7, "yes", "no"),
sleep10plus = factor(sleep10plus, levels = c("no","yes")),
strength_daily = factor(strength_daily, levels = c("no","yes"))
) %>%
dplyr::filter(!is.na(sleep10plus), !is.na(strength_daily))
# Quick 2x2 table
tab_ex9 <- yrbss_cmp %>% dplyr::count(sleep10plus, strength_daily)
tab_ex9
## # A tibble: 4 × 3
## sleep10plus strength_daily n
## <fct> <fct> <int>
## 1 no no 9949
## 2 no yes 1958
## 3 yes no 228
## 4 yes yes 84
# Observed difference in proportions: p_yes(sleep10plus) - p_yes(not)
obs_diff <- yrbss_cmp %>%
infer::specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
infer::calculate(stat = "diff in props", order = c("yes","no"))
set.seed(606)
null_dist <- yrbss_cmp %>%
infer::specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
infer::hypothesize(null = "independence") %>%
infer::generate(reps = 800, type = "permute") %>% # light on memory
infer::calculate(stat = "diff in props", order = c("yes","no"))
# P-value (use the API that your infer version supports)
# Preferred:
p_val <- infer::get_p_value(null_dist, obs_stat = obs_diff, direction = "greater")
# Fallback (works on all versions):
p_val_manual <- mean(null_dist$stat >= obs_diff$stat)
list(
observed_diff = obs_diff,
p_value_one_sided_greater = p_val,
p_value_manual = p_val_manual
)
## $observed_diff
## Response: strength_daily (factor)
## Explanatory: sleep10plus (factor)
## # A tibble: 1 × 1
## stat
## <dbl>
## 1 0.105
##
## $p_value_one_sided_greater
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
##
## $p_value_manual
## [1] 0
boot_ci <- yrbss_cmp %>%
specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
generate(reps = 400, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("yes","no")) %>%
get_ci(level = 0.95)
boot_ci
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0557 0.157
rm(yrbss_cmp); invisible(gc())
Exercise 10 — Type I error
"At alpha = 0.05, the Type I error rate is 0.05."
## [1] "At alpha = 0.05, the Type I error rate is 0.05."
Exercise 11 — n for ME ≤ 1% (95%)
n_required <- (1.96^2 * 0.25) / (0.01^2)
ceiling(n_required)
## [1] 9604
Session Info (hidden)