Lab 6: Inference for Categorical Data (Ultra‑Light)

Sachi Kapoor

October 19, 2025

Getting Started

no_helmet <- yrbss %>% filter(helmet_12m == "never")
nrow(no_helmet); invisible(gc())
## [1] 6977

Exercise 1 — Counts

yrbss %>%
  count(text_while_driving_30d, name = "count") %>%
  arrange(as.numeric(as.character(text_while_driving_30d)))
## # A tibble: 9 × 2
##   text_while_driving_30d count
##   <chr>                  <int>
## 1 0                       4792
## 2 30                       827
## 3 1-2                      925
## 4 10-19                    373
## 5 20-29                    298
## 6 3-5                      493
## 7 6-9                      311
## 8 did not drive           4646
## 9 <NA>                     918

Exercise 2 — Proportion + CI

no_helmet <- no_helmet %>%
  mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
prop_everyday_nohelmet <- no_helmet %>%
  summarise(p_hat = mean(text_ind == "yes", na.rm = TRUE),
            n = sum(!is.na(text_ind)))
prop_everyday_nohelmet
## # A tibble: 1 × 2
##    p_hat     n
##    <dbl> <int>
## 1 0.0712  6503
no_helmet_bin <- no_helmet %>%
  mutate(text_ind = factor(text_ind, levels = c("no","yes"))) %>%
  filter(!is.na(text_ind)) %>%
  transmute(text_ind_num = as.integer(text_ind == "yes"))

ci_ex2 <- no_helmet_bin %>%
  specify(response = text_ind_num) %>%
  generate(reps = 400, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)
ci_ex2; rm(no_helmet_bin); invisible(gc())
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   0.0655   0.0777

Exercise 3 — ME (normal approx)

z <- 1.96
p_hat <- prop_everyday_nohelmet$p_hat
n_hat <- prop_everyday_nohelmet$n
ME_ex3 <- z * sqrt(p_hat * (1 - p_hat) / n_hat)
data.frame(p_hat = p_hat, n = n_hat, ME_approx_95 = ME_ex3)
##        p_hat    n ME_approx_95
## 1 0.07119791 6503  0.006250207

Exercise 4 — Two more CIs

yrbss <- yrbss %>%
  mutate(active7_ind = ifelse(physically_active_7d == 7, "yes", "no"))
tv3plus_ind <- str_detect(yrbss$hours_tv_per_school_day, "^[3-9]") |
               str_detect(yrbss$hours_tv_per_school_day, "or more")
yrbss <- yrbss %>% mutate(tv3plus_ind = ifelse(tv3plus_ind, "yes", "no"))
yrbss_active <- yrbss %>%
  mutate(active7_ind = factor(active7_ind, levels = c("no","yes"))) %>%
  filter(!is.na(active7_ind)) %>%
  transmute(active7_num = as.integer(active7_ind == "yes"))

ci_active7 <- yrbss_active %>%
  specify(response = active7_num) %>%
  generate(reps = 400, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)

phat_active7 <- mean(yrbss_active$active7_num)
n_active7 <- nrow(yrbss_active)
ME_active7 <- 1.96 * sqrt(phat_active7 * (1 - phat_active7) / n_active7)

list(ci = ci_active7, phat = phat_active7, n = n_active7, ME_95 = ME_active7)
## $ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    0.265    0.280
## 
## $phat
## [1] 0.2721262
## 
## $n
## [1] 13310
## 
## $ME_95
## [1] 0.007561018
rm(yrbss_active); invisible(gc())
yrbss_tv <- yrbss %>%
  mutate(tv3plus_ind = factor(tv3plus_ind, levels = c("no","yes"))) %>%
  filter(!is.na(tv3plus_ind)) %>%
  transmute(tv3_num = as.integer(tv3plus_ind == "yes"))

ci_tv3 <- yrbss_tv %>%
  specify(response = tv3_num) %>%
  generate(reps = 400, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)

phat_tv3 <- mean(yrbss_tv$tv3_num)
n_tv3 <- nrow(yrbss_tv)
ME_tv3 <- 1.96 * sqrt(phat_tv3 * (1 - phat_tv3) / n_tv3)

list(ci = ci_tv3, phat = phat_tv3, n = n_tv3, ME_95 = ME_tv3)
## $ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    0.353    0.369
## 
## $phat
## [1] 0.3610419
## 
## $n
## [1] 13245
## 
## $ME_95
## [1] 0.008179845
rm(yrbss_tv); invisible(gc())

Exercise 5 — ME vs p (n = 1000)

n <- 1000
p <- seq(0, 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p) / n)
ggplot(data.frame(p, me), aes(p, me)) +
  geom_line() +
  labs(x = "Population Proportion (p)", y = "Approx. 95% ME")

Exercises 6–8 — Conceptual

Exercise 9 — Sleep 10+ vs daily strength

# Exercise 9 — Sleep 10+ vs daily strength: one-sided test ("greater") + 95% CI

# Build comparison data (binary factors, no NAs)
yrbss_cmp <- yrbss %>%
  dplyr::mutate(
    sleep10plus    = ifelse(stringr::str_detect(school_night_hours_sleep, "^10"), "yes", "no"),
    strength_daily = ifelse(strength_training_7d == 7, "yes", "no"),
    sleep10plus    = factor(sleep10plus,    levels = c("no","yes")),
    strength_daily = factor(strength_daily, levels = c("no","yes"))
  ) %>%
  dplyr::filter(!is.na(sleep10plus), !is.na(strength_daily))

# Quick 2x2 table
tab_ex9 <- yrbss_cmp %>% dplyr::count(sleep10plus, strength_daily)
tab_ex9
## # A tibble: 4 × 3
##   sleep10plus strength_daily     n
##   <fct>       <fct>          <int>
## 1 no          no              9949
## 2 no          yes             1958
## 3 yes         no               228
## 4 yes         yes               84
# Observed difference in proportions: p_yes(sleep10plus) - p_yes(not)
obs_diff <- yrbss_cmp %>%
  infer::specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
  infer::calculate(stat = "diff in props", order = c("yes","no"))

set.seed(606)
null_dist <- yrbss_cmp %>%
  infer::specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
  infer::hypothesize(null = "independence") %>%
  infer::generate(reps = 800, type = "permute") %>%   # light on memory
  infer::calculate(stat = "diff in props", order = c("yes","no"))

# P-value (use the API that your infer version supports)
# Preferred:
p_val <- infer::get_p_value(null_dist, obs_stat = obs_diff, direction = "greater")
# Fallback (works on all versions):
p_val_manual <- mean(null_dist$stat >= obs_diff$stat)

list(
  observed_diff = obs_diff,
  p_value_one_sided_greater = p_val,
  p_value_manual = p_val_manual
)
## $observed_diff
## Response: strength_daily (factor)
## Explanatory: sleep10plus (factor)
## # A tibble: 1 × 1
##    stat
##   <dbl>
## 1 0.105
## 
## $p_value_one_sided_greater
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
## 
## $p_value_manual
## [1] 0
boot_ci <- yrbss_cmp %>%
  specify(response = strength_daily, explanatory = sleep10plus, success = "yes") %>%
  generate(reps = 400, type = "bootstrap") %>%
  calculate(stat = "diff in props", order = c("yes","no")) %>%
  get_ci(level = 0.95)
boot_ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   0.0557    0.157
rm(yrbss_cmp); invisible(gc())

Exercise 10 — Type I error

"At alpha = 0.05, the Type I error rate is 0.05."
## [1] "At alpha = 0.05, the Type I error rate is 0.05."

Exercise 11 — n for ME ≤ 1% (95%)

n_required <- (1.96^2 * 0.25) / (0.01^2)
ceiling(n_required)
## [1] 9604

Session Info (hidden)