We’ll create a convenience data frame of students who never wore a helmet in last 12 months.
no_helmet <- yrbss %>% filter(helmet_12m == "never")
nrow(no_helmet)
## [1] 6977
# memory trim
invisible(gc())
Counts for days texted while driving (past 30 days).
yrbss %>%
count(text_while_driving_30d, name = "count") %>%
arrange(as.numeric(as.character(text_while_driving_30d)))
## # A tibble: 9 × 2
## text_while_driving_30d count
## <chr> <int>
## 1 0 4792
## 2 30 827
## 3 1-2 925
## 4 10-19 373
## 5 20-29 298
## 6 3-5 493
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
Interpretation. The table shows frequencies of self-reported texting days out of the last 30.
Proportion who text every day (30/30) among students who never wear helmets.
no_helmet <- no_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
prop_everyday_nohelmet <- no_helmet %>%
summarise(p_hat = mean(text_ind == "yes", na.rm = TRUE),
n = sum(!is.na(text_ind)))
prop_everyday_nohelmet
## # A tibble: 1 × 2
## p_hat n
## <dbl> <int>
## 1 0.0712 6503
95% bootstrap CI (numeric indicator; 1,000 reps to save memory):
no_helmet_bin <- no_helmet %>%
mutate(text_ind = factor(text_ind, levels = c("no","yes"))) %>%
filter(!is.na(text_ind)) %>%
mutate(text_ind_num = as.integer(text_ind == "yes"))
ci_ex2 <- no_helmet_bin %>%
specify(response = text_ind_num) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
ci_ex2
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0650 0.0775
Margin of error for the estimate in Ex 2 (normal
approx, 95%):
\(\text{ME} = 1.96 \sqrt{\hat p(1-\hat
p)/n}\)
z <- 1.96
p_hat <- prop_everyday_nohelmet$p_hat
n_hat <- prop_everyday_nohelmet$n
ME_ex3 <- z * sqrt(p_hat * (1 - p_hat) / n_hat)
data.frame(p_hat = p_hat, n = n_hat, ME_approx_95 = ME_ex3)
## p_hat n ME_approx_95
## 1 0.07119791 6503 0.006250207
Bootstrap half-width check:
ME_boot <- (ci_ex2$upper_ci - ci_ex2$lower_ci) / 2
data.frame(ME_bootstrap_halfwidth = ME_boot)
## ME_bootstrap_halfwidth
## 1 0.006229817
Two other categorical variables + CIs and MEs.
We use: 1) Physically active all 7 days
(physically_active_7d == 7)
2) TV 3+ hours on school nights (from
hours_tv_per_school_day)
# (1) Active all 7 days
yrbss <- yrbss %>%
mutate(active7_ind = ifelse(physically_active_7d == 7, "yes", "no"))
# (2) TV 3+ hours (robust string match)
tv3plus_ind <- str_detect(yrbss$hours_tv_per_school_day, "^[3-9]") |
str_detect(yrbss$hours_tv_per_school_day, "or more")
yrbss <- yrbss %>%
mutate(tv3plus_ind = ifelse(tv3plus_ind, "yes", "no"))
(a) Active all 7 days
yrbss_active <- yrbss %>%
mutate(active7_ind = factor(active7_ind, levels = c("no","yes"))) %>%
filter(!is.na(active7_ind)) %>%
mutate(active7_num = as.integer(active7_ind == "yes"))
ci_active7 <- yrbss_active %>%
specify(response = active7_num) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
phat_active7 <- mean(yrbss_active$active7_num)
n_active7 <- nrow(yrbss_active)
ME_active7 <- 1.96 * sqrt(phat_active7 * (1 - phat_active7) / n_active7)
list(ci = ci_active7, phat = phat_active7, n = n_active7, ME_95 = ME_active7)
## $ci
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.264 0.279
##
## $phat
## [1] 0.2721262
##
## $n
## [1] 13310
##
## $ME_95
## [1] 0.007561018
(b) TV 3+ hours on a school night
yrbss_tv <- yrbss %>%
mutate(tv3plus_ind = factor(tv3plus_ind, levels = c("no","yes"))) %>%
filter(!is.na(tv3plus_ind)) %>%
mutate(tv3_num = as.integer(tv3plus_ind == "yes"))
ci_tv3 <- yrbss_tv %>%
specify(response = tv3_num) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean") %>%
get_ci(level = 0.95)
phat_tv3 <- mean(yrbss_tv$tv3_num)
n_tv3 <- nrow(yrbss_tv)
ME_tv3 <- 1.96 * sqrt(phat_tv3 * (1 - phat_tv3) / n_tv3)
list(ci = ci_tv3, phat = phat_tv3, n = n_tv3, ME_95 = ME_tv3)
## $ci
## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.353 0.369
##
## $phat
## [1] 0.3610419
##
## $n
## [1] 13245
##
## $ME_95
## [1] 0.008179845
ME vs \(p\) (n = 1000). Max at \(p=0.5\).
n <- 1000
p <- seq(0, 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p) / n)
data.frame(p, me) %>%
ggplot(aes(p, me)) + geom_line() +
labs(x = "Population Proportion (p)", y = "Approx. 95% ME")
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] usdata_0.3.1 cherryblossom_0.1.0 airports_0.1.0
## [4] infer_1.0.9 stringr_1.5.2 ggplot2_4.0.0
## [7] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
## [5] jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [9] readr_2.1.5 R6_2.6.1 labeling_0.4.3 generics_0.1.4
## [13] knitr_1.50 tibble_3.3.0 tzdb_0.5.0 bslib_0.9.0
## [17] pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.6 utf8_1.2.6
## [21] cachem_1.1.0 stringi_1.8.7 xfun_0.53 sass_0.4.10
## [25] S7_0.2.0 cli_3.6.5 withr_3.0.2 magrittr_2.0.4
## [29] digest_0.6.37 grid_4.5.1 hms_1.1.3 lifecycle_1.0.4
## [33] vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [37] rmarkdown_2.30 purrr_1.1.0 tools_4.5.1 pkgconfig_2.0.3
## [41] htmltools_0.5.8.1