Getting Started

We’ll create a convenience data frame of students who never wore a helmet in last 12 months.

no_helmet <- yrbss %>% filter(helmet_12m == "never")
nrow(no_helmet)

## [1] 6977

# memory trim
invisible(gc())

Exercise 1

Counts for days texted while driving (past 30 days).

yrbss %>%
  count(text_while_driving_30d, name = "count") %>%
  arrange(as.numeric(as.character(text_while_driving_30d)))

## # A tibble: 9 × 2
##   text_while_driving_30d count
##   <chr>                  <int>
## 1 0                       4792
## 2 30                       827
## 3 1-2                      925
## 4 10-19                    373
## 5 20-29                    298
## 6 3-5                      493
## 7 6-9                      311
## 8 did not drive           4646
## 9 <NA>                     918

Interpretation. The table shows frequencies of self-reported texting days out of the last 30.

Exercise 2

Proportion who text every day (30/30) among students who never wear helmets.

no_helmet <- no_helmet %>%
  mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))

prop_everyday_nohelmet <- no_helmet %>%
  summarise(p_hat = mean(text_ind == "yes", na.rm = TRUE),
            n = sum(!is.na(text_ind)))
prop_everyday_nohelmet

## # A tibble: 1 × 2
##    p_hat     n
##    <dbl> <int>
## 1 0.0712  6503

95% bootstrap CI (numeric indicator; 1,000 reps to save memory):

no_helmet_bin <- no_helmet %>%
  mutate(text_ind = factor(text_ind, levels = c("no","yes"))) %>%
  filter(!is.na(text_ind)) %>%
  mutate(text_ind_num = as.integer(text_ind == "yes"))

ci_ex2 <- no_helmet_bin %>%
  specify(response = text_ind_num) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)

ci_ex2

## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   0.0650   0.0775

Exercise 3

Margin of error for the estimate in Ex 2 (normal approx, 95%):
\(\text{ME} = 1.96 \sqrt{\hat p(1-\hat p)/n}\)

z <- 1.96
p_hat <- prop_everyday_nohelmet$p_hat
n_hat <- prop_everyday_nohelmet$n
ME_ex3 <- z * sqrt(p_hat * (1 - p_hat) / n_hat)
data.frame(p_hat = p_hat, n = n_hat, ME_approx_95 = ME_ex3)

##        p_hat    n ME_approx_95
## 1 0.07119791 6503  0.006250207

Bootstrap half-width check:

ME_boot <- (ci_ex2$upper_ci - ci_ex2$lower_ci) / 2
data.frame(ME_bootstrap_halfwidth = ME_boot)

##   ME_bootstrap_halfwidth
## 1            0.006229817

Exercise 4

Two other categorical variables + CIs and MEs.

We use: 1) Physically active all 7 days (physically_active_7d == 7)
2) TV 3+ hours on school nights (from hours_tv_per_school_day)

# (1) Active all 7 days
yrbss <- yrbss %>%
  mutate(active7_ind = ifelse(physically_active_7d == 7, "yes", "no"))

# (2) TV 3+ hours (robust string match)
tv3plus_ind <- str_detect(yrbss$hours_tv_per_school_day, "^[3-9]") |
               str_detect(yrbss$hours_tv_per_school_day, "or more")
yrbss <- yrbss %>%
  mutate(tv3plus_ind = ifelse(tv3plus_ind, "yes", "no"))

(a) Active all 7 days

yrbss_active <- yrbss %>%
  mutate(active7_ind = factor(active7_ind, levels = c("no","yes"))) %>%
  filter(!is.na(active7_ind)) %>%
  mutate(active7_num = as.integer(active7_ind == "yes"))

ci_active7 <- yrbss_active %>%
  specify(response = active7_num) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)

phat_active7 <- mean(yrbss_active$active7_num)
n_active7 <- nrow(yrbss_active)
ME_active7 <- 1.96 * sqrt(phat_active7 * (1 - phat_active7) / n_active7)

list(ci = ci_active7, phat = phat_active7, n = n_active7, ME_95 = ME_active7)

## $ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    0.264    0.279
## 
## $phat
## [1] 0.2721262
## 
## $n
## [1] 13310
## 
## $ME_95
## [1] 0.007561018

(b) TV 3+ hours on a school night

yrbss_tv <- yrbss %>%
  mutate(tv3plus_ind = factor(tv3plus_ind, levels = c("no","yes"))) %>%
  filter(!is.na(tv3plus_ind)) %>%
  mutate(tv3_num = as.integer(tv3plus_ind == "yes"))

ci_tv3 <- yrbss_tv %>%
  specify(response = tv3_num) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean") %>%
  get_ci(level = 0.95)

phat_tv3 <- mean(yrbss_tv$tv3_num)
n_tv3 <- nrow(yrbss_tv)
ME_tv3 <- 1.96 * sqrt(phat_tv3 * (1 - phat_tv3) / n_tv3)

list(ci = ci_tv3, phat = phat_tv3, n = n_tv3, ME_95 = ME_tv3)

## $ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1    0.353    0.369
## 
## $phat
## [1] 0.3610419
## 
## $n
## [1] 13245
## 
## $ME_95
## [1] 0.008179845

Exercise 5

ME vs \(p\) (n = 1000). Max at \(p=0.5\).

n <- 1000
p <- seq(0, 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p) / n)
data.frame(p, me) %>%
  ggplot(aes(p, me)) + geom_line() +
  labs(x = "Population Proportion (p)", y = "Approx. 95% ME")

Exercises 6–8 (Conceptual)

6. For fixed \(n\), the sampling distribution of \(\hat p\) is centered at \(p\) with SD \(\sqrt{p(1-p)/n}\); it’s widest near \(p=0.5\).
7. Increasing \(n\) shrinks the SD; the distribution concentrates around \(p\).
8. For fixed \(n\), variance is largest near \(p=0.5\) and smaller toward 0 or 1.

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] usdata_0.3.1        cherryblossom_0.1.0 airports_0.1.0     
## [4] infer_1.0.9         stringr_1.5.2       ggplot2_4.0.0      
## [7] dplyr_1.1.4        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
##  [9] readr_2.1.5        R6_2.6.1           labeling_0.4.3     generics_0.1.4    
## [13] knitr_1.50         tibble_3.3.0       tzdb_0.5.0         bslib_0.9.0       
## [17] pillar_1.11.1      RColorBrewer_1.1-3 rlang_1.1.6        utf8_1.2.6        
## [21] cachem_1.1.0       stringi_1.8.7      xfun_0.53          sass_0.4.10       
## [25] S7_0.2.0           cli_3.6.5          withr_3.0.2        magrittr_2.0.4    
## [29] digest_0.6.37      grid_4.5.1         hms_1.1.3          lifecycle_1.0.4   
## [33] vctrs_0.6.5        evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [37] rmarkdown_2.30     purrr_1.1.0        tools_4.5.1        pkgconfig_2.0.3   
## [41] htmltools_0.5.8.1

Lab 6: Inference for Categorical Data

Sachi Kapoor

October 19, 2025