# install.packages(c("tidyverse","infer","openintro")) # <-- run once if needed
library(tidyverse)
library(infer)
suppressPackageStartupMessages(library(openintro))
# Load 'nc' from openintro or tiny RData fallback
if (!exists("nc")) {
if ("package:openintro" %in% search() && "nc" %in% data(package = "openintro")$results[, "Item"]) {
data(nc, package = "openintro")
} else {
tmp <- "nc.RData"
if (!file.exists(tmp)) {
download.file("https://www.openintro.org/stat/data/nc.RData", destfile = tmp, mode = "wb")
}
load(tmp) # loads nc
}
}
# If dataset has 'mage' but not 'age', create age
if (!"age" %in% names(nc) && "mage" %in% names(nc)) {
nc <- nc %>% mutate(age = mage)
}
# Keep only columns we actually need (use any_of so it's safe)
nc <- nc %>%
select(any_of(c("weight","habit","age","weeks","parity","gained","premie"))) %>%
drop_na(weight, habit)
glimpse(nc)
## Rows: 999
## Columns: 6
## $ weight <dbl> 7.63, 7.88, 6.63, 8.00, 6.38, 5.38, 8.44, 4.69, 8.81, 6.94, 7.4…
## $ habit <fct> nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoke…
## $ age <int> 13, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,…
## $ weeks <int> 39, 42, 37, 41, 39, 38, 37, 35, 38, 37, 45, 42, 40, 38, 38, 40,…
## $ gained <int> 38, 20, 38, 34, 27, 22, 76, 15, NA, 52, 28, 34, 12, 30, 75, 35,…
## $ premie <fct> full term, full term, full term, full term, full term, full ter…
Answer:
- Response: weight (numeric).
- Explanatory: habit
(smoker/nonsmoker, categorical).
Do babies of mothers who smoke have a different average birth weight than those of non-smokers?
group_summ <- nc %>%
group_by(habit) %>%
summarise(n = n(),
mean_weight = mean(weight),
sd_weight = sd(weight),
.groups = "drop")
group_summ
Auto-filled: Smokers 126, mean 6.83; Non-smokers 873, mean 7.14.
ggplot(nc, aes(x = weight)) + geom_histogram(bins = 30) +
labs(title="Distribution of Birth Weight", x="Birth weight (lbs)", y="Count")
ggplot(nc, aes(x = habit, y = weight)) + geom_boxplot() +
labs(title="Birth Weight by Smoking Status", x="Smoking habit", y="Birth weight (lbs)")
obs_diff <- nc %>%
specify(response = weight, explanatory = habit) %>%
calculate(stat = "diff in means", order = c("smoker","nonsmoker"))
obs_diff
Observed difference (smoker − nonsmoker): -0.316 lbs.
null_dist <- nc %>%
specify(response = weight, explanatory = habit) %>%
hypothesize(null = "independence") %>%
generate(reps = 5000, type = "permute") %>%
calculate(stat = "diff in means", order = c("smoker","nonsmoker"))
p_val <- null_dist %>% get_p_value(obs_stat = obs_diff, direction = "two_sided")
p_val
visualize(null_dist) +
shade_p_value(obs_stat = obs_diff, direction = "two_sided") +
labs(title="Null Distribution (Permutation Test)")
Two-sided p-value = 0.0336.
boot_dist <- nc %>%
specify(response = weight, explanatory = habit) %>%
generate(reps = 3000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("smoker","nonsmoker"))
boot_ci <- boot_dist %>% get_confidence_interval(level = 0.95, type = "percentile")
boot_ci
tt <- t.test(weight ~ habit, data = nc, var.equal = FALSE)
tt$conf.int
## [1] 0.05151165 0.57957328
## attr(,"conf.level")
## [1] 0.95
Bootstrap 95% CI: [-0.586, -0.0625];
Welch t 95% CI: [0.0515, 0.58].
rm(list = setdiff(ls(), c("obs_diff","p_val","boot_ci","tt","group_summ")))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1601901 85.6 2547645 136.1 2547645 136.1
## Vcells 2835043 21.7 23664832 180.6 29580656 225.7
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
##
## Random number generation:
## RNG: Mersenne-Twister
## Normal: Inversion
## Sample: Rounding
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] openintro_2.5.0 usdata_0.3.1 cherryblossom_0.1.0
## [4] airports_0.1.0 infer_1.0.9 lubridate_1.9.4
## [7] forcats_1.0.1 stringr_1.5.2 dplyr_1.1.4
## [10] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1
## [13] tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
## [5] jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [9] R6_2.6.1 labeling_0.4.3 generics_0.1.4 knitr_1.50
## [13] bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3 tzdb_0.5.0
## [17] rlang_1.1.6 stringi_1.8.7 cachem_1.1.0 xfun_0.53
## [21] sass_0.4.10 S7_0.2.0 timechange_0.3.0 cli_3.6.5
## [25] withr_3.0.2 magrittr_2.0.4 digest_0.6.37 grid_4.5.1
## [29] rstudioapi_0.17.1 hms_1.1.4 lifecycle_1.0.4 vctrs_0.6.5
## [33] evaluate_1.0.5 glue_1.8.0 farver_2.1.2 rmarkdown_2.30
## [37] tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1