1 Load packages

# install.packages(c("tidyverse","infer","openintro")) # <-- run once if needed
library(tidyverse)
library(infer)
suppressPackageStartupMessages(library(openintro))

2 Load data (robust, low-memory, with column-name fixes)

# Load 'nc' from openintro or tiny RData fallback
if (!exists("nc")) {
  if ("package:openintro" %in% search() && "nc" %in% data(package = "openintro")$results[, "Item"]) {
    data(nc, package = "openintro")
  } else {
    tmp <- "nc.RData"
    if (!file.exists(tmp)) {
      download.file("https://www.openintro.org/stat/data/nc.RData", destfile = tmp, mode = "wb")
    }
    load(tmp)  # loads nc
  }
}

# If dataset has 'mage' but not 'age', create age
if (!"age" %in% names(nc) && "mage" %in% names(nc)) {
  nc <- nc %>% mutate(age = mage)
}

# Keep only columns we actually need (use any_of so it's safe)
nc <- nc %>%
  select(any_of(c("weight","habit","age","weeks","parity","gained","premie"))) %>%
  drop_na(weight, habit)

glimpse(nc)
## Rows: 999
## Columns: 6
## $ weight <dbl> 7.63, 7.88, 6.63, 8.00, 6.38, 5.38, 8.44, 4.69, 8.81, 6.94, 7.4…
## $ habit  <fct> nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoke…
## $ age    <int> 13, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,…
## $ weeks  <int> 39, 42, 37, 41, 39, 38, 37, 35, 38, 37, 45, 42, 40, 38, 38, 40,…
## $ gained <int> 38, 20, 38, 34, 27, 22, 76, 15, NA, 52, 28, 34, 12, 30, 75, 35,…
## $ premie <fct> full term, full term, full term, full term, full term, full ter…

3 Q1. Identify variables and types

Answer:
- Response: weight (numeric).
- Explanatory: habit (smoker/nonsmoker, categorical).

4 Q2. Research question

Do babies of mothers who smoke have a different average birth weight than those of non-smokers?

5 Q3. EDA

5.1 3a) Group summaries

group_summ <- nc %>%
  group_by(habit) %>%
  summarise(n = n(),
            mean_weight = mean(weight),
            sd_weight = sd(weight),
            .groups = "drop")
group_summ

Auto-filled: Smokers 126, mean 6.83; Non-smokers 873, mean 7.14.

5.2 3b) Plots

ggplot(nc, aes(x = weight)) + geom_histogram(bins = 30) +
  labs(title="Distribution of Birth Weight", x="Birth weight (lbs)", y="Count")

ggplot(nc, aes(x = habit, y = weight)) + geom_boxplot() +
  labs(title="Birth Weight by Smoking Status", x="Smoking habit", y="Birth weight (lbs)")

6 Q4. Hypotheses

  • H0: \\(\mu_S - \mu_N = 0 \\\)
  • HA: \\(\mu_S - \mu_N \neq 0 \\\)

7 Q5. Observed statistic

obs_diff <- nc %>%
  specify(response = weight, explanatory = habit) %>%
  calculate(stat = "diff in means", order = c("smoker","nonsmoker"))
obs_diff

Observed difference (smoker − nonsmoker): -0.316 lbs.

8 Q6. Conditions

  • Independence within/between groups reasonable.
  • Group sizes: nonsmoker 873, smoker 126 (both > 30) ⇒ CLT applies.

9 Q7. Permutation test

null_dist <- nc %>%
  specify(response = weight, explanatory = habit) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 5000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("smoker","nonsmoker"))

p_val <- null_dist %>% get_p_value(obs_stat = obs_diff, direction = "two_sided")
p_val
visualize(null_dist) +
  shade_p_value(obs_stat = obs_diff, direction = "two_sided") +
  labs(title="Null Distribution (Permutation Test)")

Two-sided p-value = 0.0336.

10 Q8. 95% CIs

boot_dist <- nc %>%
  specify(response = weight, explanatory = habit) %>%
  generate(reps = 3000, type = "bootstrap") %>%
  calculate(stat = "diff in means", order = c("smoker","nonsmoker"))
boot_ci <- boot_dist %>% get_confidence_interval(level = 0.95, type = "percentile")
boot_ci
tt <- t.test(weight ~ habit, data = nc, var.equal = FALSE)
tt$conf.int
## [1] 0.05151165 0.57957328
## attr(,"conf.level")
## [1] 0.95

Bootstrap 95% CI: [-0.586, -0.0625];
Welch t 95% CI: [0.0515, 0.58].

11 Q9. Interpretation

  • p-value 0.0336 ⇒ probability of seeing a difference ≥ -0.316 if H0 true.
  • 95% CI [-0.586, -0.0625] does not include 0 ⇒ evidence of a true difference. -Because the estimated difference (smoker − non-smoker) is negative, babies of smokers weigh less on average than babies of non-smokers.

12 Cleanup

rm(list = setdiff(ls(), c("obs_diff","p_val","boot_ci","tt","group_summ")))
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1601901 85.6    2547645 136.1  2547645 136.1
## Vcells 2835043 21.7   23664832 180.6 29580656 225.7
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
## 
## Random number generation:
##  RNG:     Mersenne-Twister 
##  Normal:  Inversion 
##  Sample:  Rounding 
##  
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] openintro_2.5.0     usdata_0.3.1        cherryblossom_0.1.0
##  [4] airports_0.1.0      infer_1.0.9         lubridate_1.9.4    
##  [7] forcats_1.0.1       stringr_1.5.2       dplyr_1.1.4        
## [10] purrr_1.1.0         readr_2.1.5         tidyr_1.3.1        
## [13] tibble_3.3.0        ggplot2_4.0.0       tidyverse_2.0.0    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
##  [9] R6_2.6.1           labeling_0.4.3     generics_0.1.4     knitr_1.50        
## [13] bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3 tzdb_0.5.0        
## [17] rlang_1.1.6        stringi_1.8.7      cachem_1.1.0       xfun_0.53         
## [21] sass_0.4.10        S7_0.2.0           timechange_0.3.0   cli_3.6.5         
## [25] withr_3.0.2        magrittr_2.0.4     digest_0.6.37      grid_4.5.1        
## [29] rstudioapi_0.17.1  hms_1.1.4          lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.30    
## [37] tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1