Setup

library(infer)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

load("cdc.Rdata")
cdc_small = cdc %>% 
  sample_n(500)

Chi-Square Test for Independence (Categorical Data)

Suppose you want to test if the number of gears (gear) is independent of the transmission type (am) in mtcars:

# Convert variables to factors if not already
mtcars <- mtcars %>%
  mutate(
    am = factor(am, labels = c("Automatic", "Manual")),
    gear = factor(gear)
  )

# Calculate the observed chi-square statistic
obs_stat <- mtcars %>%
  specify(am ~ gear) %>%
  calculate(stat = "Chisq")

# Generate the null distribution
null_dist <- mtcars %>%
  specify(am ~ gear) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 10000, type = "permute") %>%
  calculate(stat = "Chisq")

# Get the p-value (using 'greater' since the chi-square is one-sided)
p_value <- null_dist %>%
  get_p_value(obs_stat = obs_stat, direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

The p-value is clearly below .05. Therefore we reject the null hypothesis.

Exercise

Using the code above, test the hypothesis that gender and genhlth in the dataframe cdc_small are independent.

Be sure to state your conclusion using proper language.

Solution

# Calculate the observed chi-square statistic
obs_stat <- cdc_small %>%
  specify(gender ~ genhlth) %>%
  calculate(stat = "Chisq")

# Generate the null distribution
null_dist <- cdc_small %>%
  specify(gender ~ genhlth) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

# Get the p-value (using 'greater' since the chi-square is one-sided)
p_value <- null_dist %>%
  get_p_value(obs_stat = obs_stat, direction = "greater")

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.011

We do not have enough evidence to reject the null hypothesis. The p-value is greater than .05.

Correlation

Test the hypothesis that there is no correlation between mpg and hp in the mtcars dataframe.

# Calculate the observed correlation
obs_cor <- mtcars %>%
  specify(response = mpg, explanatory = hp) %>%
  calculate(stat = "correlation")

# Generate the null distribution by permuting the explanatory variable
null_dist <- mtcars %>%
  specify(response = mpg, explanatory = hp) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "correlation")

# Calculate the p-value
p_value <- null_dist %>%
  get_p_value(obs_stat = obs_cor, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Since the p-value is below .05, we reject the null hypothesis.

Exercise

Test the hypothesis that there is no correlation betweeh height and wtdesire in the cdc_small dataframe.

# Calculate the observed correlation
obs_cor <- cdc_small %>%
  specify(response = wtdesire, explanatory = height) %>%
  calculate(stat = "correlation")

# Generate the null distribution by permuting the explanatory variable
null_dist <- cdc_small %>%
  specify(response = wtdesire, explanatory = height) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "correlation")

# Calculate the p-value
p_value <- null_dist %>%
  get_p_value(obs_stat = obs_cor, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Since the p-value is below .05, we reject the null hypothesis.

Permutation Test for Difference in Means Between Two Groups

Use mtcars to see if there is a significant diiference between mpg values for different transmission types.

# Prepare data: convert 'am' to a factor with meaningful labels
mtcars <- mtcars %>%
  mutate(am = factor(am, labels = c("Automatic", "Manual")))

# Generate the null distribution by permuting group labels
mpg_perm <- mtcars %>%
  specify(response = mpg, explanatory = am) %>%      # Specify response and group
  hypothesize(null = "independence") %>%              # Null: no association between mpg and am
  generate(reps = 1000, type = "permute") %>%         # Permute group labels
  calculate(stat = "diff in means", order = c("Manual", "Automatic"))  # Difference in means

# Calculate the observed difference in means
obs_diff <- mtcars %>%
  specify(response = mpg, explanatory = am) %>%
  calculate(stat = "diff in means", order = c("Manual", "Automatic"))

# Compute the p-value from the null distribution
p_value <- mpg_perm %>%
  get_p_value(obs_stat = obs_diff, direction = "two-sided")

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.002

Since the p-value is below .05, we reject the null hypothesis.

Exercise

Using the code above, test the hypothesis that that the mean value of weight is the same for men and women.

Solution

# Generate the null distribution by permuting group labels
weight_perm <- cdc_small %>%
  specify(response = weight, explanatory = gender) %>%      # Specify response and group
  hypothesize(null = "independence") %>%              # Null: no association between mpg and am
  generate(reps = 1000, type = "permute") %>%         # Permute group labels
  calculate(stat = "diff in means", order = c("m", "f"))  # Difference in means

# Calculate the observed difference in means
obs_diff <- cdc_small %>%
  specify(response = weight, explanatory = gender) %>%
  calculate(stat = "diff in means", order = c("m", "f"))

# Compute the p-value from the null distribution
p_value <- weight_perm %>%
  get_p_value(obs_stat = obs_diff, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

print(p_value)

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Bootstrapping a One-Sample Statistic (Confidence Interval for the Mean)

Imagine you want a 95% confidence interval for the mean of miles per gallon (mpg) from the built-in mtcars dataset. You can use bootstrapping as follows:

# Bootstrap resampling for the mean of mpg
mpg_bootstrap <- mtcars %>%
  specify(response = mpg) %>%           # Specify the variable of interest
  generate(reps = 1000, type = "bootstrap") %>%  # Create bootstrap samples
  calculate(stat = "mean")              # Calculate the mean for each resample

# Derive a 95% confidence interval using the percentile method
ci <- mpg_bootstrap %>%
  get_confidence_interval(level = 0.95, type = "percentile")

print(ci)

## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     18.1     22.3

Exercise

Use the code above to create a 95% confidence interval for the mean value of weight in the cdc_small dataframe.

Solution

# Bootstrap resampling for the mean of weight

weight_bootstrap <- cdc_small %>%
  specify(response = weight) %>%           # Specify the variable of interest
  generate(reps = 1000, type = "bootstrap") %>%  # Create bootstrap samples
  calculate(stat = "mean")              # Calculate the mean for each resample

# Derive a 95% confidence interval using the percentile method
ci <- weight_bootstrap %>%
  get_confidence_interval(level = 0.95, type = "percentile")

print(ci)

## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     168.     175.

Look at the estimates

lower = quantile(weight_bootstrap$stat,.025)
upper = quantile(weight_bootstrap$stat,.975)
me = (upper - lower)/2
names(me) = "ME"
print(me)

##      ME 
## 3.43275

Question

Using the central limit theorem to estimate confidence intervals we observed the inverse square root law. Increasing the sample size by a factor of n would reduce the margin of error by a factor of \(\sqrt{n}\). Does that hold true for the bootstrap method?

To answer this question, use the diamonds dataframe from ggplot2 and samples of 50 and 5,000 for the mean of carat.

Sample of 50

set.seed(123)
d_50 = diamonds %>% 
  sample_n(50)

d_50_bs = d_50 %>% 
  specify(response = carat) %>% 
  # Specify the variable of interest
  generate(reps = 1000, type = "bootstrap") %>%  # Create bootstrap samples
  calculate(stat = "mean")              # Calculate the mean for each resample

upper = quantile(d_50_bs$stat,.975)
lower = quantile(d_50_bs$stat, .025)
me_50 = (upper - lower)/2
names(me_50) = "ME_50"
print(me_50)

##     ME_50 
## 0.1256475

Sample of 5000

set.seed(456)
d_5000 = diamonds %>% 
  sample_n(5000)

d_5000_bs = d_5000 %>% 
  specify(response = carat) %>% 
  # Specify the variable of interest
  generate(reps = 1000, type = "bootstrap") %>%  # Create bootstrap samples
  calculate(stat = "mean")              # Calculate the mean for each resample

upper = quantile(d_5000_bs$stat,.975)
lower = quantile(d_5000_bs$stat, .025)
me_5000 = (upper - lower)/2
names(me_5000) = "ME_5000"
print(me_5000)

##    ME_5000 
## 0.01319748

The Ratio

ratio = me_50/me_5000
print(ratio)

##    ME_50 
## 9.520571

The Infer Package

Introduction

Setup

Chi-Square Test for Independence (Categorical Data)

Exercise

Solution

Correlation

Exercise

Permutation Test for Difference in Means Between Two Groups

Exercise

Solution

Bootstrapping a One-Sample Statistic (Confidence Interval for the Mean)

Exercise

Solution

Look at the estimates

Question

Sample of 50

Sample of 5000

The Ratio