Permutation

Harold Nelson

2024-04-12

Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(infer)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(gmodels)
load("cdc.Rdata")

Examine gender/smoke100 Relationship

Solution

CrossTable(cdc$gender,cdc$smoke100)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  20000 
## 
##  
##              | cdc$smoke100 
##   cdc$gender |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            m |      4547 |      5022 |      9569 | 
##              |    50.471 |    56.448 |           | 
##              |     0.475 |     0.525 |     0.478 | 
##              |     0.431 |     0.532 |           | 
##              |     0.227 |     0.251 |           | 
## -------------|-----------|-----------|-----------|
##            f |      6012 |      4419 |     10431 | 
##              |    46.300 |    51.783 |           | 
##              |     0.576 |     0.424 |     0.522 | 
##              |     0.569 |     0.468 |           | 
##              |     0.301 |     0.221 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     10559 |      9441 |     20000 | 
##              |     0.528 |     0.472 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Make smoke100 a f]Factor

Use yes/no.

cdc = cdc %>% 
  
mutate(smoke100 = factor(smoke100,
                  labels=c("No","Yes")))

Permutation Test

Run a permutation test on the independence of these two variables. Ask ChatGPT how to do it using only base R.

https://chat.openai.com/share/25508b28-0cbc-4178-bdd1-f3f7ba45e13a

# Calculate the observed contingency table
observed_table <- table(cdc$gender, cdc$smoke100)

# Calculate the observed Chi-squared statistic
observed_chi_squared <- sum((observed_table - outer(rowSums(observed_table), colSums(observed_table), FUN = "*") / sum(observed_table))^2 / outer(rowSums(observed_table), colSums(observed_table), FUN = "*"))

# Initialize a vector to store the permutation chi-squared statistics
perm_chi_squared <- numeric(1000)

# Permute the data and compute the chi-squared statistic for each permutation
for (i in 1:1000) {
  # Permute the labels of 'smoke100'
  permuted_smoke100 <- sample(cdc$smoke100)

  # Create a new table with permuted labels
  perm_table <- table(cdc$gender, permuted_smoke100)

  # Calculate the chi-squared statistic for the permuted table
  perm_chi_squared[i] <- sum((perm_table - outer(rowSums(perm_table), colSums(perm_table), FUN = "*") / sum(perm_table))^2 / outer(rowSums(perm_table), colSums(perm_table), FUN = "*"))
}

# Calculate the p-value
p_value <- mean(perm_chi_squared >= observed_chi_squared)

# Output the result
cat("The p-value of the permutation test is:", p_value, "\n")
## The p-value of the permutation test is: 0

Use infer

Ask ChatGPT to do the same thing with infer.

Change reps to match the capabilities of your hardware. 1,000 is reasonable for my Mac.

# Install and load the infer package
if (!requireNamespace("infer", quietly = TRUE)) {
  install.packages("infer")
}
library(infer)



# Specify the response and explanatory variables and the test statistic
observed_statistic <- cdc %>%
  specify(response = smoke100, explanatory = gender,success = "Yes") %>%
  calculate(stat = "Chisq")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "m" - "f", or divided in the order "m" / "f" for ratio-based statistics.
## To specify this order yourself, supply `order = c("m", "f")` to the calculate()
## function.
# Permutation test
p_value <- cdc %>%
  specify(response = smoke100, explanatory = gender,success = "Yes") %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq") %>%
  get_p_value(obs_stat = observed_statistic$stat, direction = "greater")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "m" - "f", or divided in the order "m" / "f" for ratio-based statistics.
## To specify this order yourself, supply `order = c("m", "f")` to the calculate()
## function.
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
# Output the result
cat("The p-value of the permutation test is:", p_value$p_value, "\n")
## The p-value of the permutation test is: 0