Shuffle and Difference

Harold Nelson

2024-07-04

Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)

Soda

soda <- data.frame(
  drink = c(rep("Cola", 47), rep("Orange", 13)),
  location = c(
    rep("East", 28), rep("West", 19),
    rep("East", 6), rep("West", 7)
  )
)
soda |>
  mutate(drink_perm = sample(drink)) |>
  group_by(location) |>
  summarize(prop_cola_perm = mean(drink_perm == "Cola"),
            prop_cola = mean(drink == "Cola")) |>
  summarize(diff_perm = diff(prop_cola_perm), 
            diff_orig = diff(prop_cola))  # West - East
## # A tibble: 1 × 2
##   diff_perm diff_orig
##       <dbl>     <dbl>
## 1    -0.161   -0.0928
head(soda)
##   drink location
## 1  Cola     East
## 2  Cola     East
## 3  Cola     East
## 4  Cola     East
## 5  Cola     East
## 6  Cola     East
tail(soda)
##     drink location
## 55 Orange     West
## 56 Orange     West
## 57 Orange     West
## 58 Orange     West
## 59 Orange     West
## 60 Orange     West

Step 1

step1 = soda |>
  mutate(drink_perm = sample(drink))
step1
##     drink location drink_perm
## 1    Cola     East     Orange
## 2    Cola     East       Cola
## 3    Cola     East     Orange
## 4    Cola     East       Cola
## 5    Cola     East       Cola
## 6    Cola     East       Cola
## 7    Cola     East       Cola
## 8    Cola     East     Orange
## 9    Cola     East       Cola
## 10   Cola     East     Orange
## 11   Cola     East       Cola
## 12   Cola     East       Cola
## 13   Cola     East     Orange
## 14   Cola     East     Orange
## 15   Cola     East       Cola
## 16   Cola     East       Cola
## 17   Cola     East       Cola
## 18   Cola     East       Cola
## 19   Cola     East       Cola
## 20   Cola     East     Orange
## 21   Cola     East       Cola
## 22   Cola     East       Cola
## 23   Cola     East       Cola
## 24   Cola     East       Cola
## 25   Cola     East       Cola
## 26   Cola     East       Cola
## 27   Cola     East       Cola
## 28   Cola     East       Cola
## 29   Cola     West       Cola
## 30   Cola     West       Cola
## 31   Cola     West       Cola
## 32   Cola     West       Cola
## 33   Cola     West       Cola
## 34   Cola     West       Cola
## 35   Cola     West       Cola
## 36   Cola     West       Cola
## 37   Cola     West       Cola
## 38   Cola     West       Cola
## 39   Cola     West       Cola
## 40   Cola     West     Orange
## 41   Cola     West       Cola
## 42   Cola     West       Cola
## 43   Cola     West     Orange
## 44   Cola     West       Cola
## 45   Cola     West       Cola
## 46   Cola     West       Cola
## 47   Cola     West     Orange
## 48 Orange     East       Cola
## 49 Orange     East       Cola
## 50 Orange     East       Cola
## 51 Orange     East     Orange
## 52 Orange     East       Cola
## 53 Orange     East       Cola
## 54 Orange     West       Cola
## 55 Orange     West     Orange
## 56 Orange     West       Cola
## 57 Orange     West     Orange
## 58 Orange     West       Cola
## 59 Orange     West       Cola
## 60 Orange     West       Cola
table(step1$drink,step1$drink_perm)
##         
##          Cola Orange
##   Cola     37     10
##   Orange   10      3

sample()?

What does sample do?

x = c(1,2,3,4,5)
sample(x)
## [1] 5 1 3 4 2
sample(x)
## [1] 1 5 3 2 4
sample(x)
## [1] 1 2 3 4 5

Step 2

step2 = step1 %>% 
  group_by(location) |>
  summarize(prop_cola_perm = mean(drink_perm == "Cola"),
            prop_cola = mean(drink == "Cola"))
step2
## # A tibble: 2 × 3
##   location prop_cola_perm prop_cola
##   <chr>             <dbl>     <dbl>
## 1 East              0.765     0.824
## 2 West              0.808     0.731

Step 3

step3 = step2 %>% 
  summarize(diff_perm = diff(prop_cola_perm), 
            diff_orig = diff(prop_cola))  # West - East

step3
## # A tibble: 1 × 2
##   diff_perm diff_orig
##       <dbl>     <dbl>
## 1    0.0430   -0.0928

diff()?

What does diff() do?

x = c(1,2,3,5,8)
diff(x)
## [1] 1 1 2 3