This R notebook illustrates some robust alternatives to commonly used effect sizes.

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, sn)

#settings
group_n = 100000
group_2 = rep(LETTERS[1:2], each = group_n)
theme_set(theme_bw())

Cohen’s d vs. robust version

Simulate data with roughly constant gap size, but varying variances. Then apply normal Cohen’s d (based on means and standard deviations) vs. robust Cohen’s d (based on medians and median absolute deviations).

The four distributions each with about 15 point gap between groups: 1. Normal distribution 2. Normal distribution mixed with another normal distribution with larger variance 3. t distribution with low degree of freedom (naturally thick tails) 4. Skewed normal distribution

#large
normal_large = tibble(
  group = group_2,
  
  #normal
  norm = c(rnorm(group_n, mean = 100, sd = 15),
        rnorm(group_n, mean = 115, sd = 15)),
  
  #mixed normal
  norm_mixed = c(rnorm(group_n*.9, mean = 100, sd = 15), rnorm(group_n*.1, mean = 100, sd = 50),
                 rnorm(group_n*.9, mean = 115, sd = 15), rnorm(group_n*.1, mean = 115, sd = 50)),
  
  #t dist
  t = c(rt(group_n, df = 2) * 15 + 100,
        rt(group_n, df = 2) * 15 + 115),
  
  #skewed normal
  snorm = c(rsn(group_n, alpha = 5) %>% {.*30 + 80},
            rsn(group_n, alpha = 5) %>% {.*30 + 95})
) %>% mutate(i = 1:n())

#plot by facet
normal_large %>% 
  gather(key = key, value = value, norm:snorm) %>% 
  ggplot(aes(value, fill = group)) +
  geom_density(alpha = .3, n = 2^15) +
  coord_cartesian(xlim = c(0, 200)) +
  facet_wrap(vars(key))

#numerics
describeBy(normal_large %>% select(norm:snorm),
           group = normal_large$group, mat = T) %>% 
  rownames_to_column() %>% 
  select(-min, -max, -range, -se) %>% 
  arrange(group1, vars)
#cohen d vs. robust version
describeBy(normal_large %>% select(norm:snorm),
           group = normal_large$group) %>% 
  {
    tibble(
      dist = rownames(.$A),
      mean_d = (.$B$mean - .$A$mean),
      median_d = (.$B$median - .$A$median),
      cohen_d = (.$B$mean - .$A$mean) / rowMeans(cbind(.$B$sd, .$A$sd)),
      cohen_d_robust = (.$B$median - .$A$median) / rowMeans(cbind(.$B$mad, .$A$mad))
    )
  }