Abstract
We run benchmark speed tests of alternative algorithms for calculating Area Under the Curve, also called ‘common language effect size’, or ‘probability of superiority’. We want to estimate the probability that a randomly-selected item from one group has a higher (or lower) value on some variable than a randomly-selected item from another group. The best performer is an algorithm that relies on tabulate().This R Markdown Notebook is prompted by a recent discussion on StackOverflow, where I asked:
I want to estimate the likelihood that a randomly-selected item from one group will have a higher score than a randomly-selected item from a different group. That is, the Probability of Superiority, sometimes called the Common-language Effect Size. See for example: https://rpsychologist.com/d3/cohend/. This can be resolved algebraically if we accept that the data are normally distributed (McGraw and Wong (1992, Psychological Bulletin, 111), but I know that my data are not normally distributed, making such an estimate unreliable.
The resulting discussion produced several alternatives, most of which were a great deal better than my first efforts.
This file runs benchmarking tests to see the relative speed at which each algorithm works with different sample sizes.
# for analysis
library(data.table)
library(bigstatsr)
# for plotting
library(ggplot2) # pretty plots
library(scales) # commas in large numbers
# for recording & comparing runtimes
library(microbenchmark)
# for output to PDF & HTML
library(tinytex)
options(width = 90)
Eight alternative functions are benchmarked against each other and compared using a prespecified number of calculations with 100 iterations. Summary tables are presented for absolute and relative times. We then present these results with violin plots.
MyForIfElse.function <- function(){
bigger <- 0
equal <- 0.0
smaller <- 0
for (i in alpha) {
for (j in beta) {
if (i > j) {bigger <- bigger + 1}
else
if (i == j) {equal <- equal + 0.5}
}
}
PaGTb <- (bigger + equal) / nPairs
PaGTb
}
Myexpandgrid.function <- function(){
# My second attempt
c <- expand.grid(a = alpha, b = beta)
aGTb <- length(which(c$a > c$b))
aEQb <- length(which(c$a == c$b))
aGTb <- aGTb + (0.5 * aEQb)
PaGTb <- aGTb / nPairs
PaGTb
}
MyCJ.function <- function(){
data.table::CJ(a = alpha, b = beta, sorted = F)[, {
aGTb = sum(a > b)
aEQb = sum(a == b)
aGTb = aGTb + 0.5 * aEQb
PaGTb = aGTb / nPairs #data.table returns last value in {}
} ]
}
MyAUC.function <- function() {
MyAUC <-
bigstatsr::AUC(c(alpha, beta), rep(1:0, c(length(alpha), length(beta))))
MyAUC
}
MyDT.function <- function() {
DTa <- data.table(a = alpha)[, .N, keyby = a]
DTb <- data.table(b = beta)[, .N, keyby = b]
MyDT <- (DTb[DTa, on = .(b < a), allow.cartesian = TRUE,
nomatch = 0L, sum(N*i.N)] +
0.5 * DTb[DTa, on = .(b = a), allow.cartesian = TRUE,
nomatch = 0L, sum(N*i.N)]) / nPairs
MyDT
}
MyWilcox.function <- function() {
MyWilcox <- wilcox.test(alpha, beta)$statistic / nPairs
MyWilcox
}
wilcox_AUC <- function(){
r <- rank(c(alpha, beta))
n.x <- as.integer(length(alpha))
n.y <- as.integer(length(beta))
{sum(r[seq_along(alpha)]) - n.x * (n.x + 1) / 2} / n.x / n.y
}
AUC2 <- function() {
# tabulate does not include 0, therefore we need to
# normalize to positive values
m <- min(c(alpha, beta))
A_Freq = tabulate(alpha - min(m) + 1)
B_Freq = tabulate(beta - min(m) + 1)
# calculate the outer product.
out_matrix <- outer(A_Freq, B_Freq)
bigger = sum(out_matrix[lower.tri(out_matrix)])
equal = 0.5 * sum(diag(out_matrix))
(bigger + equal) / length(alpha) / length(beta)
}
MyBenchmark.function <- function(x) {
mbm <<- microbenchmark(
"ForIfElse" = {MyForIfElse.function()},
"expandgrid" = {Myexpandgrid.function()},
"CrossJoin" = {MyCJ.function()},
"CartesianDT" = {MyDT.function()},
"Wilcox" = {MyWilcox.function()},
"BaseWilcox" = {wilcox_AUC()},
"BigStatsAUC" = {MyAUC.function()},
"BaseAUC" = {AUC2()},
times = x,
unit = "ns"
)
}
MyDistPlot <- function(){
a <- data.frame(val = alpha, id = "alpha")
b <- data.frame(val = beta, id = "beta")
c <- rbind(a,b)
p2 <- ggplot(data = c, aes(x = val, group = id, fill = id)) +
geom_density(adjust = 1.5, alpha = .2) +
theme_classic() +
labs(title = NULL ,
subtitle = paste("Sample sizes = ", MyRows),
caption = paste("AUC = Alpha:", round(PaGTb, 2) * 100,
", Beta:", round(1 - PaGTb, 2) * 100,
"
Note truncated x-axis: Values can range ± 30")
) +
theme(legend.position = c(0.9, 0.9)) +
xlim(-8, 8)
p2
}
MyPlot.function <- function(){
pl <- ggplot(mbm, aes(expr, time)) +
geom_violin(scale = "width", trim = TRUE,
fill = "blue", alpha = 0.1,
colour = "#999999") +
geom_jitter(height = 0, width = 0.2,
alpha = 0.1,
colour = "navy") +
theme_classic() +
scale_y_log10() +
labs(title = "Speed-test: alternative Probability of Superiority calculations",
subtitle = paste(comma(MyRows), " x ",
comma(MyRows), " = ",
comma(nPairs), "comparisons"),
y = "Nanoseconds (log10)" , x = NULL ) +
coord_flip()
pl
}
In this example, two non-normal data sets are compared, so it’s unclear which should have higher probability of superiority. Group Alpha is more platykurtic (spread out), with many large negative and positive values, while group Beta is a \(\chi\)\(^2\) distribution (df=1), so highly skewed.
# for reproducability
# set.seed(42)
# shifted normal distribution cubed
MakeAlpha <- function(){rnorm(MyRows, mean = 0, sd = 1)**3}
# normal distribution squared and shifted
MakeBeta <- function(){rnorm(MyRows, mean = 0, sd = 1)**2 - 0.6}
Note: Large sample size can take a long time.
MyRows <- 25
Complete simulation is run 100 times.
(Reduce the number for a faster result)
alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)
# For interest, we calculate the sample Probability of Superiority
# (Area Under the Curve)
PaGTb <- AUC2()
MyDistPlot()
## Warning: Removed 5 rows containing non-finite values (stat_density).
MyBenchmark.function(100)
# print comparison speed test table
print(mbm, unit = "ms")
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## ForIfElse 0.022102 0.0242185 0.08820782 0.0249340 0.0257555 6.062722 100 a
## expandgrid 0.119878 0.1392545 0.30590234 0.1600375 0.1754530 5.046713 100 a
## CrossJoin 0.367652 0.4070780 1.00455212 0.4314830 0.5977610 38.268336 100 a
## CartesianDT 2.105333 2.2551925 4.99424879 2.5348100 2.9842695 204.489821 100 b
## Wilcox 0.063169 0.0763720 0.13229076 0.0996325 0.1116375 1.645324 100 a
## BaseWilcox 0.011595 0.0179795 0.08012278 0.0216845 0.0257735 5.125787 100 a
## BigStatsAUC 0.105285 0.1281850 0.18496230 0.1530405 0.1719600 1.936180 100 a
## BaseAUC 0.023367 0.0334565 0.04433223 0.0411515 0.0480130 0.157854 100 a
print(mbm, unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval
## ForIfElse 1.906166 1.347006 1.1009081 1.149854 0.9993016 1.18278852 100
## expandgrid 10.338767 7.745182 3.8179197 7.380272 6.8074961 0.98457330 100
## CrossJoin 31.707805 22.641230 12.5376593 19.898222 23.1928531 7.46584593 100
## CartesianDT 181.572488 125.431325 62.3324452 116.895017 115.7882903 39.89432667 100
## Wilcox 5.447952 4.247727 1.6511005 4.594641 4.3314839 0.32098954 100
## BaseWilcox 1.000000 1.000000 1.0000000 1.000000 1.0000000 1.00000000 100
## BigStatsAUC 9.080207 7.129509 2.3084858 7.057599 6.6719693 0.37773321 100
## BaseAUC 2.015265 1.860814 0.5533037 1.897738 1.8628824 0.03079605 100
## cld
## a
## a
## a
## b
## a
## a
## a
## a
# Visualise the comparison
MyPlot.function()
The original ForIfElse algorithm seems to work about as well as, or better than, the other options.
The Modified Wilcox and BaseR_AUC algorithm seem to perform best most often.
MyRows <- 500
# set.seed(42)
alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)
PaGTb <- MyAUC.function()
MyDistPlot()
## Warning: Removed 24 rows containing non-finite values (stat_density).
MyBenchmark.function(100)
# print comparison benchmark tables
print(mbm, unit = "ms")
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## ForIfElse 7.679252 7.8980865 8.33344224 8.0379090 8.2410470 19.678600 100 cd
## expandgrid 5.339306 5.6215980 8.21365743 6.0491555 8.9943415 100.318393 100 cd
## CrossJoin 2.092173 2.2393230 5.07015497 2.7431545 3.8688980 130.642755 100 bc
## CartesianDT 3.858280 4.5416540 10.63300470 5.5445895 7.8997405 199.951686 100 d
## Wilcox 1.395751 1.5280865 1.66299851 1.5914405 1.6656055 5.508405 100 ab
## BaseWilcox 0.098033 0.1085540 0.16704753 0.1194020 0.1341605 4.331057 100 a
## BigStatsAUC 0.172668 0.2261765 0.26158034 0.2557905 0.2801315 0.533215 100 a
## BaseAUC 0.044649 0.0707030 0.08500532 0.0790260 0.0903785 0.315768 100 a
print(mbm, unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval cld
## ForIfElse 171.991579 111.707940 98.034361 101.712209 91.183711 62.319804 100 cd
## expandgrid 119.584000 79.510035 96.625216 76.546396 99.518597 317.696515 100 cd
## CrossJoin 46.858228 31.672249 59.645149 34.712050 42.807725 413.730191 100 bc
## CartesianDT 86.413581 64.235662 125.086344 70.161586 87.407298 633.223398 100 d
## Wilcox 31.260521 21.612753 19.563464 20.138189 18.429223 17.444469 100 ab
## BaseWilcox 2.195637 1.535352 1.965142 1.510920 1.484429 13.715947 100 a
## BigStatsAUC 3.867231 3.198966 3.077223 3.236789 3.099537 1.688629 100 a
## BaseAUC 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
MyPlot.function()
The Modified Wilcox, BigStatsR AUC and BaseR AUC algorithm seem to perform best most often.
ForIfElse alfgorithm is among the poorer performers now
MyRows <- 5000
# set.seed(42)
alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)
PaGTb <- MyAUC.function()
MyDistPlot()
## Warning: Removed 239 rows containing non-finite values (stat_density).
MyBenchmark.function(100)
# print comparison benchmark tables
print(mbm, unit = "ms")
## Unit: milliseconds
## expr min lq mean median uq max neval
## ForIfElse 776.873969 816.858854 898.3932144 884.090684 980.0347880 1220.914776 100
## expandgrid 548.302321 670.039900 767.7625184 755.727513 829.0553620 1448.790001 100
## CrossJoin 203.739032 304.264470 349.6972554 334.971122 404.6782315 683.965457 100
## CartesianDT 166.358794 182.056548 249.5755077 204.771629 316.9810685 555.044648 100
## Wilcox 12.708419 13.486905 16.5528643 14.054885 15.1856945 146.152472 100
## BaseWilcox 1.208642 1.244107 1.4087997 1.295043 1.3457735 7.271682 100
## BigStatsAUC 0.682308 0.762148 0.8244686 0.797541 0.8505025 1.425798 100
## BaseAUC 0.139901 0.173552 0.1869021 0.181387 0.1956810 0.372841 100
## cld
## e
## d
## c
## b
## a
## a
## a
## a
print(mbm, unit = "relative")
## Unit: relative
## expr min lq mean median uq max
## ForIfElse 5553.026562 4706.709540 4806.756982 4874.057592 5008.328800 3274.625849
## expandgrid 3919.216596 3860.744327 4107.831389 4166.381896 4236.769855 3885.811917
## CrossJoin 1456.308618 1753.160260 1871.017832 1846.720669 2068.050713 1834.469538
## CartesianDT 1189.117976 1049.002884 1335.327109 1128.921196 1619.886798 1488.689946
## Wilcox 90.838657 77.711035 88.564333 77.485625 77.604338 391.996781
## BaseWilcox 8.639266 7.168494 7.537632 7.139671 6.877385 19.503440
## BigStatsAUC 4.877077 4.391468 4.411231 4.396903 4.346372 3.824145
## BaseAUC 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
## neval cld
## 100 e
## 100 d
## 100 c
## 100 b
## 100 a
## 100 a
## 100 a
## 100 a
MyPlot.function()
The BaseR_AUC, followed by Modified Wilcox and BigStatsR AUC seem to perform best most often.
BaseR_AUC functions at about 5000 times the speed of the ForIfElse algorithm.
The Area Under the Curve (AUC) function from the package, bigstatsr (Statistical Tools for Filebacked Big Matrices) by Florian Privé (Grenoble, France) was extraordinarily efficient.
The two close winners were created by Cole, who prefers to remain anonymous, with significant input from chinsoon12 (Singapore). These were:
Both algorithms are attractive because they rely only on Base R.