Benchmarking Area Under the Curve

Background

This R Markdown Notebook is prompted by a recent discussion on StackOverflow, where I asked:

I want to estimate the likelihood that a randomly-selected item from one group will have a higher score than a randomly-selected item from a different group. That is, the Probability of Superiority, sometimes called the Common-language Effect Size. See for example: https://rpsychologist.com/d3/cohend/. This can be resolved algebraically if we accept that the data are normally distributed (McGraw and Wong (1992, Psychological Bulletin, 111), but I know that my data are not normally distributed, making such an estimate unreliable.

The resulting discussion produced several alternatives, most of which were a great deal better than my first efforts.

This file runs benchmarking tests to see the relative speed at which each algorithm works with different sample sizes.

Setup

Load required libraries

# for analysis 
library(data.table)
library(bigstatsr)

# for plotting
library(ggplot2) # pretty plots
library(scales)  # commas in large numbers

# for recording & comparing runtimes
library(microbenchmark)

# for output to PDF & HTML
library(tinytex)

options(width = 90)

Functions to test

Eight alternative functions are benchmarked against each other and compared using a prespecified number of calculations with 100 iterations. Summary tables are presented for absolute and relative times. We then present these results with violin plots.

Functions are:

  1. My Original For-If-Else function
  2. Bring data together with expandgrid
  3. “CJ” (C)ross (J)oin. A data.table is formed from the cross product of the vectors
  4. Modified expandgrid function using similar sum routine as CJ
  5. DataTable cartesian product of data.table
  6. Wilcox from base R
  7. Modified Wilcox from base R
  8. BaseR AUC, taking advantage of properties of matrix outer product
  9. AUC from BigStatsR

My Original For-If-Else function

MyForIfElse.function <- function(){
  bigger  <- 0
  equal   <- 0.0
  smaller <- 0

  for (i in alpha) {
    for (j in beta) {
      if (i > j) {bigger <- bigger + 1}
      else
        if (i == j) {equal <- equal + 0.5}
    }
  }
  PaGTb <- (bigger + equal) / nPairs
  PaGTb
}

Expand.grid function

Myexpandgrid.function <- function(){
  # My second attempt
  c <- expand.grid(a = alpha, b = beta)
  aGTb <- length(which(c$a > c$b))
  aEQb <- length(which(c$a == c$b))
  aGTb <- aGTb + (0.5 * aEQb)
  PaGTb <- aGTb / nPairs
  PaGTb
}

Cross Join function

MyCJ.function <- function(){
  data.table::CJ(a = alpha, b = beta, sorted = F)[, {
    aGTb = sum(a > b)
    aEQb = sum(a == b)
    aGTb = aGTb + 0.5 * aEQb
    PaGTb = aGTb / nPairs #data.table returns last value in {}
    } ]
}

bigstatsr AUC function

MyAUC.function <- function() {
  MyAUC <-
    bigstatsr::AUC(c(alpha, beta), rep(1:0, c(length(alpha), length(beta))))
  MyAUC
}

DataTable function

MyDT.function <- function() {
  DTa <- data.table(a = alpha)[, .N, keyby = a]
  DTb <- data.table(b = beta)[, .N, keyby = b]
  MyDT <- (DTb[DTa, on = .(b < a), allow.cartesian = TRUE, 
               nomatch = 0L, sum(N*i.N)] +
      0.5 * DTb[DTa, on = .(b = a), allow.cartesian = TRUE, 
                nomatch = 0L, sum(N*i.N)]) / nPairs
  MyDT
}

Wilcoxon test function

MyWilcox.function <- function() {
  MyWilcox <- wilcox.test(alpha, beta)$statistic / nPairs
  MyWilcox
}

Modified Wilcox

wilcox_AUC <- function(){
  r <- rank(c(alpha, beta))
  
  n.x <- as.integer(length(alpha))
  n.y <- as.integer(length(beta))
  
  {sum(r[seq_along(alpha)]) - n.x * (n.x + 1) / 2} / n.x / n.y
}

Base AUC

AUC2 <- function() {
# tabulate does not include 0, therefore we need to
# normalize to positive values
  m <- min(c(alpha, beta))

  A_Freq = tabulate(alpha - min(m) + 1)
  B_Freq = tabulate(beta - min(m) + 1)

# calculate the outer product.  
  out_matrix <- outer(A_Freq, B_Freq)
  
  bigger = sum(out_matrix[lower.tri(out_matrix)])
  equal = 0.5 * sum(diag(out_matrix))
  (bigger + equal) / length(alpha) / length(beta)
}

Benchmark all of the Functions

MyBenchmark.function <- function(x) {
  mbm <<- microbenchmark(
    "ForIfElse"   = {MyForIfElse.function()},
    "expandgrid"  = {Myexpandgrid.function()},
    "CrossJoin"   = {MyCJ.function()},
    "CartesianDT" = {MyDT.function()},
    "Wilcox"      = {MyWilcox.function()},
    "BaseWilcox"  = {wilcox_AUC()},
    "BigStatsAUC" = {MyAUC.function()},
    "BaseAUC"     = {AUC2()},
    times = x,
    unit = "ns"
  )
}

Charting functions

Function to visualise the data distributions

MyDistPlot <- function(){
  a <- data.frame(val = alpha, id = "alpha")
  b <- data.frame(val = beta, id = "beta")
  c <- rbind(a,b)
  
  p2 <- ggplot(data = c, aes(x = val, group = id, fill = id)) +
    geom_density(adjust = 1.5, alpha = .2) +  
    theme_classic() +
    labs(title = NULL , 
         subtitle = paste("Sample sizes = ", MyRows),
         caption = paste("AUC = Alpha:", round(PaGTb, 2) * 100, 
                         ", Beta:", round(1 - PaGTb, 2) * 100,
                         "
                         Note truncated x-axis: Values can range ± 30")
    ) +
    theme(legend.position = c(0.9, 0.9)) +
    xlim(-8, 8)
  p2
}

Function to plot results of simulation

MyPlot.function <- function(){
  pl <- ggplot(mbm, aes(expr, time)) +
    geom_violin(scale = "width", trim = TRUE, 
                fill = "blue", alpha = 0.1, 
                colour = "#999999") +
    geom_jitter(height = 0, width = 0.2, 
                alpha = 0.1, 
                colour = "navy") +
    theme_classic() +
    scale_y_log10() +
    labs(title = "Speed-test: alternative Probability of Superiority calculations",
         subtitle = paste(comma(MyRows), " x ", 
                          comma(MyRows), " = ", 
                          comma(nPairs), "comparisons"),
         y = "Nanoseconds (log10)" , x = NULL ) +
    coord_flip()
  pl
}

Generate some artificial data

Generating data

In this example, two non-normal data sets are compared, so it’s unclear which should have higher probability of superiority. Group Alpha is more platykurtic (spread out), with many large negative and positive values, while group Beta is a \(\chi\)\(^2\) distribution (df=1), so highly skewed.

# for reproducability
# set.seed(42)

# shifted normal distribution cubed
MakeAlpha <- function(){rnorm(MyRows, mean = 0, sd = 1)**3}
# normal distribution squared and shifted
MakeBeta <- function(){rnorm(MyRows, mean = 0, sd = 1)**2 - 0.6}

Run the simulation

Change the number in MyRows for different sample sizes

Note: Large sample size can take a long time.

MyRows <- 25

Complete simulation is run 100 times.

(Reduce the number for a faster result)

alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)

# For interest, we calculate the sample Probability of Superiority 
# (Area Under the Curve)
PaGTb <- AUC2()
MyDistPlot() 
## Warning: Removed 5 rows containing non-finite values (stat_density).

Rerun with larger sample size

MyRows <- 500

# set.seed(42)

alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)

PaGTb <- MyAUC.function()
MyDistPlot() 
## Warning: Removed 24 rows containing non-finite values (stat_density).

MyBenchmark.function(100)

# print comparison benchmark tables
print(mbm, unit = "ms")
## Unit: milliseconds
##         expr      min        lq        mean    median        uq        max neval  cld
##    ForIfElse 7.679252 7.8980865  8.33344224 8.0379090 8.2410470  19.678600   100   cd
##   expandgrid 5.339306 5.6215980  8.21365743 6.0491555 8.9943415 100.318393   100   cd
##    CrossJoin 2.092173 2.2393230  5.07015497 2.7431545 3.8688980 130.642755   100  bc 
##  CartesianDT 3.858280 4.5416540 10.63300470 5.5445895 7.8997405 199.951686   100    d
##       Wilcox 1.395751 1.5280865  1.66299851 1.5914405 1.6656055   5.508405   100 ab  
##   BaseWilcox 0.098033 0.1085540  0.16704753 0.1194020 0.1341605   4.331057   100 a   
##  BigStatsAUC 0.172668 0.2261765  0.26158034 0.2557905 0.2801315   0.533215   100 a   
##      BaseAUC 0.044649 0.0707030  0.08500532 0.0790260 0.0903785   0.315768   100 a
print(mbm, unit = "relative")
## Unit: relative
##         expr        min         lq       mean     median        uq        max neval  cld
##    ForIfElse 171.991579 111.707940  98.034361 101.712209 91.183711  62.319804   100   cd
##   expandgrid 119.584000  79.510035  96.625216  76.546396 99.518597 317.696515   100   cd
##    CrossJoin  46.858228  31.672249  59.645149  34.712050 42.807725 413.730191   100  bc 
##  CartesianDT  86.413581  64.235662 125.086344  70.161586 87.407298 633.223398   100    d
##       Wilcox  31.260521  21.612753  19.563464  20.138189 18.429223  17.444469   100 ab  
##   BaseWilcox   2.195637   1.535352   1.965142   1.510920  1.484429  13.715947   100 a   
##  BigStatsAUC   3.867231   3.198966   3.077223   3.236789  3.099537   1.688629   100 a   
##      BaseAUC   1.000000   1.000000   1.000000   1.000000  1.000000   1.000000   100 a
MyPlot.function() 

Interpretation of larger sample experiment

The Modified Wilcox, BigStatsR AUC and BaseR AUC algorithm seem to perform best most often.

ForIfElse alfgorithm is among the poorer performers now


Rerun with much larger sample size

MyRows <- 5000

# set.seed(42)

alpha <- MakeAlpha()
beta <- MakeBeta()
nPairs <- length(alpha) * length(beta)

PaGTb <- MyAUC.function()
MyDistPlot() 
## Warning: Removed 239 rows containing non-finite values (stat_density).

MyBenchmark.function(100)

# print comparison benchmark tables
print(mbm, unit = "ms")
## Unit: milliseconds
##         expr        min         lq        mean     median          uq         max neval
##    ForIfElse 776.873969 816.858854 898.3932144 884.090684 980.0347880 1220.914776   100
##   expandgrid 548.302321 670.039900 767.7625184 755.727513 829.0553620 1448.790001   100
##    CrossJoin 203.739032 304.264470 349.6972554 334.971122 404.6782315  683.965457   100
##  CartesianDT 166.358794 182.056548 249.5755077 204.771629 316.9810685  555.044648   100
##       Wilcox  12.708419  13.486905  16.5528643  14.054885  15.1856945  146.152472   100
##   BaseWilcox   1.208642   1.244107   1.4087997   1.295043   1.3457735    7.271682   100
##  BigStatsAUC   0.682308   0.762148   0.8244686   0.797541   0.8505025    1.425798   100
##      BaseAUC   0.139901   0.173552   0.1869021   0.181387   0.1956810    0.372841   100
##    cld
##      e
##     d 
##    c  
##   b   
##  a    
##  a    
##  a    
##  a
print(mbm, unit = "relative")
## Unit: relative
##         expr         min          lq        mean      median          uq         max
##    ForIfElse 5553.026562 4706.709540 4806.756982 4874.057592 5008.328800 3274.625849
##   expandgrid 3919.216596 3860.744327 4107.831389 4166.381896 4236.769855 3885.811917
##    CrossJoin 1456.308618 1753.160260 1871.017832 1846.720669 2068.050713 1834.469538
##  CartesianDT 1189.117976 1049.002884 1335.327109 1128.921196 1619.886798 1488.689946
##       Wilcox   90.838657   77.711035   88.564333   77.485625   77.604338  391.996781
##   BaseWilcox    8.639266    7.168494    7.537632    7.139671    6.877385   19.503440
##  BigStatsAUC    4.877077    4.391468    4.411231    4.396903    4.346372    3.824145
##      BaseAUC    1.000000    1.000000    1.000000    1.000000    1.000000    1.000000
##  neval   cld
##    100     e
##    100    d 
##    100   c  
##    100  b   
##    100 a    
##    100 a    
##    100 a    
##    100 a
MyPlot.function() 

Interpretation of larger sample experiment

The BaseR_AUC, followed by Modified Wilcox and BigStatsR AUC seem to perform best most often.

BaseR_AUC functions at about 5000 times the speed of the ForIfElse algorithm.


Conclusions

The Area Under the Curve (AUC) function from the package, bigstatsr (Statistical Tools for Filebacked Big Matrices) by Florian Privé (Grenoble, France) was extraordinarily efficient.

The two close winners were created by Cole, who prefers to remain anonymous, with significant input from chinsoon12 (Singapore). These were:

Both algorithms are attractive because they rely only on Base R.