pROC::roc.test DeLong method possible improvement

References

Possible improvement

The DeLong method of AUC comparison test is conducted using pROC:::delong.placements, which conducts all-possible pairwise comparison of case score and non-case score via pROC:::MW.kernel. This is done in double sapply loops, but it is slow when the dataset is of moderate size (order of 10,000) because of a large numbers of possible pairs.

Here, I propose an improvement in delong.placements() using outer() that does not depend on one-by-one pairwise comparison with pROC:::MW.kernel.

Provide medium-sized data

## Load pROC
library(pROC)

## Load aSAH dataset
data(aSAH)

## Create a medium-sized dataset
aSAH.medium <- aSAH[rep(seq_len(nrow(aSAH)), 20),]

## By s100b predictor
roc.s100b <- roc((outcome == "Poor") ~ s100b, data = aSAH.medium)
roc.s100b

Call:
roc.formula(formula = (outcome == "Poor") ~ s100b, data = aSAH.medium)

Data: s100b in 1440 controls ((outcome == "Poor") FALSE) < 820 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.731

## By ndka predictor
roc.ndka <- roc((outcome == "Poor") ~ ndka, data = aSAH.medium)
roc.ndka

Call:
roc.formula(formula = (outcome == "Poor") ~ ndka, data = aSAH.medium)

Data: ndka in 1440 controls ((outcome == "Poor") FALSE) < 820 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.612

Original and alternative delong.placements functions

## The alternative without compilation
delong.placements.alternative <- function (roc) {
    V <- list()
    Y <- roc$controls
    X <- roc$cases
    n <- length(Y)
    m <- length(X)

### Alternative
    ## outer() alternative for double sapply loop
    ## Equal: Give 0.5
    equal   <- outer(X, Y, "==") * 0.5
    ## Greater: Give 1.0
    greater <- outer(X, Y, ">") * 1.0
    ## Add these two matrices
    MW <- equal + greater
### Alternative ends here

    V$theta <- sum(MW)/(m * n)
    V$X <- sapply(1:m, function(i) {
        sum(MW[i, ])
    })/n
    V$Y <- sapply(1:n, function(j) {
        sum(MW[, j])
    })/m
    return(V)
}

## The original without compilation
delong.placements.original <- function (roc) {
    V <- list()
    Y <- roc$controls
    X <- roc$cases
    n <- length(Y)
    m <- length(X)

### Original
    ## Double sapply loop
    MW <- sapply(1:n, function(j)
                 sapply(1:m, function(i, j)
                        pROC:::MW.kernel(X[i], Y[j]),
                        j = j))
### Original ends here

    V$theta <- sum(MW)/(m * n)
    V$X <- sapply(1:m, function(i) {
        sum(MW[i, ])
    })/n
    V$Y <- sapply(1:n, function(j) {
        sum(MW[, j])
    })/m
    return(V)
}


## Test run both
system.time(res.alternative <- delong.placements.alternative(roc.s100b))
   user  system elapsed 
  0.323   0.028   0.352 
res.alternative$theta
[1] 0.7314
system.time(res.original    <- delong.placements.original(roc.s100b))
   user  system elapsed 
 13.380   0.028  13.414 
res.original$theta
[1] 0.7314
## Test the results for identity
identical(res.alternative, res.original)
[1] TRUE

delong.paired.test functions with the original and alternative delong.placements functions

## Alternative without compilation
delong.paired.test.alternative <- function (roc1, roc2) {
    n <- length(roc1$controls)
    m <- length(roc1$cases)

### Using alternative
    VR <- delong.placements.alternative(roc1)
    VS <- delong.placements.alternative(roc2)

    SX <- matrix(NA, ncol = 2, nrow = 2)
    SX[1, 1] <- sum((VR$X - VR$theta) * (VR$X - VR$theta))/(m - 1)
    SX[1, 2] <- sum((VR$X - VR$theta) * (VS$X - VS$theta))/(m - 1)
    SX[2, 1] <- sum((VS$X - VS$theta) * (VR$X - VR$theta))/(m - 1)
    SX[2, 2] <- sum((VS$X - VS$theta) * (VS$X - VS$theta))/(m - 1)
    SY <- matrix(NA, ncol = 2, nrow = 2)
    SY[1, 1] <- sum((VR$Y - VR$theta) * (VR$Y - VR$theta))/(n - 1)
    SY[1, 2] <- sum((VR$Y - VR$theta) * (VS$Y - VS$theta))/(n - 1)
    SY[2, 1] <- sum((VS$Y - VS$theta) * (VR$Y - VR$theta))/(n - 1)
    SY[2, 2] <- sum((VS$Y - VS$theta) * (VS$Y - VS$theta))/(n - 1)
    S <- SX/m + SY/n
    L <- c(1, -1)
    sig <- sqrt(L %*% S %*% L)
    zscore <- (VR$theta - VS$theta)/sig[1]
    if (is.nan(zscore) && VR$theta == VR$theta && sig[1] == 0)
        zscore <- 0
    return(zscore)
}


## Original without compilation
delong.paired.test.original <- function (roc1, roc2) {
    n <- length(roc1$controls)
    m <- length(roc1$cases)

### Using original
    VR <- delong.placements.original(roc1)
    VS <- delong.placements.original(roc2)

    SX <- matrix(NA, ncol = 2, nrow = 2)
    SX[1, 1] <- sum((VR$X - VR$theta) * (VR$X - VR$theta))/(m - 1)
    SX[1, 2] <- sum((VR$X - VR$theta) * (VS$X - VS$theta))/(m - 1)
    SX[2, 1] <- sum((VS$X - VS$theta) * (VR$X - VR$theta))/(m - 1)
    SX[2, 2] <- sum((VS$X - VS$theta) * (VS$X - VS$theta))/(m - 1)
    SY <- matrix(NA, ncol = 2, nrow = 2)
    SY[1, 1] <- sum((VR$Y - VR$theta) * (VR$Y - VR$theta))/(n - 1)
    SY[1, 2] <- sum((VR$Y - VR$theta) * (VS$Y - VS$theta))/(n - 1)
    SY[2, 1] <- sum((VS$Y - VS$theta) * (VR$Y - VR$theta))/(n - 1)
    SY[2, 2] <- sum((VS$Y - VS$theta) * (VS$Y - VS$theta))/(n - 1)
    S <- SX/m + SY/n
    L <- c(1, -1)
    sig <- sqrt(L %*% S %*% L)
    zscore <- (VR$theta - VS$theta)/sig[1]
    if (is.nan(zscore) && VR$theta == VR$theta && sig[1] == 0)
        zscore <- 0
    return(zscore)
}

## Test run both
system.time(res.alternative.test <- delong.paired.test.alternative(roc.s100b, roc.ndka))
   user  system elapsed 
  0.368   0.033   0.401 
res.alternative.test
[1] 6.286
system.time(res.original.test <- delong.paired.test.original(roc.s100b, roc.ndka))
   user  system elapsed 
 26.454   0.068  26.525 
res.original.test
[1] 6.286
## Test the results for identity
identical(res.alternative.test, res.original.test)
[1] TRUE