The DeLong method of AUC comparison test is conducted using pROC:::delong.placements, which conducts all-possible pairwise comparison of case score and non-case score via pROC:::MW.kernel. This is done in double sapply loops, but it is slow when the dataset is of moderate size (order of 10,000) because of a large numbers of possible pairs.
Here, I propose an improvement in delong.placements() using outer() that does not depend on one-by-one pairwise comparison with pROC:::MW.kernel.
## Load pROC
library(pROC)
## Load aSAH dataset
data(aSAH)
## Create a medium-sized dataset
aSAH.medium <- aSAH[rep(seq_len(nrow(aSAH)), 20),]
## By s100b predictor
roc.s100b <- roc((outcome == "Poor") ~ s100b, data = aSAH.medium)
roc.s100b
Call:
roc.formula(formula = (outcome == "Poor") ~ s100b, data = aSAH.medium)
Data: s100b in 1440 controls ((outcome == "Poor") FALSE) < 820 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.731
## By ndka predictor
roc.ndka <- roc((outcome == "Poor") ~ ndka, data = aSAH.medium)
roc.ndka
Call:
roc.formula(formula = (outcome == "Poor") ~ ndka, data = aSAH.medium)
Data: ndka in 1440 controls ((outcome == "Poor") FALSE) < 820 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.612
## The alternative without compilation
delong.placements.alternative <- function (roc) {
V <- list()
Y <- roc$controls
X <- roc$cases
n <- length(Y)
m <- length(X)
### Alternative
## outer() alternative for double sapply loop
## Equal: Give 0.5
equal <- outer(X, Y, "==") * 0.5
## Greater: Give 1.0
greater <- outer(X, Y, ">") * 1.0
## Add these two matrices
MW <- equal + greater
### Alternative ends here
V$theta <- sum(MW)/(m * n)
V$X <- sapply(1:m, function(i) {
sum(MW[i, ])
})/n
V$Y <- sapply(1:n, function(j) {
sum(MW[, j])
})/m
return(V)
}
## The original without compilation
delong.placements.original <- function (roc) {
V <- list()
Y <- roc$controls
X <- roc$cases
n <- length(Y)
m <- length(X)
### Original
## Double sapply loop
MW <- sapply(1:n, function(j)
sapply(1:m, function(i, j)
pROC:::MW.kernel(X[i], Y[j]),
j = j))
### Original ends here
V$theta <- sum(MW)/(m * n)
V$X <- sapply(1:m, function(i) {
sum(MW[i, ])
})/n
V$Y <- sapply(1:n, function(j) {
sum(MW[, j])
})/m
return(V)
}
## Test run both
system.time(res.alternative <- delong.placements.alternative(roc.s100b))
user system elapsed
0.323 0.028 0.352
res.alternative$theta
[1] 0.7314
system.time(res.original <- delong.placements.original(roc.s100b))
user system elapsed
13.380 0.028 13.414
res.original$theta
[1] 0.7314
## Test the results for identity
identical(res.alternative, res.original)
[1] TRUE
## Alternative without compilation
delong.paired.test.alternative <- function (roc1, roc2) {
n <- length(roc1$controls)
m <- length(roc1$cases)
### Using alternative
VR <- delong.placements.alternative(roc1)
VS <- delong.placements.alternative(roc2)
SX <- matrix(NA, ncol = 2, nrow = 2)
SX[1, 1] <- sum((VR$X - VR$theta) * (VR$X - VR$theta))/(m - 1)
SX[1, 2] <- sum((VR$X - VR$theta) * (VS$X - VS$theta))/(m - 1)
SX[2, 1] <- sum((VS$X - VS$theta) * (VR$X - VR$theta))/(m - 1)
SX[2, 2] <- sum((VS$X - VS$theta) * (VS$X - VS$theta))/(m - 1)
SY <- matrix(NA, ncol = 2, nrow = 2)
SY[1, 1] <- sum((VR$Y - VR$theta) * (VR$Y - VR$theta))/(n - 1)
SY[1, 2] <- sum((VR$Y - VR$theta) * (VS$Y - VS$theta))/(n - 1)
SY[2, 1] <- sum((VS$Y - VS$theta) * (VR$Y - VR$theta))/(n - 1)
SY[2, 2] <- sum((VS$Y - VS$theta) * (VS$Y - VS$theta))/(n - 1)
S <- SX/m + SY/n
L <- c(1, -1)
sig <- sqrt(L %*% S %*% L)
zscore <- (VR$theta - VS$theta)/sig[1]
if (is.nan(zscore) && VR$theta == VR$theta && sig[1] == 0)
zscore <- 0
return(zscore)
}
## Original without compilation
delong.paired.test.original <- function (roc1, roc2) {
n <- length(roc1$controls)
m <- length(roc1$cases)
### Using original
VR <- delong.placements.original(roc1)
VS <- delong.placements.original(roc2)
SX <- matrix(NA, ncol = 2, nrow = 2)
SX[1, 1] <- sum((VR$X - VR$theta) * (VR$X - VR$theta))/(m - 1)
SX[1, 2] <- sum((VR$X - VR$theta) * (VS$X - VS$theta))/(m - 1)
SX[2, 1] <- sum((VS$X - VS$theta) * (VR$X - VR$theta))/(m - 1)
SX[2, 2] <- sum((VS$X - VS$theta) * (VS$X - VS$theta))/(m - 1)
SY <- matrix(NA, ncol = 2, nrow = 2)
SY[1, 1] <- sum((VR$Y - VR$theta) * (VR$Y - VR$theta))/(n - 1)
SY[1, 2] <- sum((VR$Y - VR$theta) * (VS$Y - VS$theta))/(n - 1)
SY[2, 1] <- sum((VS$Y - VS$theta) * (VR$Y - VR$theta))/(n - 1)
SY[2, 2] <- sum((VS$Y - VS$theta) * (VS$Y - VS$theta))/(n - 1)
S <- SX/m + SY/n
L <- c(1, -1)
sig <- sqrt(L %*% S %*% L)
zscore <- (VR$theta - VS$theta)/sig[1]
if (is.nan(zscore) && VR$theta == VR$theta && sig[1] == 0)
zscore <- 0
return(zscore)
}
## Test run both
system.time(res.alternative.test <- delong.paired.test.alternative(roc.s100b, roc.ndka))
user system elapsed
0.368 0.033 0.401
res.alternative.test
[1] 6.286
system.time(res.original.test <- delong.paired.test.original(roc.s100b, roc.ndka))
user system elapsed
26.454 0.068 26.525
res.original.test
[1] 6.286
## Test the results for identity
identical(res.alternative.test, res.original.test)
[1] TRUE