pROC and larger datasets

Create dataset

## Load pROC
library(pROC)

## Load data
data(aSAH)

## Check number of rows
nrow.aSAH <- nrow(aSAH)
nrow.aSAH
[1] 113

## Create a larger dataset 50000 cases
aSAH.large <- aSAH[rep(seq_len(nrow.aSAH), 450),]

## Check number of rows
nrow(aSAH.large)
[1] 50850

## Check first rows
head(aSAH.large)
   gos6 outcome gender age wfns s100b  ndka
29    5    Good Female  42    1  0.13  3.01
30    5    Good Female  37    1  0.14  8.54
31    5    Good Female  42    1  0.10  8.09
32    5    Good Female  27    1  0.04 10.42
33    1    Poor Female  42    3  0.13 17.40
34    1    Poor   Male  48    2  0.10 12.75

Create ROCs

## By s100b predictor
roc.s100b <- roc((outcome == "Poor") ~ s100b, data = aSAH.large)
roc.s100b

Call:
roc.formula(formula = (outcome == "Poor") ~ s100b, data = aSAH.large)

Data: s100b in 32400 controls ((outcome == "Poor") FALSE) < 18450 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.731

## By ndka predictor
roc.ndka <- roc((outcome == "Poor") ~ ndka, data = aSAH.large)
roc.ndka

Call:
roc.formula(formula = (outcome == "Poor") ~ ndka, data = aSAH.large)

Data: ndka in 32400 controls ((outcome == "Poor") FALSE) < 18450 cases ((outcome == "Poor") TRUE).
Area under the curve: 0.612

Testing AUC difference

Summary: The De Long method is impossibly slow, so use bootstrap.

The bootstrap method takes several minutes but doable.

## Test these two ROCs (bootstrap)
system.time(res.roc.test <- roc.test(roc.s100b, roc.ndka, method = "bootstrap"))
   user  system elapsed 
 421.50   58.89  480.49 
res.roc.test

    Bootstrap test for two correlated ROC curves

data:  roc.s100b and roc.ndka 
D = 30.21, boot.n = 2000, boot.stratified = 1, p-value < 2.2e-16
alternative hypothesis: true difference in AUC is not equal to 0 
sample estimates:
AUC of roc1 AUC of roc2 
     0.7314      0.6120 

The DeLong method involves all possible pairwise comparison, and it does not scale well in larger data.

## Test these two ROCs (DeLong)
system.time(res.roc.test2 <- roc.test(roc.s100b, roc.ndka, method = "delong"))
res.roc.test2

This call will use these functions in sequence. The last function, pROC:::MW.kernel is a function that gives scores 1, 0.5, or 0, according to a two value comparison procedure. This function is applied to all possible pairs of cases and non-cases, taking a lot of time in larger datasets.

roc.test
function (...) 
{
    UseMethod("roc.test")
}
<environment: namespace:pROC>
## roc.test.roc # Suppressed. Too long.
pROC:::delong.paired.test 
function (roc1, roc2) 
{
    n <- length(roc1$controls)
    m <- length(roc1$cases)
    VR <- delong.placements(roc1)
    VS <- delong.placements(roc2)
    SX <- matrix(NA, ncol = 2, nrow = 2)
    SX[1, 1] <- sum((VR$X - VR$theta) * (VR$X - VR$theta))/(m - 
        1)
    SX[1, 2] <- sum((VR$X - VR$theta) * (VS$X - VS$theta))/(m - 
        1)
    SX[2, 1] <- sum((VS$X - VS$theta) * (VR$X - VR$theta))/(m - 
        1)
    SX[2, 2] <- sum((VS$X - VS$theta) * (VS$X - VS$theta))/(m - 
        1)
    SY <- matrix(NA, ncol = 2, nrow = 2)
    SY[1, 1] <- sum((VR$Y - VR$theta) * (VR$Y - VR$theta))/(n - 
        1)
    SY[1, 2] <- sum((VR$Y - VR$theta) * (VS$Y - VS$theta))/(n - 
        1)
    SY[2, 1] <- sum((VS$Y - VS$theta) * (VR$Y - VR$theta))/(n - 
        1)
    SY[2, 2] <- sum((VS$Y - VS$theta) * (VS$Y - VS$theta))/(n - 
        1)
    S <- SX/m + SY/n
    L <- c(1, -1)
    sig <- sqrt(L %*% S %*% L)
    zscore <- (VR$theta - VS$theta)/sig[1]
    if (is.nan(zscore) && VR$theta == VR$theta && sig[1] == 0) 
        zscore <- 0
    return(zscore)
}
<environment: namespace:pROC>
pROC:::delong.placements
function (roc) 
{
    V <- list()
    Y <- roc$controls
    X <- roc$cases
    n <- length(Y)
    m <- length(X)
    MW <- sapply(1:n, function(j) sapply(1:m, function(i, j) MW.kernel(X[i], 
        Y[j]), j = j))
    V$theta <- sum(MW)/(m * n)
    V$X <- sapply(1:m, function(i) {
        sum(MW[i, ])
    })/n
    V$Y <- sapply(1:n, function(j) {
        sum(MW[, j])
    })/m
    return(V)
}
<environment: namespace:pROC>
pROC:::MW.kernel
function (x, y) 
{
    if (y < x) 
        return(1)
    if (y == x) 
        return(0.5)
    if (y > x) 
        return(0)
}
<environment: namespace:pROC>