survival::survSplit problem with Surv object

Issue

A Surv object in a data frame is not a vector, but it is a list of two vectors (one vector for time, one vector for censoring status). survSplit() does not handle this well and returns a data frame with duplicated rows.

There is no problem if there is no Surv object in the data frame.

## Load survival package
library(survival)

## Load lung data
data(lung)

## Create an event indicator
lung$event <- lung$status == 2

## Convert to the counting process format at given times
lung.split1 <- survSplit(data    = lung,
                         cut     = c(200,400,600), # vector of timepoints to cut at
                         end     = "time",  # character string with name of event time variable
                         event   = "event", # character string with name of censoring indicator
                         start   = "start", # character string with name of start time variable (created)
                         id      = "id",    # character string with name of new id variable to create
                         zero    = 0,       # If start doesn't already exist, used as start
                         episode = NULL     # character string with name of new episode variable (optional)
                         )

## id as numeric
lung.split1$id <- as.numeric(lung.split1$id)

## Reorder
library(doBy)
lung.split1 <- orderBy( ~ id + start, lung.split1)

## This is good
head(lung.split1)
    inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event start id
1      3  200      2  74   1       1       90       100     1175      NA     0     0  1
229    3  306      2  74   1       1       90       100     1175      NA     1   200  1
2      3  200      2  68   1       0       90        90     1225      15     0     0  2
230    3  400      2  68   1       0       90        90     1225      15     0   200  2
458    3  455      2  68   1       0       90        90     1225      15     1   400  2
3      3  200      1  56   1       0       90        90       NA      15     0     0  3

## 453 rows
nrow(lung.split1)
[1] 453

It breaks down if there is a Surv object in the data frame.

## Create a Surv object
lung$SurvObj <- with(lung, Surv(time, event))

## Check the Surv object
head(lung)
  inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj
1    3  306      2  74   1       1       90       100     1175      NA  TRUE    306 
2    3  455      2  68   1       0       90        90     1225      15  TRUE    455 
3    3 1010      1  56   1       0       90        90       NA      15 FALSE   1010+
4    5  210      2  57   1       1       90        60     1150      11  TRUE    210 
5    1  883      2  60   1       0      100        90       NA       0  TRUE    883 
6   12 1022      1  74   1       1       50        80      513       0 FALSE   1022+

## Surv class
class(lung$SurvObj)
[1] "Surv"

## It is not a vector. It is a list (one vector for time, one vector for censoring status)
str(lung$SurvObj)
 Surv [1:228, 1:2]  306   455  1010+  210   883  1022+  310   361   218   166  ...
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:2] "time" "status"
 - attr(*, "type")= chr "right"

## Convert to the counting process format at given times
lung.split2 <- survSplit(data    = lung,
                         cut     = c(200,400,600), # vector of timepoints to cut at
                         end     = "time",  # character string with name of event time variable
                         event   = "event", # character string with name of censoring indicator
                         start   = "start", # character string with name of start time variable (created)
                         id      = "id",    # character string with name of new id variable to create
                         zero    = 0,       # If start doesn't already exist, used as start
                         episode = NULL     # character string with name of new episode variable (optional)
                         )

## id as numeric for ordering
lung.split2$id <- as.numeric(lung.split2$id)

## Reorder by id, then start time
library(doBy)
lung.split2 <- orderBy( ~ id + start, lung.split2)

## DUPLICATED ROWS!!
head(lung.split2, 20)
     inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj start id
1       3  200      2  74   1       1       90       100     1175      NA     0     306     0  1
913     3  200      2  74   1       1       90       100     1175      NA     0     306     0  1
229     3  306      2  74   1       1       90       100     1175      NA     1       1   200  1
1141    3  306      2  74   1       1       90       100     1175      NA     1       1   200  1
2       3  200      2  68   1       0       90        90     1225      15     0     455     0  2
914     3  200      2  68   1       0       90        90     1225      15     0     455     0  2
230     3  400      2  68   1       0       90        90     1225      15     0       1   200  2
1142    3  400      2  68   1       0       90        90     1225      15     0       1   200  2
458     3  455      2  68   1       0       90        90     1225      15     1     455   400  2
1370    3  455      2  68   1       0       90        90     1225      15     1     455   400  2
3       3  200      1  56   1       0       90        90       NA      15     0    1010     0  3
915     3  200      1  56   1       0       90        90       NA      15     0    1010     0  3
231     3  400      1  56   1       0       90        90       NA      15     0       0   200  3
1143    3  400      1  56   1       0       90        90       NA      15     0       0   200  3
459     3  600      1  56   1       0       90        90       NA      15     0    1010   400  3
1371    3  600      1  56   1       0       90        90       NA      15     0    1010   400  3
687     3 1010      1  56   1       0       90        90       NA      15     0       0   600  3
1599    3 1010      1  56   1       0       90        90       NA      15     0       0   600  3
4       5  200      2  57   1       1       90        60     1150      11     0     210     0  4
916     5  200      2  57   1       1       90        60     1150      11     0     210     0  4

## 906 rows!! (Twice compared to what is expected)
nrow(lung.split2)
[1] 906

## If duplications are removed 453 rows!
nrow(unique(lung.split2))
[1] 453

Solutions 1: Remove Surv object(s) if it exists in the data frame

This just removes any columns that have a class of “Surv” before running anything else.

survSplit.removeSurv <- function (data, cut, end, event, start, id = NULL, zero = 0,
    episode = NULL)
{

    ## Remove Surv object if it exists ##
    NotSurvColumns <- sapply(data, class) !=  "Surv"
    data <- data[NotSurvColumns]
    ## Ends here ##

    ## No change below
    cut <- sort(cut)
    ntimes <- length(cut)
    n <- nrow(data)
    p <- ncol(data)
    newdata <- lapply(data, rep, ntimes + 1)
    endtime <- rep(c(cut, Inf), each = n)
    eventtime <- newdata[[end]]
    if (start %in% names(data))
        starttime <- data[[start]]
    else starttime <- rep(zero, length.out = n)
    starttime <- c(starttime, pmax(starttime, rep(cut, each = n)))
    epi <- rep(0:ntimes, each = n)
    status <- ifelse(eventtime <= endtime & eventtime > starttime,
        newdata[[event]], 0)
    endtime <- pmin(endtime, eventtime)
    drop <- starttime >= endtime
    newdata <- do.call("data.frame", newdata)
    newdata[, start] <- starttime
    newdata[, end] <- endtime
    newdata[, event] <- status
    if (!is.null(id))
        newdata[, id] <- rep(rownames(data), ntimes + 1)
    if (!is.null(episode))
        newdata[, episode] <- epi
    newdata <- newdata[!drop, ]
    newdata
}

## Run
lung.split3 <-
    survSplit.removeSurv(data    = lung,
                         cut     = c(200,400,600), # vector of timepoints to cut at
                         end     = "time",  # character string with name of event time variable
                         event   = "event", # character string with name of censoring indicator
                         start   = "start", # character string with name of start time variable (created)
                         id      = "id",    # character string with name of new id variable to create
                         zero    = 0,       # If start doesn't already exist, used as start
                         episode = NULL     # character string with name of new episode variable (optional)
                         )

## id as numeric for ordering
lung.split3$id <- as.numeric(lung.split3$id)

## Reorder by id, then start time
lung.split3 <- orderBy( ~ id + start, lung.split3)

## No duplications!
head(lung.split3, 20)
    inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event start id
1      3  200      2  74   1       1       90       100     1175      NA     0     0  1
229    3  306      2  74   1       1       90       100     1175      NA     1   200  1
2      3  200      2  68   1       0       90        90     1225      15     0     0  2
230    3  400      2  68   1       0       90        90     1225      15     0   200  2
458    3  455      2  68   1       0       90        90     1225      15     1   400  2
3      3  200      1  56   1       0       90        90       NA      15     0     0  3
231    3  400      1  56   1       0       90        90       NA      15     0   200  3
459    3  600      1  56   1       0       90        90       NA      15     0   400  3
687    3 1010      1  56   1       0       90        90       NA      15     0   600  3
4      5  200      2  57   1       1       90        60     1150      11     0     0  4
232    5  210      2  57   1       1       90        60     1150      11     1   200  4
5      1  200      2  60   1       0      100        90       NA       0     0     0  5
233    1  400      2  60   1       0      100        90       NA       0     0   200  5
461    1  600      2  60   1       0      100        90       NA       0     0   400  5
689    1  883      2  60   1       0      100        90       NA       0     1   600  5
6     12  200      1  74   1       1       50        80      513       0     0     0  6
234   12  400      1  74   1       1       50        80      513       0     0   200  6
462   12  600      1  74   1       1       50        80      513       0     0   400  6
690   12 1022      1  74   1       1       50        80      513       0     0   600  6
7      7  200      2  68   2       2       70        60      384      10     0     0  7

## 453 rows: No duplications!
nrow(lung.split3)
[1] 453

Solution 2: Function that preserves Surv objects

survSplit.preserveDF <- function (data, cut, end, event, start, id = NULL, zero = 0,
    episode = NULL)
{
    cut <- sort(cut)
    ntimes <- length(cut)
    n <- nrow(data)
    p <- ncol(data)

    ## Maintain a data frame instead of creating a list
    ## newdata <- lapply(data, rep, ntimes + 1)
    newdata <- data[rep(seq_len(n), time = ntimes + 1), ]

    endtime <- rep(c(cut, Inf), each = n)
    eventtime <- newdata[[end]]
    if (start %in% names(data))
        starttime <- data[[start]]
    else starttime <- rep(zero, length.out = n)
    starttime <- c(starttime, pmax(starttime, rep(cut, each = n)))
    epi <- rep(0:ntimes, each = n)
    status <- ifelse(eventtime <= endtime & eventtime > starttime,
        newdata[[event]], 0)
    endtime <- pmin(endtime, eventtime)
    drop <- starttime >= endtime

    ## This line is not necessary.
    ## newdata <- do.call("data.frame", newdata)

    newdata[, start] <- starttime
    newdata[, end] <- endtime
    newdata[, event] <- status
    if (!is.null(id))
        newdata[, id] <- rep(rownames(data), ntimes + 1)
    if (!is.null(episode))
        newdata[, episode] <- epi
    newdata <- newdata[!drop, ]
    newdata
}

lung.split4 <-
    survSplit.preserveDF(data    = lung,
                         cut     = c(200,400,600), # vector of timepoints to cut at
                         end     = "time",  # character string with name of event time variable
                         event   = "event", # character string with name of censoring indicator
                         start   = "start", # character string with name of start time variable (created)
                         id      = "id",    # character string with name of new id variable to create
                         zero    = 0,       # If start doesn't already exist, used as start
                         episode = NULL     # character string with name of new episode variable (optional)
                         )

## id as numeric for ordering
lung.split4$id <- as.numeric(lung.split4$id)

## Reorder by id, then start time
lung.split4 <- orderBy( ~ id + start, lung.split4)

## No duplications!
head(lung.split4, 20)
    inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj start id
1      3  200      2  74   1       1       90       100     1175      NA     0    306      0  1
1.1    3  306      2  74   1       1       90       100     1175      NA     1    306    200  1
2      3  200      2  68   1       0       90        90     1225      15     0    455      0  2
2.1    3  400      2  68   1       0       90        90     1225      15     0    455    200  2
2.2    3  455      2  68   1       0       90        90     1225      15     1    455    400  2
3      3  200      1  56   1       0       90        90       NA      15     0   1010+     0  3
3.1    3  400      1  56   1       0       90        90       NA      15     0   1010+   200  3
3.2    3  600      1  56   1       0       90        90       NA      15     0   1010+   400  3
3.3    3 1010      1  56   1       0       90        90       NA      15     0   1010+   600  3
4      5  200      2  57   1       1       90        60     1150      11     0    210      0  4
4.1    5  210      2  57   1       1       90        60     1150      11     1    210    200  4
5      1  200      2  60   1       0      100        90       NA       0     0    883      0  5
5.1    1  400      2  60   1       0      100        90       NA       0     0    883    200  5
5.2    1  600      2  60   1       0      100        90       NA       0     0    883    400  5
5.3    1  883      2  60   1       0      100        90       NA       0     1    883    600  5
6     12  200      1  74   1       1       50        80      513       0     0   1022+     0  6
6.1   12  400      1  74   1       1       50        80      513       0     0   1022+   200  6
6.2   12  600      1  74   1       1       50        80      513       0     0   1022+   400  6
6.3   12 1022      1  74   1       1       50        80      513       0     0   1022+   600  6
7      7  200      2  68   2       2       70        60      384      10     0    310      0  7

## 453 rows: No duplications!
nrow(lung.split4)
[1] 453

Test code

## Create a testing function
## If it gives a count for FALSE, there is mismatch between survSplit and survSplit.preserveDF
survSplit.tester <- function(RETURN.DATA = FALSE,
                             data, cut, end, event, start, id = NULL, zero = 0, episode = NULL) {

    ## Mark columns not containing "Surv"
    NonSurvColumns <- sapply(data, class) != "Surv"

    ### Test: a dataset with a Surv object (data)
    ## Run the original survSplit on the dataset without Surv objects
    df.original <- survSplit(data, cut, end, event, start, id, zero, episode)
    ## Need to remove duplicated rows before comparing
    df.original <- unique(df.original)
    ## Need to drop the Surv object before comparing
    df.original <- df.original[, NonSurvColumns]

    ## Run the new survSplit on the dataset without Surv objects
    df.new <- survSplit.preserveDF(data, cut, end, event, start, id, zero, episode)
    ## Need to drop the Surv object before comparing
    df.new <- df.new[, NonSurvColumns]

    ## Check element-wise equality (NAs are NAs)
    mat.equality2 <- df.original == df.new
    ## Summarize result
    res.data.SurvPlus <- summary.factor(mat.equality2)


    ### Return test results
    if (RETURN.DATA == TRUE) {

        list(df.original = df.original,
             df.new      = df.new)

    } else {

        list(res.data.SurvPlus = res.data.SurvPlus
             )
    }
}


## Test case where there is no start time in the data frame
## Initialize data
data(lung)
## Create an event indicator
lung$event <- lung$status == 2
## Create a Surv object
lung$SurvObj <- with(lung, Surv(time, event))

survSplit.tester(data    = lung,
                 cut     = c(200,400,600), # vector of timepoints to cut at
                 end     = "time",  # character string with name of event time variable
                 event   = "event", # character string with name of censoring indicator
                 start   = "start", # character string with name of start time variable (created)
                 id      = "id",    # character string with name of new id variable to create
                 zero    = 0,       # If start doesn't already exist, used as start
                 episode = NULL     # character string with name of new episode variable (optional)
                 )
$res.data.SurvPlus
TRUE NA's 
5769  120 

## Test a case where there is a start time in the data frame
lung$start <- ifelse(lung$time > 100, lung$time - 100, lung$time - 10)

survSplit.tester(data    = lung,
                 cut     = c(200,400,600), # vector of timepoints to cut at
                 end     = "time",  # character string with name of event time variable
                 event   = "event", # character string with name of censoring indicator
                 start   = "start", # character string with name of start time variable (created)
                 id      = "id",    # character string with name of new id variable to create
                 zero    = 0,       # If start doesn't already exist, used as start
                 episode = NULL     # character string with name of new episode variable (optional)
                 )
$res.data.SurvPlus
TRUE NA's 
3872   80