A Surv object in a data frame is not a vector, but it is a list of two vectors (one vector for time, one vector for censoring status). survSplit() does not handle this well and returns a data frame with duplicated rows.
## Load survival package
library(survival)
## Load lung data
data(lung)
## Create an event indicator
lung$event <- lung$status == 2
## Convert to the counting process format at given times
lung.split1 <- survSplit(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
## id as numeric
lung.split1$id <- as.numeric(lung.split1$id)
## Reorder
library(doBy)
lung.split1 <- orderBy( ~ id + start, lung.split1)
## This is good
head(lung.split1)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event start id
1 3 200 2 74 1 1 90 100 1175 NA 0 0 1
229 3 306 2 74 1 1 90 100 1175 NA 1 200 1
2 3 200 2 68 1 0 90 90 1225 15 0 0 2
230 3 400 2 68 1 0 90 90 1225 15 0 200 2
458 3 455 2 68 1 0 90 90 1225 15 1 400 2
3 3 200 1 56 1 0 90 90 NA 15 0 0 3
## 453 rows
nrow(lung.split1)
[1] 453
## Create a Surv object
lung$SurvObj <- with(lung, Surv(time, event))
## Check the Surv object
head(lung)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj
1 3 306 2 74 1 1 90 100 1175 NA TRUE 306
2 3 455 2 68 1 0 90 90 1225 15 TRUE 455
3 3 1010 1 56 1 0 90 90 NA 15 FALSE 1010+
4 5 210 2 57 1 1 90 60 1150 11 TRUE 210
5 1 883 2 60 1 0 100 90 NA 0 TRUE 883
6 12 1022 1 74 1 1 50 80 513 0 FALSE 1022+
## Surv class
class(lung$SurvObj)
[1] "Surv"
## It is not a vector. It is a list (one vector for time, one vector for censoring status)
str(lung$SurvObj)
Surv [1:228, 1:2] 306 455 1010+ 210 883 1022+ 310 361 218 166 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:2] "time" "status"
- attr(*, "type")= chr "right"
## Convert to the counting process format at given times
lung.split2 <- survSplit(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
## id as numeric for ordering
lung.split2$id <- as.numeric(lung.split2$id)
## Reorder by id, then start time
library(doBy)
lung.split2 <- orderBy( ~ id + start, lung.split2)
## DUPLICATED ROWS!!
head(lung.split2, 20)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj start id
1 3 200 2 74 1 1 90 100 1175 NA 0 306 0 1
913 3 200 2 74 1 1 90 100 1175 NA 0 306 0 1
229 3 306 2 74 1 1 90 100 1175 NA 1 1 200 1
1141 3 306 2 74 1 1 90 100 1175 NA 1 1 200 1
2 3 200 2 68 1 0 90 90 1225 15 0 455 0 2
914 3 200 2 68 1 0 90 90 1225 15 0 455 0 2
230 3 400 2 68 1 0 90 90 1225 15 0 1 200 2
1142 3 400 2 68 1 0 90 90 1225 15 0 1 200 2
458 3 455 2 68 1 0 90 90 1225 15 1 455 400 2
1370 3 455 2 68 1 0 90 90 1225 15 1 455 400 2
3 3 200 1 56 1 0 90 90 NA 15 0 1010 0 3
915 3 200 1 56 1 0 90 90 NA 15 0 1010 0 3
231 3 400 1 56 1 0 90 90 NA 15 0 0 200 3
1143 3 400 1 56 1 0 90 90 NA 15 0 0 200 3
459 3 600 1 56 1 0 90 90 NA 15 0 1010 400 3
1371 3 600 1 56 1 0 90 90 NA 15 0 1010 400 3
687 3 1010 1 56 1 0 90 90 NA 15 0 0 600 3
1599 3 1010 1 56 1 0 90 90 NA 15 0 0 600 3
4 5 200 2 57 1 1 90 60 1150 11 0 210 0 4
916 5 200 2 57 1 1 90 60 1150 11 0 210 0 4
## 906 rows!! (Twice compared to what is expected)
nrow(lung.split2)
[1] 906
## If duplications are removed 453 rows!
nrow(unique(lung.split2))
[1] 453
This just removes any columns that have a class of “Surv” before running anything else.
survSplit.removeSurv <- function (data, cut, end, event, start, id = NULL, zero = 0,
episode = NULL)
{
## Remove Surv object if it exists ##
NotSurvColumns <- sapply(data, class) != "Surv"
data <- data[NotSurvColumns]
## Ends here ##
## No change below
cut <- sort(cut)
ntimes <- length(cut)
n <- nrow(data)
p <- ncol(data)
newdata <- lapply(data, rep, ntimes + 1)
endtime <- rep(c(cut, Inf), each = n)
eventtime <- newdata[[end]]
if (start %in% names(data))
starttime <- data[[start]]
else starttime <- rep(zero, length.out = n)
starttime <- c(starttime, pmax(starttime, rep(cut, each = n)))
epi <- rep(0:ntimes, each = n)
status <- ifelse(eventtime <= endtime & eventtime > starttime,
newdata[[event]], 0)
endtime <- pmin(endtime, eventtime)
drop <- starttime >= endtime
newdata <- do.call("data.frame", newdata)
newdata[, start] <- starttime
newdata[, end] <- endtime
newdata[, event] <- status
if (!is.null(id))
newdata[, id] <- rep(rownames(data), ntimes + 1)
if (!is.null(episode))
newdata[, episode] <- epi
newdata <- newdata[!drop, ]
newdata
}
## Run
lung.split3 <-
survSplit.removeSurv(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
## id as numeric for ordering
lung.split3$id <- as.numeric(lung.split3$id)
## Reorder by id, then start time
lung.split3 <- orderBy( ~ id + start, lung.split3)
## No duplications!
head(lung.split3, 20)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event start id
1 3 200 2 74 1 1 90 100 1175 NA 0 0 1
229 3 306 2 74 1 1 90 100 1175 NA 1 200 1
2 3 200 2 68 1 0 90 90 1225 15 0 0 2
230 3 400 2 68 1 0 90 90 1225 15 0 200 2
458 3 455 2 68 1 0 90 90 1225 15 1 400 2
3 3 200 1 56 1 0 90 90 NA 15 0 0 3
231 3 400 1 56 1 0 90 90 NA 15 0 200 3
459 3 600 1 56 1 0 90 90 NA 15 0 400 3
687 3 1010 1 56 1 0 90 90 NA 15 0 600 3
4 5 200 2 57 1 1 90 60 1150 11 0 0 4
232 5 210 2 57 1 1 90 60 1150 11 1 200 4
5 1 200 2 60 1 0 100 90 NA 0 0 0 5
233 1 400 2 60 1 0 100 90 NA 0 0 200 5
461 1 600 2 60 1 0 100 90 NA 0 0 400 5
689 1 883 2 60 1 0 100 90 NA 0 1 600 5
6 12 200 1 74 1 1 50 80 513 0 0 0 6
234 12 400 1 74 1 1 50 80 513 0 0 200 6
462 12 600 1 74 1 1 50 80 513 0 0 400 6
690 12 1022 1 74 1 1 50 80 513 0 0 600 6
7 7 200 2 68 2 2 70 60 384 10 0 0 7
## 453 rows: No duplications!
nrow(lung.split3)
[1] 453
survSplit.preserveDF <- function (data, cut, end, event, start, id = NULL, zero = 0,
episode = NULL)
{
cut <- sort(cut)
ntimes <- length(cut)
n <- nrow(data)
p <- ncol(data)
## Maintain a data frame instead of creating a list
## newdata <- lapply(data, rep, ntimes + 1)
newdata <- data[rep(seq_len(n), time = ntimes + 1), ]
endtime <- rep(c(cut, Inf), each = n)
eventtime <- newdata[[end]]
if (start %in% names(data))
starttime <- data[[start]]
else starttime <- rep(zero, length.out = n)
starttime <- c(starttime, pmax(starttime, rep(cut, each = n)))
epi <- rep(0:ntimes, each = n)
status <- ifelse(eventtime <= endtime & eventtime > starttime,
newdata[[event]], 0)
endtime <- pmin(endtime, eventtime)
drop <- starttime >= endtime
## This line is not necessary.
## newdata <- do.call("data.frame", newdata)
newdata[, start] <- starttime
newdata[, end] <- endtime
newdata[, event] <- status
if (!is.null(id))
newdata[, id] <- rep(rownames(data), ntimes + 1)
if (!is.null(episode))
newdata[, episode] <- epi
newdata <- newdata[!drop, ]
newdata
}
lung.split4 <-
survSplit.preserveDF(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
## id as numeric for ordering
lung.split4$id <- as.numeric(lung.split4$id)
## Reorder by id, then start time
lung.split4 <- orderBy( ~ id + start, lung.split4)
## No duplications!
head(lung.split4, 20)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event SurvObj start id
1 3 200 2 74 1 1 90 100 1175 NA 0 306 0 1
1.1 3 306 2 74 1 1 90 100 1175 NA 1 306 200 1
2 3 200 2 68 1 0 90 90 1225 15 0 455 0 2
2.1 3 400 2 68 1 0 90 90 1225 15 0 455 200 2
2.2 3 455 2 68 1 0 90 90 1225 15 1 455 400 2
3 3 200 1 56 1 0 90 90 NA 15 0 1010+ 0 3
3.1 3 400 1 56 1 0 90 90 NA 15 0 1010+ 200 3
3.2 3 600 1 56 1 0 90 90 NA 15 0 1010+ 400 3
3.3 3 1010 1 56 1 0 90 90 NA 15 0 1010+ 600 3
4 5 200 2 57 1 1 90 60 1150 11 0 210 0 4
4.1 5 210 2 57 1 1 90 60 1150 11 1 210 200 4
5 1 200 2 60 1 0 100 90 NA 0 0 883 0 5
5.1 1 400 2 60 1 0 100 90 NA 0 0 883 200 5
5.2 1 600 2 60 1 0 100 90 NA 0 0 883 400 5
5.3 1 883 2 60 1 0 100 90 NA 0 1 883 600 5
6 12 200 1 74 1 1 50 80 513 0 0 1022+ 0 6
6.1 12 400 1 74 1 1 50 80 513 0 0 1022+ 200 6
6.2 12 600 1 74 1 1 50 80 513 0 0 1022+ 400 6
6.3 12 1022 1 74 1 1 50 80 513 0 0 1022+ 600 6
7 7 200 2 68 2 2 70 60 384 10 0 310 0 7
## 453 rows: No duplications!
nrow(lung.split4)
[1] 453
## Create a testing function
## If it gives a count for FALSE, there is mismatch between survSplit and survSplit.preserveDF
survSplit.tester <- function(RETURN.DATA = FALSE,
data, cut, end, event, start, id = NULL, zero = 0, episode = NULL) {
## Mark columns not containing "Surv"
NonSurvColumns <- sapply(data, class) != "Surv"
### Test: a dataset with a Surv object (data)
## Run the original survSplit on the dataset without Surv objects
df.original <- survSplit(data, cut, end, event, start, id, zero, episode)
## Need to remove duplicated rows before comparing
df.original <- unique(df.original)
## Need to drop the Surv object before comparing
df.original <- df.original[, NonSurvColumns]
## Run the new survSplit on the dataset without Surv objects
df.new <- survSplit.preserveDF(data, cut, end, event, start, id, zero, episode)
## Need to drop the Surv object before comparing
df.new <- df.new[, NonSurvColumns]
## Check element-wise equality (NAs are NAs)
mat.equality2 <- df.original == df.new
## Summarize result
res.data.SurvPlus <- summary.factor(mat.equality2)
### Return test results
if (RETURN.DATA == TRUE) {
list(df.original = df.original,
df.new = df.new)
} else {
list(res.data.SurvPlus = res.data.SurvPlus
)
}
}
## Test case where there is no start time in the data frame
## Initialize data
data(lung)
## Create an event indicator
lung$event <- lung$status == 2
## Create a Surv object
lung$SurvObj <- with(lung, Surv(time, event))
survSplit.tester(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
$res.data.SurvPlus
TRUE NA's
5769 120
## Test a case where there is a start time in the data frame
lung$start <- ifelse(lung$time > 100, lung$time - 100, lung$time - 10)
survSplit.tester(data = lung,
cut = c(200,400,600), # vector of timepoints to cut at
end = "time", # character string with name of event time variable
event = "event", # character string with name of censoring indicator
start = "start", # character string with name of start time variable (created)
id = "id", # character string with name of new id variable to create
zero = 0, # If start doesn't already exist, used as start
episode = NULL # character string with name of new episode variable (optional)
)
$res.data.SurvPlus
TRUE NA's
3872 80