Row-by-row calculations can be a bit slow in R…
mydf <- inDF <- structure(list(V1 = c("a", "c", "d", "f"), V2 = c("ha", "hb",
"hc", "hd"), V3 = c("1;2;3", "4", "5;6", "7"), V4 = c("A", "B", "C", "D")),
.Names = c("V1", "V2", "V3", "V4"), row.names = c("1", "2", "3", "4"), class = "data.frame")
mydf
## V1 V2 V3 V4
## 1 a ha 1;2;3 A
## 2 c hb 4 B
## 3 d hc 5;6 C
## 4 f hd 7 D
Our benchmarking functions:
AM_manual <- function() {
V3 <- strsplit(mydf$V3, ";", fixed = TRUE)
mydf2 <- mydf[rep(seq_along(V3), sapply(V3, length)), ]
mydf2$V3 <- unlist(V3)
mydf2
}
library(splitstackshape)
## Loading required package: data.table
AM_csm <- function() {
df2 <- concat.split.multiple(mydf, split.cols = "V3", seps = ";", direction = "long")
df2 <- df2[complete.cases(df2), ] ## Optional, perhaps
df2[order(df2$V1), ]
}
J_df <- function() {
spl = split(inDF, 1:nrow(inDF))
transformed = lapply(spl, function(x) {
data.frame(V1 = x$V1, V2 = x$V2, V3 = strsplit(x$V3, ";")[[1]], V4 = x$V4)
})
outDF = do.call(rbind, transformed)
}
J_cbind <- function() {
spl = split(inDF, 1:nrow(inDF))
do.call(rbind, lapply(spl, function(x) {
cbind(x[, c("V1", "V2", "V4")], V3 = strsplit(x[, "V3"], ";", fixed = TRUE)[[1]],
row.names = NULL)
}))
}
The benchmarking:
library(microbenchmark)
microbenchmark(AM_manual(), AM_csm(), J_df(), J_cbind())
## Unit: microseconds
## expr min lq median uq max neval
## AM_manual() 278 305.1 319.1 338.9 3413 100
## AM_csm() 11804 11947.2 12040.0 13434.9 26721 100
## J_df() 9484 9625.2 9708.9 9835.8 12432 100
## J_cbind() 4822 4930.5 4973.4 5048.0 6796 100
concat.split.multiple appears to be the slowest, but we are talking about microseconds, so I don't think most people would bat an eye. Now, let's make things a little bit more interesting and see how well the approaches fare:
backup <- mydf
## 4,000 rows
mydf <- inDF <- do.call(rbind, replicate(1000, mydf, simplify = FALSE))
system.time(J_df())
## user system elapsed
## 10.936 0.016 11.064
system.time(J_cbind())
## user system elapsed
## 5.631 0.016 5.823
microbenchmark(AM_manual(), AM_csm(), times = 20)
## Unit: milliseconds
## expr min lq median uq max neval
## AM_manual() 11.14 11.57 14.38 15.69 70.89 20
## AM_csm() 241.29 248.04 258.63 292.78 392.48 20