Source for this document at https://gist.github.com/wch/9233873
What’s the fastest way to collect strings together in R and put them into a single output string? Probably the fastest way is to simply use paste0('string1', 'string2'), and so on – but this assumes that you have all the strings collected and ready at one time. In many cases, this isn’t possible, and you need to collect the strings together as you go.
This document contains benchmarks for different ways of collecting strings together. Some highlights:
textConnection is super slow.file(open = "w+") is much faster.Some setup code for the benchmarks:
# Number of iterations
count <- 20000
# Some text to output
txt <- paste(rep("a", 100), collapse = "")
# The expected output
expected <- paste(rep(txt, count), collapse = "")
assert <- function(val) {
if (!val) stop("Assertion failed")
}
This grows a character vector as it goes along.
system.time({
res <- character()
for (i in 1:count) res[i] <- txt
out <- paste(res, collapse = "")
assert(identical(out, expected))
})
## user system elapsed
## 1.402 0.108 1.530
The drawback to this method is that you can’t always know the total number of strings ahead of time.
system.time({
res <- character(count)
for (i in 1:count) res[i] <- txt
out <- paste(res, collapse = "")
assert(identical(out, expected))
})
## user system elapsed
## 0.044 0.001 0.045
textConnection and catsystem.time({
htmlResult <- NULL
conn <- textConnection("htmlResult", "w", local = TRUE)
for (i in 1:count) cat(txt, file = conn)
close(conn)
out <- paste(htmlResult, collapse = "\n")
assert(identical(out, expected))
})
## user system elapsed
## 15.810 3.937 19.883
file and catsystem.time({
conn <- file(open="w+")
for (i in 1:count) cat(txt, file = conn)
flush(conn)
out <- readLines(conn, warn = FALSE)
close(conn)
assert(identical(out, expected))
})
## user system elapsed
## 0.295 0.043 0.338
file and writeCharsystem.time({
conn <- file(open="w+b")
for (i in 1:count) writeChar(txt, conn, eos = NULL)
flush(conn)
out <- readLines(conn, warn = FALSE)
close(conn)
assert(identical(out, expected))
})
## user system elapsed
## 0.267 0.006 0.274
textVector uses a character vector that doubles in length whenever a new item is added that makes it exceed its current length.
# textVector implemented with char vector
textVector <- function(n = 1e2) {
output <- vector("character", n)
i <- 0
add <- function(text) {
i <<- i + 1
if (i > n) {
n <<- 2 * n
length(output) <<- n
}
output[i] <<- text
}
extract <- function() {
paste(output[seq_len(i)], collapse ="")
}
list(add = add, extract = extract)
}
system.time({
tv <- textVector()
add <- tv$add
for (i in 1:count) add(txt)
out <- tv$extract()
assert(identical(out, expected))
})
## user system elapsed
## 0.085 0.001 0.087
This version of textVector2 uses a list that doubles in length whenever a new item is added that makes it exceed its current length.
# textVector implemented with lists
textVector2 <- function(n = 1e2) {
output <- list()
length(output) <- n
i <- 0
add <- function(text) {
i <<- i + 1
if (i > n) {
n <<- 2 * n
length(output) <<- n
}
output[[i]] <<- text
}
extract <- function() {
paste(output[seq_len(i)], collapse ="")
}
list(add = add, extract = extract)
}
system.time({
tv <- textVector2()
add <- tv$add
for (i in 1:count) add(txt)
out <- tv$extract()
assert(identical(out, expected))
})
## user system elapsed
## 0.083 0.000 0.084
sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] evaluate_0.5.1 formatR_0.10 knitr_1.5 rmarkdown_0.1.2
## [5] stringr_0.6.2 tools_3.0.2