An examination of the relative performance of base hashed enviornments, listenv::listenv(), and hash::hash() inspired by and code modified and extended from original articles by Jeffery Horner:
Caveat, for this examination I compare only those cases where the performance of the given operation was best or the only approach possible (to the best of my knowledge). Therefore, because it is not possible to the best of my knowledge, no comparison is made for numerically indexed environments or hashes. As performance for string indexed listenv poorly compared against hash and base environment, I skip those results.
library(plyr)
library(ggplot2)
library(hash)
## hash-2.2.6 provided by Decision Patterns
library(listenv)
library(microbenchmark)
library(uuid)
library(assertthat)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:hash':
##
## copy
generateUUIDs <- function(n) {
assert_that(is.numeric(n))
n <- trunc(n)
if (n <= 7000*12/getOption("mc.cores",1) | .Platform$OS.type != "unix") {
return(replicate(n,UUIDgenerate()))
} else {
return(mcsapply(1:n,function(x) {UUIDgenerate()}))
}
}
# Create n unique strings. We use upper and lower case letters only. Create
# length 1 strings first, and if we haven't satisfied the number of strings
# requested, we'll create length 2 strings, and so on until we've created n
# strings.
#
# Returns them in random order
#
unique_strings <- function(n){
string_i <- 1
string_len <- 1
ans <- character(n)
chars <- c(letters,LETTERS)
new_strings <- function(len,pfx){
for(i in 1:length(chars)){
if (len == 1){
ans[string_i] <<- paste(pfx,chars[i],sep='')
string_i <<- string_i + 1
} else {
new_strings(len-1,pfx=paste(pfx,chars[i],sep=''))
}
if (string_i > n) return ()
}
}
while(string_i <= n){
new_strings(string_len,'')
string_len <- string_len + 1
}
sample(ans)
}
timingsEnv <- adply(2^(10:15),.mar=1,.fun=function(i){
strings <- unique_strings(i)
ht1 <- new.env(hash=TRUE)
lapply(strings, function(s){ ht1[[s]] <<- 0L})
data.frame(
size=c(i,i),
seconds=c(
system.time(for (j in 1:i) ht1[[strings[j]]]==0L)[3]),
type = c('1_hashedEnv')
)
})
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht1[[strings[j]]] == : row names were found from a short variable and
## have been discarded
timingsListEnv <- adply(2^(10:15),.mar=1,.fun=function(i){
strings <- unique_strings(i)
le <- listenv()
lapply(strings, function(s) le[[s]] <<- 0L)
data.frame(
size=c(i,i),
seconds=c(
system.time(for (k in 1:i) le[[k]]==0L)[3]),
type = c('2_numericListEnv')
)
})
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (k in
## 1:i) le[[k]] == : row names were found from a short variable and have been
## discarded
timingsHash <- adply(2^(10:15),.mar=1,.fun=function(i){
strings <- unique_strings(i)
ht <- hash()
lapply(strings, function(s) ht[[s]] <<- 0L)
data.frame(
size=c(i,i),
seconds=c(
system.time(for (j in 1:i) ht[[strings[j]]]==0L)[3]),
type = c('3_stringHash')
)
})
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
## Warning in data.frame(size = c(i, i), seconds = c(system.time(for (j in
## 1:i) ht[[strings[j]]] == : row names were found from a short variable and
## have been discarded
plotDat <- rbind(timingsHash,timingsEnv,timingsListEnv)
p2 <- ggplot(plotDat,aes(x=size,y=seconds,group=type)) +
geom_point(aes(color=type)) + geom_line(aes(color=type)) +
scale_x_log10(
breaks=2^(10:15),
labels=c(expression(2^10), expression(2^11),expression(2^12),
expression(2^13), expression(2^14), expression(2^15))
) +
theme(
axis.text = element_text(colour = "black")
) +
ggtitle('Search Time for hash')
p2
The string insert time for listenv is tested here because it is thought to be less efficient than the numeric insert method and is consistent with how the other methods are being tested.
INSERT_subset <- function(){
ht <- new.env()
ht[["foo"]] <- 1
}
INSERT_subsetHash <- function(){
ht <- hash()
ht[["foo"]] <- 1
}
INSERT_subsetListenv <- function(){
ht <- listenv()
ht[["foo"]] <- 1
}
microbenchmark(INSERT_subset(), INSERT_subsetHash(), INSERT_subsetListenv, times=10L^5)
## Unit: nanoseconds
## expr min lq mean median uq max neval
## INSERT_subset() 1422 2129 2698.87354 2668 3015 1637237 1e+05
## INSERT_subsetHash() 35346 40255 43972.91450 41131 42181 57796710 1e+05
## INSERT_subsetListenv 21 25 75.21278 60 117 456 1e+05
## cld
## b
## c
## a
Perhaps the initial creation of the data structure is expensive. Let’s try again holding that operation out from the benchmark.
hte <- new.env()
hth <- hash()
le <- listenv()
INSERT_subset <- function(){
hte[["foo"]] <- 1
}
INSERT_subsetHash <- function(){
hth[["foo"]] <- 1
}
INSERT_subsetListenv <- function(){
ht[["foo"]] <- 1
}
microbenchmark(INSERT_subset(), INSERT_subsetHash(), INSERT_subsetListenv, times=10L^5)
## Unit: nanoseconds
## expr min lq mean median uq max neval cld
## INSERT_subset() 558 758 910.68384 877 1033 15066 1e+05 b
## INSERT_subsetHash() 4520 5127 5767.00191 5398 5722 3349055 1e+05 c
## INSERT_subsetListenv 21 25 47.45736 51 59 11117 1e+05 a
Does anything differ if the data structures being inserted into are larger?
longList <- as.list(1:(2^(15)))
longListenv <- as.listenv(longList)
longNamedList <- longList
uuids <- generateUUIDs(2^15)
names(longNamedList) <- uuids
longHash <- hash(longNamedList)
longEnv <- as.environment(longNamedList)
INSERT_subset <- function(){
longEnv[[uuids[1]]] <- 1
}
INSERT_subsetHash <- function(){
longHash[[uuids[1]]] <- 1
}
INSERT_subsetListenv <- function(){
longListenv[[1]] <- 1
}
microbenchmark(INSERT_subset(), INSERT_subsetHash(), INSERT_subsetListenv, times=10L^5)
## Unit: nanoseconds
## expr min lq mean median uq max neval cld
## INSERT_subset() 801 1082 1555.73665 1242.0 1411 6142302 1e+05 b
## INSERT_subsetHash() 5245 5856 6883.52504 6164.5 6650 5668481 1e+05 c
## INSERT_subsetListenv 21 26 48.06831 51.0 60 11250 1e+05 a
Can we convert between these types of data structures and how long does it take?
stripNames <- function(x) {
names(x) <- c()
return(x)
}
hashToEnv <- as.environment(hash(longNamedList))
envToHash <- hash(as.list(hashToEnv))
# Slow benchmarking skipped
microbenchmark(
hashToEnv <- as.environment(longHash),
envToHash <- hash(as.list(longEnv)),
EnvToListenv <- as.listenv(stripNames(hashToEnv)),
HashToListenv <- as.listenv(stripNames(envToHash))
, times=10L^1)
## Unit: microseconds
## expr min
## hashToEnv <- as.environment(longHash) 3.193
## envToHash <- hash(as.list(longEnv)) 179671.855
## EnvToListenv <- as.listenv(stripNames(hashToEnv)) 4676460.137
## HashToListenv <- as.listenv(stripNames(envToHash)) 3865550.984
## lq mean median uq max neval cld
## 4.298 22.1455 26.676 32.977 33.737 10 a
## 187206.150 187254.7866 188616.735 190104.653 191443.987 10 a
## 4784577.229 4832374.8561 4851655.965 4904423.029 4912713.973 10 b
## 4825682.087 4716356.6845 4866896.576 4882920.285 4914787.039 10 b
system.time({
names(longListenv) <- uuids
listenvToEnv <- as.environment(longListenv)
})
## user system elapsed
## 0 0 0
Adding names to listenv is fast, but pulling the names off a hash or environment to build it into a numerically indexed listenv is slow.
Relative to less optimized methods, e.g. vectors and string indexed lists, each of the three methods examined performs quite well. However, as the size of the data structure increases, the advantage for R environments over hashes and listenv increases. listenv is less efficient than hash. Insert performance differs by approximately an order of magnitude between examined methods. listenv is the fastest, followed by environment, followed by hash. Everyone’s use case may differ, but for my milliseconds and typical use cases, I’ve reached the conclusion that hashed environments are generally the way to go for me. The only exception would be a segmented workflow in which I find myself doing any insert operations and then many retrieval operations. In those cases, it may be worthwhile to do the inserts into a listenv and then coerce them out to an environment for retrieval but only if the speed of that operation is of little to no concern.