library(microbenchmark)
suppressMessages(devtools::load_all())

Create different backends (locally)

cf.mem <- load_cytoframe_from_fcs("~/rglab/workspace/flowWorkspaceData/inst/extdata/CytoTrol_CytoTrol_1.fcs")
dim(cf.mem)
##     events parameters 
##     119531         12
h5_file <- tempfile(fileext = ".h5")
cf_write_h5(cf.mem, h5_file)
tile_dir <- tempfile(fileext = ".tile")
cf_write_tile(cf.mem, tile_dir)

Compare size

utils:::format.object_size(file.size(h5_file), "auto")
## [1] "5.5 Mb"
strsplit(system(paste0("du -sh ", tile_dir), intern = T), "\t")[[1]][1]
## [1] "5.6M"

initial load (only reads meta data)

microbenchmark(cf.h5 <- load_cytoframe(h5_file)
               ,cf.tile <- load_cytoframe(tile_dir)
               , times = 5)
## Unit: milliseconds
##                                 expr      min       lq     mean   median
##     cf.h5 <- load_cytoframe(h5_file) 3.731425 3.753496 4.656725 4.642060
##  cf.tile <- load_cytoframe(tile_dir) 5.799948 5.922622 7.429375 5.980265
##        uq       max neval
##  4.975144  6.181498     5
##  7.313724 12.130318     5
cf_get_uri(cf.mem)
## [1] ""
cf_get_uri(cf.h5)
## [1] "/tmp/RtmpVBaTYZ/file16fa1b544846.h5"
cf_get_uri(cf.tile)
## [1] "/tmp/RtmpVBaTYZ/file16fa35e10fc7.tile"

Benchmark the random slicing

set.seed(1)
cidx <- c(1,5,8)
ridx <- sample(nrow(cf.h5), 1e3)

subset by channels

microbenchmark(a <- exprs(cf.mem[, cidx])
               , b <- exprs(cf.h5[, cidx])
               , c <- exprs(cf.tile[, cidx])
               , times = 5
              )
## Unit: milliseconds
##                         expr      min       lq      mean   median        uq
##   a <- exprs(cf.mem[, cidx]) 4.013820 4.109259  4.570120 4.228480  4.705207
##    b <- exprs(cf.h5[, cidx]) 4.976243 5.425548  6.077632 5.925931  6.399990
##  c <- exprs(cf.tile[, cidx]) 7.032783 8.472395 11.789017 9.055889 10.559067
##        max neval
##   5.793835     5
##   7.660447     5
##  23.824953     5
all.equal(a,b,c)
## [1] TRUE

subset by cols & rows

microbenchmark(a <- exprs(cf.mem[ridx, cidx])
               , b <- exprs(cf.h5[ridx, cidx])
               , c <- exprs(cf.tile[ridx, cidx])
               , times = 5
)
## Unit: milliseconds
##                             expr       min        lq      mean    median
##   a <- exprs(cf.mem[ridx, cidx])  2.490350  2.775796  3.160115  2.894229
##    b <- exprs(cf.h5[ridx, cidx])  4.100077  4.548739  5.087715  5.106019
##  c <- exprs(cf.tile[ridx, cidx]) 12.394133 13.538592 14.282130 13.745210
##         uq       max neval
##   2.918879  4.721321     5
##   5.335178  6.348561     5
##  14.811257 16.921458     5
all.equal(a,b,c)
## [1] TRUE

Compare remote storage

bucket <- "mike-h5"

Manually upload h5 to s3 using aws.s3 pkg since H5 doesnโ€™t support s3 write yet

library(aws.s3)
cred <- check_credential(NULL)
uri.h5 <- paste0("s3://", bucket, "/", basename(h5_file))
put_object(h5_file, uri.h5
           , region = cred$AWS_REGION
           , key = cred$AWS_ACCESS_KEY_ID
           , secret = cred$AWS_SECRET_ACCESS_KEY
          )
## [1] TRUE

tiledb can directly write to s3

uri.tile <- paste0("s3://", bucket, "/", basename(tile_dir))
cf_write_tile(cf.mem, uri.tile)

load cf remotely

microbenchmark(cf.h5 <- load_cytoframe(uri.h5)
               ,cf.tile <- load_cytoframe(uri.tile)
               , times = 3)
## Unit: seconds
##                                 expr      min       lq     mean   median
##      cf.h5 <- load_cytoframe(uri.h5) 1.208530 1.273865 1.324207 1.339200
##  cf.tile <- load_cytoframe(uri.tile) 2.076306 2.095645 2.120654 2.114984
##        uq      max neval
##  1.382045 1.424890     3
##  2.142828 2.170672     3
cf_get_uri(cf.h5)
## [1] "https://mike-h5.s3.amazonaws.com/file16fa1b544846.h5"
cf_get_uri(cf.tile)
## [1] "s3://mike-h5/file16fa35e10fc7.tile"

subset by channels

microbenchmark( b <- exprs(cf.h5[, cidx])
               , c <- exprs(cf.tile[, cidx])
               , times = 3
)
## Unit: milliseconds
##                         expr       min        lq      mean    median       uq
##    b <- exprs(cf.h5[, cidx]) 1216.7192 1238.4951 1280.3548 1260.2710 1312.173
##  c <- exprs(cf.tile[, cidx])  439.6784  449.5028  942.0454  459.3271 1193.229
##       max neval
##  1364.074     3
##  1927.131     3
all.equal(b,c, check.attributes = F)
## [1] TRUE

subset by cols & rows

microbenchmark(b <- exprs(cf.h5[ridx, cidx])
               , c <- exprs(cf.tile[ridx, cidx])
               , times = 3
)
## Unit: milliseconds
##                             expr       min        lq      mean    median
##    b <- exprs(cf.h5[ridx, cidx]) 1231.2594 1231.6869 1237.1049 1232.1143
##  c <- exprs(cf.tile[ridx, cidx])  480.5955  495.3953  552.6094  510.1951
##         uq       max neval
##  1240.0277 1247.9411     3
##   588.6164  667.0377     3
all.equal(b,c, check.attributes = F)
## [1] TRUE

clean up

cf_cleanup(cf.h5)
## https://mike-h5.s3.amazonaws.com/file16fa1b544846.h5 is deleted!
cf_cleanup(cf.tile)
## s3://mike-h5/file16fa35e10fc7.tile is deleted!