library(microbenchmark)
suppressMessages(devtools::load_all())

Create different backends (locally)

cf.mem <- load_cytoframe_from_fcs("~/rglab/workspace/flowWorkspaceData/inst/extdata/CytoTrol_CytoTrol_1.fcs")
dim(cf.mem)
##     events parameters 
##     119531         12
h5_file <- tempfile(fileext = ".h5")
cf_write_h5(cf.mem, h5_file)
tile_dir <- tempfile(fileext = ".tile")
cf_write_tile(cf.mem, tile_dir)

Compare size

utils:::format.object_size(file.size(h5_file), "auto")
## [1] "5.5 Mb"
strsplit(system(paste0("du -sh ", tile_dir), intern = T), "\t")[[1]][1]
## [1] "5.9M"

initial load (only reads meta data)

microbenchmark(cf.h5 <- load_cytoframe(h5_file)
               ,cf.tile <- load_cytoframe(tile_dir)
               , times = 5)
## Unit: milliseconds
##                                 expr      min       lq     mean   median
##     cf.h5 <- load_cytoframe(h5_file) 3.624251 3.660252 4.031525 3.781626
##  cf.tile <- load_cytoframe(tile_dir) 5.647998 5.729530 7.291502 5.954164
##        uq       max neval
##  4.114318  4.977177     5
##  7.685320 11.440498     5
cf_get_uri(cf.mem)
## [1] ""
cf_get_uri(cf.h5)
## [1] "/tmp/RtmpHuQ2e1/file37767b41a56e.h5"
cf_get_uri(cf.tile)
## [1] "/tmp/RtmpHuQ2e1/file37765ef9a67a.tile"

Benchmark the random slicing

set.seed(1)
cidx <- c(1,5,8)
ridx <- sample(nrow(cf.h5), 1e3)

subset by channels

microbenchmark(a <- exprs(cf.mem[, cidx])
               , b <- exprs(cf.h5[, cidx])
               , c <- exprs(cf.tile[, cidx])
               , times = 5
              )
## Unit: milliseconds
##                         expr       min        lq      mean    median        uq
##   a <- exprs(cf.mem[, cidx])  4.744973  4.820243  5.245412  4.836304  5.450215
##    b <- exprs(cf.h5[, cidx])  4.353024  4.947002  5.053556  4.979597  5.228883
##  c <- exprs(cf.tile[, cidx]) 33.996457 38.511042 42.004904 38.980041 39.539437
##        max neval
##   6.375325     5
##   5.759276     5
##  58.997543     5
all.equal(a,b,c)
## [1] TRUE

subset by cols & rows

microbenchmark(a <- exprs(cf.mem[ridx, cidx])
               , b <- exprs(cf.h5[ridx, cidx])
               , c <- exprs(cf.tile[ridx, cidx])
               , times = 5
)
## Unit: milliseconds
##                             expr       min        lq      mean    median
##   a <- exprs(cf.mem[ridx, cidx])  2.369003  2.487385  2.849076  2.750922
##    b <- exprs(cf.h5[ridx, cidx])  4.036930  4.067461  4.539437  4.101277
##  c <- exprs(cf.tile[ridx, cidx]) 28.382903 28.535450 29.708429 29.338413
##         uq       max neval
##   2.792729  3.845342     5
##   4.117894  6.373624     5
##  31.013319 31.272060     5
all.equal(a,b,c)
## [1] TRUE

Compare remote storage

bucket <- "mike-h5"

Manually upload h5 to s3 using aws.s3 pkg since H5 doesnโ€™t support s3 write yet

library(aws.s3)
cred <- check_credential(NULL)
uri.h5 <- paste0("s3://", bucket, "/", basename(h5_file))
put_object(h5_file, uri.h5
           , region = cred$AWS_REGION
           , key = cred$AWS_ACCESS_KEY_ID
           , secret = cred$AWS_SECRET_ACCESS_KEY
          )
## [1] TRUE

tiledb can directly write to s3

uri.tile <- paste0("s3://", bucket, "/", basename(tile_dir))
cf_write_tile(cf.mem, uri.tile)

load cf remotely

microbenchmark(cf.h5 <- load_cytoframe(uri.h5)
               ,cf.tile <- load_cytoframe(uri.tile)
               , times = 3)
## Unit: seconds
##                                 expr      min       lq     mean   median
##      cf.h5 <- load_cytoframe(uri.h5) 1.245577 1.246506 1.247689 1.247435
##  cf.tile <- load_cytoframe(uri.tile) 2.164970 2.216001 2.240629 2.267033
##        uq      max neval
##  1.248744 1.250054     3
##  2.278459 2.289885     3
cf_get_uri(cf.h5)
## [1] "https://mike-h5.s3.amazonaws.com/file37767b41a56e.h5"
cf_get_uri(cf.tile)
## [1] "s3://mike-h5/file37765ef9a67a.tile"

subset by channels

microbenchmark( b <- exprs(cf.h5[, cidx])
               , c <- exprs(cf.tile[, cidx])
               , times = 3
)
## Unit: milliseconds
##                         expr       min        lq      mean    median       uq
##    b <- exprs(cf.h5[, cidx]) 1213.1904 1215.0520 1225.7256 1216.9136 1231.993
##  c <- exprs(cf.tile[, cidx])  453.5959  480.1759  938.9981  506.7559 1181.699
##       max neval
##  1247.073     3
##  1856.642     3
all.equal(b,c, check.attributes = F)
## [1] TRUE

subset by cols & rows

microbenchmark(b <- exprs(cf.h5[ridx, cidx])
               , c <- exprs(cf.tile[ridx, cidx])
               , times = 3
)
## Unit: milliseconds
##                             expr       min       lq      mean    median
##    b <- exprs(cf.h5[ridx, cidx]) 1166.2893 1169.746 1175.4951 1173.2026
##  c <- exprs(cf.tile[ridx, cidx])  499.1655  511.917  570.8657  524.6686
##         uq       max neval
##  1180.0980 1186.9934     3
##   606.7158  688.7631     3
all.equal(b,c, check.attributes = F)
## [1] TRUE

clean up

cf_cleanup(cf.h5)
## https://mike-h5.s3.amazonaws.com/file37767b41a56e.h5 is deleted!
cf_cleanup(cf.tile)
## s3://mike-h5/file37765ef9a67a.tile is deleted!