suppressPackageStartupMessages(
  { library(flowCore)
    library(flowWorkspace)
    library(microbenchmark)
    library(utils)
    library(parallel)
    })
## Warning: replacing previous import 'ncdfFlow::filter' by 'dplyr::filter' when
## loading 'flowWorkspace'

load a faust generated gs

tmp_mem <- "~/Downloads/gs_faust/"
gs <- load_gs(tmp_mem)
gh_idx_get_uri(gs[[1]])
## [1] ""

dataset size

length(gs) #nSamples
## [1] 82
length(gs_get_pop_paths(gs)) #nNodes
## [1] 695

convert to ondisk-idx

system.time(gs_convert_idx_to_ondisk(gs))
##    user  system elapsed 
##  44.834   1.476  39.822
tmp_ondisk <- tempfile()
system.time(save_gs(gs, tmp_ondisk))
##    user  system elapsed 
##   0.533   2.230   5.245

Benchmark

loading time

microbenchmark(gs <- load_gs(tmp_mem), gs2 <- load_gs(tmp_ondisk), times = 2)
## Unit: milliseconds
##                        expr       min        lq      mean    median        uq
##      gs <- load_gs(tmp_mem) 2596.1655 2596.1655 2631.3459 2631.3459 2666.5263
##  gs2 <- load_gs(tmp_ondisk)  485.0436  485.0436  501.1024  501.1024  517.1612
##        max neval
##  2666.5263     2
##   517.1612     2
gh_idx_get_uri(gs[[1]])
## [1] ""
gh_idx_get_uri(gs2[[1]])
## [1] "/tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu Vaccine-K 11-09-10_NCH-VK 21 TFH Panel.fcs.idx"

disk layout

#‘ ### total pb file size for in-mem idx gs
utils:::format.object_size(sum(file.size(list.files(tmp_mem, "*.pb", full.names = T))), "auto")
## [1] "177.8 Mb"
#‘ ### total pb file size for ondisk idx gs
utils:::format.object_size(sum(file.size(list.files(tmp_ondisk, "*.pb", full.names = T))), "auto")
## [1] "4.5 Mb"
#‘ ### total idx file size for on-disk idx gs
cat(tail(system(paste0("du ", tmp_ondisk, "/*.idx -ch "), intern = T)), sep = "\n")
## 1.1M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu Vaccine-Kids 05-17-11 6M_NCH-VK 37 TFH Panel.fcs.idx
## 1.4M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_38_TFH_Panel.fcs.idx/__meta
## 1.4M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_38_TFH_Panel.fcs.idx
## 1.3M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_40_TFH_Panel.fcs.idx/__meta
## 1.3M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_40_TFH_Panel.fcs.idx
## 120M total

idx loading

slower initial idx reading (but in real use case, traversing idx from all nodes is rarely needed)

traverse_idx <- function(gh) {
  for(i in gh_get_pop_paths(gh))
    a <- gh_pop_get_indices(gh, i)
}
gh <- gs[[1]]
gh2 <- gs2[[1]]
microbenchmark(traverse_idx(gh), traverse_idx(gh2), times = 1)
## Unit: milliseconds
##               expr       min        lq      mean    median        uq       max
##   traverse_idx(gh)  474.0916  474.0916  474.0916  474.0916  474.0916  474.0916
##  traverse_idx(gh2) 1739.9197 1739.9197 1739.9197 1739.9197 1739.9197 1739.9197
##  neval
##      1
##      1

comparable subsequent reading (due to the cache)

microbenchmark(traverse_idx(gh), traverse_idx(gh2), times = 1)
## Unit: milliseconds
##               expr      min       lq     mean   median       uq      max neval
##   traverse_idx(gh) 454.0599 454.0599 454.0599 454.0599 454.0599 454.0599     1
##  traverse_idx(gh2) 456.1760 456.1760 456.1760 456.1760 456.1760 456.1760     1

cluster application

cl <- makeCluster(4, outfile = "")

load them as gscluster

gsc <- load_gs_cluster(tmp_mem, select= 1:8)
gsc
## A GatingSet with 8 samples
## archive dir:  ~/Downloads/gs_faust/
gsc2 <- load_gs_cluster(tmp_ondisk, select = 1:8)
gsc2
## A GatingSet with 8 samples
## archive dir:  /tmp/RtmpOM7Uv4/file2f5226c1dfcb

Immutable operations

access tree structures (stats)

parallel

microbenchmark(
              gs_clusterApply(cl, gsc, function(gh){
              head(gh_pop_get_stats(gh, path = "auto"))
            }, mutable = F)
          , gs_clusterApply(cl, gsc2, function(gh){
            head(gh_pop_get_stats(gh, path = "auto"))
          }, mutable = F)
          , times = 2
        )
## Unit: seconds
##                                                                                                    expr
##   gs_clusterApply(cl, gsc, function(gh) {     head(gh_pop_get_stats(gh, path = "auto")) }, mutable = F)
##  gs_clusterApply(cl, gsc2, function(gh) {     head(gh_pop_get_stats(gh, path = "auto")) }, mutable = F)
##       min       lq     mean   median       uq      max neval
##  13.57489 13.57489 17.65694 17.65694 21.73900 21.73900     2
##  13.48155 13.48155 13.58869 13.58869 13.69583 13.69583     2

serial

microbenchmark(
  lapply(gsc, function(gh){
    head(gh_pop_get_stats(gh, path = "auto"))
  })
  , lapply(gsc2, function(gh){
    head(gh_pop_get_stats(gh, path = "auto"))
  })
  , times = 2
)
## Unit: seconds
##                                                                          expr
##   lapply(gsc, function(gh) {     head(gh_pop_get_stats(gh, path = "auto")) })
##  lapply(gsc2, function(gh) {     head(gh_pop_get_stats(gh, path = "auto")) })
##       min       lq     mean   median      uq     max neval
##  37.17781 37.17781 37.31555 37.31555 37.4533 37.4533     2
##  37.55303 37.55303 37.71377 37.71377 37.8745 37.8745     2