suppressPackageStartupMessages(
  { library(flowCore)
    library(flowWorkspace)
    library(CytoML)
    library(microbenchmark)
    library(ggcyto)
  })
## Warning: replacing previous import 'ncdfFlow::filter' by 'dplyr::filter' when
## loading 'flowWorkspace'

on-disk idx is turned off by default

use_on_disk_idx()
## [1] FALSE

parse flowjo workspace

dataDir <- system.file("extdata",package="flowWorkspaceData")
wsfile <- list.files(dataDir, pattern="manual.xml",full=TRUE)
ws <- open_flowjo_xml(wsfile);

parse it with in-mem idx

# set_log_level("Gate")
system.time(gs <- flowjo_to_gatingset(ws, path = dataDir, name = 4, subset = 1))
##    user  system elapsed 
##   0.308   0.032   0.339
gh <- gs[[1]]
gh_idx_get_uri(gh)
## [1] ""

parse it with on-disk idx

it is now using cached in-mem idx thus perform as fast as before

use_on_disk_idx(T)
## [1] TRUE
system.time(gs2 <- flowjo_to_gatingset(ws, path = dataDir, name = 4, subset = 1))
##    user  system elapsed 
##   0.297   0.012   0.323
gh2 <- gs2[[1]]
gh_idx_get_uri(gh2)
## [1] "/tmp/5fe56ce6-7773-4f50-8e87-70e7720f53c5.idx"
all.equal(gh_pop_get_stats(gh), gh_pop_get_stats(gh2))
## [1] TRUE

save gs

during saving, the in-mem idx cache is flushed to on-disk idx

tmp_mem <- tempfile()
tmp_ondisk <- tempfile()
system.time(save_gs(gs, tmp_mem))
##    user  system elapsed 
##   0.013   0.004   0.028
system.time(save_gs(gs2, tmp_ondisk))
##    user  system elapsed 
##   0.023   0.000   0.027
system(paste0("du ", tmp_mem, "/* -ch "))
system(paste0("du ", tmp_ondisk, "/* -ch "))

load gs

smaller pb thus loading faster

microbenchmark(gs <- load_gs(tmp_mem), gs2 <- load_gs(tmp_ondisk), times = 2)
## Unit: milliseconds
##                        expr      min       lq     mean   median        uq
##      gs <- load_gs(tmp_mem) 6.416442 6.416442 9.259159 9.259159 12.101877
##  gs2 <- load_gs(tmp_ondisk) 2.569334 2.569334 2.576967 2.576967  2.584599
##        max neval
##  12.101877     2
##   2.584599     2
gh <- gs[[1]]
gh2 <- gs2[[1]]

retrieving idx

traverse_idx <- function(gh) {
  for(i in gh_get_pop_paths(gh))
    a <- gh_pop_get_indices(gh, i)
}

on-disk idx is only loaded on-demand, thus initial read is a little slower

microbenchmark(traverse_idx(gh), traverse_idx(gh2), times = 1)
## Unit: milliseconds
##               expr      min       lq     mean   median       uq      max neval
##   traverse_idx(gh)  8.94083  8.94083  8.94083  8.94083  8.94083  8.94083     1
##  traverse_idx(gh2) 21.52728 21.52728 21.52728 21.52728 21.52728 21.52728     1

but it doesn’t seem to be the significant overhead in the overall performance of actual application (e.g. plot)

gs2 <- load_gs(tmp_ondisk)#reload gs to ensure no cache effect
gh2 <- gs2[[1]]
microbenchmark(autoplot(gh), autoplot(gh2), times = 1)
## Unit: seconds
##           expr      min       lq     mean   median       uq      max neval
##   autoplot(gh) 2.052758 2.052758 2.052758 2.052758 2.052758 2.052758     1
##  autoplot(gh2) 2.262579 2.262579 2.262579 2.262579 2.262579 2.262579     1