suppressPackageStartupMessages(
{ library(flowCore)
library(flowWorkspace)
library(microbenchmark)
library(utils)
library(parallel)
})
## Warning: replacing previous import 'ncdfFlow::filter' by 'dplyr::filter' when
## loading 'flowWorkspace'
load a faust generated gs
tmp_mem <- "~/Downloads/gs_faust/"
gs <- load_gs(tmp_mem)
gh_idx_get_uri(gs[[1]])
## [1] ""
dataset size
length(gs) #nSamples
## [1] 82
length(gs_get_pop_paths(gs)) #nNodes
## [1] 695
convert to ondisk-idx
system.time(gs_convert_idx_to_ondisk(gs))
## user system elapsed
## 44.834 1.476 39.822
tmp_ondisk <- tempfile()
system.time(save_gs(gs, tmp_ondisk))
## user system elapsed
## 0.533 2.230 5.245
Benchmark
loading time
microbenchmark(gs <- load_gs(tmp_mem), gs2 <- load_gs(tmp_ondisk), times = 2)
## Unit: milliseconds
## expr min lq mean median uq
## gs <- load_gs(tmp_mem) 2596.1655 2596.1655 2631.3459 2631.3459 2666.5263
## gs2 <- load_gs(tmp_ondisk) 485.0436 485.0436 501.1024 501.1024 517.1612
## max neval
## 2666.5263 2
## 517.1612 2
gh_idx_get_uri(gs[[1]])
## [1] ""
gh_idx_get_uri(gs2[[1]])
## [1] "/tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu Vaccine-K 11-09-10_NCH-VK 21 TFH Panel.fcs.idx"
disk layout
#‘ ### total pb file size for in-mem idx gs
utils:::format.object_size(sum(file.size(list.files(tmp_mem, "*.pb", full.names = T))), "auto")
## [1] "177.8 Mb"
#‘ ### total pb file size for ondisk idx gs
utils:::format.object_size(sum(file.size(list.files(tmp_ondisk, "*.pb", full.names = T))), "auto")
## [1] "4.5 Mb"
#‘ ### total idx file size for on-disk idx gs
cat(tail(system(paste0("du ", tmp_ondisk, "/*.idx -ch "), intern = T)), sep = "\n")
## 1.1M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu Vaccine-Kids 05-17-11 6M_NCH-VK 37 TFH Panel.fcs.idx
## 1.4M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_38_TFH_Panel.fcs.idx/__meta
## 1.4M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_38_TFH_Panel.fcs.idx
## 1.3M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_40_TFH_Panel.fcs.idx/__meta
## 1.3M /tmp/RtmpOM7Uv4/file2f5226c1dfcb/Flu_NCH_VK_40_TFH_Panel.fcs.idx
## 120M total
idx loading
slower initial idx reading (but in real use case, traversing idx from all nodes is rarely needed)
traverse_idx <- function(gh) {
for(i in gh_get_pop_paths(gh))
a <- gh_pop_get_indices(gh, i)
}
gh <- gs[[1]]
gh2 <- gs2[[1]]
microbenchmark(traverse_idx(gh), traverse_idx(gh2), times = 1)
## Unit: milliseconds
## expr min lq mean median uq max
## traverse_idx(gh) 474.0916 474.0916 474.0916 474.0916 474.0916 474.0916
## traverse_idx(gh2) 1739.9197 1739.9197 1739.9197 1739.9197 1739.9197 1739.9197
## neval
## 1
## 1
comparable subsequent reading (due to the cache)
microbenchmark(traverse_idx(gh), traverse_idx(gh2), times = 1)
## Unit: milliseconds
## expr min lq mean median uq max neval
## traverse_idx(gh) 454.0599 454.0599 454.0599 454.0599 454.0599 454.0599 1
## traverse_idx(gh2) 456.1760 456.1760 456.1760 456.1760 456.1760 456.1760 1
cluster application
cl <- makeCluster(4, outfile = "")
load them as gscluster
gsc <- load_gs_cluster(tmp_mem, select= 1:8)
gsc
## A GatingSet with 8 samples
## archive dir: ~/Downloads/gs_faust/
gsc2 <- load_gs_cluster(tmp_ondisk, select = 1:8)
gsc2
## A GatingSet with 8 samples
## archive dir: /tmp/RtmpOM7Uv4/file2f5226c1dfcb
Immutable operations
access tree structures (stats)
parallel
microbenchmark(
gs_clusterApply(cl, gsc, function(gh){
head(gh_pop_get_stats(gh, path = "auto"))
}, mutable = F)
, gs_clusterApply(cl, gsc2, function(gh){
head(gh_pop_get_stats(gh, path = "auto"))
}, mutable = F)
, times = 2
)
## Unit: seconds
## expr
## gs_clusterApply(cl, gsc, function(gh) { head(gh_pop_get_stats(gh, path = "auto")) }, mutable = F)
## gs_clusterApply(cl, gsc2, function(gh) { head(gh_pop_get_stats(gh, path = "auto")) }, mutable = F)
## min lq mean median uq max neval
## 13.57489 13.57489 17.65694 17.65694 21.73900 21.73900 2
## 13.48155 13.48155 13.58869 13.58869 13.69583 13.69583 2
serial
microbenchmark(
lapply(gsc, function(gh){
head(gh_pop_get_stats(gh, path = "auto"))
})
, lapply(gsc2, function(gh){
head(gh_pop_get_stats(gh, path = "auto"))
})
, times = 2
)
## Unit: seconds
## expr
## lapply(gsc, function(gh) { head(gh_pop_get_stats(gh, path = "auto")) })
## lapply(gsc2, function(gh) { head(gh_pop_get_stats(gh, path = "auto")) })
## min lq mean median uq max neval
## 37.17781 37.17781 37.31555 37.31555 37.4533 37.4533 2
## 37.55303 37.55303 37.71377 37.71377 37.8745 37.8745 2