Create different backends (locally)
cf.mem <- load_cytoframe_from_fcs("~/rglab/workspace/flowWorkspaceData/inst/extdata/CytoTrol_CytoTrol_1.fcs")
dim(cf.mem)
## events parameters
## 119531 12
h5_file <- tempfile(fileext = ".h5")
cf_write_h5(cf.mem, h5_file)
tile_dir <- tempfile(fileext = ".tile")
cf_write_tile(cf.mem, tile_dir)
Compare size
utils:::format.object_size(file.size(h5_file), "auto")
## [1] "5.5 Mb"
strsplit(system(paste0("du -sh ", tile_dir), intern = T), "\t")[[1]][1]
## [1] "5.9M"
Benchmark the random slicing
set.seed(1)
cidx <- c(1,5,8)
ridx <- sample(nrow(cf.h5), 1e3)
subset by channels
microbenchmark(a <- exprs(cf.mem[, cidx])
, b <- exprs(cf.h5[, cidx])
, c <- exprs(cf.tile[, cidx])
, times = 5
)
## Unit: milliseconds
## expr min lq mean median uq
## a <- exprs(cf.mem[, cidx]) 4.744973 4.820243 5.245412 4.836304 5.450215
## b <- exprs(cf.h5[, cidx]) 4.353024 4.947002 5.053556 4.979597 5.228883
## c <- exprs(cf.tile[, cidx]) 33.996457 38.511042 42.004904 38.980041 39.539437
## max neval
## 6.375325 5
## 5.759276 5
## 58.997543 5
all.equal(a,b,c)
## [1] TRUE
subset by cols & rows
microbenchmark(a <- exprs(cf.mem[ridx, cidx])
, b <- exprs(cf.h5[ridx, cidx])
, c <- exprs(cf.tile[ridx, cidx])
, times = 5
)
## Unit: milliseconds
## expr min lq mean median
## a <- exprs(cf.mem[ridx, cidx]) 2.369003 2.487385 2.849076 2.750922
## b <- exprs(cf.h5[ridx, cidx]) 4.036930 4.067461 4.539437 4.101277
## c <- exprs(cf.tile[ridx, cidx]) 28.382903 28.535450 29.708429 29.338413
## uq max neval
## 2.792729 3.845342 5
## 4.117894 6.373624 5
## 31.013319 31.272060 5
all.equal(a,b,c)
## [1] TRUE
Compare remote storage
bucket <- "mike-h5"
Manually upload h5 to s3 using aws.s3 pkg since H5 doesnโt support s3 write yet
library(aws.s3)
cred <- check_credential(NULL)
uri.h5 <- paste0("s3://", bucket, "/", basename(h5_file))
put_object(h5_file, uri.h5
, region = cred$AWS_REGION
, key = cred$AWS_ACCESS_KEY_ID
, secret = cred$AWS_SECRET_ACCESS_KEY
)
## [1] TRUE
tiledb can directly write to s3
uri.tile <- paste0("s3://", bucket, "/", basename(tile_dir))
cf_write_tile(cf.mem, uri.tile)
load cf remotely
microbenchmark(cf.h5 <- load_cytoframe(uri.h5)
,cf.tile <- load_cytoframe(uri.tile)
, times = 3)
## Unit: seconds
## expr min lq mean median
## cf.h5 <- load_cytoframe(uri.h5) 1.245577 1.246506 1.247689 1.247435
## cf.tile <- load_cytoframe(uri.tile) 2.164970 2.216001 2.240629 2.267033
## uq max neval
## 1.248744 1.250054 3
## 2.278459 2.289885 3
cf_get_uri(cf.h5)
## [1] "https://mike-h5.s3.amazonaws.com/file37767b41a56e.h5"
cf_get_uri(cf.tile)
## [1] "s3://mike-h5/file37765ef9a67a.tile"
subset by channels
microbenchmark( b <- exprs(cf.h5[, cidx])
, c <- exprs(cf.tile[, cidx])
, times = 3
)
## Unit: milliseconds
## expr min lq mean median uq
## b <- exprs(cf.h5[, cidx]) 1213.1904 1215.0520 1225.7256 1216.9136 1231.993
## c <- exprs(cf.tile[, cidx]) 453.5959 480.1759 938.9981 506.7559 1181.699
## max neval
## 1247.073 3
## 1856.642 3
all.equal(b,c, check.attributes = F)
## [1] TRUE
subset by cols & rows
microbenchmark(b <- exprs(cf.h5[ridx, cidx])
, c <- exprs(cf.tile[ridx, cidx])
, times = 3
)
## Unit: milliseconds
## expr min lq mean median
## b <- exprs(cf.h5[ridx, cidx]) 1166.2893 1169.746 1175.4951 1173.2026
## c <- exprs(cf.tile[ridx, cidx]) 499.1655 511.917 570.8657 524.6686
## uq max neval
## 1180.0980 1186.9934 3
## 606.7158 688.7631 3
all.equal(b,c, check.attributes = F)
## [1] TRUE
clean up
cf_cleanup(cf.h5)
## https://mike-h5.s3.amazonaws.com/file37767b41a56e.h5 is deleted!
cf_cleanup(cf.tile)
## s3://mike-h5/file37765ef9a67a.tile is deleted!