devtools::load_all()
## Loading singleCell
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: DelayedArray
## Loading required package: stats4
## Loading required package: matrixStats
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply
# library(pryr)
library(HDF5Array)
## Loading required package: rhdf5
library(microbenchmark)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "1M_neurons")
h5gz_gene <- file.path(path, "gz_chunk_by_gene_sub.h5")
# library(bigmemory)
ind <- 1:100
block.size <- 10
h5seed <- HDF5ArraySeed(h5gz_gene, name = "data")
h5array <- HDF5Array(h5seed)
dim(h5array)
## [1] 10000 27998
tiledb_dir <- file.path(path, "tiledb_dense_by_col_sub")
tiledb_sparse_dir <- file.path(path, "tiledb_sparse_by_col_sub")
#create tiledb
# rsize <- 1e4
# csize <- 27998
# idx <- list(1:rsize, 1:csize)
# a <- h5read.chunked(h5gz_gene, "data", idx, block.size = block.size, fast = F)
# a <- extract_array(h5seed, idx)
# a <- extract_array(h5seed, list(1:1000, 1:1000))
# object_size(a)
# if (dir.exists(tiledb_dir)) {
# unlink(tiledb_dir, recursive = TRUE)
# }
# if (dir.exists(tiledb_sparse_dir)) {
# unlink(tiledb_sparse_dir, recursive = TRUE)
# }
#
# # write_tiledb_dense(h5array, tiledb_dir, "count")
# write_tiledb_dense(h5seed, tiledb_dir, "count")
#
# write_tiledb_sparse(h5seed, tiledb_sparse_dir, "count")
system(paste("du -sh ", h5gz_gene))
system(paste("du -sh ", tiledb_dir))
system(paste("du -sh ", tiledb_sparse_dir))
cfg <- tiledb:::Config()
cfg["vfs.num_threads"] <- 1
cfg["vfs.file.max_parallel_ops"] <- 1
ctx <- tiledb::Ctx(cfg)
size <- 1e3
idx <- list(1:size, 1:size)
native API calls (continous block IO)
microbenchmark(
# a <- h5read.chunked(h5gz_gene, "data", idx, block.size = block.size, fast = F),
# a <- chunked.read(h5seed, idx, block.size = block.size),
b <- extract_array(h5seed, idx),
c <- region_selection_tiledb(tiledb_dir, "count", c(1,size), c(1,size), ctx@ptr),
d <- region_selection_tiledb_sparse(tiledb_sparse_dir, "count", c(1,size), c(1,size), ctx@ptr)
, times = 5)
## Unit: milliseconds
## expr
## b <- extract_array(h5seed, idx)
## c <- region_selection_tiledb(tiledb_dir, "count", c(1, size), c(1, size), ctx@ptr)
## d <- region_selection_tiledb_sparse(tiledb_sparse_dir, "count", c(1, size), c(1, size), ctx@ptr)
## min lq mean median uq max neval cld
## 116.17203 119.99277 188.47272 122.06109 122.48332 461.65438 5 b
## 72.15694 73.78794 86.94317 75.76083 84.19794 128.81220 5 ab
## 27.30223 34.16882 36.53781 35.06515 40.96487 45.18797 5 a
all.equal(b,c,d)
## [1] TRUE
generic API chunked.read (capable of both continuous and non-continuous indexing)
tileseed <- tiledbArraySeed(tiledb_dir, "count")
tilesparseseed <- tiledbArraySeed(tiledb_sparse_dir, "count")
dim(tileseed)
## [1] 10000 27998
size <- 1e3
idx <- list(1:size, 1:size)
continuous indexing through chunked.read
microbenchmark(
# a <- h5read.chunked(h5gz_gene, "data", idx, block.size = block.size, fast = F),
a <- chunked.read(h5seed, idx, block.size = block.size),
# b <- extract_array(h5seed, idx),
c <- chunked.read(tileseed, idx, block.size = block.size),
d <- chunked.read(tilesparseseed, idx, block.size = block.size)
, times = 5)
## Unit: milliseconds
## expr min
## a <- chunked.read(h5seed, idx, block.size = block.size) 850.0993
## c <- chunked.read(tileseed, idx, block.size = block.size) 1200.5477
## d <- chunked.read(tilesparseseed, idx, block.size = block.size) 1500.0736
## lq mean median uq max neval cld
## 882.1724 1079.340 980.2892 1284.652 1399.488 5 a
## 1287.2148 1406.890 1291.5710 1473.342 1781.773 5 ab
## 1556.3292 1846.191 1726.4033 1735.404 2712.746 5 b
all.equal(a,c)
## [1] TRUE
random slicing through chunked.read
set.seed(1)
size <- 1e2
nrow <- nrow(h5seed)
ncol <- ncol(h5seed)
idx <- list(sample(nrow, size), sample(ncol, size))
microbenchmark(
# a <- h5read.chunked(h5gz_gene, "data", idx, block.size = block.size, fast = F),
a <- chunked.read(h5seed, idx, block.size = block.size),
# b <- extract_array(h5seed, idx),
c <- chunked.read(tileseed, idx, block.size = block.size),
d <- chunked.read(tilesparseseed, idx, block.size = block.size)
, times = 5)
## Unit: milliseconds
## expr min
## a <- chunked.read(h5seed, idx, block.size = block.size) 172.8336
## c <- chunked.read(tileseed, idx, block.size = block.size) 102.8718
## d <- chunked.read(tilesparseseed, idx, block.size = block.size) 169.5305
## lq mean median uq max neval cld
## 173.0510 183.1340 175.1874 186.2812 208.3169 5 b
## 104.1302 117.2689 120.5127 126.8546 131.9750 5 a
## 173.5052 177.6188 174.5476 178.1867 192.3239 5 b
all.equal(a,c)
## [1] TRUE
# library(profvis)
# profvis(d <- chunked.read(tileseed, idx, block.size = block.size))
common DelayedArray operation (colSums and rowSums)
tilearray <- tiledbArray(tileseed)
tilesparsearray <- tiledbArray(tilesparseseed)
cidx <- sample(ncol, 50)
microbenchmark(
a <- colSums(h5array[, cidx])
, b <- colSums(tilearray[, cidx])
, c <- colSums(tilesparsearray[, cidx])
, times = 5)
## Unit: milliseconds
## expr min lq mean
## a <- colSums(h5array[, cidx]) 84.74995 90.53320 103.96965
## b <- colSums(tilearray[, cidx]) 72.73470 73.05259 77.87966
## c <- colSums(tilesparsearray[, cidx]) 106.45034 118.01673 190.49465
## median uq max neval cld
## 102.42648 109.67487 132.46374 5 a
## 75.25193 78.41252 89.94655 5 a
## 161.92420 165.99662 400.08535 5 a
all.equal(a,b,c)
## [1] TRUE
ridx <- sample(nrow, 1e2)
microbenchmark(
a <- rowSums(h5array[ridx, cidx])
, b <- rowSums(tilearray[ridx, cidx])
, c <- rowSums(tilesparsearray[ridx, cidx])
, times = 5)
## Unit: milliseconds
## expr min lq mean
## a <- rowSums(h5array[ridx, cidx]) 52.47276 52.61682 53.56550
## b <- rowSums(tilearray[ridx, cidx]) 65.25182 65.31699 68.77582
## c <- rowSums(tilesparsearray[ridx, cidx]) 105.84373 108.25280 110.05615
## median uq max neval cld
## 53.08468 53.24056 56.41268 5 a
## 66.45574 70.15793 76.69663 5 b
## 109.36996 112.80267 114.01157 5 c
all.equal(a,b,c)
## [1] TRUE