suppressPackageStartupMessages({
    library(knitr)
    library(rhdf5)
    # library(HDF5Array)
    library(microbenchmark)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = TRUE,fig.width=8,fig.height=6, auto.dep=TRUE)

Loading data

data(maits, package='MAST')
mat <- t(maits$expressionmat)
dim(mat)
r <- nrow(mat)
c <- ncol(mat)
set.seed(4)
cind <- sample(c,40)
rind <- sample(r, 1e3)

write to H5 (chunking by row)

path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "MAST")
h5 <- file.path(path, "mat.h5")

h5createFile(h5)
h5createDataset(h5, "data", c(r,c), storage.mode = "double", chunk=c(1,c), level=7) 
h5write(mat, h5, "data")
H5close()

row slicing (random)

microbenchmark(v1 <- mat[rind,], v2 <- h5read(h5, "/data", list(rind, NULL)), times = 3)

## Unit: microseconds
##                                         expr        min         lq
##                            v1 <- mat[rind, ]    533.381   1152.203
##  v2 <- h5read(h5, "/data", list(rind, NULL)) 168062.401 184190.274
##        mean     median         uq        max neval cld
##    1469.696   1771.025   1937.854   2104.683     3  a 
##  202481.824 200318.147 219691.535 239064.924     3   b

col slicing (random)

microbenchmark(v1 <- mat[,cind], v2 <- h5read(h5, "/data", list(NULL, cind)), times = 3)

## Unit: milliseconds
##                                         expr        min         lq
##                            v1 <- mat[, cind]   6.487641   7.402412
##  v2 <- h5read(h5, "/data", list(NULL, cind)) 602.492785 607.602092
##        mean     median         uq        max neval cld
##    8.023952   8.317182   8.792108   9.267033     3  a 
##  610.904860 612.711399 615.110897 617.510396     3   b

enable chunked read on HDF

library(data.table)
chunked.read <- function(h5, index){
    rind <- index[[1]]
    cind <- index[[2]]
    
    res <- lapply(split(rind, 20), function(i){
      sub <- h5read(h5, "/data", list(i, NULL))
      data.table(sub[,cind])
    })
    rbindlist(res)
}

try sqlite format

library(RSQLite)
db = dbConnect(SQLite(), dbname=file.path(path, "Test.sqlite"))

df <- data.frame(mat, check.names = FALSE)
dbWriteTable(db, "data", df)
# v4 <- dbGetQuery(db, sql)

coln <- colnames(mat)
rown <- rownames(mat)
rows <- rown[rind]
cols <- coln[cind]

sql <- paste0("select \"", paste(cols, collapse = "\",\""), "\" from data where row_names IN ('", paste(rows, collapse = "','"), "')" )

both row & col slicing

microbenchmark(v1 <- mat[rind,cind]
                , v2 <- h5read(h5, "/data", list(rind, cind))
               , v3 <- chunked.read(h5, list(rind, cind))
               ,v4 <- dbGetQuery(db, sql), times = 2, unit = "ms")

## Unit: milliseconds
##                                         expr         min          lq
##                        v1 <- mat[rind, cind]    0.906507    0.906507
##  v2 <- h5read(h5, "/data", list(rind, cind)) 5071.992337 5071.992337
##     v3 <- chunked.read(h5, list(rind, cind))  265.223683  265.223683
##                    v4 <- dbGetQuery(db, sql)   73.862391   73.862391
##         mean      median          uq         max neval cld
##     1.060264    1.060264    1.214022    1.214022     2  a 
##  5190.381699 5190.381699 5308.771062 5308.771062     2   b
##   283.479407  283.479407  301.735132  301.735132     2  a 
##    87.323083   87.323083  100.783776  100.783776     2  a

# all.equal(v1,v2, check.attributes = F)
# all.equal(v1,as.matrix(v4[rows, cols]), check.attributes = F)

find out the bottleneck

# Rprof(interval = 2e-2)
# v3 <- chunked.read(h5, list(rind, cind))
# Rprof(NULL)
# summaryRprof()

SingleCellAssay based on HDF5Array

Mike Jiang

May 12, 2017

Loading data

write to H5 (chunking by row)

row slicing (random)

col slicing (random)

enable chunked read on HDF

try sqlite format

both row & col slicing

find out the bottleneck