suppressPackageStartupMessages({
    library(knitr)
    library(rhdf5)
    # library(HDF5Array)
    library(microbenchmark)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = TRUE,fig.width=8,fig.height=6, auto.dep=TRUE)

Loading data

data(maits, package='MAST')
mat <- t(maits$expressionmat)
dim(mat)
r <- nrow(mat)
c <- ncol(mat)
set.seed(4)
cind <- sample(c,40)
rind <- sample(r, 1e3)

write to H5 (chunking by row)

path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "MAST")
h5 <- file.path(path, "mat.h5")
h5createFile(h5)
h5createDataset(h5, "data", c(r,c), storage.mode = "double", chunk=c(1,c), level=7) 
h5write(mat, h5, "data")
H5close()

row slicing (random)

microbenchmark(v1 <- mat[rind,], v2 <- h5read(h5, "/data", list(rind, NULL)), times = 3)
## Unit: microseconds
##                                         expr        min         lq
##                            v1 <- mat[rind, ]    533.381   1152.203
##  v2 <- h5read(h5, "/data", list(rind, NULL)) 168062.401 184190.274
##        mean     median         uq        max neval cld
##    1469.696   1771.025   1937.854   2104.683     3  a 
##  202481.824 200318.147 219691.535 239064.924     3   b

col slicing (random)

microbenchmark(v1 <- mat[,cind], v2 <- h5read(h5, "/data", list(NULL, cind)), times = 3)
## Unit: milliseconds
##                                         expr        min         lq
##                            v1 <- mat[, cind]   6.487641   7.402412
##  v2 <- h5read(h5, "/data", list(NULL, cind)) 602.492785 607.602092
##        mean     median         uq        max neval cld
##    8.023952   8.317182   8.792108   9.267033     3  a 
##  610.904860 612.711399 615.110897 617.510396     3   b

enable chunked read on HDF

library(data.table)
chunked.read <- function(h5, index){
    rind <- index[[1]]
    cind <- index[[2]]
    
    res <- lapply(split(rind, 20), function(i){
      sub <- h5read(h5, "/data", list(i, NULL))
      data.table(sub[,cind])
    })
    rbindlist(res)
}

try sqlite format

library(RSQLite)
db = dbConnect(SQLite(), dbname=file.path(path, "Test.sqlite"))
df <- data.frame(mat, check.names = FALSE)
dbWriteTable(db, "data", df)
# v4 <- dbGetQuery(db, sql)
coln <- colnames(mat)
rown <- rownames(mat)
rows <- rown[rind]
cols <- coln[cind]

sql <- paste0("select \"", paste(cols, collapse = "\",\""), "\" from data where row_names IN ('", paste(rows, collapse = "','"), "')" )

both row & col slicing

microbenchmark(v1 <- mat[rind,cind]
                , v2 <- h5read(h5, "/data", list(rind, cind))
               , v3 <- chunked.read(h5, list(rind, cind))
               ,v4 <- dbGetQuery(db, sql), times = 2, unit = "ms")
## Unit: milliseconds
##                                         expr         min          lq
##                        v1 <- mat[rind, cind]    0.906507    0.906507
##  v2 <- h5read(h5, "/data", list(rind, cind)) 5071.992337 5071.992337
##     v3 <- chunked.read(h5, list(rind, cind))  265.223683  265.223683
##                    v4 <- dbGetQuery(db, sql)   73.862391   73.862391
##         mean      median          uq         max neval cld
##     1.060264    1.060264    1.214022    1.214022     2  a 
##  5190.381699 5190.381699 5308.771062 5308.771062     2   b
##   283.479407  283.479407  301.735132  301.735132     2  a 
##    87.323083   87.323083  100.783776  100.783776     2  a
# all.equal(v1,v2, check.attributes = F)
# all.equal(v1,as.matrix(v4[rows, cols]), check.attributes = F)

find out the bottleneck

# Rprof(interval = 2e-2)
# v3 <- chunked.read(h5, list(rind, cind))
# Rprof(NULL)
# summaryRprof()