suppressPackageStartupMessages({
library(knitr)
library(rhdf5)
# library(HDF5Array)
library(microbenchmark)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = TRUE,fig.width=8,fig.height=6, auto.dep=TRUE)
Loading data
data(maits, package='MAST')
mat <- t(maits$expressionmat)
dim(mat)
r <- nrow(mat)
c <- ncol(mat)
set.seed(4)
cind <- sample(c,40)
rind <- sample(r, 1e3)
write to H5 (chunking by row)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "MAST")
h5 <- file.path(path, "mat.h5")
h5createFile(h5)
h5createDataset(h5, "data", c(r,c), storage.mode = "double", chunk=c(1,c), level=7)
h5write(mat, h5, "data")
H5close()
row slicing (random)
microbenchmark(v1 <- mat[rind,], v2 <- h5read(h5, "/data", list(rind, NULL)), times = 3)
## Unit: microseconds
## expr min lq
## v1 <- mat[rind, ] 533.381 1152.203
## v2 <- h5read(h5, "/data", list(rind, NULL)) 168062.401 184190.274
## mean median uq max neval cld
## 1469.696 1771.025 1937.854 2104.683 3 a
## 202481.824 200318.147 219691.535 239064.924 3 b
col slicing (random)
microbenchmark(v1 <- mat[,cind], v2 <- h5read(h5, "/data", list(NULL, cind)), times = 3)
## Unit: milliseconds
## expr min lq
## v1 <- mat[, cind] 6.487641 7.402412
## v2 <- h5read(h5, "/data", list(NULL, cind)) 602.492785 607.602092
## mean median uq max neval cld
## 8.023952 8.317182 8.792108 9.267033 3 a
## 610.904860 612.711399 615.110897 617.510396 3 b
enable chunked read on HDF
library(data.table)
chunked.read <- function(h5, index){
rind <- index[[1]]
cind <- index[[2]]
res <- lapply(split(rind, 20), function(i){
sub <- h5read(h5, "/data", list(i, NULL))
data.table(sub[,cind])
})
rbindlist(res)
}
both row & col slicing
microbenchmark(v1 <- mat[rind,cind]
, v2 <- h5read(h5, "/data", list(rind, cind))
, v3 <- chunked.read(h5, list(rind, cind))
,v4 <- dbGetQuery(db, sql), times = 2, unit = "ms")
## Unit: milliseconds
## expr min lq
## v1 <- mat[rind, cind] 0.906507 0.906507
## v2 <- h5read(h5, "/data", list(rind, cind)) 5071.992337 5071.992337
## v3 <- chunked.read(h5, list(rind, cind)) 265.223683 265.223683
## v4 <- dbGetQuery(db, sql) 73.862391 73.862391
## mean median uq max neval cld
## 1.060264 1.060264 1.214022 1.214022 2 a
## 5190.381699 5190.381699 5308.771062 5308.771062 2 b
## 283.479407 283.479407 301.735132 301.735132 2 a
## 87.323083 87.323083 100.783776 100.783776 2 a
# all.equal(v1,v2, check.attributes = F)
# all.equal(v1,as.matrix(v4[rows, cols]), check.attributes = F)
find out the bottleneck
# Rprof(interval = 2e-2)
# v3 <- chunked.read(h5, list(rind, cind))
# Rprof(NULL)
# summaryRprof()