library(rhdf5)
library(TENxGenomics)
library(microbenchmark)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "h5test")
orig <- file.path(path, "1M_neurons_neuron20k.h5")
tenx <- TENxGenomics(orig)
tenx
## class: TENxGenomics
## h5path: /loc/no-backup/mike/shared/h5test/1M_neurons_neuron20k.h5
## dim(): 27998 x 20000
r <- nrow(tenx)
c <- ncol(tenx)
compressed <- file.path(path, "compressed.h5")
h5createFile(compressed)
h5createGroup(compressed, "mm10")
uncompressed <- file.path(path, "uncompressed.h5")
h5createFile(uncompressed)
h5createGroup(uncompressed, "mm10")
##write the data
data <- as.matrix(tenx)
h5createDataset(compressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=7)
h5write(data, compressed, "/mm10/data")
h5createDataset(uncompressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=0)
h5write(data, uncompressed, "/mm10/data")
library(RSQLite)
dbfile <- file.path(path, "Test.sqlite")
db = dbConnect(SQLite(), dbname=dbfile)
kv <- TENxGenomics:::.values_and_indices(tenx)
df <- data.frame(kv, check.names = FALSE)
dbWriteTable(db, "data", df, overwrite = T)
# sql <- "select from data where cidx in (1,2) and ridx in (1,3)"
# v4 <- dbGetQuery(db, sql)
utils:::format.object_size(file.size(orig), unit= "MB")
## [1] "58.8 Mb"
utils:::format.object_size(file.size(compressed), unit= "MB")
## [1] "72.6 Mb"
utils:::format.object_size(file.size(uncompressed), unit= "MB")
## [1] "2136.4 Mb"
utils:::format.object_size(file.size(dbfile), unit= "MB")
## [1] "2067.8 Mb"
set.seed(4)
rindx <- sample(r, 10)
cindx <- sample(c, 10)
microbenchmark(sub1 <- as.matrix(tenx[,cindx], withDimnames = F)
,sub2 <- h5read(compressed, "/mm10/data", index=list(NULL,cindx))
,sub3 <- h5read(uncompressed, "/mm10/data", index=list(NULL,cindx))
, sub4 <- readDB(db, NULL, cindx)
, times = 2)
## Unit: milliseconds
## expr
## sub1 <- as.matrix(tenx[, cindx], withDimnames = F)
## sub2 <- h5read(compressed, "/mm10/data", index = list(NULL, cindx))
## sub3 <- h5read(uncompressed, "/mm10/data", index = list(NULL, cindx))
## sub4 <- readDB(db, NULL, cindx)
## min lq mean median uq max neval cld
## 61.77157 61.77157 75.81512 75.81512 89.85867 89.85867 2 a
## 43.18017 43.18017 49.30964 49.30964 55.43911 55.43911 2 a
## 22.49010 22.49010 61.62949 61.62949 100.76888 100.76888 2 a
## 72.76032 72.76032 98.65715 98.65715 124.55399 124.55399 2 a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE
microbenchmark(sub1 <- as.matrix(tenx[rindx,], withDimnames = F)
,sub2 <- h5read(compressed, "/mm10/data", index=list(rindx, NULL))
,sub3 <- h5read(uncompressed, "/mm10/data", index=list(rindx,NULL))
,sub4 <- readDB(db, rindx, NULL)
, times = 2)
## Unit: milliseconds
## expr
## sub1 <- as.matrix(tenx[rindx, ], withDimnames = F)
## sub2 <- h5read(compressed, "/mm10/data", index = list(rindx, NULL))
## sub3 <- h5read(uncompressed, "/mm10/data", index = list(rindx, NULL))
## sub4 <- readDB(db, rindx, NULL)
## min lq mean median uq max neval
## 63697.5890 63697.5890 65754.0878 65754.0878 67810.5867 67810.5867 2
## 3725.1717 3725.1717 3729.0925 3729.0925 3733.0133 3733.0133 2
## 421.1347 421.1347 448.8358 448.8358 476.5369 476.5369 2
## 154.4599 154.4599 185.1890 185.1890 215.9181 215.9181 2
## cld
## b
## a
## a
## a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE
microbenchmark(sub1 <- as.matrix(tenx[rindx, cindx])
,sub2 <- h5read(compressed, "/mm10/data", index=list(rindx, cindx))
,sub3 <- h5read(uncompressed, "/mm10/data", index=list(rindx, cindx))
, sub4 <- readDB(db, rindx, cindx)
, times = 2)
## Unit: milliseconds
## expr
## sub1 <- as.matrix(tenx[rindx, cindx])
## sub2 <- h5read(compressed, "/mm10/data", index = list(rindx, cindx))
## sub3 <- h5read(uncompressed, "/mm10/data", index = list(rindx, cindx))
## sub4 <- readDB(db, rindx, cindx)
## min lq mean median uq max neval cld
## 80.603914 80.603914 82.990128 82.990128 85.376342 85.376342 2 c
## 12.391225 12.391225 12.725957 12.725957 13.060688 13.060688 2 b
## 8.694919 8.694919 9.060401 9.060401 9.425883 9.425883 2 ab
## 2.066368 2.066368 3.615582 3.615582 5.164796 5.164796 2 a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE
H5close()
Rprof()
sub1 <- as.matrix(tenx[rindx, cindx])
Rprof(NULL)
summaryRprof()
## $by.self
## self.time self.pct total.time total.pct
## "h5checktype" 0.02 33.33 0.06 100.00
## "h5const2Factor" 0.02 33.33 0.02 33.33
## "stopifnot" 0.02 33.33 0.02 33.33
##
## $by.total
## total.time total.pct self.time self.pct
## "h5checktype" 0.06 100.00 0.02 33.33
## "as.matrix.TENxGenomics" 0.06 100.00 0.00 0.00
## "as.matrix" 0.06 100.00 0.00 0.00
## "block_exec" 0.06 100.00 0.00 0.00
## "call_block" 0.06 100.00 0.00 0.00
## "doTryCatch" 0.06 100.00 0.00 0.00
## "eval" 0.06 100.00 0.00 0.00
## "evaluate_call" 0.06 100.00 0.00 0.00
## "evaluate" 0.06 100.00 0.00 0.00
## "h5read" 0.06 100.00 0.00 0.00
## "handle" 0.06 100.00 0.00 0.00
## "in_dir" 0.06 100.00 0.00 0.00
## "knitr::knit" 0.06 100.00 0.00 0.00
## "process_file" 0.06 100.00 0.00 0.00
## "process_group.block" 0.06 100.00 0.00 0.00
## "process_group" 0.06 100.00 0.00 0.00
## "rmarkdown::render" 0.06 100.00 0.00 0.00
## "standardGeneric" 0.06 100.00 0.00 0.00
## "timing_fn" 0.06 100.00 0.00 0.00
## "try" 0.06 100.00 0.00 0.00
## "tryCatch" 0.06 100.00 0.00 0.00
## "tryCatchList" 0.06 100.00 0.00 0.00
## "tryCatchOne" 0.06 100.00 0.00 0.00
## "withCallingHandlers" 0.06 100.00 0.00 0.00
## "withVisible" 0.06 100.00 0.00 0.00
## ".values_and_indices" 0.04 66.67 0.00 0.00
## "H5Dread" 0.04 66.67 0.00 0.00
## "h5readDataset" 0.04 66.67 0.00 0.00
## "h5const2Factor" 0.02 33.33 0.02 33.33
## "stopifnot" 0.02 33.33 0.02 33.33
## "dimnames" 0.02 33.33 0.00 0.00
## "H5Aexists" 0.02 33.33 0.00 0.00
## "H5Dopen" 0.02 33.33 0.00 0.00
## "H5Iget_type" 0.02 33.33 0.00 0.00
## "H5Iis_valid" 0.02 33.33 0.00 0.00
## "matrix" 0.02 33.33 0.00 0.00
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 0.06