Load data from 10x file that was stored as CSC

library(rhdf5)
library(TENxGenomics)
library(microbenchmark)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "h5test")
orig <- file.path(path, "1M_neurons_neuron20k.h5")
tenx <- TENxGenomics(orig)
tenx
## class: TENxGenomics 
## h5path: /loc/no-backup/mike/shared/h5test/1M_neurons_neuron20k.h5 
## dim(): 27998 x 20000
r <- nrow(tenx)
c <- ncol(tenx)

convert to 2d matrix format

compressed

compressed <- file.path(path, "compressed.h5")
h5createFile(compressed)
h5createGroup(compressed, "mm10")

uncompressed

uncompressed <- file.path(path, "uncompressed.h5")
h5createFile(uncompressed)
h5createGroup(uncompressed, "mm10")

##write the data

data <- as.matrix(tenx)
h5createDataset(compressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=7)
h5write(data, compressed, "/mm10/data")
h5createDataset(uncompressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=0)
h5write(data, uncompressed, "/mm10/data")

store it in sqlite db as Coordinate list (COO) structure

library(RSQLite)
dbfile <- file.path(path, "Test.sqlite")
db = dbConnect(SQLite(), dbname=dbfile)
kv <- TENxGenomics:::.values_and_indices(tenx)
df <- data.frame(kv, check.names = FALSE)
dbWriteTable(db, "data", df, overwrite = T)

# sql <- "select  from data where cidx in (1,2) and ridx in (1,3)"
# v4 <- dbGetQuery(db, sql)

compare the file size (MB)

utils:::format.object_size(file.size(orig), unit= "MB")
## [1] "58.8 Mb"
utils:::format.object_size(file.size(compressed), unit= "MB")
## [1] "72.6 Mb"
utils:::format.object_size(file.size(uncompressed), unit= "MB")
## [1] "2136.4 Mb"
utils:::format.object_size(file.size(dbfile), unit= "MB")
## [1] "2067.8 Mb"

compare matrix subsetting speed

set.seed(4)
rindx <- sample(r, 10)
cindx <- sample(c, 10)

colum-wise

microbenchmark(sub1 <- as.matrix(tenx[,cindx], withDimnames = F)
                ,sub2 <- h5read(compressed, "/mm10/data", index=list(NULL,cindx))
                ,sub3 <- h5read(uncompressed, "/mm10/data", index=list(NULL,cindx))
                , sub4 <- readDB(db, NULL, cindx)
               , times = 2)
## Unit: milliseconds
##                                                                        expr
##                          sub1 <- as.matrix(tenx[, cindx], withDimnames = F)
##         sub2 <- h5read(compressed, "/mm10/data", index = list(NULL, cindx))
##  sub3 <- h5read(uncompressed, "/mm10/data", index = list(NULL,      cindx))
##                                             sub4 <- readDB(db, NULL, cindx)
##       min       lq     mean   median        uq       max neval cld
##  61.77157 61.77157 75.81512 75.81512  89.85867  89.85867     2   a
##  43.18017 43.18017 49.30964 49.30964  55.43911  55.43911     2   a
##  22.49010 22.49010 61.62949 61.62949 100.76888 100.76888     2   a
##  72.76032 72.76032 98.65715 98.65715 124.55399 124.55399     2   a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE

row-wise

microbenchmark(sub1 <- as.matrix(tenx[rindx,], withDimnames = F)
               ,sub2 <- h5read(compressed, "/mm10/data", index=list(rindx, NULL))
               ,sub3 <- h5read(uncompressed, "/mm10/data", index=list(rindx,NULL))
               ,sub4 <- readDB(db, rindx, NULL)
               , times = 2)
## Unit: milliseconds
##                                                                        expr
##                          sub1 <- as.matrix(tenx[rindx, ], withDimnames = F)
##    sub2 <- h5read(compressed, "/mm10/data", index = list(rindx,      NULL))
##  sub3 <- h5read(uncompressed, "/mm10/data", index = list(rindx,      NULL))
##                                             sub4 <- readDB(db, rindx, NULL)
##         min         lq       mean     median         uq        max neval
##  63697.5890 63697.5890 65754.0878 65754.0878 67810.5867 67810.5867     2
##   3725.1717  3725.1717  3729.0925  3729.0925  3733.0133  3733.0133     2
##    421.1347   421.1347   448.8358   448.8358   476.5369   476.5369     2
##    154.4599   154.4599   185.1890   185.1890   215.9181   215.9181     2
##  cld
##    b
##   a 
##   a 
##   a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE

row & col

microbenchmark(sub1 <- as.matrix(tenx[rindx, cindx])
               ,sub2 <- h5read(compressed, "/mm10/data", index=list(rindx, cindx))
               ,sub3 <- h5read(uncompressed, "/mm10/data", index=list(rindx, cindx))
               , sub4 <- readDB(db, rindx, cindx)
                 , times = 2)
## Unit: milliseconds
##                                                                         expr
##                                        sub1 <- as.matrix(tenx[rindx, cindx])
##    sub2 <- h5read(compressed, "/mm10/data", index = list(rindx,      cindx))
##  sub3 <- h5read(uncompressed, "/mm10/data", index = list(rindx,      cindx))
##                                             sub4 <- readDB(db, rindx, cindx)
##        min        lq      mean    median        uq       max neval cld
##  80.603914 80.603914 82.990128 82.990128 85.376342 85.376342     2   c
##  12.391225 12.391225 12.725957 12.725957 13.060688 13.060688     2  b 
##   8.694919  8.694919  9.060401  9.060401  9.425883  9.425883     2 ab 
##   2.066368  2.066368  3.615582  3.615582  5.164796  5.164796     2 a
all.equal(sub1, sub4, check.attributes = FALSE)
## [1] TRUE
H5close()



Rprof()
sub1 <- as.matrix(tenx[rindx, cindx])
Rprof(NULL)
summaryRprof()
## $by.self
##                  self.time self.pct total.time total.pct
## "h5checktype"         0.02    33.33       0.06    100.00
## "h5const2Factor"      0.02    33.33       0.02     33.33
## "stopifnot"           0.02    33.33       0.02     33.33
## 
## $by.total
##                          total.time total.pct self.time self.pct
## "h5checktype"                  0.06    100.00      0.02    33.33
## "as.matrix.TENxGenomics"       0.06    100.00      0.00     0.00
## "as.matrix"                    0.06    100.00      0.00     0.00
## "block_exec"                   0.06    100.00      0.00     0.00
## "call_block"                   0.06    100.00      0.00     0.00
## "doTryCatch"                   0.06    100.00      0.00     0.00
## "eval"                         0.06    100.00      0.00     0.00
## "evaluate_call"                0.06    100.00      0.00     0.00
## "evaluate"                     0.06    100.00      0.00     0.00
## "h5read"                       0.06    100.00      0.00     0.00
## "handle"                       0.06    100.00      0.00     0.00
## "in_dir"                       0.06    100.00      0.00     0.00
## "knitr::knit"                  0.06    100.00      0.00     0.00
## "process_file"                 0.06    100.00      0.00     0.00
## "process_group.block"          0.06    100.00      0.00     0.00
## "process_group"                0.06    100.00      0.00     0.00
## "rmarkdown::render"            0.06    100.00      0.00     0.00
## "standardGeneric"              0.06    100.00      0.00     0.00
## "timing_fn"                    0.06    100.00      0.00     0.00
## "try"                          0.06    100.00      0.00     0.00
## "tryCatch"                     0.06    100.00      0.00     0.00
## "tryCatchList"                 0.06    100.00      0.00     0.00
## "tryCatchOne"                  0.06    100.00      0.00     0.00
## "withCallingHandlers"          0.06    100.00      0.00     0.00
## "withVisible"                  0.06    100.00      0.00     0.00
## ".values_and_indices"          0.04     66.67      0.00     0.00
## "H5Dread"                      0.04     66.67      0.00     0.00
## "h5readDataset"                0.04     66.67      0.00     0.00
## "h5const2Factor"               0.02     33.33      0.02    33.33
## "stopifnot"                    0.02     33.33      0.02    33.33
## "dimnames"                     0.02     33.33      0.00     0.00
## "H5Aexists"                    0.02     33.33      0.00     0.00
## "H5Dopen"                      0.02     33.33      0.00     0.00
## "H5Iget_type"                  0.02     33.33      0.00     0.00
## "H5Iis_valid"                  0.02     33.33      0.00     0.00
## "matrix"                       0.02     33.33      0.00     0.00
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 0.06