library(rhdf5)
library(TENxGenomics)
library(microbenchmark)

Load data from 10x file that was stored as RLE

orig <- "/loc/no-backup/mike/1M_neurons_neuron20k.h5"
tenx <- TENxGenomics(orig)
tenx
## class: TENxGenomics 
## h5path: /loc/no-backup/mike/1M_neurons_neuron20k.h5 
## dim(): 27998 x 20000
data <- as.matrix(tenx)
r <- nrow(data)
c <- ncol(data)

convert to 2d sparse matrix format (compressed & uncompressed)

compressed <- "/loc/no-backup/mike/compressed.h5"
h5createFile(compressed)
h5createGroup(compressed, "mm10")
uncompressed <- "/loc/no-backup/mike/uncompressed.h5"
h5createFile(uncompressed)
h5createGroup(uncompressed, "mm10")

##write the data

h5createDataset(compressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=7)
h5write(data, compressed, "/mm10/data")
h5createDataset(uncompressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=0)
h5write(data, uncompressed, "/mm10/data")

write gene and barcodes vectors(unchanged) back to h5

barcodes <- h5read(orig, "/mm10/barcodes") 
h5write(barcodes, compressed, "/mm10/barcodes", level = 1)
h5write(barcodes, uncompressed, "/mm10/barcodes", level = 1)

genes <- h5read(orig, "/mm10/genes") 
h5write(genes, compressed, "/mm10/genes", level = 1)
h5write(genes, uncompressed, "/mm10/genes", level = 1)

gene_names <- h5read(orig, "/mm10/gene_names") 
h5write(barcodes, compressed, "/mm10/gene_names", level = 1)
h5write(barcodes, uncompressed, "/mm10/gene_names", level = 1)

compare the h5 size (MB)

file.size(orig)/2^20
## [1] 58.83689
file.size(compressed)/2^20
## [1] 72.57142
file.size(uncompressed)/2^20
## [1] 2136.436

compare matrix retrieve speed

continous block

microbenchmark(sub <- as.matrix(tenx[, 1:100])
               ,sub1 <- h5read(compressed, "/mm10/data", index=list(NULL,1:100))
               ,sub2 <- h5read(uncompressed, "/mm10/data", index=list(NULL,1:100))
               , times = 10)
## Unit: milliseconds
##                                                                        expr
##                                             sub <- as.matrix(tenx[, 1:100])
##         sub1 <- h5read(compressed, "/mm10/data", index = list(NULL, 1:100))
##  sub2 <- h5read(uncompressed, "/mm10/data", index = list(NULL,      1:100))
##        min        lq      mean    median        uq       max neval
##  168.37174 174.80739 189.26857 186.95237 195.01344 233.33392    10
##   48.52326  49.07636  56.72171  56.61076  59.46967  73.10114    10
##   33.59638  40.24370  48.15406  45.05596  49.69418  76.76322    10
all.equal(sub, sub1, check.attributes = FALSE)
## [1] TRUE
all.equal(sub, sub2, check.attributes = FALSE)
## [1] TRUE

random slices

set.seed(4)
rindx <- sample(r, 10)
cindx <- sample(c, 100)
microbenchmark(randomM <- as.matrix(tenx[rindx, cindx])
               ,randomM1 <- h5read(compressed, "/mm10/data", index=list(rindx, cindx))
               ,randomM2 <- h5read(uncompressed, "/mm10/data", index=list(rindx, cindx))
                 , times = 10)
## Unit: milliseconds
##                                                                             expr
##                                         randomM <- as.matrix(tenx[rindx, cindx])
##    randomM1 <- h5read(compressed, "/mm10/data", index = list(rindx,      cindx))
##  randomM2 <- h5read(uncompressed, "/mm10/data", index = list(rindx,      cindx))
##        min        lq      mean    median        uq       max neval
##  194.10781 194.47665 201.73349 197.33004 200.44720 225.11127    10
##   30.63320  31.10219  32.53921  31.49208  33.47150  38.84446    10
##   10.25918  10.75929  16.74423  11.24669  11.94802  66.75384    10
all.equal(randomM, randomM1, check.attributes = FALSE)
## [1] TRUE
all.equal(randomM, randomM2, check.attributes = FALSE)
## [1] TRUE
H5close()