library(rhdf5)
library(TENxGenomics)
## Loading required package: Matrix
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:Matrix':
## 
##     colMeans, colSums, rowMeans, rowSums, which
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:httr':
## 
##     content
## Loading required package: DelayedArray
## Loading required package: matrixStats
## matrixStats v0.51.0 (2016-10-08) successfully loaded. See ?matrixStats for help.
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
library(microbenchmark)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "h5test")

Load data from 10x file that was stored as CSC

orig <- file.path(path, "1M_neurons_neuron20k.h5")
tenx <- TENxGenomics(orig)
tenx
## class: TENxGenomics 
## h5path: /loc/no-backup/mike/shared/h5test/1M_neurons_neuron20k.h5 
## dim(): 27998 x 20000
data <- as.matrix(tenx)

get thesize of data

r <- nrow(tenx)
c <- ncol(tenx)

convert to 2d matrix format

compressed

compressed <- file.path(path, "compressed.h5")
h5createFile(compressed)
h5createGroup(compressed, "mm10")

uncompressed

uncompressed <- file.path(path, "uncompressed.h5")
h5createFile(uncompressed)
h5createGroup(uncompressed, "mm10")

##write the data

h5createDataset(compressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=7)
h5write(data, compressed, "/mm10/data")
h5createDataset(uncompressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=0)
h5write(data, uncompressed, "/mm10/data")

write gene and barcodes vectors(unchanged) back to h5

barcodes <- h5read(orig, "/mm10/barcodes") 
h5write(barcodes, compressed, "/mm10/barcodes", level = 1)
h5write(barcodes, uncompressed, "/mm10/barcodes", level = 1)
genes <- h5read(orig, "/mm10/genes") 
h5write(genes, compressed, "/mm10/genes", level = 1)
h5write(genes, uncompressed, "/mm10/genes", level = 1)
gene_names <- h5read(orig, "/mm10/gene_names") 
h5write(barcodes, compressed, "/mm10/gene_names", level = 1)
h5write(barcodes, uncompressed, "/mm10/gene_names", level = 1)

compare the h5 size (MB)

utils:::format.object_size(file.size(orig), unit= "MB")
## [1] "58.8 Mb"
utils:::format.object_size(file.size(compressed), unit= "MB")
## [1] "72.6 Mb"
utils:::format.object_size(file.size(uncompressed), unit= "MB")
## [1] "2136.4 Mb"

compare matrix subsetting speed

continous block (colum-wise)

ind <- 1:100
microbenchmark(sub1 <- as.matrix(tenx[,ind], withDimnames = F)
                ,sub2 <- h5read(compressed, "/mm10/data", index=list(NULL,ind))
                ,sub3 <- h5read(uncompressed, "/mm10/data", index=list(NULL,ind))
               , times = 10)
## Unit: milliseconds
##                                                                      expr
##                          sub1 <- as.matrix(tenx[, ind], withDimnames = F)
##         sub2 <- h5read(compressed, "/mm10/data", index = list(NULL, ind))
##  sub3 <- h5read(uncompressed, "/mm10/data", index = list(NULL,      ind))
##        min        lq      mean    median        uq      max neval
##  164.03254 169.81935 324.77722 327.98678 454.53002 541.3881    10
##   64.90005  65.33857  94.36689  66.54846  67.64308 345.3854    10
##   46.99974  48.24214 165.70198  49.18509 333.80274 366.2625    10
all.equal(sub1, sub2, check.attributes = FALSE)
## [1] TRUE
all.equal(sub1, sub3, check.attributes = FALSE)
## [1] TRUE

continous block (row-wise)

ind <- 1:2
microbenchmark(sub1 <- as.matrix(tenx[ind,], withDimnames = F)
               ,sub2 <- h5read(compressed, "/mm10/data", index=list(ind, NULL))
               ,sub3 <- h5read(uncompressed, "/mm10/data", index=list(ind,NULL))
               , times = 5)
## Unit: milliseconds
##                                                                      expr
##                          sub1 <- as.matrix(tenx[ind, ], withDimnames = F)
##         sub2 <- h5read(compressed, "/mm10/data", index = list(ind, NULL))
##  sub3 <- h5read(uncompressed, "/mm10/data", index = list(ind,      NULL))
##         min         lq       mean     median        uq        max neval
##  29791.1177 36275.5221 36559.7265 37871.0860 39358.578 39502.3288     5
##   3534.8459  3566.7625  3607.9368  3586.0279  3638.774  3713.2738     5
##    246.5934   249.0551   250.7899   252.0176   252.869   253.4145     5
all.equal(sub1, sub2, check.attributes = FALSE)
## [1] TRUE
all.equal(sub1, sub3, check.attributes = FALSE)
## [1] TRUE

random slices

set.seed(4)
rindx <- sample(r, 100)
cindx <- sample(c, 100)
microbenchmark(randomM1 <- as.matrix(tenx[rindx, cindx])
               ,randomM2 <- as.matrix(tenx[rindx, cindx], cpp = T)
               ,randomM3 <- h5read(compressed, "/mm10/data", index=list(rindx, cindx))
               ,randomM4 <- h5read(uncompressed, "/mm10/data", index=list(rindx, cindx))
                 , times = 10)
## Unit: milliseconds
##                                                                             expr
##                                        randomM1 <- as.matrix(tenx[rindx, cindx])
##                               randomM2 <- as.matrix(tenx[rindx, cindx], cpp = T)
##    randomM3 <- h5read(compressed, "/mm10/data", index = list(rindx,      cindx))
##  randomM4 <- h5read(uncompressed, "/mm10/data", index = list(rindx,      cindx))
##        min        lq      mean    median        uq       max neval
##  286.18396 294.12492 301.13529 299.42046 309.88381 319.65699    10
##  149.09306 150.30422 151.19445 151.23214 151.78208 153.63409    10
##   91.24773  92.55357  95.24850  94.93860  98.28324 100.32897    10
##   63.10031  63.49637  65.59568  65.02022  68.02447  69.05132    10
all.equal(randomM1, randomM3, check.attributes = FALSE)
## [1] TRUE
all.equal(randomM1, randomM4, check.attributes = FALSE)
## [1] TRUE
H5close()

some more benchmark for in-memory version of slicing mat vs sparsed mat(CSV)

library(Matrix)
mat <- as.matrix(tenx[1:1000,1:1000], withDimnames = F)
smat <- Matrix(mat, sparse = T)
format(object.size(mat), units = "MB")
## [1] "3.8 Mb"
format(object.size(smat), units = "MB")
## [1] "0.8 Mb"
set.seed(3)
r <- sample(1e3, 100)
c <- sample(1e3, 100)
microbenchmark(
  v1 <- colSums(mat)
  ,v2 <- Matrix::colSums(smat)
  ,v3 <- rowSums(mat)
  ,v4 <- Matrix::rowSums(smat)
  ,v5 <- mat[r,c]
  ,v6 <- smat[r,c]
  , times = 10
)
## Unit: microseconds
##                         expr      min       lq      mean    median
##           v1 <- colSums(mat)  918.706  949.550 1020.4976 1006.9150
##  v2 <- Matrix::colSums(smat)  311.717  321.235  339.1373  341.9290
##           v3 <- rowSums(mat) 3221.065 3270.296 3353.0933 3286.3870
##  v4 <- Matrix::rowSums(smat)  791.918  804.828  987.3554  828.0165
##              v5 <- mat[r, c]   89.513   91.258   92.6637   92.1795
##             v6 <- smat[r, c]  205.655  213.107 2306.1442  225.3290
##        uq       max neval
##  1025.217  1361.106    10
##   347.333   378.643    10
##  3380.442  3845.425    10
##  1028.226  1881.332    10
##    93.600    98.102    10
##   238.732 21045.595    10
all.equal(v1, v2)
## [1] TRUE
all.equal(v3, v4)
## [1] TRUE
# Rprof()
# sub <- as.matrix(tenx[1:1000, 1:1000], cpp = T)
# Rprof(NULL)
# summaryRprof()