library(rhdf5)
library(TENxGenomics)
## Loading required package: Matrix
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:Matrix':
##
## colMeans, colSums, rowMeans, rowSums, which
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:httr':
##
## content
## Loading required package: DelayedArray
## Loading required package: matrixStats
## matrixStats v0.51.0 (2016-10-08) successfully loaded. See ?matrixStats for help.
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
##
## apply
library(microbenchmark)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "h5test")
orig <- file.path(path, "1M_neurons_neuron20k.h5")
tenx <- TENxGenomics(orig)
tenx
## class: TENxGenomics
## h5path: /loc/no-backup/mike/shared/h5test/1M_neurons_neuron20k.h5
## dim(): 27998 x 20000
data <- as.matrix(tenx)
r <- nrow(tenx)
c <- ncol(tenx)
compressed <- file.path(path, "compressed.h5")
h5createFile(compressed)
h5createGroup(compressed, "mm10")
uncompressed <- file.path(path, "uncompressed.h5")
h5createFile(uncompressed)
h5createGroup(uncompressed, "mm10")
##write the data
h5createDataset(compressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=7)
h5write(data, compressed, "/mm10/data")
h5createDataset(uncompressed, "/mm10/data", c(r,c), storage.mode = "integer", chunk=c(r,1), level=0)
h5write(data, uncompressed, "/mm10/data")
barcodes <- h5read(orig, "/mm10/barcodes")
h5write(barcodes, compressed, "/mm10/barcodes", level = 1)
h5write(barcodes, uncompressed, "/mm10/barcodes", level = 1)
genes <- h5read(orig, "/mm10/genes")
h5write(genes, compressed, "/mm10/genes", level = 1)
h5write(genes, uncompressed, "/mm10/genes", level = 1)
gene_names <- h5read(orig, "/mm10/gene_names")
h5write(barcodes, compressed, "/mm10/gene_names", level = 1)
h5write(barcodes, uncompressed, "/mm10/gene_names", level = 1)
utils:::format.object_size(file.size(orig), unit= "MB")
## [1] "58.8 Mb"
utils:::format.object_size(file.size(compressed), unit= "MB")
## [1] "72.6 Mb"
utils:::format.object_size(file.size(uncompressed), unit= "MB")
## [1] "2136.4 Mb"
ind <- 1:100
microbenchmark(sub1 <- as.matrix(tenx[,ind], withDimnames = F)
,sub2 <- h5read(compressed, "/mm10/data", index=list(NULL,ind))
,sub3 <- h5read(uncompressed, "/mm10/data", index=list(NULL,ind))
, times = 10)
## Unit: milliseconds
## expr
## sub1 <- as.matrix(tenx[, ind], withDimnames = F)
## sub2 <- h5read(compressed, "/mm10/data", index = list(NULL, ind))
## sub3 <- h5read(uncompressed, "/mm10/data", index = list(NULL, ind))
## min lq mean median uq max neval
## 164.03254 169.81935 324.77722 327.98678 454.53002 541.3881 10
## 64.90005 65.33857 94.36689 66.54846 67.64308 345.3854 10
## 46.99974 48.24214 165.70198 49.18509 333.80274 366.2625 10
all.equal(sub1, sub2, check.attributes = FALSE)
## [1] TRUE
all.equal(sub1, sub3, check.attributes = FALSE)
## [1] TRUE
ind <- 1:2
microbenchmark(sub1 <- as.matrix(tenx[ind,], withDimnames = F)
,sub2 <- h5read(compressed, "/mm10/data", index=list(ind, NULL))
,sub3 <- h5read(uncompressed, "/mm10/data", index=list(ind,NULL))
, times = 5)
## Unit: milliseconds
## expr
## sub1 <- as.matrix(tenx[ind, ], withDimnames = F)
## sub2 <- h5read(compressed, "/mm10/data", index = list(ind, NULL))
## sub3 <- h5read(uncompressed, "/mm10/data", index = list(ind, NULL))
## min lq mean median uq max neval
## 29791.1177 36275.5221 36559.7265 37871.0860 39358.578 39502.3288 5
## 3534.8459 3566.7625 3607.9368 3586.0279 3638.774 3713.2738 5
## 246.5934 249.0551 250.7899 252.0176 252.869 253.4145 5
all.equal(sub1, sub2, check.attributes = FALSE)
## [1] TRUE
all.equal(sub1, sub3, check.attributes = FALSE)
## [1] TRUE
set.seed(4)
rindx <- sample(r, 100)
cindx <- sample(c, 100)
microbenchmark(randomM1 <- as.matrix(tenx[rindx, cindx])
,randomM2 <- as.matrix(tenx[rindx, cindx], cpp = T)
,randomM3 <- h5read(compressed, "/mm10/data", index=list(rindx, cindx))
,randomM4 <- h5read(uncompressed, "/mm10/data", index=list(rindx, cindx))
, times = 10)
## Unit: milliseconds
## expr
## randomM1 <- as.matrix(tenx[rindx, cindx])
## randomM2 <- as.matrix(tenx[rindx, cindx], cpp = T)
## randomM3 <- h5read(compressed, "/mm10/data", index = list(rindx, cindx))
## randomM4 <- h5read(uncompressed, "/mm10/data", index = list(rindx, cindx))
## min lq mean median uq max neval
## 286.18396 294.12492 301.13529 299.42046 309.88381 319.65699 10
## 149.09306 150.30422 151.19445 151.23214 151.78208 153.63409 10
## 91.24773 92.55357 95.24850 94.93860 98.28324 100.32897 10
## 63.10031 63.49637 65.59568 65.02022 68.02447 69.05132 10
all.equal(randomM1, randomM3, check.attributes = FALSE)
## [1] TRUE
all.equal(randomM1, randomM4, check.attributes = FALSE)
## [1] TRUE
H5close()
library(Matrix)
mat <- as.matrix(tenx[1:1000,1:1000], withDimnames = F)
smat <- Matrix(mat, sparse = T)
format(object.size(mat), units = "MB")
## [1] "3.8 Mb"
format(object.size(smat), units = "MB")
## [1] "0.8 Mb"
set.seed(3)
r <- sample(1e3, 100)
c <- sample(1e3, 100)
microbenchmark(
v1 <- colSums(mat)
,v2 <- Matrix::colSums(smat)
,v3 <- rowSums(mat)
,v4 <- Matrix::rowSums(smat)
,v5 <- mat[r,c]
,v6 <- smat[r,c]
, times = 10
)
## Unit: microseconds
## expr min lq mean median
## v1 <- colSums(mat) 918.706 949.550 1020.4976 1006.9150
## v2 <- Matrix::colSums(smat) 311.717 321.235 339.1373 341.9290
## v3 <- rowSums(mat) 3221.065 3270.296 3353.0933 3286.3870
## v4 <- Matrix::rowSums(smat) 791.918 804.828 987.3554 828.0165
## v5 <- mat[r, c] 89.513 91.258 92.6637 92.1795
## v6 <- smat[r, c] 205.655 213.107 2306.1442 225.3290
## uq max neval
## 1025.217 1361.106 10
## 347.333 378.643 10
## 3380.442 3845.425 10
## 1028.226 1881.332 10
## 93.600 98.102 10
## 238.732 21045.595 10
all.equal(v1, v2)
## [1] TRUE
all.equal(v3, v4)
## [1] TRUE
# Rprof()
# sub <- as.matrix(tenx[1:1000, 1:1000], cpp = T)
# Rprof(NULL)
# summaryRprof()