library(microbenchmark)
library(Matrix)
library(data.table)
# library(RSQLite)
library(rhdf5)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "benchmark")
nGenes <- 4e3
simulate_sparse_mat <- function(alpha, beta, nCells){
pi_g <- rbeta(n = nGenes, alpha, beta)
#set 30% rows to all 0s
all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
m <- Matrix(0,nrow = nGenes, ncol = nCells, sparse = T)
for(i in seq_len(nGenes))
{
if(!(i %in% all_zero_row_id))
{
row <- sample(x = c(0, 10), size = nCells, prob = c(1-pi_g[i], pi_g[i]),replace = T)
colidx <- which(row !=0)
nLen <- length(colidx)
row[colidx]=runif(nLen,min=0.1,max=1)
m[i,] <- row
}
}
m
}
# more efficient version
simulate_sparse_mat_fast <- function(alpha, beta, nCells){
pi_g <- rbeta(n = nGenes, alpha, beta)
#set 30% rows to all 0s
all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
#construct mat from csr format
p <- vector(mode = "integer", length = nGenes+1)#row ptr
nLen <- 0
ptr <- 0
x <- integer()#non zero vals
j <- integer()#col idx
for(i in seq_len(nGenes))
{
p[i] <- ptr
if(!(i %in% all_zero_row_id))
{
row <- sample(x = c(0, 10), size = nCells, prob = c(1-pi_g[i], pi_g[i]),replace = T)
colidx <- which(row !=0)
nLen <- length(colidx)
row[colidx]=runif(nLen,min=0.1,max=1)
if(nLen > 0){
j <- c(j, colidx - 1)#add col idx (zero-based)
x <- c(x, row[colidx]) #add non zero vals
ptr <- ptr + nLen#mv ptr for the next row
}
}else
nLen <- 0
}
# browser()
#add the end idx for last row
p[i+1] <- length(x)
# browser()
sparseMatrix(j = j, p = p, x = x, index1 = F, dims = c(nGenes, nCells))
}
# system.time(dd <- simulate_sparse_mat_fast(0.1, 0.8, 10))
# set.seed(4)
# system.time(dd <- simulate_sparse_mat(0.1, 0.8, 10))
# all.equal(d,dd)
# set.seed(4)
set.seed(4)
nCellsVec <- c(1e3, 4e3, 7e3, 2e4, 5e4,1e5,5e5)
beta_values <- data.frame(alpha = c(0.1, 0.2, 0.3, 0.4), beta = c(0.8, 0.9, 0.7, 0.6))
ridx <- sample(nGenes, 1e3)
ntime <- 10
# Rprof()
res <- apply(beta_values, 1, function(beta_value){
alpha <- beta_value[1]
beta <- beta_value[2]
res <- lapply(nCellsVec, function(nCells){
suffix <- paste(alpha, beta, nGenes, nCells, sep = "_")
cidx <- sample(nCells, 1e3)
#in-memory mat
matfile <- file.path(path, paste0("mat_", suffix,".rds"))
if(!file.exists(matfile))
{
mat <- simulate_sparse_mat_fast(alpha, beta, nCells)
saveRDS(mat, file = matfile)
}else
mat <- readRDS(matfile)
sparsity <- 1- nnzero(mat)/(nCells * nGenes)
size <- object.size(mat)/1e9
timing <- microbenchmark(mat[ridx,], mat[, cidx], sub1 <- mat[ridx, cidx], times = ntime, unit = "ms")
timing <- summary(timing)[["mean"]]
dt1 <- data.table(nCells = nCells
, format = "in-memory sparsed 2d mat"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
#sqlite
# dbfile <- file.path(path, paste0("Test_", suffix,".sqlite"))
#
# if(!file.exists(dbfile))
# {
# db_sqlite = dbConnect(SQLite(), dbname=dbfile)
# #covert CSC to COO
# startidx <- mat@p[1:nCells]
# endidx <- mat@p[-1]-1
# colidx <- Map(seq, startidx, endidx, MoreArgs=list(by = 1))
# lens <- lengths(colidx)
# colidx <- rep(seq_along(colidx)-1, lens)
# df <- data.frame(values = mat@x, ridx = mat@i, cidx = colidx, check.names = FALSE)
# dbWriteTable(db_sqlite, "data", df, overwrite = T)
# dbSendStatement(db_sqlite, "CREATE INDEX index_row ON data (ridx);")
# dbSendStatement(db_sqlite, "CREATE INDEX index_col ON data (cidx);")
# dbSendStatement(db_sqlite, "CREATE INDEX index_col_row ON data (ridx,cidx);")
#
# }else
# db_sqlite = dbConnect(SQLite(), dbname=dbfile)
# # browser()
# size <- file.size(dbfile) /1e9
# timing <- microbenchmark(readDB(db_sqlite, ridx, NULL, nCells)
# , readDB(db_sqlite, NULL, cidx, nCells)
# , sub2 <- readDB(db_sqlite, ridx, cidx, nCells)
# , times = 3, unit = "ms")
# timing <- summary(timing)[["mean"]]
# dt2 <- data.table(nCells = nCells
# , format = "sqlite(COO)"
# , size = size
# , slicing = c("row", "col", "row&col")
# , time = timing)
#h5
compressed <- file.path(path, paste0("compressed_", suffix, ".h5"))
nBlockSize <- min(1000, nCells)
nBlocks <- nCells/nBlockSize
if(nBlocks > 1)
blocks <- cut(seq_len(nCells), nBlocks)
else
blocks <- 1
col.grps <- split(seq_len(nCells), blocks)
if(!file.exists(compressed))
{
h5createFile(compressed)
h5createDataset(compressed, "data", c(nGenes, nCells), storage.mode = "double", chunk=c(nGenes,1), level=7)
#write subset of columns (avoid coerce entire mat into memory)
for(i in col.grps)
h5write(as.matrix(mat[, i, drop = F]), compressed, "data", index = list(NULL, i))
}
size <- file.size(compressed) /1e9
timing <- microbenchmark(h5read(compressed, "data", list(ridx, NULL))
, h5read(compressed, "data", list(NULL, cidx))
, sub3 <- h5read(compressed, "data", list(ridx, cidx))
, times = ntime, unit = "ms")
# browser()
timing <- summary(timing)[["mean"]]
dt3 <- data.table(nCells = nCells
, format = "H5(2d mat)"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
#chunked.read.h5
#The API only reads data from H5 by chunks (entire columns)
h5read.chunked <- function(h5, name, index){
rind <- index[[1]]
cind <- index[[2]]
if(is.null(cind))
cind <- seq_len(nCells)
nTotal <- length(cind)
nBlockSize <- min(1000, nTotal)
nBlocks <- nTotal/nBlockSize
if(nBlocks > 1)
blocks <- cut(cind, nBlocks)
else
blocks <- 1
col.grps <- split(cind, blocks)
res <- lapply(col.grps, function(i){
sub <- h5read(h5, name, list(NULL,i))
# browser()
if(!is.null(rind))
sub <- sub[rind,]
sub
})
do.call(cbind, res)
}
# browser()
timing <- microbenchmark(h5read.chunked(compressed, "data", list(ridx, NULL))
, h5read.chunked(compressed, "data", list(NULL, cidx))
, sub4 <- h5read.chunked(compressed, "data", list(ridx, cidx))
, times = ntime, unit = "ms")
H5close()
timing <- summary(timing)[["mean"]]
dt4 <- data.table(nCells = nCells
, format = "H5(chunked.read)"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
sub1 <- as.matrix(sub1)
# browser()
#TODO: check all.equal(sub1, sub2, check.attributes = F)
# if(!all.equal(sub2, sub3, check.attributes = F)||!all.equal(sub2, sub4, check.attributes = F))
# stop("wrong reading!")
thisRes <- rbindlist(list(dt1, dt3, dt4))
thisRes[, sparsity:=sparsity]
thisRes[, beta_value := paste0("alpha = ", alpha, " beta = ", beta)]
})
rbindlist(res)
})
# Rprof(NULL)
# summaryRprof()
dt <- rbindlist(res)
dt[, unique(sparsity), by = list(nCells, beta_value)]
## nCells beta_value V1
## 1: 1e+03 alpha = 0.1 beta = 0.8 0.9219535
## 2: 4e+03 alpha = 0.1 beta = 0.8 0.9264733
## 3: 7e+03 alpha = 0.1 beta = 0.8 0.9214560
## 4: 2e+04 alpha = 0.1 beta = 0.8 0.9233983
## 5: 5e+04 alpha = 0.1 beta = 0.8 0.9247434
## 6: 1e+05 alpha = 0.1 beta = 0.8 0.9198862
## 7: 5e+05 alpha = 0.1 beta = 0.8 0.9172703
## 8: 1e+03 alpha = 0.2 beta = 0.9 0.8729290
## 9: 4e+03 alpha = 0.2 beta = 0.9 0.8752556
## 10: 7e+03 alpha = 0.2 beta = 0.9 0.8660386
## 11: 2e+04 alpha = 0.2 beta = 0.9 0.8675775
## 12: 5e+04 alpha = 0.2 beta = 0.9 0.8759838
## 13: 1e+05 alpha = 0.2 beta = 0.9 0.8699478
## 14: 5e+05 alpha = 0.2 beta = 0.9 0.8744355
## 15: 1e+03 alpha = 0.3 beta = 0.7 0.7881493
## 16: 4e+03 alpha = 0.3 beta = 0.7 0.7921320
## 17: 7e+03 alpha = 0.3 beta = 0.7 0.7847506
## 18: 2e+04 alpha = 0.3 beta = 0.7 0.7865228
## 19: 5e+04 alpha = 0.3 beta = 0.7 0.7903635
## 20: 1e+05 alpha = 0.3 beta = 0.7 0.7926098
## 21: 5e+05 alpha = 0.3 beta = 0.7 0.7883725
## 22: 1e+03 alpha = 0.4 beta = 0.6 0.7153415
## 23: 4e+03 alpha = 0.4 beta = 0.6 0.7195698
## 24: 7e+03 alpha = 0.4 beta = 0.6 0.7151178
## 25: 2e+04 alpha = 0.4 beta = 0.6 0.7265743
## 26: 5e+04 alpha = 0.4 beta = 0.6 0.7163692
## 27: 1e+05 alpha = 0.4 beta = 0.6 0.7190813
## 28: 5e+05 alpha = 0.4 beta = 0.6 0.7240613
## nCells beta_value V1
library(ggplot2)
ggplot(dt, aes(y = time, x = nCells, color = format)) + geom_line() + geom_point() + facet_grid(slicing~beta_value, scales ="free") + ylab("time (ms)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("time")
plot of chunk unnamed-chunk-6
ggplot(dt[format != "H5(chunked.read)"], aes(x = nCells, y = size, color = format)) + geom_line() + geom_point() + facet_wrap(~beta_value) + ylab("size (GB)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("space")
plot of chunk unnamed-chunk-6