library(microbenchmark)
library(Matrix)
library(data.table)
library(RSQLite)
library(rhdf5)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "benchmark")
nGenes <- 3e4
simulate_sparse_mat <- function(alpha, beta, nCells){
pi_g <- rbeta(n = nGenes, alpha, beta)
#set 30% rows to all 0s
all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
m <- Matrix(0,nrow = nGenes, ncol = nCells, sparse = T)
for(i in seq_len(nGenes))
{
if(!(i %in% all_zero_row_id))
{
row <- sample(x = c(0, 10), size = nCells, prob = c(1-pi_g[i], pi_g[i]),replace = T)
colidx <- which(row !=0)
nLen <- length(colidx)
row[colidx]=runif(nLen,min=0.1,max=1)
m[i,] <- row
}
}
m
}
# more efficient version
simulate_sparse_mat_fast <- function(alpha, beta, nCells){
pi_g <- rbeta(n = nGenes, alpha, beta)
#set 30% rows to all 0s
all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
#construct mat from csr format
p <- vector(mode = "integer", length = nGenes+1)#row ptr
nLen <- 0
ptr <- 0
x <- integer()#non zero vals
j <- integer()#col idx
for(i in seq_len(nGenes))
{
p[i] <- ptr
if(!(i %in% all_zero_row_id))
{
row <- sample(x = c(0, 10), size = nCells, prob = c(1-pi_g[i], pi_g[i]),replace = T)
colidx <- which(row !=0)
nLen <- length(colidx)
row[colidx]=runif(nLen,min=0.1,max=1)
if(nLen > 0){
j <- c(j, colidx - 1)#add col idx (zero-based)
x <- c(x, row[colidx]) #add non zero vals
ptr <- ptr + nLen#mv ptr for the next row
}
}else
nLen <- 0
}
# browser()
#add the end idx for last row
p[i+1] <- length(x)
# browser()
sparseMatrix(j = j, p = p, x = x, index1 = F, dims = c(nGenes, nCells))
}
# system.time(dd <- simulate_sparse_mat_fast(0.1, 0.8, 10))
# set.seed(4)
# system.time(dd <- simulate_sparse_mat(0.1, 0.8, 10))
# all.equal(d,dd)
# set.seed(4)
set.seed(4)
nCellsVec <- c(1e3, 4e3, 7e3, 2e4, 5e4,1e5)[1:4]
beta_values <- data.frame(alpha = c(0.1, 0.2, 0.3, 0.4), beta = c(0.8, 0.9, 0.7, 0.6))
ridx <- sample(nGenes, 1e3)
# Rprof()
res <- apply(beta_values, 1, function(beta_value){
alpha <- beta_value[1]
beta <- beta_value[2]
res <- lapply(nCellsVec, function(nCells){
suffix <- paste(alpha, beta, nGenes, nCells, sep = "_")
cidx <- sample(nCells, 1e3)
#in-memory mat
matfile <- file.path(path, paste0("mat_", suffix,".rds"))
if(!file.exists(matfile))
{
mat <- simulate_sparse_mat_fast(alpha, beta, nCells)
saveRDS(mat, file = matfile)
}else
mat <- readRDS(matfile)
sparsity <- 1- nnzero(mat)/(nCells * nGenes)
size <- object.size(mat)/1e9
timing <- microbenchmark(mat[ridx,], mat[, cidx], sub1 <- mat[ridx, cidx], times = 3, unit = "ms")
timing <- summary(timing)[["mean"]]
dt1 <- data.table(nCells = nCells
, format = "in-memory sparsed 2d mat"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
#sqlite
dbfile <- file.path(path, paste0("Test_", suffix,".sqlite"))
if(!file.exists(dbfile))
{
db_sqlite = dbConnect(SQLite(), dbname=dbfile)
#covert CSC to COO
startidx <- mat@p[1:nCells]
endidx <- mat@p[-1]-1
colidx <- Map(seq, startidx, endidx, MoreArgs=list(by = 1))
lens <- lengths(colidx)
colidx <- rep(seq_along(colidx)-1, lens)
df <- data.frame(values = mat@x, ridx = mat@i, cidx = colidx, check.names = FALSE)
dbWriteTable(db_sqlite, "data", df, overwrite = T)
dbSendStatement(db_sqlite, "CREATE INDEX index_row ON data (ridx);")
dbSendStatement(db_sqlite, "CREATE INDEX index_col ON data (cidx);")
dbSendStatement(db_sqlite, "CREATE INDEX index_col_row ON data (ridx,cidx);")
}else
db_sqlite = dbConnect(SQLite(), dbname=dbfile)
# browser()
size <- file.size(dbfile) /1e9
timing <- microbenchmark(readDB(db_sqlite, ridx, NULL, nCells)
, readDB(db_sqlite, NULL, cidx, nCells)
, sub2 <- readDB(db_sqlite, ridx, cidx, nCells)
, times = 3, unit = "ms")
timing <- summary(timing)[["mean"]]
dt2 <- data.table(nCells = nCells
, format = "sqlite(COO)"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
#h5
compressed <- file.path(path, paste0("compressed_", suffix, ".h5"))
nBlockSize <- min(1000, nCells)
nBlocks <- nCells/nBlockSize
if(nBlocks > 1)
blocks <- cut(seq_len(nCells), nBlocks)
else
blocks <- 1
col.grps <- split(seq_len(nCells), blocks)
if(!file.exists(compressed))
{
h5createFile(compressed)
h5createDataset(compressed, "data", c(nGenes, nCells), storage.mode = "double", chunk=c(nGenes,1), level=7)
#write subset of columns (avoid coerce entire mat into memory)
for(i in col.grps)
h5write(as.matrix(mat[, i, drop = F]), compressed, "data", index = list(NULL, i))
}
size <- file.size(compressed) /1e9
timing <- microbenchmark(h5read(compressed, "data", list(ridx, NULL))
, h5read(compressed, "data", list(NULL, cidx))
, sub3 <- h5read(compressed, "data", list(ridx, cidx))
, times = 3, unit = "ms")
# browser()
timing <- summary(timing)[["mean"]]
dt3 <- data.table(nCells = nCells
, format = "H5(2d mat)"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
#chunked.read.h5
#The API only reads data from H5 by chunks (entire columns)
h5read.chunked <- function(h5, name, index){
rind <- index[[1]]
cind <- index[[2]]
if(is.null(cind))
cind <- seq_len(nCells)
nTotal <- length(cind)
nBlockSize <- min(1000, nTotal)
nBlocks <- nTotal/nBlockSize
if(nBlocks > 1)
blocks <- cut(cind, nBlocks)
else
blocks <- 1
col.grps <- split(cind, blocks)
res <- lapply(col.grps, function(i){
sub <- h5read(h5, name, list(NULL,i))
# browser()
if(!is.null(rind))
sub <- sub[rind,]
sub
})
do.call(cbind, res)
}
# browser()
timing <- microbenchmark(h5read.chunked(compressed, "data", list(ridx, NULL))
, h5read.chunked(compressed, "data", list(NULL, cidx))
, sub4 <- h5read.chunked(compressed, "data", list(ridx, cidx))
, times = 3, unit = "ms")
H5close()
timing <- summary(timing)[["mean"]]
dt4 <- data.table(nCells = nCells
, format = "H5(chunked.read)"
, size = size
, slicing = c("row", "col", "row&col")
, time = timing)
sub1 <- as.matrix(sub1)
# browser()
#TODO: check all.equal(sub1, sub2, check.attributes = F)
if(!all.equal(sub2, sub3, check.attributes = F)||!all.equal(sub2, sub4, check.attributes = F))
stop("wrong reading!")
thisRes <- rbindlist(list(dt1, dt2, dt3, dt4))
thisRes[, sparsity:=sparsity]
thisRes[, beta_value := paste0("alpha = ", alpha, " beta = ", beta)]
})
rbindlist(res)
})
# Rprof(NULL)
# summaryRprof()
dt <- rbindlist(res)
dt[, unique(sparsity), by = list(nCells, beta_value)]
## nCells beta_value V1
## 1: 1000 alpha = 0.1 beta = 0.8 0.9225260
## 2: 4000 alpha = 0.1 beta = 0.8 0.9219361
## 3: 7000 alpha = 0.1 beta = 0.8 0.9203185
## 4: 20000 alpha = 0.1 beta = 0.8 0.9228833
## 5: 1000 alpha = 0.2 beta = 0.9 0.8721571
## 6: 4000 alpha = 0.2 beta = 0.9 0.8722341
## 7: 7000 alpha = 0.2 beta = 0.9 0.8740477
## 8: 20000 alpha = 0.2 beta = 0.9 0.8743154
## 9: 1000 alpha = 0.3 beta = 0.7 0.7895697
## 10: 4000 alpha = 0.3 beta = 0.7 0.7871520
## 11: 7000 alpha = 0.3 beta = 0.7 0.7897489
## 12: 20000 alpha = 0.3 beta = 0.7 0.7881764
## 13: 1000 alpha = 0.4 beta = 0.6 0.7190587
## 14: 4000 alpha = 0.4 beta = 0.6 0.7195323
## 15: 7000 alpha = 0.4 beta = 0.6 0.7173377
## 16: 20000 alpha = 0.4 beta = 0.6 0.7201783
library(ggplot2)
ggplot(dt, aes(y = time, x = nCells, color = format)) + geom_line() + geom_point() + facet_grid(slicing~beta_value, scales ="free") + ylab("time (ms)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("time")
plot of chunk unnamed-chunk-6
ggplot(dt[format != "H5(chunked.read)"], aes(x = nCells, y = size, color = format)) + geom_line() + geom_point() + facet_wrap(~beta_value) + ylab("size (GB)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("space")
plot of chunk unnamed-chunk-6