library(microbenchmark)
library(Matrix)
library(data.table)
nGenes <- 3e4
sim <- function(alpha, beta, nCells){
m <- matrix(nrow = nGenes, ncol = nCells)
pi_g <- rbeta(n = nGenes, alpha, beta)
for(i in seq_len(nGenes))
m[i, ] <- sample(x = c(0, 10), size = nCells, prob = c(1-pi_g[i], pi_g[i]),replace = T)
#set 30% rows to all 0s
m[sample(nGenes, 0.3 * nGenes), ] <- 0
m
}
set.seed(4)
ridx <- sample(nGenes, 1e3)
res <- lapply(c(1e3, 4e3, 6e3,2e4), function(nCells){
cidx <- sample(nCells, 1e3)
#2d in-memory mat
mat <- sim(0.5, 0.5, nCells)
size <- object.size(mat)/1e9
timing <- microbenchmark(mat[ridx,], mat[, cidx], mat[ridx, cidx], times = 3, unit = "ms")
timing <- summary(timing)[["mean"]]
dt1 <- data.table(nCells = nCells, format = "in-memory 2d mat", size = size, slicing = c("row", "col", "row&col"), time = timing)
#2d in-memory sparse mat
mat <- Matrix(mat, sparse = T)
sparsity <- 1- nnzero(mat)/(nCells * nGenes)
size <- object.size(mat)/1e9
timing <- microbenchmark(mat[ridx,], mat[, cidx], mat[ridx, cidx], times = 3, unit = "ms")
timing <- summary(timing)[["mean"]]
dt2 <- data.table(nCells = nCells, format = "in-memory sparsed 2d mat", size = size, slicing = c("row", "col", "row&col"), time = timing)
thisRes <- rbindlist(list(dt1, dt2))
thisRes[, sparsity:=sparsity]
})
dt <- rbindlist(res)
dt[, unique(sparsity), by = nCells]
## nCells V1
## 1: 1000 0.6495754
## 2: 4000 0.6472751
## 3: 6000 0.6495084
## 4: 20000 0.6530742
library(ggplot2)
ggplot(dt, aes(y = time, x = nCells, color = format)) + geom_line() + geom_point() + facet_wrap(~slicing) + ylab("time (ms)")
ggplot(dt, aes(x = nCells, y = size, color = format)) + geom_line() + geom_point() + ylab("size (GB)")