library(microbenchmark)
library(Matrix)
library(data.table)
library(RSQLite)
library(rhdf5)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "benchmark")

data Simulation function

nGenes <- 3e4
simulate_sparse_mat <- function(alpha, beta, nCells){
  
  pi_g <- rbeta(n = nGenes, alpha, beta)
  
  #set 30% rows to all 0s
  all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
  
  m <- Matrix(0,nrow = nGenes, ncol = nCells, sparse = T)
  for(i in seq_len(nGenes))
  {
    if(!(i %in% all_zero_row_id))
    {
      row <- sample(x = c(0, 10), size = nCells,  prob = c(1-pi_g[i], pi_g[i]),replace = T)
      colidx <- which(row !=0)
      nLen <- length(colidx)
      row[colidx]=runif(nLen,min=0.1,max=1)

      m[i,] <- row
      
    }
      
  }
  m
}

# more efficient version
simulate_sparse_mat_fast <- function(alpha, beta, nCells){
  
  pi_g <- rbeta(n = nGenes, alpha, beta)
  
  #set 30% rows to all 0s
  all_zero_row_id <- sample(nGenes, 0.3 * nGenes)
  
  #construct mat from csr format
  p <- vector(mode = "integer", length = nGenes+1)#row ptr
  nLen <- 0
  ptr <- 0
  x <- integer()#non zero vals
  j <- integer()#col idx
  for(i in seq_len(nGenes))
  {
  
    p[i] <- ptr
    if(!(i %in% all_zero_row_id))
    {
      row <- sample(x = c(0, 10), size = nCells,  prob = c(1-pi_g[i], pi_g[i]),replace = T)
      colidx <- which(row !=0)
      nLen <- length(colidx)
      row[colidx]=runif(nLen,min=0.1,max=1)
      
      if(nLen > 0){
        j <- c(j, colidx - 1)#add col idx (zero-based)
        x <- c(x, row[colidx]) #add non zero vals
        ptr <- ptr + nLen#mv ptr for the next row
        
      }
        
      
    }else
      nLen <- 0
      
      
  }
    # browser()
  #add the end idx for last row
  p[i+1] <- length(x)
  # browser()
  sparseMatrix(j = j, p = p, x = x, index1 = F, dims = c(nGenes, nCells))
}

# system.time(dd <- simulate_sparse_mat_fast(0.1, 0.8, 10))
# set.seed(4)
# system.time(dd <- simulate_sparse_mat(0.1, 0.8, 10))
# all.equal(d,dd)
# set.seed(4)

benchmark different formats

set.seed(4)
nCellsVec <- c(1e3, 4e3, 7e3, 2e4, 5e4,1e5)[1:4]
beta_values <- data.frame(alpha = c(0.1, 0.2, 0.3, 0.4), beta = c(0.8, 0.9, 0.7, 0.6))
ridx <- sample(nGenes, 1e3)
# Rprof()
res <- apply(beta_values, 1, function(beta_value){
  
            alpha <- beta_value[1]
            beta <- beta_value[2]
            res <- lapply(nCellsVec, function(nCells){
              suffix <- paste(alpha, beta, nGenes, nCells, sep = "_")
              cidx <- sample(nCells, 1e3)
              
              #in-memory mat
              matfile <- file.path(path, paste0("mat_", suffix,".rds"))
              if(!file.exists(matfile))
              {
                mat <- simulate_sparse_mat_fast(alpha, beta, nCells)  
                saveRDS(mat, file = matfile)
              }else
                mat <- readRDS(matfile)
                
              sparsity <- 1- nnzero(mat)/(nCells * nGenes)
              size <- object.size(mat)/1e9
              timing <- microbenchmark(mat[ridx,], mat[, cidx], sub1 <- mat[ridx, cidx], times = 3, unit = "ms")
              timing <- summary(timing)[["mean"]]
              dt1 <- data.table(nCells = nCells
                                , format = "in-memory sparsed 2d mat"
                                , size = size
                                , slicing = c("row", "col", "row&col")
                                , time = timing)
              
              
              #sqlite
              dbfile <- file.path(path, paste0("Test_", suffix,".sqlite"))
              
              if(!file.exists(dbfile))
              {
                db_sqlite = dbConnect(SQLite(), dbname=dbfile)  
                #covert CSC to COO
                startidx <- mat@p[1:nCells]
                endidx <- mat@p[-1]-1
                colidx <- Map(seq, startidx, endidx, MoreArgs=list(by = 1))
                lens <- lengths(colidx)
                colidx <- rep(seq_along(colidx)-1, lens)
                df <- data.frame(values = mat@x, ridx = mat@i, cidx = colidx, check.names = FALSE)
                dbWriteTable(db_sqlite, "data", df, overwrite = T)
                dbSendStatement(db_sqlite, "CREATE INDEX index_row ON data (ridx);")
                dbSendStatement(db_sqlite, "CREATE INDEX index_col ON data (cidx);")
                dbSendStatement(db_sqlite, "CREATE INDEX index_col_row ON data (ridx,cidx);")
  
              }else
                db_sqlite = dbConnect(SQLite(), dbname=dbfile)  
              # browser()
              size <- file.size(dbfile) /1e9
              timing <- microbenchmark(readDB(db_sqlite, ridx, NULL, nCells)
                                       , readDB(db_sqlite, NULL, cidx, nCells)
                                       , sub2 <- readDB(db_sqlite, ridx, cidx, nCells)
                                       , times = 3, unit = "ms")
              timing <- summary(timing)[["mean"]]
              dt2 <- data.table(nCells = nCells
                                , format = "sqlite(COO)"
                                , size = size
                                , slicing = c("row", "col", "row&col")
                                , time = timing)
              
              
              #h5 
              compressed <- file.path(path, paste0("compressed_", suffix, ".h5"))
              nBlockSize <- min(1000, nCells)
              nBlocks <- nCells/nBlockSize
              if(nBlocks > 1)
                blocks <- cut(seq_len(nCells), nBlocks)
              else
                blocks <- 1
              
              col.grps <- split(seq_len(nCells), blocks)
              if(!file.exists(compressed))
              {
                h5createFile(compressed)
                h5createDataset(compressed, "data", c(nGenes, nCells), storage.mode = "double", chunk=c(nGenes,1), level=7)
                #write subset of columns (avoid coerce entire mat into memory)
                for(i in col.grps)
                  h5write(as.matrix(mat[, i, drop = F]), compressed, "data", index = list(NULL, i))
              }
              size <- file.size(compressed) /1e9
              timing <- microbenchmark(h5read(compressed, "data", list(ridx, NULL))
                                       , h5read(compressed, "data", list(NULL, cidx))
                                       , sub3 <- h5read(compressed, "data", list(ridx, cidx))
                                       , times = 3, unit = "ms")
              
              # browser()
              
              timing <- summary(timing)[["mean"]]
              dt3 <- data.table(nCells = nCells
                                , format = "H5(2d mat)"
                                , size = size
                                , slicing = c("row", "col", "row&col")
                                , time = timing)
              
              #chunked.read.h5
              #The API only reads data from H5 by chunks (entire columns)
              h5read.chunked <- function(h5, name, index){
                  rind <- index[[1]]
                  cind <- index[[2]]
                  if(is.null(cind))
                    cind <- seq_len(nCells)
                  nTotal <- length(cind)
                  nBlockSize <- min(1000, nTotal)
                  nBlocks <- nTotal/nBlockSize
                  if(nBlocks > 1)
                    blocks <- cut(cind, nBlocks)
                  else
                    blocks <- 1
                  col.grps <- split(cind, blocks)
                  res <- lapply(col.grps, function(i){
                    sub <- h5read(h5, name, list(NULL,i))
                    # browser()
                    if(!is.null(rind))
                      sub <- sub[rind,]
                    sub
                  })
                  do.call(cbind, res)
              }

              # browser()
              timing <- microbenchmark(h5read.chunked(compressed, "data", list(ridx, NULL))
                                       , h5read.chunked(compressed, "data", list(NULL, cidx))
                                       , sub4 <- h5read.chunked(compressed, "data", list(ridx, cidx))
                                       , times = 3, unit = "ms")
              
              
              H5close()
              timing <- summary(timing)[["mean"]]
              dt4 <- data.table(nCells = nCells
                                , format = "H5(chunked.read)"
                                , size = size
                                , slicing = c("row", "col", "row&col")
                                , time = timing)
              
              sub1 <- as.matrix(sub1)
              # browser()
              #TODO: check all.equal(sub1, sub2, check.attributes = F)
              if(!all.equal(sub2, sub3, check.attributes = F)||!all.equal(sub2, sub4, check.attributes = F))
                stop("wrong reading!")
              thisRes <- rbindlist(list(dt1, dt2, dt3, dt4))
              thisRes[, sparsity:=sparsity]
              thisRes[, beta_value := paste0("alpha = ", alpha, " beta = ", beta)]
            })
            rbindlist(res)
          })
# Rprof(NULL)
# summaryRprof()
dt <- rbindlist(res)

sparsity (zero-value rate)

dt[, unique(sparsity), by = list(nCells, beta_value)]
##     nCells             beta_value        V1
##  1:   1000 alpha = 0.1 beta = 0.8 0.9225260
##  2:   4000 alpha = 0.1 beta = 0.8 0.9219361
##  3:   7000 alpha = 0.1 beta = 0.8 0.9203185
##  4:  20000 alpha = 0.1 beta = 0.8 0.9228833
##  5:   1000 alpha = 0.2 beta = 0.9 0.8721571
##  6:   4000 alpha = 0.2 beta = 0.9 0.8722341
##  7:   7000 alpha = 0.2 beta = 0.9 0.8740477
##  8:  20000 alpha = 0.2 beta = 0.9 0.8743154
##  9:   1000 alpha = 0.3 beta = 0.7 0.7895697
## 10:   4000 alpha = 0.3 beta = 0.7 0.7871520
## 11:   7000 alpha = 0.3 beta = 0.7 0.7897489
## 12:  20000 alpha = 0.3 beta = 0.7 0.7881764
## 13:   1000 alpha = 0.4 beta = 0.6 0.7190587
## 14:   4000 alpha = 0.4 beta = 0.6 0.7195323
## 15:   7000 alpha = 0.4 beta = 0.6 0.7173377
## 16:  20000 alpha = 0.4 beta = 0.6 0.7201783

plot the result

library(ggplot2)
ggplot(dt, aes(y = time, x = nCells, color = format)) + geom_line() + geom_point() + facet_grid(slicing~beta_value, scales ="free") + ylab("time (ms)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("time")
plot of chunk unnamed-chunk-6

plot of chunk unnamed-chunk-6

ggplot(dt[format != "H5(chunked.read)"], aes(x = nCells, y = size, color = format)) + geom_line() + geom_point() + facet_wrap(~beta_value) + ylab("size (GB)") + scale_x_continuous(breaks = nCellsVec) + ggtitle("space")
plot of chunk unnamed-chunk-6

plot of chunk unnamed-chunk-6