suppressPackageStartupMessages({
     library(GSEABase)
    library(limma)
    library(reshape2)
    library(data.table)
    library(knitr)
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    library(stringr)
    library(NMF)
    library(rsvd)
    library(MAST)
  library(microbenchmark)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = TRUE,fig.width=8,fig.height=6, auto.dep=TRUE)
freq_expressed <- 0.2
FCTHRESHOLD <- log2(1.5)

Loading data

data(maits, package='MAST')
dat <- t(maits$expressionmat)
dim(dat)

create in-memory version of SingleCellAssay container

scaRaw <- FromMatrix(dat, maits$cdat, maits$fdat)
r <- nrow(dat)
c <- ncol(dat)

create hdf version of SingleCellAssay container (using HDFArray)

#write to H5
library(rhdf5)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "MAST")
maits_h5 <- file.path(path, "maits.h5")
# saveHDF5SummarizedExperiment()
h5createFile(maits_h5)
## [1] TRUE
h5createDataset(maits_h5, "data", c(r,c), storage.mode = "double", chunk=c(1,c), level=0) #chunking by row
## [1] TRUE
h5write(dat, maits_h5, "data")
H5close()

library(HDF5Array)
hda_array <- HDF5Array(maits_h5, "data")
scaRaw1 <- SummarizedExperiment(assays=list(hda_array), colData=as(maits$cdat, 'DataFrame'))
mcols(scaRaw1) <- as(maits$fdat,'DataFrame')
scaRaw1 <- as(scaRaw1, "SingleCellAssay")
scaRaw1
## class: SingleCellAssay 
## dim: 16302 96 
## metadata(0):
## assays(1): ''
## rownames: NULL
## rowData names(3): primerid entrez symbolid
## colnames(96): 1-MAIT-Stim-C05_S91 1-MAIT-Stim-C07_S70 ...
##   1-MAIT-Unstim-C93_S33 1-MAIT-Unstim-C96_S60
## colData names(21): wellKey condition ... bc ourfilter
assay(scaRaw1)
## DelayedMatrix object of 16302 x 96 doubles:
##            1-MAIT-Stim-C05_S91   1-MAIT-Stim-C07_S70   1-MAIT-Stim-C08_S68
##     [1,]            0.00000000            0.00000000            0.00000000
##     [2,]            1.29865832            0.00000000            0.00000000
##     [3,]            0.08406426            0.12432814            1.79077204
##     [4,]            3.71589337            0.00000000            0.00000000
##     [5,]            6.82921532            0.00000000            0.00000000
##      ...                     .                     .                     .
## [16298,]             0.0000000             0.0000000             0.0000000
## [16299,]             0.0000000             0.0000000             0.0000000
## [16300,]             0.0000000             0.7136958             1.6644828
## [16301,]             0.0000000             0.0000000             0.0000000
## [16302,]             0.0000000             0.2868811             0.0000000
##                              . 1-MAIT-Unstim-C93_S33 1-MAIT-Unstim-C96_S60
##     [1,]                     .             0.0000000             0.0000000
##     [2,]                     .             0.0000000             0.0000000
##     [3,]                     .             0.2387869             2.4382929
##     [4,]                     .             0.0000000             0.0000000
##     [5,]                     .             0.0000000             0.0000000
##      ...                     .                     .                     .
## [16298,]                     .                     0                     0
## [16299,]                     .                     0                     0
## [16300,]                     .                     0                     0
## [16301,]                     .                     0                     0
## [16302,]                     .                     0                     0

Exploratory Data accessing

rowSums

microbenchmark(v1 <- rowSums(assay(scaRaw)), v2 <- rowSums(assay(scaRaw1)), times = 10)
## Unit: milliseconds
##                           expr       min        lq       mean   median
##   v1 <- rowSums(assay(scaRaw))  7.069379  7.191648   7.703484  7.20468
##  v2 <- rowSums(assay(scaRaw1)) 88.088676 90.440108 151.211717 93.28237
##          uq       max neval cld
##    7.629381  10.84785    10  a 
##  115.723775 562.87094    10   b
all.equal(v1, v2, check.attributes = F)
## [1] TRUE

colSums

microbenchmark(v1 <- colSums(assay(scaRaw)), v2 <- colSums(assay(scaRaw1)), times = 10)
## Unit: milliseconds
##                           expr       min        lq      mean    median
##   v1 <- colSums(assay(scaRaw))  4.315464  4.537219  5.323695  4.706701
##  v2 <- colSums(assay(scaRaw1)) 82.283570 84.859894 87.284253 86.159740
##         uq       max neval cld
##   5.946092  8.309307    10  a 
##  88.581971 98.900142    10   b
all.equal(v1, v2, check.attributes = F)
## [1] TRUE

row slicing (continous)

ind <- 1:1e3
microbenchmark(v1 <- as.array(assay(scaRaw[ind,])), v2 <- as.array(assay(scaRaw1[ind,])), times = 10)
## Unit: milliseconds
##                                   expr       min       lq      mean
##   v1 <- as.array(assay(scaRaw[ind, ]))  2.561213  2.60599  4.581689
##  v2 <- as.array(assay(scaRaw1[ind, ])) 10.294773 10.48888 15.837991
##     median        uq      max neval cld
##   3.435395  3.522695 17.23660    10  a 
##  10.598378 11.209945 62.03065    10   b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE

row slicing (random)

set.seed(4)
ind <- sample(r,1e3)
microbenchmark(v1 <- as.array(assay(scaRaw[ind,])), v2 <- as.array(assay(scaRaw1[ind,])), times = 10)
## Unit: milliseconds
##                                   expr       min        lq      mean
##   v1 <- as.array(assay(scaRaw[ind, ]))  3.016406  3.432583  4.364121
##  v2 <- as.array(assay(scaRaw1[ind, ])) 63.041670 63.527232 64.294125
##     median        uq       max neval cld
##   4.920953  5.057959  5.426127    10  a 
##  63.700083 64.329327 68.101570    10   b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE

col slicing (continous)

ind <- 1:90
microbenchmark(v1 <- as.array(assay(scaRaw[,ind])), v2 <- as.array(assay(scaRaw1[,ind])), times = 10)
## Unit: milliseconds
##                                   expr      min       lq      mean
##   v1 <- as.array(assay(scaRaw[, ind])) 36.30926 37.73019  86.06515
##  v2 <- as.array(assay(scaRaw1[, ind])) 47.21058 50.94359 148.83481
##    median       uq      max neval cld
##  39.06719 39.96509 512.1124    10   a
##  57.88306 60.85949 525.5616    10   a
all.equal(v1,v2, check.attributes = F)
## [1] TRUE

col slicing (random)

set.seed(4)
ind <- sample(c,30)
microbenchmark(v1 <- as.array(assay(scaRaw[,ind])), v2 <- as.array(assay(scaRaw1[,ind])), times = 10)
## Unit: milliseconds
##                                   expr      min       lq     mean   median
##   v1 <- as.array(assay(scaRaw[, ind])) 12.31081 14.64752 15.49230 15.61511
##  v2 <- as.array(assay(scaRaw1[, ind])) 25.29523 26.72577 27.58741 27.58896
##        uq      max neval cld
##  16.41621 17.65630    10  a 
##  29.05301 29.21735    10   b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE

do PCA

runPCA <- function(x){
  set.seed(123)
  rpca(t(assay(x)), retx=TRUE, k=4)$x

}
microbenchmark(v1 <- runPCA(scaRaw), v2 <- runPCA(scaRaw1), times = 5)
## Unit: milliseconds
##                   expr      min       lq     mean   median       uq
##   v1 <- runPCA(scaRaw) 739.7310 1303.967 1392.779 1373.621 1653.984
##  v2 <- runPCA(scaRaw1) 752.2333 1262.987 1198.465 1280.809 1342.735
##       max neval cld
##  1892.590     5   a
##  1353.563     5   a
all.equal(v1,v2)
## [1] TRUE

freq (col-wise stats)

microbenchmark(f1 <- freq(scaRaw), f2 <- freq(scaRaw1), times = 5)
## Unit: milliseconds
##                 expr      min       lq     mean   median       uq      max
##   f1 <- freq(scaRaw) 317.6191 320.7211 340.9995 322.6214 324.4261 419.6097
##  f2 <- freq(scaRaw1) 379.7953 422.0343 453.1739 455.8169 502.6885 505.5345
##  neval cld
##      5  a 
##      5   b
all.equal(f1,f2, check.attributes = F)
## [1] TRUE

Filtering by col and row

filterFunc <- function(scaRaw)
{
  filterCrit <- with(colData(scaRaw), pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman>0.6 & nGeneOn> 4000)
  sca <- subset(scaRaw,filterCrit)
  eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype ="GENEID",columns = c("GENEID","TXNAME"))
  ueid <- unique(na.omit(eid)$GENEID)
  sca <- sca[mcols(sca)$entrez %in% ueid,]
  
}
sca <- filterFunc(scaRaw)
sca1 <- filterFunc(scaRaw1)

freq (on the subsetted data)

microbenchmark(f1 <- freq(sca), f2 <- freq(sca1), times = 1)
## Unit: milliseconds
##              expr       min        lq      mean    median        uq
##   f1 <- freq(sca)  236.2341  236.2341  236.2341  236.2341  236.2341
##  f2 <- freq(sca1) 6884.6609 6884.6609 6884.6609 6884.6609 6884.6609
##        max neval
##   236.2341     1
##  6884.6609     1
all.equal(f1,f2, check.attributes = F)
## [1] TRUE

Remove invariant genes

set.seed(4)
  ind <- sample(which(f1>0), 6000)
  sca <- sca[ind,]
  sca1 <- sca1[ind,]

Recalculating the cellular detection rate (ngeneson)

microbenchmark(cdr1 <- colSums(assay(sca)>0)
              , cdr2 <- colSums(assay(sca1)>0), times = 1)
## Unit: milliseconds
##                              expr          min           lq         mean
##   cdr1 <- colSums(assay(sca) > 0)     4.954953     4.954953     4.954953
##  cdr2 <- colSums(assay(sca1) > 0) 30914.832293 30914.832293 30914.832293
##        median           uq          max neval
##      4.954953     4.954953     4.954953     1
##  30914.832293 30914.832293 30914.832293     1
all.equal(cdr1,cdr2, check.attributes = F)
## [1] TRUE

PCA on filtered cells

microbenchmark(v1 <- runPCA(scaRaw), v2 <- runPCA(scaRaw1), times = 1)
## Unit: milliseconds
##                   expr       min        lq      mean    median        uq
##   v1 <- runPCA(scaRaw) 1371.2300 1371.2300 1371.2300 1371.2300 1371.2300
##  v2 <- runPCA(scaRaw1)  814.5999  814.5999  814.5999  814.5999  814.5999
##        max neval
##  1371.2300     1
##   814.5999     1
all.equal(v1,v2)
## [1] TRUE

find out the bottleneck

Rprof()
f2 <- freq(sca1)
Rprof(NULL)
summaryRprof()
## $by.self
##                   self.time self.pct total.time total.pct
## ".Call"               32.50    99.57      32.50     99.57
## "apply"                0.04     0.12      32.62     99.94
## "FUN"                  0.04     0.12       0.06      0.18
## "standardGeneric"      0.02     0.06      32.58     99.82
## "dev.list"             0.02     0.06       0.02      0.06
## "do.call"              0.02     0.06       0.02      0.06
## 
## $by.total
##                                total.time total.pct self.time self.pct
## "block_exec"                        32.64    100.00      0.00     0.00
## "call_block"                        32.64    100.00      0.00     0.00
## "evaluate_call"                     32.64    100.00      0.00     0.00
## "evaluate"                          32.64    100.00      0.00     0.00
## "in_dir"                            32.64    100.00      0.00     0.00
## "knitr::knit"                       32.64    100.00      0.00     0.00
## "process_file"                      32.64    100.00      0.00     0.00
## "process_group.block"               32.64    100.00      0.00     0.00
## "process_group"                     32.64    100.00      0.00     0.00
## "rmarkdown::render"                 32.64    100.00      0.00     0.00
## "withCallingHandlers"               32.64    100.00      0.00     0.00
## "apply"                             32.62     99.94      0.04     0.12
## "eval"                              32.62     99.94      0.00     0.00
## "freq"                              32.62     99.94      0.00     0.00
## "handle"                            32.62     99.94      0.00     0.00
## "timing_fn"                         32.62     99.94      0.00     0.00
## "withVisible"                       32.62     99.94      0.00     0.00
## "standardGeneric"                   32.58     99.82      0.02     0.06
## ".from_DelayedArray_to_matrix"      32.52     99.63      0.00     0.00
## ".local"                            32.52     99.63      0.00     0.00
## "as.array"                          32.52     99.63      0.00     0.00
## "as.matrix.DelayedArray"            32.52     99.63      0.00     0.00
## "as.matrix"                         32.52     99.63      0.00     0.00
## "h5read"                            32.52     99.63      0.00     0.00
## "h5read2"                           32.52     99.63      0.00     0.00
## "h5readDataset"                     32.52     99.63      0.00     0.00
## "subset_seed_as_array"              32.52     99.63      0.00     0.00
## "suppressWarnings"                  32.52     99.63      0.00     0.00
## ".Call"                             32.50     99.57     32.50    99.57
## "doTryCatch"                        32.50     99.57      0.00     0.00
## "try"                               32.50     99.57      0.00     0.00
## "tryCatch"                          32.50     99.57      0.00     0.00
## "tryCatchList"                      32.50     99.57      0.00     0.00
## "tryCatchOne"                       32.50     99.57      0.00     0.00
## "H5Sselect_index"                   32.48     99.51      0.00     0.00
## "FUN"                                0.06      0.18      0.04     0.12
## "dev.list"                           0.02      0.06      0.02     0.06
## "do.call"                            0.02      0.06      0.02     0.06
## "H5Dread"                            0.02      0.06      0.00     0.00
## "handle_output"                      0.02      0.06      0.00     0.00
## "plot_snapshot"                      0.02      0.06      0.00     0.00
## "w$get_new"                          0.02      0.06      0.00     0.00
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 32.64