create hdf version of SingleCellAssay container (using HDFArray)
#write to H5
library(rhdf5)
path <- "/loc/no-backup/mike/shared"
path <- file.path(path, "MAST")
maits_h5 <- file.path(path, "maits.h5")
# saveHDF5SummarizedExperiment()
h5createFile(maits_h5)
## [1] TRUE
h5createDataset(maits_h5, "data", c(r,c), storage.mode = "double", chunk=c(1,c), level=0) #chunking by row
## [1] TRUE
h5write(dat, maits_h5, "data")
H5close()
library(HDF5Array)
hda_array <- HDF5Array(maits_h5, "data")
scaRaw1 <- SummarizedExperiment(assays=list(hda_array), colData=as(maits$cdat, 'DataFrame'))
mcols(scaRaw1) <- as(maits$fdat,'DataFrame')
scaRaw1 <- as(scaRaw1, "SingleCellAssay")
scaRaw1
## class: SingleCellAssay
## dim: 16302 96
## metadata(0):
## assays(1): ''
## rownames: NULL
## rowData names(3): primerid entrez symbolid
## colnames(96): 1-MAIT-Stim-C05_S91 1-MAIT-Stim-C07_S70 ...
## 1-MAIT-Unstim-C93_S33 1-MAIT-Unstim-C96_S60
## colData names(21): wellKey condition ... bc ourfilter
assay(scaRaw1)
## DelayedMatrix object of 16302 x 96 doubles:
## 1-MAIT-Stim-C05_S91 1-MAIT-Stim-C07_S70 1-MAIT-Stim-C08_S68
## [1,] 0.00000000 0.00000000 0.00000000
## [2,] 1.29865832 0.00000000 0.00000000
## [3,] 0.08406426 0.12432814 1.79077204
## [4,] 3.71589337 0.00000000 0.00000000
## [5,] 6.82921532 0.00000000 0.00000000
## ... . . .
## [16298,] 0.0000000 0.0000000 0.0000000
## [16299,] 0.0000000 0.0000000 0.0000000
## [16300,] 0.0000000 0.7136958 1.6644828
## [16301,] 0.0000000 0.0000000 0.0000000
## [16302,] 0.0000000 0.2868811 0.0000000
## . 1-MAIT-Unstim-C93_S33 1-MAIT-Unstim-C96_S60
## [1,] . 0.0000000 0.0000000
## [2,] . 0.0000000 0.0000000
## [3,] . 0.2387869 2.4382929
## [4,] . 0.0000000 0.0000000
## [5,] . 0.0000000 0.0000000
## ... . . .
## [16298,] . 0 0
## [16299,] . 0 0
## [16300,] . 0 0
## [16301,] . 0 0
## [16302,] . 0 0
Exploratory Data accessing
rowSums
microbenchmark(v1 <- rowSums(assay(scaRaw)), v2 <- rowSums(assay(scaRaw1)), times = 10)
## Unit: milliseconds
## expr min lq mean median
## v1 <- rowSums(assay(scaRaw)) 7.069379 7.191648 7.703484 7.20468
## v2 <- rowSums(assay(scaRaw1)) 88.088676 90.440108 151.211717 93.28237
## uq max neval cld
## 7.629381 10.84785 10 a
## 115.723775 562.87094 10 b
all.equal(v1, v2, check.attributes = F)
## [1] TRUE
colSums
microbenchmark(v1 <- colSums(assay(scaRaw)), v2 <- colSums(assay(scaRaw1)), times = 10)
## Unit: milliseconds
## expr min lq mean median
## v1 <- colSums(assay(scaRaw)) 4.315464 4.537219 5.323695 4.706701
## v2 <- colSums(assay(scaRaw1)) 82.283570 84.859894 87.284253 86.159740
## uq max neval cld
## 5.946092 8.309307 10 a
## 88.581971 98.900142 10 b
all.equal(v1, v2, check.attributes = F)
## [1] TRUE
row slicing (continous)
ind <- 1:1e3
microbenchmark(v1 <- as.array(assay(scaRaw[ind,])), v2 <- as.array(assay(scaRaw1[ind,])), times = 10)
## Unit: milliseconds
## expr min lq mean
## v1 <- as.array(assay(scaRaw[ind, ])) 2.561213 2.60599 4.581689
## v2 <- as.array(assay(scaRaw1[ind, ])) 10.294773 10.48888 15.837991
## median uq max neval cld
## 3.435395 3.522695 17.23660 10 a
## 10.598378 11.209945 62.03065 10 b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE
row slicing (random)
set.seed(4)
ind <- sample(r,1e3)
microbenchmark(v1 <- as.array(assay(scaRaw[ind,])), v2 <- as.array(assay(scaRaw1[ind,])), times = 10)
## Unit: milliseconds
## expr min lq mean
## v1 <- as.array(assay(scaRaw[ind, ])) 3.016406 3.432583 4.364121
## v2 <- as.array(assay(scaRaw1[ind, ])) 63.041670 63.527232 64.294125
## median uq max neval cld
## 4.920953 5.057959 5.426127 10 a
## 63.700083 64.329327 68.101570 10 b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE
col slicing (continous)
ind <- 1:90
microbenchmark(v1 <- as.array(assay(scaRaw[,ind])), v2 <- as.array(assay(scaRaw1[,ind])), times = 10)
## Unit: milliseconds
## expr min lq mean
## v1 <- as.array(assay(scaRaw[, ind])) 36.30926 37.73019 86.06515
## v2 <- as.array(assay(scaRaw1[, ind])) 47.21058 50.94359 148.83481
## median uq max neval cld
## 39.06719 39.96509 512.1124 10 a
## 57.88306 60.85949 525.5616 10 a
all.equal(v1,v2, check.attributes = F)
## [1] TRUE
col slicing (random)
set.seed(4)
ind <- sample(c,30)
microbenchmark(v1 <- as.array(assay(scaRaw[,ind])), v2 <- as.array(assay(scaRaw1[,ind])), times = 10)
## Unit: milliseconds
## expr min lq mean median
## v1 <- as.array(assay(scaRaw[, ind])) 12.31081 14.64752 15.49230 15.61511
## v2 <- as.array(assay(scaRaw1[, ind])) 25.29523 26.72577 27.58741 27.58896
## uq max neval cld
## 16.41621 17.65630 10 a
## 29.05301 29.21735 10 b
all.equal(v1,v2, check.attributes = F)
## [1] TRUE
do PCA
runPCA <- function(x){
set.seed(123)
rpca(t(assay(x)), retx=TRUE, k=4)$x
}
microbenchmark(v1 <- runPCA(scaRaw), v2 <- runPCA(scaRaw1), times = 5)
## Unit: milliseconds
## expr min lq mean median uq
## v1 <- runPCA(scaRaw) 739.7310 1303.967 1392.779 1373.621 1653.984
## v2 <- runPCA(scaRaw1) 752.2333 1262.987 1198.465 1280.809 1342.735
## max neval cld
## 1892.590 5 a
## 1353.563 5 a
all.equal(v1,v2)
## [1] TRUE
freq (col-wise stats)
microbenchmark(f1 <- freq(scaRaw), f2 <- freq(scaRaw1), times = 5)
## Unit: milliseconds
## expr min lq mean median uq max
## f1 <- freq(scaRaw) 317.6191 320.7211 340.9995 322.6214 324.4261 419.6097
## f2 <- freq(scaRaw1) 379.7953 422.0343 453.1739 455.8169 502.6885 505.5345
## neval cld
## 5 a
## 5 b
all.equal(f1,f2, check.attributes = F)
## [1] TRUE
Filtering by col and row
filterFunc <- function(scaRaw)
{
filterCrit <- with(colData(scaRaw), pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman>0.6 & nGeneOn> 4000)
sca <- subset(scaRaw,filterCrit)
eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype ="GENEID",columns = c("GENEID","TXNAME"))
ueid <- unique(na.omit(eid)$GENEID)
sca <- sca[mcols(sca)$entrez %in% ueid,]
}
sca <- filterFunc(scaRaw)
sca1 <- filterFunc(scaRaw1)
freq (on the subsetted data)
microbenchmark(f1 <- freq(sca), f2 <- freq(sca1), times = 1)
## Unit: milliseconds
## expr min lq mean median uq
## f1 <- freq(sca) 236.2341 236.2341 236.2341 236.2341 236.2341
## f2 <- freq(sca1) 6884.6609 6884.6609 6884.6609 6884.6609 6884.6609
## max neval
## 236.2341 1
## 6884.6609 1
all.equal(f1,f2, check.attributes = F)
## [1] TRUE
Remove invariant genes
set.seed(4)
ind <- sample(which(f1>0), 6000)
sca <- sca[ind,]
sca1 <- sca1[ind,]
Recalculating the cellular detection rate (ngeneson)
microbenchmark(cdr1 <- colSums(assay(sca)>0)
, cdr2 <- colSums(assay(sca1)>0), times = 1)
## Unit: milliseconds
## expr min lq mean
## cdr1 <- colSums(assay(sca) > 0) 4.954953 4.954953 4.954953
## cdr2 <- colSums(assay(sca1) > 0) 30914.832293 30914.832293 30914.832293
## median uq max neval
## 4.954953 4.954953 4.954953 1
## 30914.832293 30914.832293 30914.832293 1
all.equal(cdr1,cdr2, check.attributes = F)
## [1] TRUE
PCA on filtered cells
microbenchmark(v1 <- runPCA(scaRaw), v2 <- runPCA(scaRaw1), times = 1)
## Unit: milliseconds
## expr min lq mean median uq
## v1 <- runPCA(scaRaw) 1371.2300 1371.2300 1371.2300 1371.2300 1371.2300
## v2 <- runPCA(scaRaw1) 814.5999 814.5999 814.5999 814.5999 814.5999
## max neval
## 1371.2300 1
## 814.5999 1
all.equal(v1,v2)
## [1] TRUE
find out the bottleneck
Rprof()
f2 <- freq(sca1)
Rprof(NULL)
summaryRprof()
## $by.self
## self.time self.pct total.time total.pct
## ".Call" 32.50 99.57 32.50 99.57
## "apply" 0.04 0.12 32.62 99.94
## "FUN" 0.04 0.12 0.06 0.18
## "standardGeneric" 0.02 0.06 32.58 99.82
## "dev.list" 0.02 0.06 0.02 0.06
## "do.call" 0.02 0.06 0.02 0.06
##
## $by.total
## total.time total.pct self.time self.pct
## "block_exec" 32.64 100.00 0.00 0.00
## "call_block" 32.64 100.00 0.00 0.00
## "evaluate_call" 32.64 100.00 0.00 0.00
## "evaluate" 32.64 100.00 0.00 0.00
## "in_dir" 32.64 100.00 0.00 0.00
## "knitr::knit" 32.64 100.00 0.00 0.00
## "process_file" 32.64 100.00 0.00 0.00
## "process_group.block" 32.64 100.00 0.00 0.00
## "process_group" 32.64 100.00 0.00 0.00
## "rmarkdown::render" 32.64 100.00 0.00 0.00
## "withCallingHandlers" 32.64 100.00 0.00 0.00
## "apply" 32.62 99.94 0.04 0.12
## "eval" 32.62 99.94 0.00 0.00
## "freq" 32.62 99.94 0.00 0.00
## "handle" 32.62 99.94 0.00 0.00
## "timing_fn" 32.62 99.94 0.00 0.00
## "withVisible" 32.62 99.94 0.00 0.00
## "standardGeneric" 32.58 99.82 0.02 0.06
## ".from_DelayedArray_to_matrix" 32.52 99.63 0.00 0.00
## ".local" 32.52 99.63 0.00 0.00
## "as.array" 32.52 99.63 0.00 0.00
## "as.matrix.DelayedArray" 32.52 99.63 0.00 0.00
## "as.matrix" 32.52 99.63 0.00 0.00
## "h5read" 32.52 99.63 0.00 0.00
## "h5read2" 32.52 99.63 0.00 0.00
## "h5readDataset" 32.52 99.63 0.00 0.00
## "subset_seed_as_array" 32.52 99.63 0.00 0.00
## "suppressWarnings" 32.52 99.63 0.00 0.00
## ".Call" 32.50 99.57 32.50 99.57
## "doTryCatch" 32.50 99.57 0.00 0.00
## "try" 32.50 99.57 0.00 0.00
## "tryCatch" 32.50 99.57 0.00 0.00
## "tryCatchList" 32.50 99.57 0.00 0.00
## "tryCatchOne" 32.50 99.57 0.00 0.00
## "H5Sselect_index" 32.48 99.51 0.00 0.00
## "FUN" 0.06 0.18 0.04 0.12
## "dev.list" 0.02 0.06 0.02 0.06
## "do.call" 0.02 0.06 0.02 0.06
## "H5Dread" 0.02 0.06 0.00 0.00
## "handle_output" 0.02 0.06 0.00 0.00
## "plot_snapshot" 0.02 0.06 0.00 0.00
## "w$get_new" 0.02 0.06 0.00 0.00
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 32.64