created: 20.04.2022
updated: 21.04.2022
scRNA data analysis of peripheral blood mononuclear cell (PBMC) dataset available from the 10x Genomics website (https://support.10xgenomics.com/single-cell-vdj/datasets/3.0.0/vdj_v1_mm_c57bl6_pbmc_5gex) was performed with the tutorial available from Bioconductor (http://bioconductor.org/books/3.13/OSCA.workflows/human-pbmc-with-surface-proteins-10x-genomics.html).
PBMC dataset consisting of filtered gene/barcode count matrices for gene expression and cell surface proteins was loaded in. Packages used for data loading included BiocFileCache and DropletUtils.
#BiocManager::install("BiocFileCache")
library(BiocFileCache)
#bring in data set from the website
bfc<-BiocFileCache(ask=FALSE)
exprs.data <- bfcrpath(bfc, file.path(
"http://cf.10xgenomics.com/samples/cell-vdj/3.1.0",
"vdj_v1_hs_pbmc3",
"vdj_v1_hs_pbmc3_filtered_feature_bc_matrix.tar.gz"))
untar(exprs.data, exdir=tempdir())
#BiocManager::install("DropletUtils")
library(DropletUtils)
sce.pbmc <- read10xCounts(file.path(tempdir(), "filtered_feature_bc_matrix"))
sce.pbmc <- splitAltExps(sce.pbmc, rowData(sce.pbmc)$Type)
Quality control was performed to discard cells with high mitochondrial proportions and low detectable antibody-derived tags (ADT). Package used for quality control included scater.
#quality control
unfiltered <- sce.pbmc
#BiocManager::install("scater")
library(scater)
#grep filter
is.mito <- grep("^MT-", rowData(sce.pbmc)$Symbol)
stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=is.mito))
high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
low.adt <- stats$`altexps_Antibody Capture_detected` < nrow(altExp(sce.pbmc))/2
#discard for all that are TRUE
discard <- high.mito | low.adt
#save back into the dataset, remove by columns
sce.pbmc <- sce.pbmc[!discard, ]
Summary of high mitochondrial proportions and low detectable ADT counts. Most change was observed in high mitochondrial analysis.
summary(high.mito)
## Mode FALSE TRUE
## logical 6660 571
summary(low.adt)
## Mode FALSE
## logical 7231
summary(discard)
## Mode FALSE TRUE
## logical 6660 571
#Thus, looks like high mito was most affected
Package used for aalysis included gridExtra.
#ctrl alt i - create new chunk shortcut
#examine distribution of each qc metric
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- discard
#install.packages("gridExtra")
#We examine the distribution of each QC metric
gridExtra::grid.arrange(
plotColData(unfiltered, y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(unfiltered, y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(unfiltered, y="subsets_Mito_percent",
colour_by="discard") + ggtitle("Mito percent"),
plotColData(unfiltered, y="altexps_Antibody Capture_detected",
colour_by="discard") + ggtitle("ADT detected"),
ncol=2)
Figure 1. The distribution of each QC metric in the 10X Genomics PBMC dataset. Each point represents a cell and the colors align with True and False defining whether the cell was discarded (True) or retained (False) by the outlier-based QC approach.
plotColData(unfiltered, x="sum", y="subsets_Mito_percent",
colour_by="discard") + scale_x_log10()
#BiocManager::install("scran")
library(scran)
Figure 2. The percentage of unique molecular identifiers (UMIs) mapped to mitochondrial genes against the total count (sum) of each cell.
Data normalization was performed to compute size factors for the gene expression and ADT counts. Set seed encourages data reproducibility. Package used included scran.
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
altExp(sce.pbmc) <- computeMedianFactors(altExp(sce.pbmc))
sce.pbmc <- applySCE(sce.pbmc, logNormCounts)
Summary statistics for both sets of size factors were generated for observation.
summary(sizeFactors(sce.pbmc))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002861 0.702281 0.920273 1.000000 1.150700 9.472657
summary(sizeFactors(altExp(sce.pbmc)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09012 0.64831 0.75982 1.00000 0.96099 236.42976
par(mfrow=c(1,2))
plot(librarySizeFactors(sce.pbmc), sizeFactors(sce.pbmc), pch=16,
xlab="Library size factors", ylab="Deconvolution factors",
main="Gene expression", log="xy")
plot(librarySizeFactors(altExp(sce.pbmc)), sizeFactors(altExp(sce.pbmc)), pch=16,
xlab="Library size factors", ylab="Median-based factors",
main="Antibody capture", log="xy")
Figure 3. Plot of the deconvolution size factors for the gene expression values (left) or the median-based size factors for the ADT expression values (right) compared to the library size-derived factors for the corresponding set of features. Each point represents a cell.
#dimensionality reduction off
par(mfrow=c(1,1))
The low-dimensionality rendered ommitting of the PCA step for the ADT expression matrix thus, the next steps included t-SNE and UMAP visualizations. Package used for dimensionality reduction UMAP visualisation included uwot.
set.seed(100000)
altExp(sce.pbmc) <- runTSNE(altExp(sce.pbmc))
#install.packages("uwot")
set.seed(1000000)
altExp(sce.pbmc) <- runUMAP(altExp(sce.pbmc))
Graph-based clustering was performed on the ADT data using the assignments as the column labels of the alternative Experiment. Package used for clustring included iGraph.
g.adt <- buildSNNGraph(altExp(sce.pbmc), k=10, d=NA)
clust.adt <- igraph::cluster_walktrap(g.adt)$membership
colLabels(altExp(sce.pbmc)) <- factor(clust.adt)
Packages used for analysis and observations included bluster and pheatmap.
#Table
table(colLabels(altExp(sce.pbmc)))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 204 505 669 938 680 51 1410 33 78 1131 148 154 74 627 15 48
## 17 18 19 20 21 22 23 24
## 144 26 45 87 32 65 50 17
#BiocManager::install("bluster")
library(bluster)
mod <- pairwiseModularity(g.adt, clust.adt, as.ratio=TRUE)
#install.packages("pheatmap")
library(pheatmap)
pheatmap::pheatmap(log10(mod + 10), cluster_row=FALSE, cluster_col=FALSE,
color=colorRampPalette(c("white", "blue"))(101))
Figure 4. Heatmap of the pairwise cluster modularity scores in the PBMC dataset, computed based on the shared nearest neighbor graph derived from the ADT expression values.
scater::plotTSNE(altExp(sce.pbmc), colour_by="label", text_by="label", text_col="red")
Figure 5. The obligatory t-SNE plot of PBMC dataset based on its own ADT expression values, where each point is a cell and colored by the cluster of origin. Cluster labels are further overlaid at the median coordinates across all cells in the cluster. Image extends a flattened view for observation and a 3-dimensional observation approach should be considered when interpreting the plot.
Packages used included scran quickSubCluster.
#BiocManager::install("quickSubCluster")
set.seed(1010010)
subclusters <- scran::quickSubCluster(sce.pbmc, clust.adt,
prepFUN=function(x) {
dec <- scran::modelGeneVarByPoisson(x)
top <- scran::getTopHVGs(dec, prop=0.1)
scran::denoisePCA(x, dec, subset.row=top)
},
clusterFUN=function(x) {
g.gene <- scran::buildSNNGraph(x, k=10, use.dimred = 'PCA')
igraph::cluster_walktrap(g.gene)$membership
}
)
data.frame(
Cluster=names(subclusters),
Ncells=vapply(subclusters, ncol, 0L),
Nsub=vapply(subclusters, function(x) length(unique(x$subcluster)), 0L)
)
## Cluster Ncells Nsub
## 1 1 204 3
## 2 2 505 5
## 3 3 669 7
## 4 4 938 5
## 5 5 680 9
## 6 6 51 3
## 7 7 1410 8
## 8 8 33 1
## 9 9 78 2
## 10 10 1131 10
## 11 11 148 4
## 12 12 154 6
## 13 13 74 2
## 14 14 627 7
## 15 15 15 1
## 16 16 48 1
## 17 17 144 3
## 18 18 26 1
## 19 19 45 1
## 20 20 87 2
## 21 21 32 1
## 22 22 65 3
## 23 23 50 2
## 24 24 17 1
Amezquita, R., Lun, A., Hicks, S., & Gottardo, R. (2021). Single-cell analysis workflows with bioconductor. Chapter 4 Human PBMC with surface proteins (10X Genomics). https://github.com/LTLA/OSCA.workflows
Zheng, G. X. Y., Terry, J. M., Belgrader, P., Ryvkin, P., Bent, Z. W., Wilson, R., Ziraldo, S. B., Wheeler, T. D., McDermott, G. P., Zhu, J., Gregory, M. T., Shuga, J., Montesclaros, L., Underwood, J. G., Masquelier, D. A., Nishimura, S. Y., Schnall-Levin, M., Wyatt, P. W., Hindson, C. M., . . . Bielas, J. H. (2017). Massively parallel digital transcriptional profiling of single cells. Nature Communications, 8(1), 14049. https://doi.org/10.1038/ncomms14049
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Australia.1252
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pheatmap_1.0.12 bluster_1.2.1
## [3] scran_1.20.1 scater_1.20.1
## [5] ggplot2_3.3.5 scuttle_1.2.1
## [7] DropletUtils_1.12.3 SingleCellExperiment_1.14.1
## [9] SummarizedExperiment_1.22.0 Biobase_2.52.0
## [11] GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
## [13] IRanges_2.26.0 S4Vectors_0.30.2
## [15] BiocGenerics_0.38.0 MatrixGenerics_1.4.3
## [17] matrixStats_0.61.0 BiocFileCache_2.0.0
## [19] dbplyr_2.1.1
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 ggbeeswarm_0.6.0
## [3] colorspace_2.0-3 ellipsis_0.3.2
## [5] XVector_0.32.0 BiocNeighbors_1.10.0
## [7] rstudioapi_0.13 farver_2.1.0
## [9] bit64_4.0.5 RSpectra_0.16-0
## [11] fansi_1.0.3 codetools_0.2-18
## [13] R.methodsS3_1.8.1 sparseMatrixStats_1.4.2
## [15] cachem_1.0.6 knitr_1.38
## [17] jsonlite_1.8.0 cluster_2.1.2
## [19] R.oo_1.24.0 uwot_0.1.11
## [21] HDF5Array_1.20.0 compiler_4.1.2
## [23] httr_1.4.2 dqrng_0.3.0
## [25] assertthat_0.2.1 Matrix_1.3-4
## [27] fastmap_1.1.0 limma_3.48.3
## [29] cli_3.2.0 BiocSingular_1.8.1
## [31] htmltools_0.5.2 tools_4.1.2
## [33] rsvd_1.0.5 igraph_1.3.0
## [35] gtable_0.3.0 glue_1.6.2
## [37] GenomeInfoDbData_1.2.6 dplyr_1.0.8
## [39] rappdirs_0.3.3 Rcpp_1.0.8.3
## [41] jquerylib_0.1.4 vctrs_0.4.1
## [43] rhdf5filters_1.4.0 DelayedMatrixStats_1.14.3
## [45] xfun_0.30 stringr_1.4.0
## [47] beachmat_2.8.1 lifecycle_1.0.1
## [49] irlba_2.3.5 statmod_1.4.36
## [51] edgeR_3.34.1 zlibbioc_1.38.0
## [53] scales_1.2.0 rhdf5_2.36.0
## [55] RColorBrewer_1.1-3 yaml_2.3.5
## [57] curl_4.3.2 memoise_2.0.1
## [59] gridExtra_2.3 sass_0.4.1
## [61] stringi_1.7.6 RSQLite_2.2.12
## [63] highr_0.9 ScaledMatrix_1.0.0
## [65] filelock_1.0.2 BiocParallel_1.26.2
## [67] rlang_1.0.2 pkgconfig_2.0.3
## [69] bitops_1.0-7 evaluate_0.15
## [71] lattice_0.20-45 purrr_0.3.4
## [73] Rhdf5lib_1.14.2 labeling_0.4.2
## [75] cowplot_1.1.1 bit_4.0.4
## [77] tidyselect_1.1.2 RcppAnnoy_0.0.19
## [79] magrittr_2.0.3 R6_2.5.1
## [81] generics_0.1.2 metapod_1.0.0
## [83] DelayedArray_0.18.0 DBI_1.1.2
## [85] pillar_1.7.0 withr_2.5.0
## [87] RCurl_1.98-1.6 tibble_3.1.6
## [89] crayon_1.5.1 utf8_1.2.2
## [91] rmarkdown_2.13 viridis_0.6.2
## [93] locfit_1.5-9.5 grid_4.1.2
## [95] blob_1.2.3 digest_0.6.29
## [97] R.utils_2.11.0 munsell_0.5.0
## [99] beeswarm_0.4.0 viridisLite_0.4.0
## [101] vipor_0.4.5 bslib_0.3.1