Analysis
Load data/packages
# List libraries
libs <- c("Seurat", "ggplot2", "patchwork","SeuratDisk", "reshape2",
"tidyverse", "SingleCellExperiment", "viridis", "gplots", "scales",
"ggrepel", "gridExtra", "scCustomize", "matrixStats", "scran",
"scuttle", "scater", "DropletUtils", "scDblFinder", "scuttle", "BiocParallel") # list libraries here
# Require all of them
lapply(libs, require, character.only = T)
rm(libs)
dir <- "jensen/data/raw/single_cell/patterson/seurat/"
files <- paste0(dir, list.files(dir))
# Function to load data
.loadData <- function(file_path) {
sn <- LoadH5Seurat(file_path)
sn <- subset(sn, nCount_RNA > 0)
sn$UMI <- colnames(sn)
return(sn)
}
sn <- lapply(files, function(x){.loadData(x)})
Remove excess of empty droplets
I didn’t want to include all of the >700K droplets which had <5
counts, but I didn’t want to completely exclude them. I compromised
selecting for droplets with > 300 counts and then sampling 10K of the
rest. This left me with ~29K and ~37K droplets in each sample going
forward.
# Function to subset empty drops
.subsetEmptyDrops <- function(sn) {
empty.drops <- sn %>%
subset(nCount_RNA <= 300) # too many empty droplets to process
empty.subset <- sample(colnames(empty.drops), 10000)
keep.drops <- sn %>%
subset(nCount_RNA > 300 | UMI %in% empty.subset) # add in a fraction of the empty droplets
return(keep.drops)
}
sn <- lapply(sn, function(x){.subsetEmptyDrops(x)})
Find empty droplets
# make sce object for later functions
sce <- lapply(sn, function(x){as.SingleCellExperiment(x)})
# rank each cell by the # counts
bcrank <- lapply(sce, function(x){barcodeRanks(counts(x))})
# Function to plot each cell by the number of reads
# Looks for an inflection point
# We can assume that below the inflection point are empty droplets
# Weirdly there are two, so I went with the higher
.plotRank <- function(ranks){
uniq <- !duplicated(ranks$rank)
o <- order(ranks$rank)
ranks.funct <- data.frame(rank =ranks$rank[uniq],
total =ranks$total[uniq])
# plot the ranks with the counts, look for the knee and inflection points
plot <- ggplot(ranks.funct, aes(x = rank, y = total, color = total)) +
geom_point(size = 2) +
scale_color_viridis(option = "viridis", trans = "log") +
geom_hline(yintercept = 104, color = "darkgreen", linetype = 2) +
geom_hline(yintercept = 1010, color = "dodgerblue", linetype = 2) +
geom_hline(yintercept = 500, color = "darkorange", linetype = 2) +
labs(x = "Rank", y = "Total UMI count") +
theme_minimal() +
theme(text = element_text(size = 12),
legend.position = "none") +
scale_y_log10() +
annotate("text",
x = max(ranks.funct$rank)*0.8,
y = c(104, 1010, 500)*0.9,
label = c("Inflection", "Knee", "Limit"),
hjust = 0, vjust = 1, size = 5,
color = c("darkgreen", "dodgerblue", "darkorange"))
return(plot)
}
bcrank.plots <- lapply(bcrank, function(x){.plotRank(x)})
grid.arrange(bcrank.plots[[1]],bcrank.plots[[2]], ncol = 2)
# Test for empty droplets, assume those below the limit are empty
e.out <- lapply(sce, function(x){emptyDrops(x, test.ambient = T, lower = 500)})
Standard scRNAseq QC pipeline
The ambient RNA detection relies on clustering the empty droplets. It
works best with a proper processing pipeline, so that’s what we’re doing
here.
# limit to droplets lower than the false discovery threshold
.qualityControl <- function(sce.func, e.out.funct) {
sce.func <- sce.func[,which(e.out.funct$FDR <= 0.01)]
is.mito <- grep("^mt-", rownames(sce.func))
qc <- perCellQCMetrics(sce.func, subsets=list(MT=is.mito))
discard.mito <- isOutlier(qc$subsets_MT_percent, type="higher")
colData(sce.func) <- cbind(colData(sce.func), qc)
sce.func <- sce.func[,!discard.mito]
return(sce.func)
}
sce.1 <- mapply(.qualityControl, sce, e.out)
.qcClusterVariance <- function(sce.1.func) {
clusters <- quickCluster(sce.1.func)
sce.1.func <- computeSumFactors(sce.1.func, cluster=clusters)
sce.1.func <- logNormCounts(sce.1.func)
sce.dec <- modelGeneVarByPoisson(sce.1.func)
sce.top <- getTopHVGs(sce.dec, prop=0.1)
return(list(sce.1.func, sce.dec, sce.top))
}
sec.vars <- lapply(sce.1, function(x){.qcClusterVariance(x)})
# Function for QC dimensional reduction
.qcDimensionalReduction <- function(sce.func) {
set.seed(10000)
sce.func[[1]] <- denoisePCA(sce.func[[1]], subset.row=sce.func[[3]], technical=sce.func[[2]])
set.seed(100000)
sce.func[[1]] <- runTSNE(sce.func[[1]], dimred="PCA")
set.seed(1000000)
sce.func[[1]] <- runUMAP(sce.func[[1]], dimred="PCA")
return(sce.func[[1]])
}
sce.2 <- lapply(sec.vars, function(x){.qcDimensionalReduction(x)})
.qcClustering <- function(sce.func) {
g <- buildSNNGraph(sce.func, k=30)
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce.func) <- factor(clust)
return(sce.func)
}
sce.2 <- lapply(sce.2, function(x){.qcClustering(x)})
TSNE clusters of each dataset, still including ambient RNA
tnse.1 <- plotTSNE(sce.2[[1]], colour_by="label") +
ggtitle("B6_1")
tnse.2 <- plotTSNE(sce.2[[2]], colour_by="label") +
ggtitle("B6_2")
grid.arrange(tnse.1, tnse.2, ncol = 2)
Remove ambient RNA
.removeAmbience <- function(sce.func, e.out.func) {
amb <- metadata(e.out.func)$ambient[,1]
stripped <- sce.func[names(amb),]
out <- removeAmbience(counts(stripped), ambient=amb, groups=colLabels(stripped))
counts(stripped, withDimnames=FALSE) <- out
stripped <- logNormCounts(stripped)
return(stripped)
}
stripped <- mapply(.removeAmbience, sce.2, e.out)
Marker expression by cluster before and after ambient RNA
removal
I’ve included D830005E20Rik as it was that was the strongest
cardiomyocyte marker from our last pass at the deconvolution
analysis.
I think it’s an interesting candidate for RNAscope, as there is some
literature to support CM nuclear specificity. Cardiomyocyte-Specific
Long Noncoding RNA Regulates Alternative Splicing of the Triadin Gene in
the Heart by Zhao et al is from last year and is worth checking out.
They define it as Trdn-as and saw it highly and specifically
expressed in cardiomyocyte nuclei via smFISH.I’ll add a relevant figure
below
# Mesh them all together in one plot
plots.all <- grid.arrange(plots.1, plots.2, ncol = 2)
Doublet detection
# Find Doublets
.removeDoublets <- function(stripped.func, sec.top.func) {
dbl.dens <- computeDoubletDensity(stripped.func, d=ncol(reducedDim(stripped.func)), subset.row=sec.top.func)
stripped.func$DoubletScore <- dbl.dens
cut_off <- quantile(stripped.func$DoubletScore,0.95)
stripped.func$isDoublet <- c("no","yes")[factor(as.integer(stripped.func$DoubletScore >= cut_off), levels=c(0,1))]
return(stripped.func)}
# get top variable gene info
sce.top <- c(sec.vars[[1]][[3]], sec.vars[[2]][[3]])
# iterate throuigh datasets and gene sets in doublet detection function
stripped <- lapply(1:length(stripped), function(x){.removeDoublets(stripped[[x]], sec.vars[[x]][[3]])})
Doublet visuals
With the doublets, we can see that they localize together in
clusters. For now, they’ll stay in the dataset. I’ll include that data
in a downstream QC.
gridExtra::grid.arrange(doub.both.1, doub.both.2, ncol = 2)
# Convert to seurat
sn.clean.1 <- as.Seurat(stripped[[1]])
sn.clean.2 <- as.Seurat(stripped[[2]])
# Save as seurat
SaveH5Seurat(sn.clean.1, "jensen/results/doublets_ambient_rna/08152023/B6_1")
## Error: H5Seurat file at jensen/results/doublets_ambient_rna/08152023/B6_1.h5seurat already exists
SaveH5Seurat(sn.clean.2, "jensen/results/doublets_ambient_rna/08152023/B6_2")
## Error: H5Seurat file at jensen/results/doublets_ambient_rna/08152023/B6_2.h5seurat already exists
SaveH5Seurat(sn.clean.1, "jensen/data/processed/single_cell/no_doublets/08162023/rau_B6_1")
## Error: H5Seurat file at jensen/data/processed/single_cell/no_doublets/08162023/rau_B6_1.h5seurat already exists
SaveH5Seurat(sn.clean.2, "jensen/data/processed/single_cell/no_doublets/08162023/rau_B6_2")
## Error: H5Seurat file at jensen/data/processed/single_cell/no_doublets/08162023/rau_B6_2.h5seurat already exists
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux 8.7 (Ootpa)
##
## Matrix products: default
## BLAS/LAPACK: /nas/longleaf/rhel8/apps/r/4.2.1/lib/libopenblas_haswellp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] BiocParallel_1.32.6 scDblFinder_1.12.0
## [3] DropletUtils_1.18.1 scater_1.26.1
## [5] scran_1.24.1 scuttle_1.8.3
## [7] scCustomize_1.1.1 gridExtra_2.3
## [9] ggrepel_0.9.2 scales_1.2.1
## [11] gplots_3.1.3 viridis_0.6.2
## [13] viridisLite_0.4.2 SingleCellExperiment_1.20.1
## [15] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [17] GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
## [19] IRanges_2.32.0 S4Vectors_0.36.2
## [21] BiocGenerics_0.44.0 MatrixGenerics_1.10.0
## [23] matrixStats_1.0.0 lubridate_1.9.2
## [25] forcats_0.5.2 stringr_1.5.0
## [27] dplyr_1.1.2 purrr_1.0.1
## [29] readr_2.1.3 tidyr_1.3.0
## [31] tibble_3.2.1 tidyverse_2.0.0
## [33] reshape2_1.4.4 SeuratDisk_0.0.0.9020
## [35] patchwork_1.1.2 ggplot2_3.4.0
## [37] SeuratObject_4.1.3 Seurat_4.3.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 R.utils_2.12.2
## [3] spatstat.explore_3.0-5 reticulate_1.27
## [5] tidyselect_1.2.0 htmlwidgets_1.6.1
## [7] grid_4.2.1 Rtsne_0.16
## [9] ScaledMatrix_1.6.0 munsell_0.5.0
## [11] codetools_0.2-19 ica_1.0-3
## [13] xgboost_1.7.3.1 statmod_1.5.0
## [15] future_1.32.0 miniUI_0.1.1.1
## [17] withr_2.5.0 spatstat.random_3.0-1
## [19] colorspace_2.1-0 progressr_0.13.0
## [21] highr_0.10 knitr_1.42
## [23] rstudioapi_0.14 ROCR_1.0-11
## [25] tensor_1.5 listenv_0.9.0
## [27] labeling_0.4.2 GenomeInfoDbData_1.2.9
## [29] polyclip_1.10-4 farver_2.1.1
## [31] bit64_4.0.5 rhdf5_2.42.1
## [33] parallelly_1.36.0 vctrs_0.6.2
## [35] generics_0.1.3 xfun_0.39
## [37] timechange_0.2.0 R6_2.5.1
## [39] ggbeeswarm_0.7.1 rsvd_1.0.5
## [41] locfit_1.5-9.7 hdf5r_1.3.8
## [43] rhdf5filters_1.10.1 bitops_1.0-7
## [45] spatstat.utils_3.0-3 cachem_1.0.8
## [47] DelayedArray_0.24.0 BiocIO_1.8.0
## [49] promises_1.2.0.1 beeswarm_0.4.0
## [51] gtable_0.3.3 beachmat_2.14.2
## [53] globals_0.16.2 goftest_1.2-3
## [55] rlang_1.1.1 GlobalOptions_0.1.2
## [57] splines_4.2.1 rtracklayer_1.58.0
## [59] lazyeval_0.2.2 spatstat.geom_3.0-3
## [61] yaml_2.3.7 abind_1.4-5
## [63] httpuv_1.6.7 tools_4.2.1
## [65] ellipsis_0.3.2 jquerylib_0.1.4
## [67] RColorBrewer_1.1-3 ggridges_0.5.4
## [69] Rcpp_1.0.10 plyr_1.8.8
## [71] sparseMatrixStats_1.10.0 zlibbioc_1.44.0
## [73] RCurl_1.98-1.12 deldir_1.0-9
## [75] pbapply_1.7-0 cowplot_1.1.1
## [77] zoo_1.8-12 cluster_2.1.4
## [79] magrittr_2.0.3 data.table_1.14.6
## [81] scattermore_1.1 circlize_0.4.15
## [83] lmtest_0.9-40 RANN_2.6.1
## [85] fitdistrplus_1.1-11 hms_1.1.2
## [87] mime_0.12 evaluate_0.21
## [89] xtable_1.8-4 XML_3.99-0.14
## [91] shape_1.4.6 compiler_4.2.1
## [93] KernSmooth_2.23-20 crayon_1.5.2
## [95] R.oo_1.25.0 htmltools_0.5.5
## [97] later_1.3.1 tzdb_0.4.0
## [99] ggprism_1.0.4 DBI_1.1.3
## [101] MASS_7.3-60 Matrix_1.5-3
## [103] cli_3.5.0 R.methodsS3_1.8.2
## [105] metapod_1.6.0 parallel_4.2.1
## [107] igraph_1.4.2 pkgconfig_2.0.3
## [109] GenomicAlignments_1.34.0 sp_1.6-0
## [111] plotly_4.10.2 spatstat.sparse_3.0-1
## [113] paletteer_1.5.0 vipor_0.4.5
## [115] bslib_0.4.2 dqrng_0.3.0
## [117] XVector_0.38.0 snakecase_0.11.0
## [119] digest_0.6.31 sctransform_0.3.5
## [121] RcppAnnoy_0.0.20 janitor_2.2.0
## [123] Biostrings_2.66.0 spatstat.data_3.0-1
## [125] rmarkdown_2.22 leiden_0.4.3
## [127] edgeR_3.40.2 uwot_0.1.14
## [129] DelayedMatrixStats_1.20.0 restfulr_0.0.15
## [131] Rsamtools_2.14.0 shiny_1.7.4
## [133] gtools_3.9.4 rjson_0.2.21
## [135] lifecycle_1.0.3 nlme_3.1-161
## [137] jsonlite_1.8.4 Rhdf5lib_1.20.0
## [139] BiocNeighbors_1.16.0 limma_3.54.2
## [141] fansi_1.0.4 pillar_1.9.0
## [143] lattice_0.20-45 ggrastr_1.0.1
## [145] fastmap_1.1.1 httr_1.4.6
## [147] survival_3.5-5 glue_1.6.2
## [149] png_0.1-8 bluster_1.6.0
## [151] bit_4.0.5 HDF5Array_1.26.0
## [153] stringi_1.7.12 sass_0.4.6
## [155] rematch2_2.1.2 BiocSingular_1.14.0
## [157] caTools_1.18.2 irlba_2.3.5.1
## [159] future.apply_1.10.0