The purpose of this script is to identify doublets in single-cell RNA-seq data. Doublets are a type of artifact that arises when two or more cells are captured by a single reaction volume and sequenced as a single cell. This can complicate interpretation of downstream analyses to determine cell identity and heterogeneity.
This script has been adapted from the DoubletFinder documentation (https://github.com/chris-mcginnis-ucsf/DoubletFinder) and publicly available tutorials (https://rpubs.com/kenneditodd/doublet_finder_example, https://biostatsquid.com/doubletfinder-tutorial/).
library(Seurat)
library(tidyverse)
library(DoubletFinder) # remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
library(ggplot2)
library(patchwork)
setwd("C:/Users/edlarsen/Documents/240828_scRNAseq")
Input: A Seurat object that has already been filtered to remove low quality cells.
filtered_ln <- readRDS(file = "C:/Users/edlarsen/Documents/240828_scRNAseq/Integration/seurat_merged_LN_filtered.RData")
filtered_thym <- readRDS(file = "C:/Users/edlarsen/Documents/240828_scRNAseq/Integration/seurat_merged_THYM_filtered.RData")
DoubletFinder should not be applied to aggregated scRNA-seq representing multiple distinct samples (e.g., multiple 10X lanes), so the Seurat object will be split based on the sample ID.
# returns a list of n Suerat objects, 1 per sample.
ln_split <- SplitObject(filtered_ln, split.by = "orig.ident")
thym_split <- SplitObject(filtered_thym, split.by = "orig.ident")
# Functions ===================================================================
#----------------------------------------------------------#
# run_doubletfinder_custom
#----------------------------------------------------------#
# run_doubletfinder_custom runs Doublet_Finder() and returns a dataframe with the cell IDs and a column with either 'Singlet' or 'Doublet'
run_doubletfinder_custom <- function(seu_sample_subset, multiplet_rate = NULL){
# for debug
#seu_sample_subset <- samp_split[[1]]
# Print sample number
print(paste0("Sample ", unique(seu_sample_subset[['orig.ident']]), '...........'))
# DoubletFinder needs the multiplet rate of your sample (i.e., the expected proportion of doublets). If you don't know the multiplet rate for your experiment, 10X published a list of expected multiplet rates for different loaded and recovered cells. If a multiplet rate is not provided, this function will automatically determine how many doublets to expect for a given number of cells in a sample.
if(is.null(multiplet_rate)){
print('multiplet_rate not provided....... estimating multiplet rate from cells in dataset')
# 10X multiplet rates table
#https://rpubs.com/kenneditodd/doublet_finder_example
multiplet_rates_10x <- data.frame('Multiplet_rate'= c(0.004, 0.008, 0.0160, 0.023, 0.031, 0.039, 0.046, 0.054, 0.061, 0.069, 0.076),
'Loaded_cells' = c(800, 1600, 3200, 4800, 6400, 8000, 9600, 11200, 12800, 14400, 16000),
'Recovered_cells' = c(500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000))
print(multiplet_rates_10x)
multiplet_rate <- multiplet_rates_10x %>% dplyr::filter(Recovered_cells < nrow(seu_sample_subset@meta.data)) %>%
dplyr::slice(which.max(Recovered_cells)) %>% # select the min threshold depending on your number of samples
dplyr::select(Multiplet_rate) %>% as.numeric(as.character()) # get the expected multiplet rate for that number of recovered cells
print(paste('Setting multiplet rate to', multiplet_rate))
}
# Pre-process seurat object with standard seurat workflow ---
sample <- NormalizeData(seu_sample_subset)
sample <- FindVariableFeatures(sample)
sample <- ScaleData(sample)
sample <- RunPCA(sample, nfeatures.print = 10)
# Find significant PCs
stdv <- sample[["pca"]]@stdev
percent_stdv <- (stdv/sum(stdv)) * 100
cumulative <- cumsum(percent_stdv)
co1 <- which(cumulative > 90 & percent_stdv < 5)[1]
co2 <- sort(which((percent_stdv[1:length(percent_stdv) - 1] -
percent_stdv[2:length(percent_stdv)]) > 0.1),
decreasing = T)[1] + 1
min_pc <- min(co1, co2)
# Finish pre-processing with min_pc
sample <- RunUMAP(sample, dims = 1:min_pc)
sample <- FindNeighbors(object = sample, dims = 1:min_pc)
sample <- FindClusters(object = sample, resolution = 0.1)
# pK identification (no ground-truth)
#introduces artificial doublets in varying props, merges with real data set and
# preprocesses the data + calculates the prop of artficial neighrest neighbours,
# provides a list of the proportion of artificial nearest neighbours for varying
# combinations of the pN and pK
sweep_list <- paramSweep(sample, PCs = 1:min_pc, sct = FALSE)
sweep_stats <- summarizeSweep(sweep_list)
bcmvn <- find.pK(sweep_stats) # computes a metric to find the optimal pK value (max mean variance normalised by modality coefficient)
# Optimal pK is the max of the bimodality coefficient (BCmvn) distribution
optimal.pk <- bcmvn %>%
dplyr::filter(BCmetric == max(BCmetric)) %>%
dplyr::select(pK)
optimal.pk <- as.numeric(as.character(optimal.pk[[1]]))
## Homotypic doublet proportion estimate
annotations <- sample@meta.data$seurat_clusters # use the clusters as the user-defined cell types
homotypic.prop <- modelHomotypic(annotations) # get proportions of homotypic doublets
nExp.poi <- round(multiplet_rate * nrow(sample@meta.data)) # multiply by number of cells to get the number of expected multiplets
nExp.poi.adj <- round(nExp.poi * (1 - homotypic.prop)) # expected number of doublets
# run DoubletFinder
sample <- doubletFinder(seu = sample,
PCs = 1:min_pc,
pK = optimal.pk, # the neighborhood size used to compute the number of artificial nearest neighbours
nExp = nExp.poi.adj) # number of expected real doublets
# change name of metadata column with Singlet/Doublet information
colnames(sample@meta.data)[grepl('DF.classifications.*', colnames(sample@meta.data))] <- "doublet_finder"
# Subset and save
# head(sample@meta.data['doublet_finder'])
# singlets <- subset(sample, doublet_finder == "Singlet") # extract only singlets
# singlets$ident
double_finder_res <- sample@meta.data['doublet_finder'] # get the metadata column with singlet, doublet info
double_finder_res <- rownames_to_column(double_finder_res, "row_names") # add the cell IDs as new column to be able to merge correctly
return(double_finder_res)
}
ln_split <- lapply(ln_split, run_doubletfinder_custom)
thym_split <- lapply(thym_split, run_doubletfinder_custom)
# Lymph node
filtered_ln <- NormalizeData(filtered_ln, normalization.method = "LogNormalize", scale.factor = 10000)
filtered_ln <- FindVariableFeatures(filtered_ln)
filtered_ln <- ScaleData(filtered_ln)
filtered_ln <- RunPCA(filtered_ln)
filtered_ln <- RunUMAP(filtered_ln, dims = 1:10)
sglt_dblt_metadata_LN <- data.frame(bind_rows(ln_split)) # merge to a single dataframe
rownames(sglt_dblt_metadata_LN) <- sglt_dblt_metadata_LN$row_names # assign cell IDs to row names to ensure match
sglt_dblt_metadata_LN$row_names <- NULL
filtered_ln <- AddMetaData(filtered_ln, sglt_dblt_metadata_LN, col.name = "doublet_finder")
# save (optional)
## the DoubletFinder function takes a long time to run, so it may be helpful to save this object to avoid having to repeat these steps in the future
#saveRDS(filtered_ln, file="merged_filtered_LN_withDoubletFinderMetadata.RData")
# Thymus
filtered_thym <- NormalizeData(filtered_thym, normalization.method = "LogNormalize", scale.factor = 10000)
filtered_thym <- FindVariableFeatures(filtered_thym)
filtered_thym <- ScaleData(filtered_thym)
filtered_thym <- RunPCA(filtered_thym)
filtered_thym <- RunUMAP(filtered_thym, dims = 1:10)
sglt_dblt_metadata_THYM <- data.frame(bind_rows(thym_split)) # merge to a single dataframe
rownames(sglt_dblt_metadata_THYM) <- sglt_dblt_metadata_THYM$row_names # assign cell IDs to row names to ensure match
sglt_dblt_metadata_THYM$row_names <- NULL
filtered_thym <- AddMetaData(filtered_thym, sglt_dblt_metadata_THYM, col.name = "doublet_finder")
# save (optional)
#saveRDS(filtered_ln, file="merged_filtered_THYM_withDoubletFinderMetadata.RData")
# Check how doublets singlets differ in QC measures per sample.
VlnPlot(filtered_ln, group.by = 'orig.ident', split.by = "doublet_finder",
features = c("nFeature_RNA", "nCount_RNA"),
ncol = 3, pt.size = 0) + theme(legend.position = 'right')
VlnPlot(filtered_thym,
group.by = "orig.ident",
split.by = "doublet_finder",
features = c("nFeature_RNA", "nCount_RNA", "percent_mt"),
ncol = 3,
pt.size = 0 +
theme(legend.position = 'right', plot.title = "Thymus")
)
DimPlot(filtered_ln,
group.by = "doublet_finder",
reduction = "umap",
pt.size=0.5) + ggtitle("DoubletFinder: Canine Lymph Node")
DimPlot(filtered_thym,
group.by = "doublet_finder",
reduction = "umap",
pt.size=0.5)
Ensure that predicted doublets are not cycling cells.
Each cell will be assigned a score based on its expression of G2/M and S phase markers. The CellCycleStoring() function stores S and G2/M scores in object metadata, along with the predicted classification of each cell in either G2/M, S, or G1 phase.
# segregate G2/M phase and S phase markers from Seurat's built-in list of cell cycle markers
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
# assign each cell a score based on expression of G2/M and S phase markers
filtered_ln <- CellCycleScoring(filtered_ln,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE)
filtered_thym <- CellCycleScoring(filtered_thym,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE)
filtered_ln <- RunPCA(filtered_ln, features = c(s.genes, g2m.genes))
DimPlot(filtered_ln,
group.by = "Phase",
reduction = "umap",
pt.size=0.5) +
ggtitle("Cell Cycle Scoring: Canine Lymph Node")
filtered_thym <- RunPCA(filtered_thym, features = c(s.genes, g2m.genes))
DimPlot(filtered_thym,
group.by = "Phase",
reduction = "umap",
pt.size=0.5) +
ggtitle("Cell Cycle Scoring: Canine Thymus")
If the doublets were not limited to cycling cells and you wish to remove doublets from your Seurat object before downstream analysis, set this code chunk to EVAL=TRUE.
filtered_ln <- subset(filtered_ln, doublet_finder == "Singlet")
filtered_thym <- subset(filtered_thym, doublet_finder == "Singlet")
saveRDS(filtered_ln, file="merged_filtered_singlet_LN.RData")
saveRDS(filtered_ln, file="merged_filtered_singlet_THYM.RData")
Next step: Sample integration.
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ROCR_1.0-11 KernSmooth_2.23-22 fields_16.3
## [4] viridisLite_0.4.2 spam_2.11-0 patchwork_1.3.0
## [7] DoubletFinder_2.0.4 lubridate_1.9.4 forcats_1.0.0
## [10] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [16] ggplot2_3.5.1 tidyverse_2.0.0 Seurat_5.1.0
## [19] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_1.8.9
## [4] magrittr_2.0.3 ggbeeswarm_0.7.2 spatstat.utils_3.1-1
## [7] farver_2.1.2 rmarkdown_2.29 vctrs_0.6.5
## [10] spatstat.explore_3.3-3 htmltools_0.5.8.1 sass_0.4.9
## [13] sctransform_0.4.1 parallelly_1.40.1 bslib_0.8.0
## [16] htmlwidgets_1.6.4 ica_1.0-3 plyr_1.8.9
## [19] plotly_4.10.4 zoo_1.8-12 cachem_1.1.0
## [22] igraph_2.1.2 mime_0.12 lifecycle_1.0.4
## [25] pkgconfig_2.0.3 Matrix_1.7-0 R6_2.5.1
## [28] fastmap_1.2.0 fitdistrplus_1.2-2 future_1.34.0
## [31] shiny_1.10.0 digest_0.6.35 colorspace_2.1-1
## [34] tensor_1.5 RSpectra_0.16-2 irlba_2.3.5.1
## [37] labeling_0.4.3 progressr_0.15.1 spatstat.sparse_3.1-0
## [40] timechange_0.3.0 httr_1.4.7 polyclip_1.10-7
## [43] abind_1.4-8 compiler_4.4.0 withr_3.0.2
## [46] fastDummies_1.7.4 maps_3.4.2.1 MASS_7.3-60.2
## [49] tools_4.4.0 vipor_0.4.7 lmtest_0.9-40
## [52] beeswarm_0.4.0 httpuv_1.6.15 future.apply_1.11.3
## [55] goftest_1.2-3 glue_1.7.0 nlme_3.1-164
## [58] promises_1.3.2 grid_4.4.0 Rtsne_0.17
## [61] cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
## [64] gtable_0.3.6 spatstat.data_3.1-4 tzdb_0.4.0
## [67] data.table_1.16.4 hms_1.1.3 spatstat.geom_3.3-4
## [70] RcppAnnoy_0.0.22 ggrepel_0.9.6 RANN_2.6.2
## [73] pillar_1.10.1 RcppHNSW_0.6.0 later_1.3.2
## [76] splines_4.4.0 lattice_0.22-6 survival_3.5-8
## [79] deldir_2.0-4 tidyselect_1.2.1 miniUI_0.1.1.1
## [82] pbapply_1.7-2 knitr_1.49 gridExtra_2.3
## [85] scattermore_1.2 xfun_0.49 matrixStats_1.4.1
## [88] stringi_1.8.4 lazyeval_0.2.2 yaml_2.3.10
## [91] evaluate_1.0.3 codetools_0.2-20 cli_3.6.2
## [94] uwot_0.2.2 xtable_1.8-4 reticulate_1.40.0
## [97] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.13
## [100] globals_0.16.3 spatstat.random_3.3-2 png_0.1-8
## [103] ggrastr_1.0.2 spatstat.univar_3.1-1 dotCall64_1.2
## [106] listenv_0.9.1 scales_1.3.0 ggridges_0.5.6
## [109] leiden_0.4.3.1 rlang_1.1.3 cowplot_1.1.3
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.