Introduction

The purpose of this script is to identify doublets in single-cell RNA-seq data. Doublets are a type of artifact that arises when two or more cells are captured by a single reaction volume and sequenced as a single cell. This can complicate interpretation of downstream analyses to determine cell identity and heterogeneity.

This script has been adapted from the DoubletFinder documentation (https://github.com/chris-mcginnis-ucsf/DoubletFinder) and publicly available tutorials (https://rpubs.com/kenneditodd/doublet_finder_example, https://biostatsquid.com/doubletfinder-tutorial/).

Data and package loading

Load necessary software packages

library(Seurat)
library(tidyverse)
library(DoubletFinder) # remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
library(ggplot2)
library(patchwork)

Set working directory

setwd("C:/Users/edlarsen/Documents/240828_scRNAseq")

Load single-cell RNA-seq count data

Input: A Seurat object that has already been filtered to remove low quality cells.

filtered_ln <- readRDS(file = "C:/Users/edlarsen/Documents/240828_scRNAseq/Integration/seurat_merged_LN_filtered.RData")
filtered_thym <- readRDS(file = "C:/Users/edlarsen/Documents/240828_scRNAseq/Integration/seurat_merged_THYM_filtered.RData")

DoubletFinder should not be applied to aggregated scRNA-seq representing multiple distinct samples (e.g., multiple 10X lanes), so the Seurat object will be split based on the sample ID.

# returns a list of n Suerat objects, 1 per sample.
ln_split <- SplitObject(filtered_ln, split.by = "orig.ident")
thym_split <- SplitObject(filtered_thym, split.by = "orig.ident")

DoubletFinder function

# Functions ===================================================================
#----------------------------------------------------------#
# run_doubletfinder_custom
#----------------------------------------------------------#
# run_doubletfinder_custom runs Doublet_Finder() and returns a dataframe with the cell IDs and a column with either 'Singlet' or 'Doublet'
run_doubletfinder_custom <- function(seu_sample_subset, multiplet_rate = NULL){
  # for debug
  #seu_sample_subset <- samp_split[[1]]
  # Print sample number
  print(paste0("Sample ", unique(seu_sample_subset[['orig.ident']]), '...........')) 
  
  # DoubletFinder needs the multiplet rate of your sample (i.e., the expected  proportion of doublets). If you don't know the multiplet rate for your experiment, 10X published a list of expected multiplet rates for different loaded and recovered cells. If a multiplet rate is not provided, this function will automatically determine how many doublets to expect for a given number of cells in a sample.
  
  if(is.null(multiplet_rate)){
    print('multiplet_rate not provided....... estimating multiplet rate from cells in dataset')
    
    # 10X multiplet rates table
    #https://rpubs.com/kenneditodd/doublet_finder_example
    multiplet_rates_10x <- data.frame('Multiplet_rate'= c(0.004, 0.008, 0.0160, 0.023, 0.031, 0.039, 0.046, 0.054, 0.061, 0.069, 0.076),
                                      'Loaded_cells' = c(800, 1600, 3200, 4800, 6400, 8000, 9600, 11200, 12800, 14400, 16000),
                                      'Recovered_cells' = c(500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000))
    
    print(multiplet_rates_10x)
    
    multiplet_rate <- multiplet_rates_10x %>% dplyr::filter(Recovered_cells < nrow(seu_sample_subset@meta.data)) %>% 
      dplyr::slice(which.max(Recovered_cells)) %>% # select the min threshold depending on your number of samples
      dplyr::select(Multiplet_rate) %>% as.numeric(as.character()) # get the expected multiplet rate for that number of recovered cells
    
    print(paste('Setting multiplet rate to', multiplet_rate))
  }
  
  # Pre-process seurat object with standard seurat workflow --- 
  sample <- NormalizeData(seu_sample_subset)
  sample <- FindVariableFeatures(sample)
  sample <- ScaleData(sample)
  sample <- RunPCA(sample, nfeatures.print = 10)
  
  # Find significant PCs
  stdv <- sample[["pca"]]@stdev
  percent_stdv <- (stdv/sum(stdv)) * 100
  cumulative <- cumsum(percent_stdv)
  co1 <- which(cumulative > 90 & percent_stdv < 5)[1] 
  co2 <- sort(which((percent_stdv[1:length(percent_stdv) - 1] - 
                       percent_stdv[2:length(percent_stdv)]) > 0.1), 
              decreasing = T)[1] + 1
  min_pc <- min(co1, co2)
  
  # Finish pre-processing with min_pc
  sample <- RunUMAP(sample, dims = 1:min_pc)
  sample <- FindNeighbors(object = sample, dims = 1:min_pc)              
  sample <- FindClusters(object = sample, resolution = 0.1)
  
  # pK identification (no ground-truth) 
  #introduces artificial doublets in varying props, merges with real data set and 
  # preprocesses the data + calculates the prop of artficial neighrest neighbours, 
  # provides a list of the proportion of artificial nearest neighbours for varying
  # combinations of the pN and pK
  sweep_list <- paramSweep(sample, PCs = 1:min_pc, sct = FALSE)   
  sweep_stats <- summarizeSweep(sweep_list)
  bcmvn <- find.pK(sweep_stats) # computes a metric to find the optimal pK value (max mean variance normalised by modality coefficient)
  # Optimal pK is the max of the bimodality coefficient (BCmvn) distribution
  optimal.pk <- bcmvn %>% 
    dplyr::filter(BCmetric == max(BCmetric)) %>%
    dplyr::select(pK)
  optimal.pk <- as.numeric(as.character(optimal.pk[[1]]))
  
  ## Homotypic doublet proportion estimate
  annotations <- sample@meta.data$seurat_clusters # use the clusters as the user-defined cell types
  homotypic.prop <- modelHomotypic(annotations) # get proportions of homotypic doublets
  
  nExp.poi <- round(multiplet_rate * nrow(sample@meta.data)) # multiply by number of cells to get the number of expected multiplets
  nExp.poi.adj <- round(nExp.poi * (1 - homotypic.prop)) # expected number of doublets
  
  # run DoubletFinder
  sample <- doubletFinder(seu = sample, 
                          PCs = 1:min_pc, 
                          pK = optimal.pk, # the neighborhood size used to compute the number of artificial nearest neighbours
                          nExp = nExp.poi.adj) # number of expected real doublets
  # change name of metadata column with Singlet/Doublet information
  colnames(sample@meta.data)[grepl('DF.classifications.*', colnames(sample@meta.data))] <- "doublet_finder"
  
  # Subset and save
  # head(sample@meta.data['doublet_finder'])
  # singlets <- subset(sample, doublet_finder == "Singlet") # extract only singlets
  # singlets$ident
  double_finder_res <- sample@meta.data['doublet_finder'] # get the metadata column with singlet, doublet info
  double_finder_res <- rownames_to_column(double_finder_res, "row_names") # add the cell IDs as new column to be able to merge correctly
  return(double_finder_res)
}

Running the function

ln_split <- lapply(ln_split, run_doubletfinder_custom)

thym_split <- lapply(thym_split, run_doubletfinder_custom)

Add DoubletFinder results to Seurat object

# Lymph node
filtered_ln <- NormalizeData(filtered_ln, normalization.method = "LogNormalize", scale.factor = 10000)
filtered_ln <- FindVariableFeatures(filtered_ln)
filtered_ln <- ScaleData(filtered_ln)
filtered_ln <- RunPCA(filtered_ln)
filtered_ln <- RunUMAP(filtered_ln, dims = 1:10)

sglt_dblt_metadata_LN <- data.frame(bind_rows(ln_split)) # merge to a single dataframe
rownames(sglt_dblt_metadata_LN) <- sglt_dblt_metadata_LN$row_names # assign cell IDs to row names to ensure match
sglt_dblt_metadata_LN$row_names <- NULL
filtered_ln <- AddMetaData(filtered_ln, sglt_dblt_metadata_LN, col.name = "doublet_finder")

# save (optional)
## the DoubletFinder function takes a long time to run, so it may be helpful to save this object to avoid having to repeat these steps in the future
#saveRDS(filtered_ln, file="merged_filtered_LN_withDoubletFinderMetadata.RData")
# Thymus
filtered_thym <- NormalizeData(filtered_thym, normalization.method = "LogNormalize", scale.factor = 10000)
filtered_thym <- FindVariableFeatures(filtered_thym)
filtered_thym <- ScaleData(filtered_thym)
filtered_thym <- RunPCA(filtered_thym)
filtered_thym <- RunUMAP(filtered_thym, dims = 1:10)

sglt_dblt_metadata_THYM <- data.frame(bind_rows(thym_split)) # merge to a single dataframe
rownames(sglt_dblt_metadata_THYM) <- sglt_dblt_metadata_THYM$row_names # assign cell IDs to row names to ensure match
sglt_dblt_metadata_THYM$row_names <- NULL
filtered_thym <- AddMetaData(filtered_thym, sglt_dblt_metadata_THYM, col.name = "doublet_finder")

# save (optional)
#saveRDS(filtered_ln, file="merged_filtered_THYM_withDoubletFinderMetadata.RData")

Summary of doublet detection results

# Check how doublets singlets differ in QC measures per sample.
VlnPlot(filtered_ln, group.by = 'orig.ident', split.by = "doublet_finder",
        features = c("nFeature_RNA", "nCount_RNA"), 
        ncol = 3, pt.size = 0) + theme(legend.position = 'right')

VlnPlot(filtered_thym,
        group.by = "orig.ident",
        split.by = "doublet_finder",
        features = c("nFeature_RNA", "nCount_RNA", "percent_mt"),
        ncol = 3,
        pt.size = 0 +
          theme(legend.position = 'right', plot.title = "Thymus")
          )

DimPlot(filtered_ln,
        group.by = "doublet_finder",
        reduction = "umap",
        pt.size=0.5) + ggtitle("DoubletFinder: Canine Lymph Node")

DimPlot(filtered_thym,
        group.by = "doublet_finder",
        reduction = "umap",
        pt.size=0.5)

Cell-Cycle Scoring

Ensure that predicted doublets are not cycling cells.

Assign Cell-Cycle Scores

Each cell will be assigned a score based on its expression of G2/M and S phase markers. The CellCycleStoring() function stores S and G2/M scores in object metadata, along with the predicted classification of each cell in either G2/M, S, or G1 phase.

# segregate G2/M phase and S phase markers from Seurat's built-in list of cell cycle markers
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

# assign each cell a score based on expression of G2/M and S phase markers
filtered_ln <- CellCycleScoring(filtered_ln, 
                                s.features = s.genes, 
                                g2m.features = g2m.genes, 
                                set.ident = TRUE)

filtered_thym <- CellCycleScoring(filtered_thym, 
                                s.features = s.genes, 
                                g2m.features = g2m.genes, 
                                set.ident = TRUE)

Compare location of cycling cells to predicted doublets on UMAP

filtered_ln <- RunPCA(filtered_ln, features = c(s.genes, g2m.genes))
DimPlot(filtered_ln,
        group.by = "Phase",
        reduction = "umap",
        pt.size=0.5) + 
  ggtitle("Cell Cycle Scoring: Canine Lymph Node")

filtered_thym <- RunPCA(filtered_thym, features = c(s.genes, g2m.genes))
DimPlot(filtered_thym,
        group.by = "Phase",
        reduction = "umap",
        pt.size=0.5) + 
  ggtitle("Cell Cycle Scoring: Canine Thymus")

Remove doublets and save

If the doublets were not limited to cycling cells and you wish to remove doublets from your Seurat object before downstream analysis, set this code chunk to EVAL=TRUE.

filtered_ln <- subset(filtered_ln, doublet_finder == "Singlet")
filtered_thym <- subset(filtered_thym, doublet_finder == "Singlet")

saveRDS(filtered_ln, file="merged_filtered_singlet_LN.RData")
saveRDS(filtered_ln, file="merged_filtered_singlet_THYM.RData")

Next step: Sample integration.

Citations

sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ROCR_1.0-11         KernSmooth_2.23-22  fields_16.3        
##  [4] viridisLite_0.4.2   spam_2.11-0         patchwork_1.3.0    
##  [7] DoubletFinder_2.0.4 lubridate_1.9.4     forcats_1.0.0      
## [10] stringr_1.5.1       dplyr_1.1.4         purrr_1.0.2        
## [13] readr_2.1.5         tidyr_1.3.1         tibble_3.2.1       
## [16] ggplot2_3.5.1       tidyverse_2.0.0     Seurat_5.1.0       
## [19] SeuratObject_5.0.2  sp_2.1-4           
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.17.1      jsonlite_1.8.9        
##   [4] magrittr_2.0.3         ggbeeswarm_0.7.2       spatstat.utils_3.1-1  
##   [7] farver_2.1.2           rmarkdown_2.29         vctrs_0.6.5           
##  [10] spatstat.explore_3.3-3 htmltools_0.5.8.1      sass_0.4.9            
##  [13] sctransform_0.4.1      parallelly_1.40.1      bslib_0.8.0           
##  [16] htmlwidgets_1.6.4      ica_1.0-3              plyr_1.8.9            
##  [19] plotly_4.10.4          zoo_1.8-12             cachem_1.1.0          
##  [22] igraph_2.1.2           mime_0.12              lifecycle_1.0.4       
##  [25] pkgconfig_2.0.3        Matrix_1.7-0           R6_2.5.1              
##  [28] fastmap_1.2.0          fitdistrplus_1.2-2     future_1.34.0         
##  [31] shiny_1.10.0           digest_0.6.35          colorspace_2.1-1      
##  [34] tensor_1.5             RSpectra_0.16-2        irlba_2.3.5.1         
##  [37] labeling_0.4.3         progressr_0.15.1       spatstat.sparse_3.1-0 
##  [40] timechange_0.3.0       httr_1.4.7             polyclip_1.10-7       
##  [43] abind_1.4-8            compiler_4.4.0         withr_3.0.2           
##  [46] fastDummies_1.7.4      maps_3.4.2.1           MASS_7.3-60.2         
##  [49] tools_4.4.0            vipor_0.4.7            lmtest_0.9-40         
##  [52] beeswarm_0.4.0         httpuv_1.6.15          future.apply_1.11.3   
##  [55] goftest_1.2-3          glue_1.7.0             nlme_3.1-164          
##  [58] promises_1.3.2         grid_4.4.0             Rtsne_0.17            
##  [61] cluster_2.1.6          reshape2_1.4.4         generics_0.1.3        
##  [64] gtable_0.3.6           spatstat.data_3.1-4    tzdb_0.4.0            
##  [67] data.table_1.16.4      hms_1.1.3              spatstat.geom_3.3-4   
##  [70] RcppAnnoy_0.0.22       ggrepel_0.9.6          RANN_2.6.2            
##  [73] pillar_1.10.1          RcppHNSW_0.6.0         later_1.3.2           
##  [76] splines_4.4.0          lattice_0.22-6         survival_3.5-8        
##  [79] deldir_2.0-4           tidyselect_1.2.1       miniUI_0.1.1.1        
##  [82] pbapply_1.7-2          knitr_1.49             gridExtra_2.3         
##  [85] scattermore_1.2        xfun_0.49              matrixStats_1.4.1     
##  [88] stringi_1.8.4          lazyeval_0.2.2         yaml_2.3.10           
##  [91] evaluate_1.0.3         codetools_0.2-20       cli_3.6.2             
##  [94] uwot_0.2.2             xtable_1.8-4           reticulate_1.40.0     
##  [97] munsell_0.5.1          jquerylib_0.1.4        Rcpp_1.0.13           
## [100] globals_0.16.3         spatstat.random_3.3-2  png_0.1-8             
## [103] ggrastr_1.0.2          spatstat.univar_3.1-1  dotCall64_1.2         
## [106] listenv_0.9.1          scales_1.3.0           ggridges_0.5.6        
## [109] leiden_0.4.3.1         rlang_1.1.3            cowplot_1.1.3
citation()
## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.