Overview

Setup

# load libraries
library(parallel) # detectCores()
library(DoubletFinder) # paramSweep_v3()
library(Seurat) # SplitObject()

# set seed
set.seed(8)

# work in parallel
options(mc.cores = detectCores() - 1)

# load filtered seurat object
mouse.filtered <- readRDS("/research/labs/neurology/fryer/m214960/Da_Mesquita/rObjects/mouse_cellbender_before_doublet_finder.rds")
mouse.filtered
## An object of class Seurat 
## 21097 features across 37408 samples within 1 assay 
## Active assay: RNA (21097 features, 0 variable features)

Run DoubletFinder

# split aggregated data by sample
table(mouse.filtered$sample)
## 
##  E3.2M.M  E3.2M.F  E4.2M.M  E4.2M.F E3.14M.M E3.14M.F E4.14M.M E4.14M.F 
##     4503     4715     3512     4209     4961     4023     4516     6969
mouse.split <- SplitObject(mouse.filtered, split.by = "sample") 

# loop through samples to find doublets
for (i in 1:length(mouse.split)) {
  # print the sample we are on
  print(paste0("Sample ",i))
  
  # Pre-process seurat object with standard seurat workflow
  mouse.sample <- NormalizeData(mouse.split[[i]])
  mouse.sample <- FindVariableFeatures(mouse.sample)
  mouse.sample <- ScaleData(mouse.sample)
  mouse.sample <- RunPCA(mouse.sample, nfeatures.print = 10)
  
  # Find significant PCs
  stdv <- mouse.sample[["pca"]]@stdev
  sum.stdv <- sum(mouse.sample[["pca"]]@stdev)
  percent.stdv <- (stdv / sum.stdv) * 100
  cumulative <- cumsum(percent.stdv)
  co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
  co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
  min.pc <- min(co1, co2)
  min.pc
  
  # finish pre-processing
  mouse.sample <- RunUMAP(mouse.sample, dims = 1:min.pc)
  mouse.sample <- FindNeighbors(object = mouse.sample, dims = 1:min.pc)              
  mouse.sample <- FindClusters(object = mouse.sample, resolution = 0.1)
  
  # pK identification (no ground-truth)
  sweep.list <- paramSweep_v3(mouse.sample, PCs = 1:min.pc, num.cores = detectCores() - 1)
  sweep.stats <- summarizeSweep(sweep.list)
  bcmvn <- find.pK(sweep.stats)
  
  # Optimal pK is the max of the bomodality coefficent (BCmvn) distribution
  bcmvn.max <- bcmvn[which.max(bcmvn$BCmetric),]
  optimal.pk <- bcmvn.max$pK
  optimal.pk <- as.numeric(levels(optimal.pk))[optimal.pk]
  
  ## Homotypic doublet proportion estimate
  annotations <- mouse.sample@meta.data$seurat_clusters
  homotypic.prop <- modelHomotypic(annotations) 
  nExp.poi <- round(optimal.pk * nrow(mouse.sample@meta.data)) ## Assuming 7.5% doublet formation rate - tailor for your dataset
  nExp.poi.adj <- round(nExp.poi * (1 - homotypic.prop))
  
  # run DoubletFinder
  mouse.sample <- doubletFinder_v3(seu = mouse.sample, 
                                   PCs = 1:min.pc, 
                                   pK = optimal.pk,
                                   nExp = nExp.poi.adj)
  metadata <- mouse.sample@meta.data
  colnames(metadata)[15] <- "doublet_finder"
  mouse.sample@meta.data <- metadata 
  
  # subset and save
  mouse.singlets <- subset(mouse.sample, doublet_finder == "Singlet")
  mouse.split[[i]] <- mouse.singlets
  remove(mouse.singlets)
}
## [1] "Sample 1"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4715
## Number of edges: 142535
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9724
## Number of communities: 11
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1572 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 2"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4503
## Number of edges: 139353
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9655
## Number of communities: 9
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1501 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 3"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4023
## Number of edges: 122114
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9661
## Number of communities: 9
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1341 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 4"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4961
## Number of edges: 148073
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9715
## Number of communities: 8
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1654 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 5"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4209
## Number of edges: 123279
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9711
## Number of communities: 10
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1403 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 6"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3512
## Number of edges: 103211
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9708
## Number of communities: 8
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1171 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 7"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6969
## Number of edges: 217179
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9720
## Number of communities: 10
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 2323 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] "Sample 8"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4516
## Number of edges: 131695
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9688
## Number of communities: 10
## Elapsed time: 0 seconds

## NULL
## [1] "Creating 1505 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
# converge mouse.split
mouse.singlets <- merge(x = mouse.split[[1]],
                       y = c(mouse.split[[2]], mouse.split[[3]], mouse.split[[4]],
                             mouse.split[[5]], mouse.split[[6]], mouse.split[[7]],
                             mouse.split[[8]]),
                       project = "Mouse scRNAseq")
mouse.singlets
## An object of class Seurat 
## 21097 features across 35214 samples within 1 assay 
## Active assay: RNA (21097 features, 0 variable features)
# compare before and after doublet finder
mouse.filtered
## An object of class Seurat 
## 21097 features across 37408 samples within 1 assay 
## Active assay: RNA (21097 features, 0 variable features)
mouse.singlets
## An object of class Seurat 
## 21097 features across 35214 samples within 1 assay 
## Active assay: RNA (21097 features, 0 variable features)