1 Overview

Preprocessing and QC for a single-cell multiome dataset. Single cell data is obtained from first-trimester placental cells.

2 Loading libararies and functions

library(here)
library(readr)
library(purrr)
library(ggplot2)
library(plotly)
library(viridis)
library(Seurat)
library(Signac)
library(scDblFinder)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggpointdensity)
library(ggExtra)
library(biovizBase)
library(tidyr)
library(dplyr)
library(stringr)
library(reshape2)
library(ggridges)
# library(magick)
# library(CHOIR)
# library(ggalluvial)
# library(aricode)

set.seed(1234)

3 Set up

3.1 Set sample

# sample <- params$sample
sample <- "s21b3"
sample

## [1] "s21b3"

3.2 Knitr options

3.3 Paths

library(here)

global_out_dir <- readLines(here("code/OUT_DIR.txt"))
data_dir <- readLines(here("code/DATA_DIR.txt"))
base_dir <- "/oak/stanford/groups/akundaje/projects/placenta_v2/output/singlecell/00/cellranger-arc"


out_dir <- file.path(global_out_dir, "singlecell/01/preprocessing", sample)

# load sample sheet
sample_meta <- read_tsv(file.path(data_dir, "scMultiome/metadata/20250526-multiome_metadata.tsv"))

## Rows: 10 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Source, ID, Batch, Liquid or Solid, Trisomy or Control, Embryo sex,...
## dbl (3): MIP_sample, Sequencing date, Gestational weeks
## lgl (1): Termination or CVS
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# construct path to sample's cellranger output
path_cellranger <- sample_meta %>% 
  dplyr::filter(ID == sample) %>% 
  pull(Path_cellranger)

# load config params
config_tsv <- data.table::fread(here("code", "singlecell", "01-preprocessing", "01-preprocessing-config.tsv"), data.table = FALSE)

# coerce to a list of param:value pairs, and make sure numeric parameters are numeric type
config <- as.list(tibble::deframe(config_tsv[, c(1, 2)]))
config$genes <- unlist(stringr::str_split(config$genes, pattern = ","))
config <- lapply(config, type.convert, as.is = TRUE)

4 Initialize Seurat object

Load the output from cellranger, which contains the gene and peak counts:

# load genome annotations for hg38
# here, annotations loaded using code adapted from 
# Kundaje Lab (2023). "PsychENCODE Multiome Analysis Pipeline."
# Github repository: https://github.com/kundajelab/psychencode/blob/main/exps/20231009_multiome_analysis_v2/02-preprocessing/qc.template.Rmd

suppressWarnings({
  
  annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
  
  seqlevelsStyle(annotation) <- "UCSC"

  })

# load cellranger-produced metadata with per-cell stats
cr_metadata <- read.csv(file.path(base_dir, sample, "per_barcode_metrics.csv"),
                        header = TRUE,
                        row.names = 1)

# load filtered matrices - contains both RNA and ATAC matrices
counts <- Read10X_h5(file.path(base_dir, sample, "filtered_feature_bc_matrix.h5"))

## Genome matrix has multiple modalities, returning a list of matrices for this genome

# list the data modalities for which counts are provided
names(counts)

## [1] "Gene Expression" "Peaks"

Initialize the Seurat object, with the scRNAseq data in the “RNA” assay and the scATACseq data in the “ATAC” assay. This applies one filter: only genes & peaks which are detected in at least config$min_cells are retained. No further filtering on genes or peaks is performed. We also load in the TF analysis (which estimates a total signal for each TF in each cell) from Cellranger, in case we want to use it downstream.

# initialize Seurat seurat_prefilt object with RNA
seurat_prefilt <- CreateSeuratObject(
    counts = counts$`Gene Expression`,
    assay = "RNA",
    meta.data = cr_metadata,
    project = sample,
    min.cells = config$min_cells
)


# create the ATACseq assay
seurat_prefilt[["ATAC"]] <- CreateChromatinAssay(
    counts = counts$Peaks, 
    sep = c(":", "-"),
    genome = "hg38", # reference genome
    fragments = file.path(base_dir, sample, "atac_fragments.tsv.gz"),
    min.cells = config$min_cells
)

## Computing hash

# set gene annotations
Annotation(seurat_prefilt[["ATAC"]]) <- annotation # obtained from hg38 reference genome 

# load filtered tf matrix
tf_counts <- Read10X_h5(file.path(base_dir, sample, "analysis/tf_analysis/filtered_tf_bc_matrix.h5"))

# create the TF assay
# filters out motifs detected in fewer than config$min_cells cells
assay_TF <- CreateAssayObject(
    counts = tf_counts,
    min.cells = config$min_cells
)

seurat_prefilt[['TF']] <- assay_TF

4.1 Initialize warnings

warnings <- list("LOW_N_CELLS"        = FALSE,
                 "HIGH_MITO"          = FALSE,
                 "HIGH_PROP_FILTERED" = FALSE,
                 "LOW_AVG_UMI"        = FALSE,
                 "CC_ASSIGNMENT"      = FALSE,
                 "SMALL_CLUSTERS"     = FALSE)

5 Compute QC metrics and filter cells

In this section, we compute several QC metrics for scRNAseq/scATACseq, filter cells based on the distributions of these metrics, and then assess the distributions of these metrics after filtering.

For some definitions related to the different metrics computed from Multiome data, see the 10X glossary at: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/glossary

In particular, note that for RNA, counts represent the number of UMIs, while for ATAC, counts represent the number of cut/transposition sites. That is, RNA reflects transcriptional activity while ATAC reflects regulatory potential.

Count Matrix: A matrix of counts representing the number of unique observations of each Feature within each Cell Barcode. Each feature (gene or peak) is a row in the matrix, while each barcode is a column. For a gene feature the count represents the number of UMIs observed while for a peak the count represents the number of transposition sites within the peak region.

5.1 scRNA metrics: mitochondrial and ribosomal content

For quality control, we assess the mitochondrial content and ribosomal content at the single-cell level, using the proportion of reads which map to mitochondrial genes, or ribosomal protein genes, respectively.

DefaultAssay(seurat_prefilt) <- "RNA"

# identify mitochondrial genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
    mito_genes <- grep("^MT-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
} else if (config$species == "m_musculus") {
    mito_genes <- grep("^mt-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
}

# compute, for each cell, the proportion of reads in mitochondrial genes, and add to the metadata
percent_mito <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[mito_genes, ]) /
    Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_mito, "percent.mito")

# identify ribosomal genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
    ribo_genes <- grepl("^RPS|^RPL|^MRPS|^MRPL", rownames(GetAssayData(object = seurat_prefilt)))
} else if (config$species == "m_musculus") {
    ribo_genes <- grepl("^Rps|^Rpl|^Mrps|^Mrpl", rownames(GetAssayData(object = seurat_prefilt)))
}

# compute, for each cell, the proportion of reads in ribsomal genes, and add to the metadata
percent_ribo <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[ribo_genes, ]) /
    Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_ribo, "percent.ribo")

5.2 scATAC metrics: nucleosome signal and TSS enrichment

Here we calculate Nucleosome signal and TSS enrichment for the cells in the sample, which are described below with the resulting plots.

DefaultAssay(seurat_prefilt) <- "ATAC"

# compute stats using Signac functions
seurat_prefilt <- NucleosomeSignal(object = seurat_prefilt)
seurat_prefilt <- TSSEnrichment(seurat_prefilt, fast = FALSE)

## Extracting TSS positions

## Finding + strand cut sites

## Finding - strand cut sites

## Computing mean insertion frequency in flanking regions

## Normalizing TSS score

Because fragments correspond to cutting at different integer multiples of nucleosomes, the distribution of fragment lengths should reveal a specific nucleosome banding pattern, with a fragment length periodicity of ~200bp. We separate cells based on the nucleosome signal, and it should be clear that one population has the typical periodic nucleosome banding pattern.

p1 <- VlnPlot(seurat_prefilt, "nucleosome_signal", pt.size = -1) + NoLegend()

sum(is.infinite(seurat_prefilt@meta.data$nucleosome_signal))

## [1] 0

# deal with inf values
seurat_prefilt@meta.data$nucleosome_signal[is.infinite(seurat_prefilt@meta.data$nucleosome_signal)] <- 0

# set threshold for nucleosome_signal as 1.5 
seurat_prefilt$nucleosome_group <- ifelse(seurat_prefilt$nucleosome_signal > 1.5, 'NS > 1.5', 'NS < 1.5')

p2 <- FragmentHistogram(object = seurat_prefilt, group.by = 'nucleosome_group', region = 'chr1-1-10000000') +
    scale_fill_manual(values = c("forestgreen", "gray80"))


# cowplot::plot_grid(p1, p2, rel_widths = c(0.3, 0.7), align = "hv", axis = "tb")
p1

p2

We also quantify chromatin accessibility at transcription start sites (TSS). Successful scATACseq experiments should have a strong enrichment of accessibility near TSS, and we can again use this metric to stratify cells:

p1 <- VlnPlot(seurat_prefilt, "TSS.enrichment", pt.size = -1) + NoLegend()

seurat_prefilt$high.tss <- ifelse(seurat_prefilt$TSS.enrichment > 2, 'High', 'Low')
p2 <- TSSPlot(seurat_prefilt, group.by = 'high.tss') + NoLegend() +
    scale_color_manual(values = c("forestgreen", "gray80"))

# cowplot::plot_grid(p1, p2, rel_widths = c(0.3, 0.7), align = "hv", axis = "tb")
p1

p2

5.3 Generate thresholds

The thresholds used are a combination of hard cutoffs and cutoffs computed based on the distribution of each metric within the sample.

thresholds <- data.frame(
    # the minimum number of features will be the greater of:
    # 400, or 2 standard deviations below the mean
    min_features = max(400, round(mean(seurat_prefilt@meta.data$nFeature_RNA) -
                                      2*sd(seurat_prefilt@meta.data$nFeature_RNA))),
    max_features = round(mean(seurat_prefilt@meta.data$nFeature_RNA) +
                             2*sd(seurat_prefilt@meta.data$nFeature_RNA)),
    
    # the max mitochondrial content will be the maximum of:
    # 2%, or 2 standard deviations above the mean
    # the parameter config$max_mito allows to set a hard upper threshold,
    # which takes precedence
    max_mito     = ifelse(!is.na(config$max_mito),
                          config$max_mito,
                          
                          max(2, round(mean(seurat_prefilt@meta.data$percent.mito) +
                                           2*sd(seurat_prefilt@meta.data$percent.mito))
                          )
    ),
    # set a max of 0 in case the value 2 standard deviations below the mean
    # is negative
    min_umi      = max(0, round(mean(seurat_prefilt@meta.data$nCount_RNA) -
                                    2*sd(seurat_prefilt@meta.data$nCount_RNA))),
    
    
    max_umi      = round(mean(seurat_prefilt@meta.data$nCount_RNA) +
                             2*sd(seurat_prefilt@meta.data$nCount_RNA)),
    
    
    min_peak_region_fragments  = max(config$min_peak_region_fragments,
                                     round(mean(seurat_prefilt@meta.data$atac_peak_region_fragments) - 2*sd(seurat_prefilt@meta.data$atac_peak_region_fragments))),
    
    max_peak_region_fragments = round(mean(seurat_prefilt@meta.data$atac_peak_region_fragments) + 2*sd(seurat_prefilt@meta.data$atac_peak_region_fragments)),
    
    
    max_nucleosome_signal     = ifelse(!is.null(config$max_nucleosome_signal), config$max_nucleosome_signal,
                                       round(mean(seurat_prefilt@meta.data$nucleosome_signal) + 2*sd(seurat_prefilt@meta.data$nucleosome_signal))),
    
    max_TSS.enrichment        = round(mean(seurat_prefilt@meta.data$TSS.enrichment) + 2*sd(seurat_prefilt@meta.data$TSS.enrichment)),
    
    min_TSS.enrichment        = round(mean(seurat_prefilt@meta.data$TSS.enrichment) - 2*sd(seurat_prefilt@meta.data$TSS.enrichment))
    
)

thresholds

5.4 Scatterplots

5.5 QC metrics before filtering

vln_custom <- function(feature, y_int = NULL, pt_size = -1, filt = "pre") {
    
    object <- switch(filt, "pre" = seurat_prefilt, "post" = seurat)
    
    p1 <- VlnPlot(object = object, features = feature, pt.size = pt_size, cols = "gray80") +
        NoLegend() + ylab(NULL) + xlab(NULL) +
        theme(title = element_text(size = 8, face = "plain"),
              axis.text.x = element_blank())
    
    if (!is.null(y_int)) p1 + geom_hline(yintercept = y_int, color = "red") 
    else p1
    
}


cowplot::plot_grid(
    vln_custom("nFeature_RNA", c(thresholds$min_features, thresholds$max_features)),
    vln_custom("nCount_RNA", c(thresholds$min_umi, thresholds$max_umi)),
    vln_custom("percent.mito", c( thresholds$max_mito)),
    vln_custom("nCount_ATAC"),
    vln_custom("nFeature_ATAC"),
    vln_custom("atac_peak_region_fragments", c(thresholds$min_peak_region_fragments, thresholds$max_peak_region_fragments)),
    vln_custom("TSS.enrichment", c(thresholds$min_TSS.enrichment, thresholds$max_TSS.enrichment)),
    vln_custom("nucleosome_signal", c( thresholds$max_nucleosome_signal)),
    ncol = 8,
    align = "h", axis = "tb"
)

cowplot::plot_grid(
    vln_custom("nFeature_RNA", c(thresholds$min_features, thresholds$max_features), pt_size = 0.1),
    vln_custom("nCount_RNA", c(thresholds$min_umi, thresholds$max_umi), pt_size = 0.1),
    vln_custom("percent.mito", c(thresholds$min_mito, thresholds$max_mito), pt_size = 0.1),
    vln_custom("nCount_ATAC", pt_size = 0.1),
    vln_custom("nFeature_ATAC", pt_size = 0.1),
    vln_custom("atac_peak_region_fragments", c(thresholds$min_peak_region_fragments, thresholds$max_peak_region_fragments), pt_size = 0.1),
    vln_custom("TSS.enrichment", c(thresholds$min_TSS.enrichment, thresholds$max_TSS.enrichment), pt_size = 0.1),
    vln_custom("nucleosome_signal", c( thresholds$max_nucleosome_signal), pt_size = 0.1),
    ncol = 8,
    align = "h", axis = "tb"
)

5.6 Filtering and QC metrics after filtering

# filter cells
seurat <- subset(
    x = seurat_prefilt,
    subset =
        nFeature_RNA > thresholds$min_features &
        nFeature_RNA < thresholds$max_features &
        nCount_RNA > thresholds$min_umi &
        nCount_RNA < thresholds$max_umi &
        percent.mito < thresholds$max_mito &
        atac_peak_region_fragments > thresholds$min_peak_region_fragments &
        atac_peak_region_fragments < thresholds$max_peak_region_fragments &
        nucleosome_signal     < thresholds$max_nucleosome_signal &
        TSS.enrichment        > thresholds$min_TSS.enrichment &
        TSS.enrichment        < thresholds$max_TSS.enrichment
)

cowplot::plot_grid(
    vln_custom("nFeature_RNA", filt = "post"),
    vln_custom("nCount_RNA", filt = "post"),
    vln_custom("percent.mito", filt = "post"),
    vln_custom("nCount_ATAC", filt = "post"),
    vln_custom("nFeature_ATAC", filt = "post"),
    vln_custom("atac_peak_region_fragments", filt = "post"),
    vln_custom("TSS.enrichment", filt = "post"),
    vln_custom("nucleosome_signal", filt = "post"),
    ncol = 8,
    align = "h", axis = "tb"
)

cowplot::plot_grid(
    vln_custom("nFeature_RNA", filt = "post", pt_size = 0.1),
    vln_custom("nCount_RNA", filt = "post", pt_size = 0.1),
    vln_custom("percent.mito", filt = "post", pt_size = 0.1),
    vln_custom("nCount_ATAC", filt = "post", pt_size = 0.1),
    vln_custom("nFeature_ATAC", filt = "post", pt_size = 0.1),
    vln_custom("atac_peak_region_fragments", filt = "post", pt_size = 0.1),
    vln_custom("TSS.enrichment", filt = "post", pt_size = 0.1),
    vln_custom("nucleosome_signal", filt = "post", pt_size = 0.1),
    ncol = 8,
    align = "h", axis = "tb"
)

5.7 Output summary stats and filtering metrics

For each of the metrics we filtered on, we save the min, max and mean before and after filtering, as well as the lower and upper thresholds used. We also record the number of cells that remain after filtering.

filtering_criteria <- c(
    "nFeature_RNA", "nCount_RNA",  "percent.mito",
    "atac_peak_region_fragments", "nucleosome_signal", "TSS.enrichment")

# compute summary stats for each metric
filtering_metrics <- sapply(filtering_criteria, function(criterion) {
    
    min_pre   <- round(min(seurat_prefilt@meta.data  %>% pull(criterion)), 2)
    mean_pre  <- mean(seurat_prefilt@meta.data %>% pull(criterion))
    max_pre   <- max(seurat_prefilt@meta.data  %>% pull(criterion))
    sd_pre    <- sd(seurat_prefilt@meta.data   %>% pull(criterion))
    
    min_post  <- min(seurat@meta.data  %>% pull(criterion))
    mean_post <- mean(seurat@meta.data %>% pull(criterion))
    max_post  <- max(seurat@meta.data  %>% pull(criterion))
    sd_post   <- sd(seurat@meta.data   %>% pull(criterion))
    
    return(c("min.preQC"   = min_pre,
             "mean.preQC"  = mean_pre,
             "max.preQC"   = max_pre,
             "sd.preQC"    = sd_pre,
             "min.postQC"  = min_post,
             "mean.postQC" = mean_post,
             "max.postQC"  = max_post,
             "sd.postQC"   = sd_post))
    
})

# round to 2 decimal places
filtering_metrics <- apply(filtering_metrics, 2, round, 3)

# transform into a dataframe
filtering_metrics <- filtering_metrics %>%
    t() %>%
    as.data.frame() %>%
    tibble::rownames_to_column(var = "criterion") %>%
    dplyr::select(criterion, min.preQC, min.postQC, max.preQC, max.postQC, mean.preQC, mean.postQC, sd.preQC, sd.postQC)

# add thresholds
filtering_metrics$min.threshold <- c(
    thresholds$min_features,
    thresholds$min_umi,
    NA,
    thresholds$min_peak_region_fragments,
    NA,
    thresholds$min_TSS.enrichment)

filtering_metrics$max.threshold <- c(
    thresholds$max_features,
    thresholds$max_umi,
    thresholds$max_mito,
    thresholds$max_peak_region_fragments,
    thresholds$max_nucleosome_signal,
    thresholds$max_TSS.enrichment)

filtering_metrics

# compute number of cells before and after filtering
N_cells_metrics <- data.frame(
    "N_cells_before" = dim(seurat_prefilt@meta.data)[1],
    "N_cells_after"  = dim(seurat@meta.data)[1]) %>%
    mutate(Prop_kept = round(N_cells_after / N_cells_before, 2))

N_cells_metrics

This report will register warnings if:

there are few cells after filtering (<1000)
more than 40% of cells were filtered out
the max mitochondrial content after filtering is > 5% (indicating a higher threshold needed to be used)
the average number of UMIs after filtering is < 2000

The warnings will be output at the end of the report, and saved as a TSV if any warning flags are TRUE.

if (N_cells_metrics$N_cells_after < 1000) warnings$LOW_N_CELLS <- TRUE
if (N_cells_metrics$Prop_kept < 0.6) warnings$HIGH_PROP_FILTERED <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
                      "nCount_RNA", ]$mean.postQC < 2000) warnings$LOW_AVG_UMI <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
                      "percent.mito", ]$max.postQC > 5) warnings$HIGH_MITO <- TRUE

6 Normalization and finding highly variable genes

6.1 scRNA

Normalization of gene expression data is performed using Log Normalization. The sequence of NormalizeData, FindVariableFeatures, and ScaleData scale counts to 10,000 UMIs per cell, log2-transform counts, and then regress out unwanted sources of variation. The log-normalized values are set as the default assay in the Seurat object.

DefaultAssay(seurat) <- "RNA"

# normalization: scale counts to 10000 UMIs per cell, and log2-transform the counts
# set the default assay to RNA to run the log-normalization & scaling
seurat <- seurat %>% 
  NormalizeData(normalization.method = "LogNormalize",
                scale.factor         = 10000) %>% 
  # identify variable genes
  FindVariableFeatures(mean.function       = ExpMean,
                       dispersion.function = LogVMR) %>%
  # regress out variables which are sources of unwanted variation, and z-score data
  ScaleData(vars.to.regress = unlist(str_split(config$var_regress, ",")))

6.2 scATAC

scATACseq processing is done using latent semantic indexing (LSI), and uses a different method for normalization due to increased data sparsity compared to scRNAseq. We perform normalization with the TF-IDF method, which takes into account both sequencing depth and frequency/rarity of peaks. Rather than feature selection based on variability (difficult due to data sparsity), we keep the top n “most informative” features.

DefaultAssay(seurat) <- "ATAC"

seurat <- RunTFIDF(seurat)

## Performing TF-IDF normalization

seurat <- FindTopFeatures(seurat, min.cutoff = 'q5', verbose = FALSE)
seurat[["ATAC"]]

## ChromatinAssay data with 124074 features for 6374 cells
## Variable features: 117937 
## Genome: hg38 
## Annotation present: TRUE 
## Motifs present: FALSE 
## Fragment files: 1

7 Dimensionality reduction

7.1 scRNA

For scRNA data, we perform PCA after scaling.

DefaultAssay(seurat) <- "RNA"

seurat <- RunPCA(seurat,
                 pc.genes        = VariableFeatures(.),
                 npcs            = 40,
                 ndims.print     = 1:5,
                 nfeatures.print = 5)

## PC_ 1 
## Positive:  PHLDB2, ADAMTS6, CHODL, PLCL2, ADGRG6 
## Negative:  MEG3, SOX5, LAMA2, PDZRN3, SLIT2 
## PC_ 2 
## Positive:  ARL15, UTRN, KANK1, CHN2, LGR5 
## Negative:  CSGALNACT1, CHODL, PAPPA2, GTF2I-AS1, KMO 
## PC_ 3 
## Positive:  FGF13, SLC9A9, F13A1, FGL1, SCN9A 
## Negative:  TENM3, KANK1, SEMA6D, PBX1, ADAMTS19 
## PC_ 4 
## Positive:  FCGR2C, DACH1, MEOX2, MECOM, ZNF521 
## Negative:  SLIT2, CDH11, CNTN5, BMP5, PDZRN4 
## PC_ 5 
## Positive:  CNTN5, ENSG00000232855, LINC01697, LINC01695, ITGBL1 
## Negative:  AGTR1, AFF2, CCDC102B, GUCY1A2, DOK6

# Elbow Plot and Jackstraw Plot are used to calibrate the number of PCs for the PCA. 
# ElbowPlot(seurat, ndims = 40)
# seurat <- JackStraw(seurat, num.replicate = 100, dims = 50)
# seurat <- ScoreJackStraw(seurat, dims = 1:50)
# JackStrawPlot(seurat, dims = 1:50)

7.2 scATAC

Analogous to PCA, we perform a linear dimensionality reduction with SVD.

DefaultAssay(seurat) <- "ATAC"
seurat <- RunSVD(object = seurat)

## Running SVD

## Scaling cell embeddings

Often, the first component is highly correlated with technical features, and should not be included in downstream analyses; this is assessed using the correlation plot with depth. Here, we check if the first LSI component is highly correlated with depth:

DepthCor(seurat, reduction = "lsi", n=40)

8 Clustering

We compute the graph from each data type individually. These graphs will be the input to non-linear dimensionality reduction using UMAP, followed by clustering.

8.1 scRNA

DefaultAssay(seurat) <- "RNA"

seurat <- FindNeighbors(seurat, reduction = "pca", dims = 1:40, verbose = FALSE)

seurat <- RunUMAP(
object = seurat,
   reduction = "pca",
   dims = 1:40,
   reduction.name = "umap.rna",
   reduction.key = "rnaUMAP_"
 )

## 10:02:08 UMAP embedding parameters a = 0.9922 b = 1.112

## 10:02:08 Read 6374 rows and found 40 numeric columns

## 10:02:08 Using Annoy for neighbor search, n_neighbors = 30

## 10:02:08 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 10:02:09 Writing NN index file to temp file /tmp/RtmptqEcjz/file654934809956
## 10:02:09 Searching Annoy index using 1 thread, search_k = 3000
## 10:02:11 Annoy recall = 100%
## 10:02:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:02:17 Initializing from normalized Laplacian + noise (using irlba)
## 10:02:17 Commencing optimization for 500 epochs, with 290622 positive edges
## 10:02:17 Using rng type: pcg
## 10:02:37 Optimization finished

# parameters for clustering will be held constant at these values. we name the graph RNA_nn.

clustering_fn <- purrr::partial(FindClusters,
                                 object = seurat,
                                 algorithm = 3,
                                 verbose = FALSE,
                                 n.start = 10,
                                 random.seed = config$seed,
                                graph.name = "RNA_nn")

# note: default clustering resolution is 1
seurat <- clustering_fn(resolution = config$clustering_resolution)
p3 <- DimPlot(seurat, reduction = "umap.rna", label.size = 6, label = TRUE) + ggtitle("Regular pipeline: res 1")

8.2 scATAC

For ATAC data, we exclude LSI 1 and use dimensions 2:40.

DefaultAssay(seurat) <- "ATAC"

seurat <- FindNeighbors(seurat, reduction = "lsi", dims = 2:40, verbose = TRUE)

## Computing nearest neighbor graph

## Computing SNN

seurat <- RunUMAP(
  object = seurat,
  reduction = "lsi",
  dims = 2:40,
  reduction.name = "umap.atac",
  reduction.key = "atacUMAP_"
)

## 10:02:41 UMAP embedding parameters a = 0.9922 b = 1.112

## 10:02:41 Read 6374 rows and found 39 numeric columns

## 10:02:41 Using Annoy for neighbor search, n_neighbors = 30

## 10:02:41 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 10:02:42 Writing NN index file to temp file /tmp/RtmptqEcjz/file6549232962f4
## 10:02:42 Searching Annoy index using 1 thread, search_k = 3000
## 10:02:44 Annoy recall = 100%
## 10:02:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:02:49 Initializing from normalized Laplacian + noise (using irlba)
## 10:02:50 Commencing optimization for 500 epochs, with 273982 positive edges
## 10:02:50 Using rng type: pcg
## 10:03:09 Optimization finished

clustering_fn <- purrr::partial(FindClusters,
                                 object = seurat,
                                 algorithm = 3,
                                 verbose = FALSE,
                                 n.start = 10,
                                 random.seed = config$seed,
                                graph.name = "ATAC_nn")

# note: the default clustering resolution is 1.
seurat <- clustering_fn(resolution = config$clustering_resolution)
p5 <- DimPlot(seurat, reduction = "umap.atac") + ggtitle("res 1")

seurat$barcodes <- colnames(seurat)

# Plot UMAPs separately
p1_bef <- DimPlot(seurat, reduction = "umap.rna", label = TRUE, group.by = "RNA_nn_res.1") + ggtitle("RNA UMAP") 

p2_bef <- DimPlot(seurat, reduction = "umap.atac", label = TRUE, group.by = "ATAC_nn_res.1") + ggtitle("ATAC UMAP")

# Combine side by side
p1_bef + p2_bef

9 Call doublets

9.1 Expected Doublets

Before calling scDblFinder, we add a reference for the number of doublets we expect to see given the number of cells. For example, if the number of cells recovered is under 750, we expect 0.4% of the cells to be doublets, etc.

expected_doublets <- function(cells_recovered) {
    if (cells_recovered < 750) {
        return(0.004)
    } else if (cells_recovered < 1500) {
        return(0.008)
    } else if (cells_recovered < 2500) {
        return(0.016)
    } else if (cells_recovered < 3500) {
        return(0.024)
    } else if (cells_recovered < 4500) {
        return(0.032)
    } else if (cells_recovered < 5500) {
        return(0.04)
    } else if (cells_recovered < 6500) {
        return(0.048)
    } else if (cells_recovered < 7500) {
        return(0.056)
    } else if (cells_recovered < 8500) {
        return(0.064)
    } else if (cells_recovered < 9500) {
        return(0.072)
    } else {
        return(0.08)
    }
}

9.2 Call scdblfinder

Here, we call scdblFinder seperately on scATAC and scRNA data, to identify the number of cells that are identified as an atac_doublet, rna_doublet or both.

9.2.1 scRNA doublets

# Source: https://biostatsquid.com/scdblfinder-tutorial/

# Convert the seurat object to a SingleCellExperiment (sce) object
DefaultAssay(seurat) <- "RNA"
sce <- as.SingleCellExperiment(seurat)
sce <- scDblFinder(sce, dbr = expected_doublets(ncol(seurat)), verbose = FALSE)

table(sce$scDblFinder.class)

## 
## singlet doublet 
##    5744     630

sce@colData@listData %>% as.data.frame() %>% head()

# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>% 
  dplyr::select(starts_with('scDblFinder')) 

# Rename columns to avoid collisions and label as RNA doublets
colnames(meta_scdblfinder) <- gsub("^scDblFinder", "rna_doublets", colnames(meta_scdblfinder))
rownames(meta_scdblfinder) <- colnames(sce)

#head(meta_scdblfinder)
rownames(meta_scdblfinder) <- sce@colData@rownames
head(meta_scdblfinder)

seurat <- AddMetaData(object = seurat, metadata = meta_scdblfinder)

# remove the temporary variables
rm(list = c('meta_scdblfinder', 'sce'))

9.2.2 scATAC doublets

DefaultAssay(seurat) <- "ATAC"
sce <- as.SingleCellExperiment(seurat)
sce <- scDblFinder(sce, dbr = expected_doublets(ncol(seurat)))

## Creating ~5100 artificial doublets...

## Dimensional reduction

## Evaluating kNN...

## Training model...

## iter=0, 1483 cells excluded from training.

## iter=1, 1630 cells excluded from training.

## iter=2, 1519 cells excluded from training.

## Threshold found:0.765

## 565 (8.9%) doublets called

table(sce$scDblFinder.class)

## 
## singlet doublet 
##    5809     565

sce@colData@listData %>% as.data.frame() %>% head()

# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>% 
  dplyr::select(starts_with('scDblFinder')) 

# Rename columns to avoid collisions and label as RNA doublets
colnames(meta_scdblfinder) <- gsub("^scDblFinder", "atac_doublets", colnames(meta_scdblfinder))
rownames(meta_scdblfinder) <- colnames(sce)

#head(meta_scdblfinder)
rownames(meta_scdblfinder) <- sce@colData@rownames
head(meta_scdblfinder)

seurat <- AddMetaData(object = seurat, metadata = meta_scdblfinder)

# remove the temporary variables
rm(list = c('meta_scdblfinder', 'sce'))

Now, we identify which category each cell falls into (is it consistently idenfied as a doublet)?

# Identify cells called doublets by each method
rna_doublets <- rownames(seurat@meta.data)[seurat@meta.data$rna_doublets.class == "doublet"]
atac_doublets <- rownames(seurat@meta.data)[seurat@meta.data$atac_doublets.class == "doublet"]

# Define both and all doublets
both_doublets <- intersect(rna_doublets, atac_doublets)
all_doublets <- union(rna_doublets, atac_doublets)

# Initialize metadata vector (default to singlet)
all_doublets.metadata <- rep("singlet", ncol(seurat))
names(all_doublets.metadata) <- colnames(seurat)

# Assign doublet labels
all_doublets.metadata[rna_doublets] <- "rna_doublet"
all_doublets.metadata[atac_doublets] <- "atac_doublet"
all_doublets.metadata[both_doublets] <- "both_doublet"

# Add to Seurat object
seurat <- AddMetaData(seurat, metadata = all_doublets.metadata, col.name = "all_doublets")

# Coloured by singlets and doublets 
DefaultAssay(seurat) <- "RNA"
p1 <- DimPlot(
  object = seurat,
  group.by = "all_doublets",   # colour by this column
  cols     = c("singlet" = "grey",
               "rna_doublet" = "coral1",
               "atac_doublet" = "purple",
               "both_doublet" = "darkolivegreen"),
  pt.size  = 0.3)

DefaultAssay(seurat) <- "ATAC"
p2 <- DimPlot(
  seurat,
  group.by = "all_doublets",
  pt.size  = 0.3,
  cols     = c("singlet" = "grey",
               "rna_doublet" = "coral1",
               "atac_doublet" = "purple",
               "both_doublet" = "darkolivegreen"),
)

p1 + p2

# Coloured by scDblFinder.score: RNA assay
DefaultAssay(seurat) <- "RNA"
seurat$rna_log.score <- log1p(seurat$rna_doublets.score)
p3 <- FeaturePlot(seurat, 
                  features = "rna_log.score", 
                  label = TRUE,
                  label.size = 10,
                  pt.size = 0.3) + scale_colour_gradient(low = "grey", high = "coral3")

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

# Coloured by scDblFinder.score: ATAC assay
DefaultAssay(seurat) <- "ATAC"
seurat$atac_log.score <- log1p(seurat$atac_doublets.score)
p4 <- FeaturePlot(seurat, 
                  features = "atac_log.score", 
                  label = TRUE,
                  label.size = 10,
                  pt.size = 0.3) + scale_colour_gradient(low = "grey", high = "purple4")

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

cowplot::plot_grid(p3, p4, ncol = 2)

# barplot per cluster with the number of singlets and doublets
## first, we create a summary dataframe of cells per cluster
cluster_counts <- seurat@meta.data %>%
  group_by(seurat_clusters, all_doublets) %>%
  summarise(n = n()) %>%
  ungroup()

## `summarise()` has grouped output by 'seurat_clusters'. You can override using
## the `.groups` argument.

## then, we plot with ggplot2
ggplot(cluster_counts, aes(x = seurat_clusters, y = n, fill = all_doublets), labels = TRUE, labels.size = 20, repel = TRUE) +
  geom_col(position = "stack") +
  labs(x = "Cluster No.", y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("singlet" = "grey",
               "rna_doublet" = "coral1",
               "atac_doublet" = "purple",
               "both_doublet" = "darkolivegreen")) +
   geom_text(aes(label = n), 
            position = position_stack(vjust = 0.5),  # center labels vertically in each segment
            color = "black", size = 8) +
  theme(
    axis.text.x = element_text(size = 20),  # X-axis text size
    axis.text.y = element_text(size = 20),  # Y-axis text size
    axis.title.x = element_text(size = 16),     # X-axis title
    axis.title.y = element_text(size = 16),     # Y-axis title
    legend.text = element_text(size = 13),      # Legend item labels
    legend.title = element_text(size = 15),     # Legend title
  )

# singlet/doublet plot split by cluster
plot_doublet_vln <- function(seurat, title, y_axis, what_cluster) {
    
    FetchData(seurat, c(what_cluster, "all_doublets")) %>% 
        ggplot(aes(x = seurat_clusters, y = y_axis)) +
        geom_violin(scale = "width", fill = "gray90") +
        geom_jitter(aes(color = all_doublets), size = 1, width = 0.2, alpha = 0.5) +
        scale_color_manual(values = c("singlet" = "grey",
               "rna_doublet" = "coral1",
               "atac_doublet" = "purple",
               "both_doublet" = "darkolivegreen")) +
        ggtitle(title) + xlab("Cluster")
    
}


# Box plots of singlet/doublet split by numfeatures and numcounts

df_long <- melt(
  seurat@meta.data,
  id.vars = c("barcodes", "all_doublets"),
  measure.vars = c("nFeature_RNA", "nCount_RNA", "nFeature_ATAC", "nCount_ATAC")
)


ggplot(df_long, aes(x = all_doublets, y = value, fill = all_doublets)) +
  geom_boxplot(outlier.size = 0.5) +
  facet_wrap(~variable, scales = "free_y", ncol = 4) +
  theme_classic() +
  theme(legend.position = "right") +
  xlab("") + ylab("Value")

9.3 Remove doublets

# Remove the droplets marked as doublets
seurat <- subset(seurat, subset = all_doublets == 'singlet')

We visualise both modalities again, without rna_doublets, atac_doublets and both_doublets. Top row: before filtering out doublets. Bottom row: after filtering out doublets.

# Plot UMAPs separately
p1_aft <- DimPlot(seurat, reduction = "umap.rna", label = TRUE, group.by = "RNA_nn_res.1") + ggtitle("RNA UMAP") 

p2_aft <- DimPlot(seurat, reduction = "umap.atac", label = TRUE) + ggtitle("ATAC UMAP")

# Combine side by side
# Plot clusters almost look "cleaned up" -- less overlap between different clusters
(p1_bef | p2_bef) / (p1_aft | p2_aft)

9.4 Save no doublet data

We save the filtered seurat object.

filepath <- paste0(out_dir, "/no_doublet_objects/", sample, ".no_doublets.rds")

# creates directory in case it doesn't exist
dir.create(dirname(filepath), recursive = TRUE, showWarnings = FALSE)

saveRDS(seurat, file = filepath)

10 Cluster-level QC

Here, we check the number of cells in each cluster after doublets have been filtered out. We register a warning if more than a third of clusters have less than 100 cells.

# 1. rearrange the clusters to be in ascending order in the seurat object (allows them to be plotted in order)

cluster_col <- "RNA_nn_res.1" 

seurat[[cluster_col]] <- factor(seurat[[cluster_col]][[1]], levels = as.character(c(0:9, 10:15)))


# number of clusters with < 100 cells, divided by the total number of clusters
# 2. check the clusters from RNA

rna <- table(seurat@meta.data$RNA_nn_res.1)

if ((sum(rna < 100) / length(rna)) > 0.33) {
  warnings$SMALL_CLUSTERS <- TRUE
}

# 3. check the clusters from ATAC
atac <- table(seurat@meta.data$ATAC_nn_res.1)

if ((sum(atac < 100) / length(atac)) > 0.33) {
  warnings$SMALL_CLUSTERS <- TRUE
}

We then generate violin plots of the distribution of each QC metric in each cluster, with the number of cells in each cluster indicated above each violin.

10.1 scRNA QC check

# make a dataframe of the number of cells per cluster (rna)
clust_df <- data.frame(rna)

clust_df[1, 2] <- paste0("N=", clust_df[1, 2])
colnames(clust_df) <- c("ident", "N")

vln_fun <- function(seurat_obj = seurat, criterion, cluster_col = "RNA_nn_res.1", colours = NULL) {
  
  # Extract cluster identities as factor
  cluster_ids <- as.factor(seurat_obj@meta.data[[cluster_col]])
  
  # Make table of number of cells per cluster, forcing all levels to appear
  cluster_counts <- table(factor(cluster_ids, levels = levels(cluster_ids)))
  
  # Make clust_df with cluster identity and cell count
  clust_df <- data.frame(
    ident = names(cluster_counts),
    N = paste0("N=", as.numeric(cluster_counts))
  )
  
  # Add y position for label, 95% of max value of criterion
  clust_df$y <- max(seurat_obj[[criterion]][,1]) * 0.95
  
  # Create violin plot
  Seurat::VlnPlot(seurat_obj, features = criterion, group.by = cluster_col, cols = colours, pt.size = 0.2) +
    theme(legend.position = "none") +
    geom_text(data = clust_df, aes(label = N, x = ident, y = y)) +
    xlab("Cluster")
}

vln_fun(criterion = "nCount_RNA")

vln_fun(criterion = "nFeature_RNA")

vln_fun(criterion = "percent.mito")

vln_fun(criterion = "nCount_ATAC")

vln_fun(criterion = "nFeature_ATAC")

vln_fun(criterion = "atac_peak_region_fragments")

vln_fun(criterion = "TSS.enrichment")

vln_fun(criterion = "nucleosome_signal")

10.2 scATAC QC check

# make a dataframe of the number of cells per cluster (atac)
clust_df <- data.frame(atac)
clust_df[1, 2] <- paste0("N=", clust_df[1, 2])
colnames(clust_df) <- c("ident", "N")

vln_fun <- function(criterion) {
    
    # plot the labels at a value slightly below the max, to ensure they're shown
    # within plot limits
    clust_df$y <- max(seurat[[criterion]]) * 0.95
    Seurat::VlnPlot(seurat, criterion, cols = seurat@misc$colours, pt.size = 0.2) +
        theme(legend.position = "none") +
        geom_text(data = clust_df, aes(label = N, x = ident, y = y)) +
        xlab("Cluster")
    
}

vln_fun("nCount_RNA")

vln_fun("nFeature_RNA")

vln_fun("percent.mito")

vln_fun("nCount_ATAC")

vln_fun("nFeature_ATAC")

vln_fun("atac_peak_region_fragments")

vln_fun("TSS.enrichment")

vln_fun("nucleosome_signal")

10.3 Save clusters

We save the clusters in the seurat object after this QC check.

filepath <- paste0(out_dir, "/clustered_objects/", sample, ".clustered.rds")

# creates directory in case it doesn't exist
dir.create(dirname(filepath), recursive = TRUE, showWarnings = FALSE)

saveRDS(seurat, file = filepath)

11 Finding cluster markers

We then identify cluster markers for each cluster for the specified resolution (config$clustering_resolution) using a Wilcoxon Rank Sum Test. We also display heatmaps of the top 10 genes/peaks that were expressed in each cell cluster. This allows us to identify differentially expressed features across clusters.

11.1 scRNA

# First, for RNA markers
DefaultAssay(seurat) <- 'RNA'
Idents(seurat) <- seurat[["RNA_nn_res.1"]]
cluster_markers_rna <- FindAllMarkers(object = seurat, verbose = FALSE)

# display the top 30 per cluster
cluster_markers_rna %>%
  dplyr::group_by(cluster) %>%
  top_n(n = 30, wt = avg_log2FC) %>%
  dplyr::select(cluster, gene, everything()) %>%
  DT::datatable(cluster_markers_rna, filter = "top")

write_tsv(cluster_markers_rna, file.path(out_dir, "cluster_markers_rna.tsv"))

DefaultAssay(seurat) <- 'RNA'
Idents(seurat) <- "RNA_nn_res.1"
top10_rna <- cluster_markers_rna %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(seurat, features = top10_rna$gene, group.colors = seurat@misc$colours) +
  NoLegend() +
  scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B")) +
  theme(
    axis.text.y = element_text(size = 4),  # smaller y-axis text (genes)
  )

11.2 scATAC

Note that the process of finding cluster markers for scATAC data is slow, due to sparse data. An alternative approach is Presto (https://immunogenomics.github.io/presto/articles/getting-started.html)

cluster_markers_atac <- FindAllMarkers(object = seurat,
  assay = "ATAC",
  verbose = FALSE)

# display the top 30 per cluster
cluster_markers_atac %>%
  dplyr::group_by(cluster) %>%
  top_n(n = 30, wt = avg_log2FC) %>%
  dplyr::select(cluster, gene, everything()) %>%
  DT::datatable(cluster_markers_atac, filter = "top")

write_tsv(cluster_markers_atac, file.path(out_dir, "cluster_markers_atac.tsv"))

# Displays heatmap of the top 10 peaks in each cluster
DefaultAssay(seurat) <- "ATAC"
seurat <- ScaleData(seurat)
Idents(seurat) <- "ATAC_nn_res.1"
top10_atac <- cluster_markers_atac %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(seurat,
          features = top10_atac$gene,
          group.by = "ATAC_nn_res.1",
          group.colors = seurat@misc$colours) +
  NoLegend() +
  scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B"))

We can also use the reference genome to map peaks back to the genes that have the most chromatin accessibility.

# find gene annotations for each of these peaks
peaks_of_interest <- StringToGRanges(top10_atac$gene, sep = c("-", "-"))
annotation <- Annotation(seurat)
overlaps <- findOverlaps(peaks_of_interest, annotation)
# for each peak, get the overlapping gene(s)
peak_idx <- queryHits(overlaps)
gene_idx <- subjectHits(overlaps)

# create a dataframe linking peak -> gene
peak_gene_df <- data.frame(
  peak = top10_atac$gene[peak_idx],
  gene = annotation$gene_name[gene_idx]
)
peak_gene_df_unique <- peak_gene_df %>%
  group_by(peak) %>%
  summarize(gene = paste(unique(gene), collapse = ";"))
peak_to_gene <- setNames(peak_gene_df_unique$gene, peak_gene_df_unique$peak)

DoHeatmap(seurat, features = top10_atac$gene, group.colors = seurat@misc$colours) +
  NoLegend() +
  scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B")) +
  scale_y_discrete(labels = function(x) peak_to_gene[x])+ theme(
    axis.text.y = element_text(size = 3),  # smaller y-axis text (genes)
  )

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

12 Save QC stats

We save the seurat object with all the quality control metrics observed so far.

# save exact metrics used for filtering and pre/post QC stats, as well as number 
# of cells before/after filtering

# Define a common output directory path
qc_dir <- file.path(out_dir, "qc_metrics")

# Create the directory if it doesn't exist
dir.create(qc_dir, recursive = TRUE, showWarnings = FALSE)

# Define filepaths
filtering_metrics_file <- file.path(qc_dir, paste0(sample, ".filtering_metrics.tsv"))
n_cells_metrics_file   <- file.path(qc_dir, paste0(sample, ".N_cells_metrics.tsv"))

# Write the files
write_tsv(filtering_metrics, filtering_metrics_file)
write_tsv(N_cells_metrics,   n_cells_metrics_file)

13 Cell cycle scoring

To score cells for their cell cycle activity, we compute the scores using the method implemented in Seurat:

DefaultAssay(seurat) <- "RNA"
# DimHeatmap(seurat, dims = c(10, 30))

# a list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat;
# segregate this list into markers of G2/M phase and markers of S phase
s_genes   <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes

seurat <- CellCycleScoring(seurat, s.features = s_genes, g2m.features = g2m_genes, set.ident = TRUE)

# Plots for the cell state in RNA and ATAC reductions 
head(seurat[[]])

p1 <- DimPlot(seurat) + ggtitle("RNA reduction")
p2 <- DimPlot(seurat, reduction="umap.atac") + ggtitle("ATAC reduction")

cc.markers <- switch(config$species,
                     "h_sapiens" = c("PCNA", "TOP2A", "MCM6", "MKI67"),
                     "m_musculus" = c("Pcna", "Top2a", "Mcm6", "Mki67"))

# ensure all the genes that we're checking for are in the dataset
genes_to_check <- c("PCNA", "TOP2A", "MCM6", "MKI67")
genes_to_check %in% rownames(seurat[["RNA"]]@data)

## [1] TRUE TRUE TRUE TRUE

# note: if nothing is visible in these ridge plots, it means the peaks are too small to be seen.
# RidgePlot(seurat,
#           group.by = "Phase",
#           features = cc.markers,
#           ncol = 2,
#           assay = "RNA",
#           slot = "data",
#           same.y.lims=T)

UMAP plot coloured by phase:

DefaultAssay(seurat) <- "RNA"
phase_palette <- c("G1" = "#F8766D", "G2M" = "#00BA38", "S" = "#619CFF")
DimPlot(seurat, reduction = "umap.rna", group.by = "Phase", dims = c(1, 2), cols = phase_palette)

We register a warning if more than half of the cells labelled G2M have low TOP2A expression. However, if upon further inspection of the proportion of cells expressing each set of genes, we see the highest proportions of cells expressing genes from their associated gene set, it is deemed acceptable.

top2a_gene <- switch(config$species,
                     "h_sapiens" = "TOP2A",
                     "m_musculus" = "Top2a")

if (top2a_gene %in% rownames(seurat)) {

  # subset to cells assigned as G2M phase
  top2a_expr <- as.matrix(subset(seurat,
                                 subset = Phase == "G2M")[["RNA"]][top2a_gene, ])
  top2a_expr <- t(top2a_expr)

  # if more than 50% of cells assigned as G2M phase have low TOP2A expression,
  # flag it
  # note: this warning is arbitrary
  (prop_low_top2a <- (sum(top2a_expr < 1) / length(top2a_expr)))
  if (prop_low_top2a > 0.5) {
    print("low TOP2A expression")
    warnings$CC_ASSIGNMENT <- TRUE}

} else {

  # Top2a is not detected
  warnings$CC_ASSIGNMENT <- TRUE
  print("Top2a not detected")

}

## [1] "low TOP2A expression"

# plot the expression of each gene set in cells classified as G2/M, S and G1
prop_df <- FetchData(seurat, c("ident", "barcodes", g2m_genes, s_genes)) %>%
  group_by(ident) %>%
  mutate(n_total = n()) %>%
  pivot_longer(cols = 3:(ncol(.) - 1), names_to = "gene") %>%
  mutate(
    gene_set = case_when(
      gene %in% g2m_genes ~ "geneset:g2/m", # index for the genes
      gene %in% s_genes ~ "geneset:s"
    )
  ) %>%
  filter(value>0) %>%
  group_by(ident, gene, gene_set, n_total) %>%
  count() %>%
  mutate(
    proportion = n/n_total
  )

prop_df %>%
  ggplot(aes(x = gene, y = proportion)) +
  geom_bar(stat = "identity") +
  # scale_fill_manual(values = c("highlight" = "red", "normal" = "steelblue")) +
  labs(title = "Proportion of Cells Expressing Each Gene",
       x = "Gene", y = "Proportion of Cells") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 5),
        legend.position = "none") +
facet_wrap(~gene_set, scales = "free_x")

prop_df %>%
  ggplot(aes(x = gene, y = proportion, fill = ident)) +
  geom_bar(stat = "identity") +
  # scale_fill_manual(values = c("highlight" = "red", "normal" = "steelblue")) +
  labs(title = "Proportion of Cells Expressing Each Gene",
       x = "Gene", y = "Proportion of Cells") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 5)) +
facet_grid(ident~gene_set, scales = "free_x")

14 Save object

All parameters and filtering metrics are saved both within the seurat object in the seurat@misc slot, as well as in the output directory.

# parameters are saved with the Seurat object
seurat@misc$params$config                <- config
seurat@misc$filtering_metrics            <- filtering_metrics
seurat@misc$n_cells                      <- N_cells_metrics

# write metrics/thresholds to file
filtering_metrics_out <- filtering_metrics %>%
    gather("metrics_name", "value", -criterion) %>%
    unite("metrics", criterion:metrics_name, sep = "_") %>%
    spread("metrics", "value") %>% 
    bind_cols(N_cells_metrics)

write_tsv(filtering_metrics_out, path = file.path(out_dir, "seurat_metrics.tsv"))

Save the Seurat object for downstream analysis:

# Ensure RNA is the default assay
DefaultAssay(seurat) <- "RNA"
save(seurat, file = file.path(out_dir, "seurat.Rda"))

15 Reproducibility

This document was last rendered on:

## 2025-08-08 10:51:44

The R session info:

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] future_1.58.0                     ggridges_0.5.6                   
##  [3] reshape2_1.4.4                    stringr_1.5.1                    
##  [5] dplyr_1.1.4                       tidyr_1.3.1                      
##  [7] biovizBase_1.46.0                 ggExtra_0.10.1                   
##  [9] ggpointdensity_0.2.0              BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [11] BSgenome_1.66.3                   rtracklayer_1.58.0               
## [13] Biostrings_2.66.0                 XVector_0.38.0                   
## [15] EnsDb.Hsapiens.v86_2.99.0         ensembldb_2.22.0                 
## [17] AnnotationFilter_1.22.0           GenomicFeatures_1.50.4           
## [19] AnnotationDbi_1.60.2              scDblFinder_1.12.0               
## [21] SingleCellExperiment_1.20.1       SummarizedExperiment_1.28.0      
## [23] Biobase_2.58.0                    GenomicRanges_1.50.2             
## [25] GenomeInfoDb_1.34.9               IRanges_2.32.0                   
## [27] S4Vectors_0.36.2                  BiocGenerics_0.44.0              
## [29] MatrixGenerics_1.10.0             matrixStats_1.5.0                
## [31] Signac_1.9.0                      SeuratObject_4.1.4               
## [33] Seurat_4.3.0                      viridis_0.6.5                    
## [35] viridisLite_0.4.2                 plotly_4.11.0                    
## [37] ggplot2_3.5.2                     purrr_1.0.4                      
## [39] readr_2.1.5                       here_1.0.1                       
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3            scattermore_1.2          
##   [3] bit64_4.6.0-1             knitr_1.50               
##   [5] irlba_2.3.5.1             DelayedArray_0.24.0      
##   [7] rpart_4.1.16              data.table_1.17.6        
##   [9] KEGGREST_1.38.0           RCurl_1.98-1.17          
##  [11] generics_0.1.4            ScaledMatrix_1.6.0       
##  [13] cowplot_1.1.3             RSQLite_2.4.1            
##  [15] RANN_2.6.2                bit_4.6.0                
##  [17] tzdb_0.5.0                spatstat.data_3.1-6      
##  [19] xml2_1.3.3                httpuv_1.6.16            
##  [21] xfun_0.52                 hms_1.1.3                
##  [23] jquerylib_0.1.4           evaluate_1.0.4           
##  [25] promises_1.3.3            restfulr_0.0.15          
##  [27] progress_1.2.3            dbplyr_2.5.0             
##  [29] igraph_2.1.4              DBI_1.2.3                
##  [31] htmlwidgets_1.6.4         spatstat.geom_3.4-1      
##  [33] crosstalk_1.2.1           backports_1.5.0          
##  [35] biomaRt_2.54.1            deldir_2.0-4             
##  [37] sparseMatrixStats_1.10.0  vctrs_0.6.5              
##  [39] ROCR_1.0-11               abind_1.4-8              
##  [41] cachem_1.1.0              withr_3.0.2              
##  [43] progressr_0.15.1          vroom_1.6.5              
##  [45] checkmate_2.3.2           sctransform_0.4.2        
##  [47] GenomicAlignments_1.34.1  prettyunits_1.2.0        
##  [49] scran_1.26.2              goftest_1.2-3            
##  [51] cluster_2.1.8.1           lazyeval_0.2.2           
##  [53] crayon_1.5.1              hdf5r_1.3.12             
##  [55] spatstat.explore_3.4-3    labeling_0.4.3           
##  [57] edgeR_3.40.2              pkgconfig_2.0.3          
##  [59] nlme_3.1-168              vipor_0.4.7              
##  [61] ProtGenerics_1.30.0       nnet_7.3-17              
##  [63] rlang_1.1.6               globals_0.18.0           
##  [65] lifecycle_1.0.4           miniUI_0.1.2             
##  [67] filelock_1.0.3            BiocFileCache_2.6.1      
##  [69] rsvd_1.0.5                dichromat_2.0-0.1        
##  [71] ggrastr_1.0.2             rprojroot_2.0.4          
##  [73] polyclip_1.10-7           lmtest_0.9-40            
##  [75] Matrix_1.6-1.1            zoo_1.8-14               
##  [77] base64enc_0.1-3           beeswarm_0.4.0           
##  [79] png_0.1-8                 rjson_0.2.23             
##  [81] bitops_1.0-9              KernSmooth_2.23-26       
##  [83] blob_1.2.4                DelayedMatrixStats_1.20.0
##  [85] spatstat.univar_3.1-3     parallelly_1.45.0        
##  [87] spatstat.random_3.4-1     beachmat_2.14.2          
##  [89] scales_1.4.0              memoise_2.0.1            
##  [91] magrittr_2.0.3            plyr_1.8.9               
##  [93] ica_1.0-3                 zlibbioc_1.44.0          
##  [95] compiler_4.2.0            dqrng_0.4.1              
##  [97] BiocIO_1.8.0              RColorBrewer_1.1-3       
##  [99] fitdistrplus_1.2-2        Rsamtools_2.14.0         
## [101] cli_3.6.5                 listenv_0.9.1            
## [103] patchwork_1.3.1           pbapply_1.7-2            
## [105] htmlTable_2.4.3           Formula_1.2-5            
## [107] MASS_7.3-9                tidyselect_1.2.1         
## [109] stringi_1.8.7             yaml_2.3.10              
## [111] BiocSingular_1.14.0       locfit_1.5-9.12          
## [113] ggrepel_0.9.6             grid_4.2.0               
## [115] VariantAnnotation_1.44.1  sass_0.4.10              
## [117] fastmatch_1.1-6           tools_4.2.0              
## [119] future.apply_1.20.0       parallel_4.2.0           
## [121] rstudioapi_0.17.1         foreign_0.8-82           
## [123] bluster_1.8.0             metapod_1.6.0            
## [125] gridExtra_2.3             farver_2.1.2             
## [127] Rtsne_0.17                digest_0.6.37            
## [129] shiny_1.11.0              Rcpp_1.0.14              
## [131] scuttle_1.8.4             later_1.4.2              
## [133] RcppAnnoy_0.0.22          httr_1.4.7               
## [135] colorspace_2.1-1          XML_3.99-0.18            
## [137] tensor_1.5.1              reticulate_1.42.0        
## [139] splines_4.2.0             uwot_0.2.3               
## [141] RcppRoll_0.3.1            statmod_1.5.0            
## [143] spatstat.utils_3.1-4      scater_1.26.1            
## [145] sp_2.2-0                  xgboost_1.7.11.1         
## [147] xtable_1.8-4              jsonlite_2.0.0           
## [149] R6_2.6.1                  Hmisc_5.2-3              
## [151] pillar_1.10.2             htmltools_0.5.8.1        
## [153] mime_0.13                 DT_0.33                  
## [155] glue_1.8.0                fastmap_1.2.0            
## [157] BiocParallel_1.32.6       BiocNeighbors_1.16.0     
## [159] codetools_0.2-20          lattice_0.22-7           
## [161] bslib_0.9.0               spatstat.sparse_3.1-0    
## [163] tibble_3.3.0              curl_6.4.0               
## [165] ggbeeswarm_0.7.2          leiden_0.4.3.1           
## [167] survival_3.8-3            limma_3.54.2             
## [169] rmarkdown_2.29            GenomeInfoDbData_1.2.9   
## [171] gtable_0.3.6

Preprocess scMultiome data

08 August, 2025