Preprocessing and QC for a single-cell multiome dataset. Single cell data is obtained from first-trimester placental cells.
library(here)
library(readr)
library(purrr)
library(ggplot2)
library(plotly)
library(viridis)
library(Seurat)
library(Signac)
library(scDblFinder)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggpointdensity)
library(ggExtra)
library(biovizBase)
library(tidyr)
library(dplyr)
library(stringr)
library(reshape2)
library(ggridges)
# library(magick)
# library(CHOIR)
# library(ggalluvial)
# library(aricode)
set.seed(1234)
# sample <- params$sample
sample <- "s21b3"
sample
## [1] "s21b3"
library(here)
global_out_dir <- readLines(here("code/OUT_DIR.txt"))
data_dir <- readLines(here("code/DATA_DIR.txt"))
base_dir <- "/oak/stanford/groups/akundaje/projects/placenta_v2/output/singlecell/00/cellranger-arc"
out_dir <- file.path(global_out_dir, "singlecell/01/preprocessing", sample)
# load sample sheet
sample_meta <- read_tsv(file.path(data_dir, "scMultiome/metadata/20250526-multiome_metadata.tsv"))
## Rows: 10 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Source, ID, Batch, Liquid or Solid, Trisomy or Control, Embryo sex,...
## dbl (3): MIP_sample, Sequencing date, Gestational weeks
## lgl (1): Termination or CVS
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# construct path to sample's cellranger output
path_cellranger <- sample_meta %>%
dplyr::filter(ID == sample) %>%
pull(Path_cellranger)
# load config params
config_tsv <- data.table::fread(here("code", "singlecell", "01-preprocessing", "01-preprocessing-config.tsv"), data.table = FALSE)
# coerce to a list of param:value pairs, and make sure numeric parameters are numeric type
config <- as.list(tibble::deframe(config_tsv[, c(1, 2)]))
config$genes <- unlist(stringr::str_split(config$genes, pattern = ","))
config <- lapply(config, type.convert, as.is = TRUE)
Load the output from cellranger, which contains the gene and peak counts:
# load genome annotations for hg38
# here, annotations loaded using code adapted from
# Kundaje Lab (2023). "PsychENCODE Multiome Analysis Pipeline."
# Github repository: https://github.com/kundajelab/psychencode/blob/main/exps/20231009_multiome_analysis_v2/02-preprocessing/qc.template.Rmd
suppressWarnings({
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
})
# load cellranger-produced metadata with per-cell stats
cr_metadata <- read.csv(file.path(base_dir, sample, "per_barcode_metrics.csv"),
header = TRUE,
row.names = 1)
# load filtered matrices - contains both RNA and ATAC matrices
counts <- Read10X_h5(file.path(base_dir, sample, "filtered_feature_bc_matrix.h5"))
## Genome matrix has multiple modalities, returning a list of matrices for this genome
# list the data modalities for which counts are provided
names(counts)
## [1] "Gene Expression" "Peaks"
Initialize the Seurat object, with the scRNAseq data in the “RNA”
assay and the scATACseq data in the “ATAC” assay. This applies one
filter: only genes & peaks which are detected in at least
config$min_cells are retained. No further filtering on
genes or peaks is performed. We also load in the TF analysis (which
estimates a total signal for each TF in each cell) from Cellranger, in
case we want to use it downstream.
# initialize Seurat seurat_prefilt object with RNA
seurat_prefilt <- CreateSeuratObject(
counts = counts$`Gene Expression`,
assay = "RNA",
meta.data = cr_metadata,
project = sample,
min.cells = config$min_cells
)
# create the ATACseq assay
seurat_prefilt[["ATAC"]] <- CreateChromatinAssay(
counts = counts$Peaks,
sep = c(":", "-"),
genome = "hg38", # reference genome
fragments = file.path(base_dir, sample, "atac_fragments.tsv.gz"),
min.cells = config$min_cells
)
## Computing hash
# set gene annotations
Annotation(seurat_prefilt[["ATAC"]]) <- annotation # obtained from hg38 reference genome
# load filtered tf matrix
tf_counts <- Read10X_h5(file.path(base_dir, sample, "analysis/tf_analysis/filtered_tf_bc_matrix.h5"))
# create the TF assay
# filters out motifs detected in fewer than config$min_cells cells
assay_TF <- CreateAssayObject(
counts = tf_counts,
min.cells = config$min_cells
)
seurat_prefilt[['TF']] <- assay_TF
warnings <- list("LOW_N_CELLS" = FALSE,
"HIGH_MITO" = FALSE,
"HIGH_PROP_FILTERED" = FALSE,
"LOW_AVG_UMI" = FALSE,
"CC_ASSIGNMENT" = FALSE,
"SMALL_CLUSTERS" = FALSE)
In this section, we compute several QC metrics for scRNAseq/scATACseq, filter cells based on the distributions of these metrics, and then assess the distributions of these metrics after filtering.
For some definitions related to the different metrics computed from Multiome data, see the 10X glossary at: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/glossary
In particular, note that for RNA, counts represent the number of UMIs, while for ATAC, counts represent the number of cut/transposition sites. That is, RNA reflects transcriptional activity while ATAC reflects regulatory potential.
Count Matrix: A matrix of counts representing the number of unique observations of each Feature within each Cell Barcode. Each feature (gene or peak) is a row in the matrix, while each barcode is a column. For a gene feature the count represents the number of UMIs observed while for a peak the count represents the number of transposition sites within the peak region.
For quality control, we assess the mitochondrial content and ribosomal content at the single-cell level, using the proportion of reads which map to mitochondrial genes, or ribosomal protein genes, respectively.
DefaultAssay(seurat_prefilt) <- "RNA"
# identify mitochondrial genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
mito_genes <- grep("^MT-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
} else if (config$species == "m_musculus") {
mito_genes <- grep("^mt-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
}
# compute, for each cell, the proportion of reads in mitochondrial genes, and add to the metadata
percent_mito <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[mito_genes, ]) /
Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_mito, "percent.mito")
# identify ribosomal genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
ribo_genes <- grepl("^RPS|^RPL|^MRPS|^MRPL", rownames(GetAssayData(object = seurat_prefilt)))
} else if (config$species == "m_musculus") {
ribo_genes <- grepl("^Rps|^Rpl|^Mrps|^Mrpl", rownames(GetAssayData(object = seurat_prefilt)))
}
# compute, for each cell, the proportion of reads in ribsomal genes, and add to the metadata
percent_ribo <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[ribo_genes, ]) /
Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_ribo, "percent.ribo")
Here we calculate Nucleosome signal and TSS enrichment for the cells in the sample, which are described below with the resulting plots.
DefaultAssay(seurat_prefilt) <- "ATAC"
# compute stats using Signac functions
seurat_prefilt <- NucleosomeSignal(object = seurat_prefilt)
seurat_prefilt <- TSSEnrichment(seurat_prefilt, fast = FALSE)
## Extracting TSS positions
## Finding + strand cut sites
## Finding - strand cut sites
## Computing mean insertion frequency in flanking regions
## Normalizing TSS score
Because fragments correspond to cutting at different integer multiples of nucleosomes, the distribution of fragment lengths should reveal a specific nucleosome banding pattern, with a fragment length periodicity of ~200bp. We separate cells based on the nucleosome signal, and it should be clear that one population has the typical periodic nucleosome banding pattern.
p1 <- VlnPlot(seurat_prefilt, "nucleosome_signal", pt.size = -1) + NoLegend()
sum(is.infinite(seurat_prefilt@meta.data$nucleosome_signal))
## [1] 0
# deal with inf values
seurat_prefilt@meta.data$nucleosome_signal[is.infinite(seurat_prefilt@meta.data$nucleosome_signal)] <- 0
# set threshold for nucleosome_signal as 1.5
seurat_prefilt$nucleosome_group <- ifelse(seurat_prefilt$nucleosome_signal > 1.5, 'NS > 1.5', 'NS < 1.5')
p2 <- FragmentHistogram(object = seurat_prefilt, group.by = 'nucleosome_group', region = 'chr1-1-10000000') +
scale_fill_manual(values = c("forestgreen", "gray80"))
# cowplot::plot_grid(p1, p2, rel_widths = c(0.3, 0.7), align = "hv", axis = "tb")
p1
p2
We also quantify chromatin accessibility at transcription start sites (TSS). Successful scATACseq experiments should have a strong enrichment of accessibility near TSS, and we can again use this metric to stratify cells:
p1 <- VlnPlot(seurat_prefilt, "TSS.enrichment", pt.size = -1) + NoLegend()
seurat_prefilt$high.tss <- ifelse(seurat_prefilt$TSS.enrichment > 2, 'High', 'Low')
p2 <- TSSPlot(seurat_prefilt, group.by = 'high.tss') + NoLegend() +
scale_color_manual(values = c("forestgreen", "gray80"))
# cowplot::plot_grid(p1, p2, rel_widths = c(0.3, 0.7), align = "hv", axis = "tb")
p1
p2
The thresholds used are a combination of hard cutoffs and cutoffs computed based on the distribution of each metric within the sample.
thresholds <- data.frame(
# the minimum number of features will be the greater of:
# 400, or 2 standard deviations below the mean
min_features = max(400, round(mean(seurat_prefilt@meta.data$nFeature_RNA) -
2*sd(seurat_prefilt@meta.data$nFeature_RNA))),
max_features = round(mean(seurat_prefilt@meta.data$nFeature_RNA) +
2*sd(seurat_prefilt@meta.data$nFeature_RNA)),
# the max mitochondrial content will be the maximum of:
# 2%, or 2 standard deviations above the mean
# the parameter config$max_mito allows to set a hard upper threshold,
# which takes precedence
max_mito = ifelse(!is.na(config$max_mito),
config$max_mito,
max(2, round(mean(seurat_prefilt@meta.data$percent.mito) +
2*sd(seurat_prefilt@meta.data$percent.mito))
)
),
# set a max of 0 in case the value 2 standard deviations below the mean
# is negative
min_umi = max(0, round(mean(seurat_prefilt@meta.data$nCount_RNA) -
2*sd(seurat_prefilt@meta.data$nCount_RNA))),
max_umi = round(mean(seurat_prefilt@meta.data$nCount_RNA) +
2*sd(seurat_prefilt@meta.data$nCount_RNA)),
min_peak_region_fragments = max(config$min_peak_region_fragments,
round(mean(seurat_prefilt@meta.data$atac_peak_region_fragments) - 2*sd(seurat_prefilt@meta.data$atac_peak_region_fragments))),
max_peak_region_fragments = round(mean(seurat_prefilt@meta.data$atac_peak_region_fragments) + 2*sd(seurat_prefilt@meta.data$atac_peak_region_fragments)),
max_nucleosome_signal = ifelse(!is.null(config$max_nucleosome_signal), config$max_nucleosome_signal,
round(mean(seurat_prefilt@meta.data$nucleosome_signal) + 2*sd(seurat_prefilt@meta.data$nucleosome_signal))),
max_TSS.enrichment = round(mean(seurat_prefilt@meta.data$TSS.enrichment) + 2*sd(seurat_prefilt@meta.data$TSS.enrichment)),
min_TSS.enrichment = round(mean(seurat_prefilt@meta.data$TSS.enrichment) - 2*sd(seurat_prefilt@meta.data$TSS.enrichment))
)
thresholds
vln_custom <- function(feature, y_int = NULL, pt_size = -1, filt = "pre") {
object <- switch(filt, "pre" = seurat_prefilt, "post" = seurat)
p1 <- VlnPlot(object = object, features = feature, pt.size = pt_size, cols = "gray80") +
NoLegend() + ylab(NULL) + xlab(NULL) +
theme(title = element_text(size = 8, face = "plain"),
axis.text.x = element_blank())
if (!is.null(y_int)) p1 + geom_hline(yintercept = y_int, color = "red")
else p1
}
cowplot::plot_grid(
vln_custom("nFeature_RNA", c(thresholds$min_features, thresholds$max_features)),
vln_custom("nCount_RNA", c(thresholds$min_umi, thresholds$max_umi)),
vln_custom("percent.mito", c( thresholds$max_mito)),
vln_custom("nCount_ATAC"),
vln_custom("nFeature_ATAC"),
vln_custom("atac_peak_region_fragments", c(thresholds$min_peak_region_fragments, thresholds$max_peak_region_fragments)),
vln_custom("TSS.enrichment", c(thresholds$min_TSS.enrichment, thresholds$max_TSS.enrichment)),
vln_custom("nucleosome_signal", c( thresholds$max_nucleosome_signal)),
ncol = 8,
align = "h", axis = "tb"
)
cowplot::plot_grid(
vln_custom("nFeature_RNA", c(thresholds$min_features, thresholds$max_features), pt_size = 0.1),
vln_custom("nCount_RNA", c(thresholds$min_umi, thresholds$max_umi), pt_size = 0.1),
vln_custom("percent.mito", c(thresholds$min_mito, thresholds$max_mito), pt_size = 0.1),
vln_custom("nCount_ATAC", pt_size = 0.1),
vln_custom("nFeature_ATAC", pt_size = 0.1),
vln_custom("atac_peak_region_fragments", c(thresholds$min_peak_region_fragments, thresholds$max_peak_region_fragments), pt_size = 0.1),
vln_custom("TSS.enrichment", c(thresholds$min_TSS.enrichment, thresholds$max_TSS.enrichment), pt_size = 0.1),
vln_custom("nucleosome_signal", c( thresholds$max_nucleosome_signal), pt_size = 0.1),
ncol = 8,
align = "h", axis = "tb"
)
# filter cells
seurat <- subset(
x = seurat_prefilt,
subset =
nFeature_RNA > thresholds$min_features &
nFeature_RNA < thresholds$max_features &
nCount_RNA > thresholds$min_umi &
nCount_RNA < thresholds$max_umi &
percent.mito < thresholds$max_mito &
atac_peak_region_fragments > thresholds$min_peak_region_fragments &
atac_peak_region_fragments < thresholds$max_peak_region_fragments &
nucleosome_signal < thresholds$max_nucleosome_signal &
TSS.enrichment > thresholds$min_TSS.enrichment &
TSS.enrichment < thresholds$max_TSS.enrichment
)
cowplot::plot_grid(
vln_custom("nFeature_RNA", filt = "post"),
vln_custom("nCount_RNA", filt = "post"),
vln_custom("percent.mito", filt = "post"),
vln_custom("nCount_ATAC", filt = "post"),
vln_custom("nFeature_ATAC", filt = "post"),
vln_custom("atac_peak_region_fragments", filt = "post"),
vln_custom("TSS.enrichment", filt = "post"),
vln_custom("nucleosome_signal", filt = "post"),
ncol = 8,
align = "h", axis = "tb"
)
cowplot::plot_grid(
vln_custom("nFeature_RNA", filt = "post", pt_size = 0.1),
vln_custom("nCount_RNA", filt = "post", pt_size = 0.1),
vln_custom("percent.mito", filt = "post", pt_size = 0.1),
vln_custom("nCount_ATAC", filt = "post", pt_size = 0.1),
vln_custom("nFeature_ATAC", filt = "post", pt_size = 0.1),
vln_custom("atac_peak_region_fragments", filt = "post", pt_size = 0.1),
vln_custom("TSS.enrichment", filt = "post", pt_size = 0.1),
vln_custom("nucleosome_signal", filt = "post", pt_size = 0.1),
ncol = 8,
align = "h", axis = "tb"
)
For each of the metrics we filtered on, we save the min, max and mean before and after filtering, as well as the lower and upper thresholds used. We also record the number of cells that remain after filtering.
filtering_criteria <- c(
"nFeature_RNA", "nCount_RNA", "percent.mito",
"atac_peak_region_fragments", "nucleosome_signal", "TSS.enrichment")
# compute summary stats for each metric
filtering_metrics <- sapply(filtering_criteria, function(criterion) {
min_pre <- round(min(seurat_prefilt@meta.data %>% pull(criterion)), 2)
mean_pre <- mean(seurat_prefilt@meta.data %>% pull(criterion))
max_pre <- max(seurat_prefilt@meta.data %>% pull(criterion))
sd_pre <- sd(seurat_prefilt@meta.data %>% pull(criterion))
min_post <- min(seurat@meta.data %>% pull(criterion))
mean_post <- mean(seurat@meta.data %>% pull(criterion))
max_post <- max(seurat@meta.data %>% pull(criterion))
sd_post <- sd(seurat@meta.data %>% pull(criterion))
return(c("min.preQC" = min_pre,
"mean.preQC" = mean_pre,
"max.preQC" = max_pre,
"sd.preQC" = sd_pre,
"min.postQC" = min_post,
"mean.postQC" = mean_post,
"max.postQC" = max_post,
"sd.postQC" = sd_post))
})
# round to 2 decimal places
filtering_metrics <- apply(filtering_metrics, 2, round, 3)
# transform into a dataframe
filtering_metrics <- filtering_metrics %>%
t() %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "criterion") %>%
dplyr::select(criterion, min.preQC, min.postQC, max.preQC, max.postQC, mean.preQC, mean.postQC, sd.preQC, sd.postQC)
# add thresholds
filtering_metrics$min.threshold <- c(
thresholds$min_features,
thresholds$min_umi,
NA,
thresholds$min_peak_region_fragments,
NA,
thresholds$min_TSS.enrichment)
filtering_metrics$max.threshold <- c(
thresholds$max_features,
thresholds$max_umi,
thresholds$max_mito,
thresholds$max_peak_region_fragments,
thresholds$max_nucleosome_signal,
thresholds$max_TSS.enrichment)
filtering_metrics
# compute number of cells before and after filtering
N_cells_metrics <- data.frame(
"N_cells_before" = dim(seurat_prefilt@meta.data)[1],
"N_cells_after" = dim(seurat@meta.data)[1]) %>%
mutate(Prop_kept = round(N_cells_after / N_cells_before, 2))
N_cells_metrics
This report will register warnings if:
The warnings will be output at the end of the report, and saved as a
TSV if any warning flags are TRUE.
if (N_cells_metrics$N_cells_after < 1000) warnings$LOW_N_CELLS <- TRUE
if (N_cells_metrics$Prop_kept < 0.6) warnings$HIGH_PROP_FILTERED <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
"nCount_RNA", ]$mean.postQC < 2000) warnings$LOW_AVG_UMI <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
"percent.mito", ]$max.postQC > 5) warnings$HIGH_MITO <- TRUE
Normalization of gene expression data is performed using Log
Normalization. The sequence of NormalizeData,
FindVariableFeatures, and ScaleData scale
counts to 10,000 UMIs per cell, log2-transform counts, and then regress
out unwanted sources of variation. The log-normalized values are set as
the default assay in the Seurat object.
DefaultAssay(seurat) <- "RNA"
# normalization: scale counts to 10000 UMIs per cell, and log2-transform the counts
# set the default assay to RNA to run the log-normalization & scaling
seurat <- seurat %>%
NormalizeData(normalization.method = "LogNormalize",
scale.factor = 10000) %>%
# identify variable genes
FindVariableFeatures(mean.function = ExpMean,
dispersion.function = LogVMR) %>%
# regress out variables which are sources of unwanted variation, and z-score data
ScaleData(vars.to.regress = unlist(str_split(config$var_regress, ",")))
scATACseq processing is done using latent semantic indexing (LSI), and uses a different method for normalization due to increased data sparsity compared to scRNAseq. We perform normalization with the TF-IDF method, which takes into account both sequencing depth and frequency/rarity of peaks. Rather than feature selection based on variability (difficult due to data sparsity), we keep the top n “most informative” features.
DefaultAssay(seurat) <- "ATAC"
seurat <- RunTFIDF(seurat)
## Performing TF-IDF normalization
seurat <- FindTopFeatures(seurat, min.cutoff = 'q5', verbose = FALSE)
seurat[["ATAC"]]
## ChromatinAssay data with 124074 features for 6374 cells
## Variable features: 117937
## Genome: hg38
## Annotation present: TRUE
## Motifs present: FALSE
## Fragment files: 1
For scRNA data, we perform PCA after scaling.
DefaultAssay(seurat) <- "RNA"
seurat <- RunPCA(seurat,
pc.genes = VariableFeatures(.),
npcs = 40,
ndims.print = 1:5,
nfeatures.print = 5)
## PC_ 1
## Positive: PHLDB2, ADAMTS6, CHODL, PLCL2, ADGRG6
## Negative: MEG3, SOX5, LAMA2, PDZRN3, SLIT2
## PC_ 2
## Positive: ARL15, UTRN, KANK1, CHN2, LGR5
## Negative: CSGALNACT1, CHODL, PAPPA2, GTF2I-AS1, KMO
## PC_ 3
## Positive: FGF13, SLC9A9, F13A1, FGL1, SCN9A
## Negative: TENM3, KANK1, SEMA6D, PBX1, ADAMTS19
## PC_ 4
## Positive: FCGR2C, DACH1, MEOX2, MECOM, ZNF521
## Negative: SLIT2, CDH11, CNTN5, BMP5, PDZRN4
## PC_ 5
## Positive: CNTN5, ENSG00000232855, LINC01697, LINC01695, ITGBL1
## Negative: AGTR1, AFF2, CCDC102B, GUCY1A2, DOK6
# Elbow Plot and Jackstraw Plot are used to calibrate the number of PCs for the PCA.
# ElbowPlot(seurat, ndims = 40)
# seurat <- JackStraw(seurat, num.replicate = 100, dims = 50)
# seurat <- ScoreJackStraw(seurat, dims = 1:50)
# JackStrawPlot(seurat, dims = 1:50)
Analogous to PCA, we perform a linear dimensionality reduction with SVD.
DefaultAssay(seurat) <- "ATAC"
seurat <- RunSVD(object = seurat)
## Running SVD
## Scaling cell embeddings
Often, the first component is highly correlated with technical features, and should not be included in downstream analyses; this is assessed using the correlation plot with depth. Here, we check if the first LSI component is highly correlated with depth:
DepthCor(seurat, reduction = "lsi", n=40)
We compute the graph from each data type individually. These graphs will be the input to non-linear dimensionality reduction using UMAP, followed by clustering.
DefaultAssay(seurat) <- "RNA"
seurat <- FindNeighbors(seurat, reduction = "pca", dims = 1:40, verbose = FALSE)
seurat <- RunUMAP(
object = seurat,
reduction = "pca",
dims = 1:40,
reduction.name = "umap.rna",
reduction.key = "rnaUMAP_"
)
## 10:02:08 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:02:08 Read 6374 rows and found 40 numeric columns
## 10:02:08 Using Annoy for neighbor search, n_neighbors = 30
## 10:02:08 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:02:09 Writing NN index file to temp file /tmp/RtmptqEcjz/file654934809956
## 10:02:09 Searching Annoy index using 1 thread, search_k = 3000
## 10:02:11 Annoy recall = 100%
## 10:02:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:02:17 Initializing from normalized Laplacian + noise (using irlba)
## 10:02:17 Commencing optimization for 500 epochs, with 290622 positive edges
## 10:02:17 Using rng type: pcg
## 10:02:37 Optimization finished
# parameters for clustering will be held constant at these values. we name the graph RNA_nn.
clustering_fn <- purrr::partial(FindClusters,
object = seurat,
algorithm = 3,
verbose = FALSE,
n.start = 10,
random.seed = config$seed,
graph.name = "RNA_nn")
# note: default clustering resolution is 1
seurat <- clustering_fn(resolution = config$clustering_resolution)
p3 <- DimPlot(seurat, reduction = "umap.rna", label.size = 6, label = TRUE) + ggtitle("Regular pipeline: res 1")
For ATAC data, we exclude LSI 1 and use dimensions 2:40.
DefaultAssay(seurat) <- "ATAC"
seurat <- FindNeighbors(seurat, reduction = "lsi", dims = 2:40, verbose = TRUE)
## Computing nearest neighbor graph
## Computing SNN
seurat <- RunUMAP(
object = seurat,
reduction = "lsi",
dims = 2:40,
reduction.name = "umap.atac",
reduction.key = "atacUMAP_"
)
## 10:02:41 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:02:41 Read 6374 rows and found 39 numeric columns
## 10:02:41 Using Annoy for neighbor search, n_neighbors = 30
## 10:02:41 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:02:42 Writing NN index file to temp file /tmp/RtmptqEcjz/file6549232962f4
## 10:02:42 Searching Annoy index using 1 thread, search_k = 3000
## 10:02:44 Annoy recall = 100%
## 10:02:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:02:49 Initializing from normalized Laplacian + noise (using irlba)
## 10:02:50 Commencing optimization for 500 epochs, with 273982 positive edges
## 10:02:50 Using rng type: pcg
## 10:03:09 Optimization finished
clustering_fn <- purrr::partial(FindClusters,
object = seurat,
algorithm = 3,
verbose = FALSE,
n.start = 10,
random.seed = config$seed,
graph.name = "ATAC_nn")
# note: the default clustering resolution is 1.
seurat <- clustering_fn(resolution = config$clustering_resolution)
p5 <- DimPlot(seurat, reduction = "umap.atac") + ggtitle("res 1")
seurat$barcodes <- colnames(seurat)
# Plot UMAPs separately
p1_bef <- DimPlot(seurat, reduction = "umap.rna", label = TRUE, group.by = "RNA_nn_res.1") + ggtitle("RNA UMAP")
p2_bef <- DimPlot(seurat, reduction = "umap.atac", label = TRUE, group.by = "ATAC_nn_res.1") + ggtitle("ATAC UMAP")
# Combine side by side
p1_bef + p2_bef
Before calling scDblFinder, we add a reference for the number of doublets we expect to see given the number of cells. For example, if the number of cells recovered is under 750, we expect 0.4% of the cells to be doublets, etc.
expected_doublets <- function(cells_recovered) {
if (cells_recovered < 750) {
return(0.004)
} else if (cells_recovered < 1500) {
return(0.008)
} else if (cells_recovered < 2500) {
return(0.016)
} else if (cells_recovered < 3500) {
return(0.024)
} else if (cells_recovered < 4500) {
return(0.032)
} else if (cells_recovered < 5500) {
return(0.04)
} else if (cells_recovered < 6500) {
return(0.048)
} else if (cells_recovered < 7500) {
return(0.056)
} else if (cells_recovered < 8500) {
return(0.064)
} else if (cells_recovered < 9500) {
return(0.072)
} else {
return(0.08)
}
}
Here, we call scdblFinder seperately on scATAC and scRNA data, to
identify the number of cells that are identified as an
atac_doublet, rna_doublet or both.
# Source: https://biostatsquid.com/scdblfinder-tutorial/
# Convert the seurat object to a SingleCellExperiment (sce) object
DefaultAssay(seurat) <- "RNA"
sce <- as.SingleCellExperiment(seurat)
sce <- scDblFinder(sce, dbr = expected_doublets(ncol(seurat)), verbose = FALSE)
table(sce$scDblFinder.class)
##
## singlet doublet
## 5744 630
sce@colData@listData %>% as.data.frame() %>% head()
# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>%
dplyr::select(starts_with('scDblFinder'))
# Rename columns to avoid collisions and label as RNA doublets
colnames(meta_scdblfinder) <- gsub("^scDblFinder", "rna_doublets", colnames(meta_scdblfinder))
rownames(meta_scdblfinder) <- colnames(sce)
#head(meta_scdblfinder)
rownames(meta_scdblfinder) <- sce@colData@rownames
head(meta_scdblfinder)
seurat <- AddMetaData(object = seurat, metadata = meta_scdblfinder)
# remove the temporary variables
rm(list = c('meta_scdblfinder', 'sce'))
DefaultAssay(seurat) <- "ATAC"
sce <- as.SingleCellExperiment(seurat)
sce <- scDblFinder(sce, dbr = expected_doublets(ncol(seurat)))
## Creating ~5100 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1483 cells excluded from training.
## iter=1, 1630 cells excluded from training.
## iter=2, 1519 cells excluded from training.
## Threshold found:0.765
## 565 (8.9%) doublets called
table(sce$scDblFinder.class)
##
## singlet doublet
## 5809 565
sce@colData@listData %>% as.data.frame() %>% head()
# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>%
dplyr::select(starts_with('scDblFinder'))
# Rename columns to avoid collisions and label as RNA doublets
colnames(meta_scdblfinder) <- gsub("^scDblFinder", "atac_doublets", colnames(meta_scdblfinder))
rownames(meta_scdblfinder) <- colnames(sce)
#head(meta_scdblfinder)
rownames(meta_scdblfinder) <- sce@colData@rownames
head(meta_scdblfinder)
seurat <- AddMetaData(object = seurat, metadata = meta_scdblfinder)
# remove the temporary variables
rm(list = c('meta_scdblfinder', 'sce'))
Now, we identify which category each cell falls into (is it consistently idenfied as a doublet)?
# Identify cells called doublets by each method
rna_doublets <- rownames(seurat@meta.data)[seurat@meta.data$rna_doublets.class == "doublet"]
atac_doublets <- rownames(seurat@meta.data)[seurat@meta.data$atac_doublets.class == "doublet"]
# Define both and all doublets
both_doublets <- intersect(rna_doublets, atac_doublets)
all_doublets <- union(rna_doublets, atac_doublets)
# Initialize metadata vector (default to singlet)
all_doublets.metadata <- rep("singlet", ncol(seurat))
names(all_doublets.metadata) <- colnames(seurat)
# Assign doublet labels
all_doublets.metadata[rna_doublets] <- "rna_doublet"
all_doublets.metadata[atac_doublets] <- "atac_doublet"
all_doublets.metadata[both_doublets] <- "both_doublet"
# Add to Seurat object
seurat <- AddMetaData(seurat, metadata = all_doublets.metadata, col.name = "all_doublets")
# Coloured by singlets and doublets
DefaultAssay(seurat) <- "RNA"
p1 <- DimPlot(
object = seurat,
group.by = "all_doublets", # colour by this column
cols = c("singlet" = "grey",
"rna_doublet" = "coral1",
"atac_doublet" = "purple",
"both_doublet" = "darkolivegreen"),
pt.size = 0.3)
DefaultAssay(seurat) <- "ATAC"
p2 <- DimPlot(
seurat,
group.by = "all_doublets",
pt.size = 0.3,
cols = c("singlet" = "grey",
"rna_doublet" = "coral1",
"atac_doublet" = "purple",
"both_doublet" = "darkolivegreen"),
)
p1 + p2
# Coloured by scDblFinder.score: RNA assay
DefaultAssay(seurat) <- "RNA"
seurat$rna_log.score <- log1p(seurat$rna_doublets.score)
p3 <- FeaturePlot(seurat,
features = "rna_log.score",
label = TRUE,
label.size = 10,
pt.size = 0.3) + scale_colour_gradient(low = "grey", high = "coral3")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Coloured by scDblFinder.score: ATAC assay
DefaultAssay(seurat) <- "ATAC"
seurat$atac_log.score <- log1p(seurat$atac_doublets.score)
p4 <- FeaturePlot(seurat,
features = "atac_log.score",
label = TRUE,
label.size = 10,
pt.size = 0.3) + scale_colour_gradient(low = "grey", high = "purple4")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
cowplot::plot_grid(p3, p4, ncol = 2)
# barplot per cluster with the number of singlets and doublets
## first, we create a summary dataframe of cells per cluster
cluster_counts <- seurat@meta.data %>%
group_by(seurat_clusters, all_doublets) %>%
summarise(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'seurat_clusters'. You can override using
## the `.groups` argument.
## then, we plot with ggplot2
ggplot(cluster_counts, aes(x = seurat_clusters, y = n, fill = all_doublets), labels = TRUE, labels.size = 20, repel = TRUE) +
geom_col(position = "stack") +
labs(x = "Cluster No.", y = "Count") +
theme_minimal() +
scale_fill_manual(values = c("singlet" = "grey",
"rna_doublet" = "coral1",
"atac_doublet" = "purple",
"both_doublet" = "darkolivegreen")) +
geom_text(aes(label = n),
position = position_stack(vjust = 0.5), # center labels vertically in each segment
color = "black", size = 8) +
theme(
axis.text.x = element_text(size = 20), # X-axis text size
axis.text.y = element_text(size = 20), # Y-axis text size
axis.title.x = element_text(size = 16), # X-axis title
axis.title.y = element_text(size = 16), # Y-axis title
legend.text = element_text(size = 13), # Legend item labels
legend.title = element_text(size = 15), # Legend title
)
# singlet/doublet plot split by cluster
plot_doublet_vln <- function(seurat, title, y_axis, what_cluster) {
FetchData(seurat, c(what_cluster, "all_doublets")) %>%
ggplot(aes(x = seurat_clusters, y = y_axis)) +
geom_violin(scale = "width", fill = "gray90") +
geom_jitter(aes(color = all_doublets), size = 1, width = 0.2, alpha = 0.5) +
scale_color_manual(values = c("singlet" = "grey",
"rna_doublet" = "coral1",
"atac_doublet" = "purple",
"both_doublet" = "darkolivegreen")) +
ggtitle(title) + xlab("Cluster")
}
# Box plots of singlet/doublet split by numfeatures and numcounts
df_long <- melt(
seurat@meta.data,
id.vars = c("barcodes", "all_doublets"),
measure.vars = c("nFeature_RNA", "nCount_RNA", "nFeature_ATAC", "nCount_ATAC")
)
ggplot(df_long, aes(x = all_doublets, y = value, fill = all_doublets)) +
geom_boxplot(outlier.size = 0.5) +
facet_wrap(~variable, scales = "free_y", ncol = 4) +
theme_classic() +
theme(legend.position = "right") +
xlab("") + ylab("Value")
# Remove the droplets marked as doublets
seurat <- subset(seurat, subset = all_doublets == 'singlet')
We visualise both modalities again, without
rna_doublets, atac_doublets and
both_doublets. Top row: before filtering out doublets.
Bottom row: after filtering out doublets.
# Plot UMAPs separately
p1_aft <- DimPlot(seurat, reduction = "umap.rna", label = TRUE, group.by = "RNA_nn_res.1") + ggtitle("RNA UMAP")
p2_aft <- DimPlot(seurat, reduction = "umap.atac", label = TRUE) + ggtitle("ATAC UMAP")
# Combine side by side
# Plot clusters almost look "cleaned up" -- less overlap between different clusters
(p1_bef | p2_bef) / (p1_aft | p2_aft)
We save the filtered seurat object.
filepath <- paste0(out_dir, "/no_doublet_objects/", sample, ".no_doublets.rds")
# creates directory in case it doesn't exist
dir.create(dirname(filepath), recursive = TRUE, showWarnings = FALSE)
saveRDS(seurat, file = filepath)
Here, we check the number of cells in each cluster after doublets have been filtered out. We register a warning if more than a third of clusters have less than 100 cells.
# 1. rearrange the clusters to be in ascending order in the seurat object (allows them to be plotted in order)
cluster_col <- "RNA_nn_res.1"
seurat[[cluster_col]] <- factor(seurat[[cluster_col]][[1]], levels = as.character(c(0:9, 10:15)))
# number of clusters with < 100 cells, divided by the total number of clusters
# 2. check the clusters from RNA
rna <- table(seurat@meta.data$RNA_nn_res.1)
if ((sum(rna < 100) / length(rna)) > 0.33) {
warnings$SMALL_CLUSTERS <- TRUE
}
# 3. check the clusters from ATAC
atac <- table(seurat@meta.data$ATAC_nn_res.1)
if ((sum(atac < 100) / length(atac)) > 0.33) {
warnings$SMALL_CLUSTERS <- TRUE
}
We then generate violin plots of the distribution of each QC metric in each cluster, with the number of cells in each cluster indicated above each violin.
# make a dataframe of the number of cells per cluster (rna)
clust_df <- data.frame(rna)
clust_df[1, 2] <- paste0("N=", clust_df[1, 2])
colnames(clust_df) <- c("ident", "N")
vln_fun <- function(seurat_obj = seurat, criterion, cluster_col = "RNA_nn_res.1", colours = NULL) {
# Extract cluster identities as factor
cluster_ids <- as.factor(seurat_obj@meta.data[[cluster_col]])
# Make table of number of cells per cluster, forcing all levels to appear
cluster_counts <- table(factor(cluster_ids, levels = levels(cluster_ids)))
# Make clust_df with cluster identity and cell count
clust_df <- data.frame(
ident = names(cluster_counts),
N = paste0("N=", as.numeric(cluster_counts))
)
# Add y position for label, 95% of max value of criterion
clust_df$y <- max(seurat_obj[[criterion]][,1]) * 0.95
# Create violin plot
Seurat::VlnPlot(seurat_obj, features = criterion, group.by = cluster_col, cols = colours, pt.size = 0.2) +
theme(legend.position = "none") +
geom_text(data = clust_df, aes(label = N, x = ident, y = y)) +
xlab("Cluster")
}
vln_fun(criterion = "nCount_RNA")
vln_fun(criterion = "nFeature_RNA")
vln_fun(criterion = "percent.mito")
vln_fun(criterion = "nCount_ATAC")
vln_fun(criterion = "nFeature_ATAC")
vln_fun(criterion = "atac_peak_region_fragments")
vln_fun(criterion = "TSS.enrichment")
vln_fun(criterion = "nucleosome_signal")
# make a dataframe of the number of cells per cluster (atac)
clust_df <- data.frame(atac)
clust_df[1, 2] <- paste0("N=", clust_df[1, 2])
colnames(clust_df) <- c("ident", "N")
vln_fun <- function(criterion) {
# plot the labels at a value slightly below the max, to ensure they're shown
# within plot limits
clust_df$y <- max(seurat[[criterion]]) * 0.95
Seurat::VlnPlot(seurat, criterion, cols = seurat@misc$colours, pt.size = 0.2) +
theme(legend.position = "none") +
geom_text(data = clust_df, aes(label = N, x = ident, y = y)) +
xlab("Cluster")
}
vln_fun("nCount_RNA")
vln_fun("nFeature_RNA")
vln_fun("percent.mito")
vln_fun("nCount_ATAC")
vln_fun("nFeature_ATAC")
vln_fun("atac_peak_region_fragments")
vln_fun("TSS.enrichment")
vln_fun("nucleosome_signal")
We save the clusters in the seurat object after this QC check.
filepath <- paste0(out_dir, "/clustered_objects/", sample, ".clustered.rds")
# creates directory in case it doesn't exist
dir.create(dirname(filepath), recursive = TRUE, showWarnings = FALSE)
saveRDS(seurat, file = filepath)
We then identify cluster markers for each cluster for the specified
resolution (config$clustering_resolution) using a Wilcoxon
Rank Sum Test. We also display heatmaps of the top 10 genes/peaks that
were expressed in each cell cluster. This allows us to identify
differentially expressed features across clusters.
# First, for RNA markers
DefaultAssay(seurat) <- 'RNA'
Idents(seurat) <- seurat[["RNA_nn_res.1"]]
cluster_markers_rna <- FindAllMarkers(object = seurat, verbose = FALSE)
# display the top 30 per cluster
cluster_markers_rna %>%
dplyr::group_by(cluster) %>%
top_n(n = 30, wt = avg_log2FC) %>%
dplyr::select(cluster, gene, everything()) %>%
DT::datatable(cluster_markers_rna, filter = "top")
write_tsv(cluster_markers_rna, file.path(out_dir, "cluster_markers_rna.tsv"))
DefaultAssay(seurat) <- 'RNA'
Idents(seurat) <- "RNA_nn_res.1"
top10_rna <- cluster_markers_rna %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(seurat, features = top10_rna$gene, group.colors = seurat@misc$colours) +
NoLegend() +
scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B")) +
theme(
axis.text.y = element_text(size = 4), # smaller y-axis text (genes)
)
Note that the process of finding cluster markers for scATAC data is slow, due to sparse data. An alternative approach is Presto (https://immunogenomics.github.io/presto/articles/getting-started.html)
cluster_markers_atac <- FindAllMarkers(object = seurat,
assay = "ATAC",
verbose = FALSE)
# display the top 30 per cluster
cluster_markers_atac %>%
dplyr::group_by(cluster) %>%
top_n(n = 30, wt = avg_log2FC) %>%
dplyr::select(cluster, gene, everything()) %>%
DT::datatable(cluster_markers_atac, filter = "top")
write_tsv(cluster_markers_atac, file.path(out_dir, "cluster_markers_atac.tsv"))
# Displays heatmap of the top 10 peaks in each cluster
DefaultAssay(seurat) <- "ATAC"
seurat <- ScaleData(seurat)
Idents(seurat) <- "ATAC_nn_res.1"
top10_atac <- cluster_markers_atac %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(seurat,
features = top10_atac$gene,
group.by = "ATAC_nn_res.1",
group.colors = seurat@misc$colours) +
NoLegend() +
scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B"))
We can also use the reference genome to map peaks back to the genes that have the most chromatin accessibility.
# find gene annotations for each of these peaks
peaks_of_interest <- StringToGRanges(top10_atac$gene, sep = c("-", "-"))
annotation <- Annotation(seurat)
overlaps <- findOverlaps(peaks_of_interest, annotation)
# for each peak, get the overlapping gene(s)
peak_idx <- queryHits(overlaps)
gene_idx <- subjectHits(overlaps)
# create a dataframe linking peak -> gene
peak_gene_df <- data.frame(
peak = top10_atac$gene[peak_idx],
gene = annotation$gene_name[gene_idx]
)
peak_gene_df_unique <- peak_gene_df %>%
group_by(peak) %>%
summarize(gene = paste(unique(gene), collapse = ";"))
peak_to_gene <- setNames(peak_gene_df_unique$gene, peak_gene_df_unique$peak)
DoHeatmap(seurat, features = top10_atac$gene, group.colors = seurat@misc$colours) +
NoLegend() +
scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B")) +
scale_y_discrete(labels = function(x) peak_to_gene[x])+ theme(
axis.text.y = element_text(size = 3), # smaller y-axis text (genes)
)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
We save the seurat object with all the quality control metrics observed so far.
# save exact metrics used for filtering and pre/post QC stats, as well as number
# of cells before/after filtering
# Define a common output directory path
qc_dir <- file.path(out_dir, "qc_metrics")
# Create the directory if it doesn't exist
dir.create(qc_dir, recursive = TRUE, showWarnings = FALSE)
# Define filepaths
filtering_metrics_file <- file.path(qc_dir, paste0(sample, ".filtering_metrics.tsv"))
n_cells_metrics_file <- file.path(qc_dir, paste0(sample, ".N_cells_metrics.tsv"))
# Write the files
write_tsv(filtering_metrics, filtering_metrics_file)
write_tsv(N_cells_metrics, n_cells_metrics_file)
To score cells for their cell cycle activity, we compute the scores using the method implemented in Seurat:
DefaultAssay(seurat) <- "RNA"
# DimHeatmap(seurat, dims = c(10, 30))
# a list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat;
# segregate this list into markers of G2/M phase and markers of S phase
s_genes <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes
seurat <- CellCycleScoring(seurat, s.features = s_genes, g2m.features = g2m_genes, set.ident = TRUE)
# Plots for the cell state in RNA and ATAC reductions
head(seurat[[]])
p1 <- DimPlot(seurat) + ggtitle("RNA reduction")
p2 <- DimPlot(seurat, reduction="umap.atac") + ggtitle("ATAC reduction")
cc.markers <- switch(config$species,
"h_sapiens" = c("PCNA", "TOP2A", "MCM6", "MKI67"),
"m_musculus" = c("Pcna", "Top2a", "Mcm6", "Mki67"))
# ensure all the genes that we're checking for are in the dataset
genes_to_check <- c("PCNA", "TOP2A", "MCM6", "MKI67")
genes_to_check %in% rownames(seurat[["RNA"]]@data)
## [1] TRUE TRUE TRUE TRUE
# note: if nothing is visible in these ridge plots, it means the peaks are too small to be seen.
# RidgePlot(seurat,
# group.by = "Phase",
# features = cc.markers,
# ncol = 2,
# assay = "RNA",
# slot = "data",
# same.y.lims=T)
UMAP plot coloured by phase:
DefaultAssay(seurat) <- "RNA"
phase_palette <- c("G1" = "#F8766D", "G2M" = "#00BA38", "S" = "#619CFF")
DimPlot(seurat, reduction = "umap.rna", group.by = "Phase", dims = c(1, 2), cols = phase_palette)
We register a warning if more than half of the cells labelled G2M have low TOP2A expression. However, if upon further inspection of the proportion of cells expressing each set of genes, we see the highest proportions of cells expressing genes from their associated gene set, it is deemed acceptable.
top2a_gene <- switch(config$species,
"h_sapiens" = "TOP2A",
"m_musculus" = "Top2a")
if (top2a_gene %in% rownames(seurat)) {
# subset to cells assigned as G2M phase
top2a_expr <- as.matrix(subset(seurat,
subset = Phase == "G2M")[["RNA"]][top2a_gene, ])
top2a_expr <- t(top2a_expr)
# if more than 50% of cells assigned as G2M phase have low TOP2A expression,
# flag it
# note: this warning is arbitrary
(prop_low_top2a <- (sum(top2a_expr < 1) / length(top2a_expr)))
if (prop_low_top2a > 0.5) {
print("low TOP2A expression")
warnings$CC_ASSIGNMENT <- TRUE}
} else {
# Top2a is not detected
warnings$CC_ASSIGNMENT <- TRUE
print("Top2a not detected")
}
## [1] "low TOP2A expression"
# plot the expression of each gene set in cells classified as G2/M, S and G1
prop_df <- FetchData(seurat, c("ident", "barcodes", g2m_genes, s_genes)) %>%
group_by(ident) %>%
mutate(n_total = n()) %>%
pivot_longer(cols = 3:(ncol(.) - 1), names_to = "gene") %>%
mutate(
gene_set = case_when(
gene %in% g2m_genes ~ "geneset:g2/m", # index for the genes
gene %in% s_genes ~ "geneset:s"
)
) %>%
filter(value>0) %>%
group_by(ident, gene, gene_set, n_total) %>%
count() %>%
mutate(
proportion = n/n_total
)
prop_df %>%
ggplot(aes(x = gene, y = proportion)) +
geom_bar(stat = "identity") +
# scale_fill_manual(values = c("highlight" = "red", "normal" = "steelblue")) +
labs(title = "Proportion of Cells Expressing Each Gene",
x = "Gene", y = "Proportion of Cells") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 5),
legend.position = "none") +
facet_wrap(~gene_set, scales = "free_x")
prop_df %>%
ggplot(aes(x = gene, y = proportion, fill = ident)) +
geom_bar(stat = "identity") +
# scale_fill_manual(values = c("highlight" = "red", "normal" = "steelblue")) +
labs(title = "Proportion of Cells Expressing Each Gene",
x = "Gene", y = "Proportion of Cells") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 0.5, size = 5)) +
facet_grid(ident~gene_set, scales = "free_x")
All parameters and filtering metrics are saved both within the seurat
object in the seurat@misc slot, as well as in the output
directory.
# parameters are saved with the Seurat object
seurat@misc$params$config <- config
seurat@misc$filtering_metrics <- filtering_metrics
seurat@misc$n_cells <- N_cells_metrics
# write metrics/thresholds to file
filtering_metrics_out <- filtering_metrics %>%
gather("metrics_name", "value", -criterion) %>%
unite("metrics", criterion:metrics_name, sep = "_") %>%
spread("metrics", "value") %>%
bind_cols(N_cells_metrics)
write_tsv(filtering_metrics_out, path = file.path(out_dir, "seurat_metrics.tsv"))
Save the Seurat object for downstream analysis:
# Ensure RNA is the default assay
DefaultAssay(seurat) <- "RNA"
save(seurat, file = file.path(out_dir, "seurat.Rda"))
This document was last rendered on:
## 2025-08-08 10:51:44
The R session info:
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] future_1.58.0 ggridges_0.5.6
## [3] reshape2_1.4.4 stringr_1.5.1
## [5] dplyr_1.1.4 tidyr_1.3.1
## [7] biovizBase_1.46.0 ggExtra_0.10.1
## [9] ggpointdensity_0.2.0 BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [11] BSgenome_1.66.3 rtracklayer_1.58.0
## [13] Biostrings_2.66.0 XVector_0.38.0
## [15] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.22.0
## [17] AnnotationFilter_1.22.0 GenomicFeatures_1.50.4
## [19] AnnotationDbi_1.60.2 scDblFinder_1.12.0
## [21] SingleCellExperiment_1.20.1 SummarizedExperiment_1.28.0
## [23] Biobase_2.58.0 GenomicRanges_1.50.2
## [25] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [27] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [29] MatrixGenerics_1.10.0 matrixStats_1.5.0
## [31] Signac_1.9.0 SeuratObject_4.1.4
## [33] Seurat_4.3.0 viridis_0.6.5
## [35] viridisLite_0.4.2 plotly_4.11.0
## [37] ggplot2_3.5.2 purrr_1.0.4
## [39] readr_2.1.5 here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 scattermore_1.2
## [3] bit64_4.6.0-1 knitr_1.50
## [5] irlba_2.3.5.1 DelayedArray_0.24.0
## [7] rpart_4.1.16 data.table_1.17.6
## [9] KEGGREST_1.38.0 RCurl_1.98-1.17
## [11] generics_0.1.4 ScaledMatrix_1.6.0
## [13] cowplot_1.1.3 RSQLite_2.4.1
## [15] RANN_2.6.2 bit_4.6.0
## [17] tzdb_0.5.0 spatstat.data_3.1-6
## [19] xml2_1.3.3 httpuv_1.6.16
## [21] xfun_0.52 hms_1.1.3
## [23] jquerylib_0.1.4 evaluate_1.0.4
## [25] promises_1.3.3 restfulr_0.0.15
## [27] progress_1.2.3 dbplyr_2.5.0
## [29] igraph_2.1.4 DBI_1.2.3
## [31] htmlwidgets_1.6.4 spatstat.geom_3.4-1
## [33] crosstalk_1.2.1 backports_1.5.0
## [35] biomaRt_2.54.1 deldir_2.0-4
## [37] sparseMatrixStats_1.10.0 vctrs_0.6.5
## [39] ROCR_1.0-11 abind_1.4-8
## [41] cachem_1.1.0 withr_3.0.2
## [43] progressr_0.15.1 vroom_1.6.5
## [45] checkmate_2.3.2 sctransform_0.4.2
## [47] GenomicAlignments_1.34.1 prettyunits_1.2.0
## [49] scran_1.26.2 goftest_1.2-3
## [51] cluster_2.1.8.1 lazyeval_0.2.2
## [53] crayon_1.5.1 hdf5r_1.3.12
## [55] spatstat.explore_3.4-3 labeling_0.4.3
## [57] edgeR_3.40.2 pkgconfig_2.0.3
## [59] nlme_3.1-168 vipor_0.4.7
## [61] ProtGenerics_1.30.0 nnet_7.3-17
## [63] rlang_1.1.6 globals_0.18.0
## [65] lifecycle_1.0.4 miniUI_0.1.2
## [67] filelock_1.0.3 BiocFileCache_2.6.1
## [69] rsvd_1.0.5 dichromat_2.0-0.1
## [71] ggrastr_1.0.2 rprojroot_2.0.4
## [73] polyclip_1.10-7 lmtest_0.9-40
## [75] Matrix_1.6-1.1 zoo_1.8-14
## [77] base64enc_0.1-3 beeswarm_0.4.0
## [79] png_0.1-8 rjson_0.2.23
## [81] bitops_1.0-9 KernSmooth_2.23-26
## [83] blob_1.2.4 DelayedMatrixStats_1.20.0
## [85] spatstat.univar_3.1-3 parallelly_1.45.0
## [87] spatstat.random_3.4-1 beachmat_2.14.2
## [89] scales_1.4.0 memoise_2.0.1
## [91] magrittr_2.0.3 plyr_1.8.9
## [93] ica_1.0-3 zlibbioc_1.44.0
## [95] compiler_4.2.0 dqrng_0.4.1
## [97] BiocIO_1.8.0 RColorBrewer_1.1-3
## [99] fitdistrplus_1.2-2 Rsamtools_2.14.0
## [101] cli_3.6.5 listenv_0.9.1
## [103] patchwork_1.3.1 pbapply_1.7-2
## [105] htmlTable_2.4.3 Formula_1.2-5
## [107] MASS_7.3-9 tidyselect_1.2.1
## [109] stringi_1.8.7 yaml_2.3.10
## [111] BiocSingular_1.14.0 locfit_1.5-9.12
## [113] ggrepel_0.9.6 grid_4.2.0
## [115] VariantAnnotation_1.44.1 sass_0.4.10
## [117] fastmatch_1.1-6 tools_4.2.0
## [119] future.apply_1.20.0 parallel_4.2.0
## [121] rstudioapi_0.17.1 foreign_0.8-82
## [123] bluster_1.8.0 metapod_1.6.0
## [125] gridExtra_2.3 farver_2.1.2
## [127] Rtsne_0.17 digest_0.6.37
## [129] shiny_1.11.0 Rcpp_1.0.14
## [131] scuttle_1.8.4 later_1.4.2
## [133] RcppAnnoy_0.0.22 httr_1.4.7
## [135] colorspace_2.1-1 XML_3.99-0.18
## [137] tensor_1.5.1 reticulate_1.42.0
## [139] splines_4.2.0 uwot_0.2.3
## [141] RcppRoll_0.3.1 statmod_1.5.0
## [143] spatstat.utils_3.1-4 scater_1.26.1
## [145] sp_2.2-0 xgboost_1.7.11.1
## [147] xtable_1.8-4 jsonlite_2.0.0
## [149] R6_2.6.1 Hmisc_5.2-3
## [151] pillar_1.10.2 htmltools_0.5.8.1
## [153] mime_0.13 DT_0.33
## [155] glue_1.8.0 fastmap_1.2.0
## [157] BiocParallel_1.32.6 BiocNeighbors_1.16.0
## [159] codetools_0.2-20 lattice_0.22-7
## [161] bslib_0.9.0 spatstat.sparse_3.1-0
## [163] tibble_3.3.0 curl_6.4.0
## [165] ggbeeswarm_0.7.2 leiden_0.4.3.1
## [167] survival_3.8-3 limma_3.54.2
## [169] rmarkdown_2.29 GenomeInfoDbData_1.2.9
## [171] gtable_0.3.6