Project 1- scRNAseq
- Week 1
- 1. Loading the Data
- 2. Create the sample sheet
- 3. Add meta-data
- Week 2
- 4. Preprocessing
- 5. Dimensionality Reduction
- Week 3
- 6. Cell Type Annotation
- 7. Differential Analysis
- 8. Pathway Analysis
# Set working directory
setwd("~/scbi_ds1/notebook")
# Set a global seed for reproducibility
set.seed(1234)
# TODO: Fix tidyverse installation
# install.packages("tidyverse")
# library("tidyverse")
# Check all libraries are installed
suppressPackageStartupMessages({
library(dplyr)
library(spatstat.core)
library(Seurat)
library(patchwork)
library(DoubletFinder)
library(SingleR)
library(enrichR)
library(CellChat)
library(SingleCellExperiment)
library(SeuratWrappers)
# library(tidyverse)
library(monocle3)
library(celldex)
library(knitr)
})
Week 1
1. Loading the Data
We will load the expression matrices from the dataset and construct a Seurat object. We will need to load two files: one containing data on Bone Marrow Mononuclear Cells (BMMC) and the other on CD34+ Enriched Bone Marrow Cells (CD34).
# Load the .rds files
# Bone Marrow Mononuclear Cells (BMMC)
<- readRDS("../data/GSM4138872_scRNA_BMMC_D1T1.rds")
bmmc_d1t1 <- readRDS("../data/GSM4138873_scRNA_BMMC_D1T2.rds")
bmmc_d1t2
# CD34+ Enriched Bone Marrow Cells (CD34)
<- readRDS("../data/GSM4138874_scRNA_CD34_D2T1.rds")
cd34_d2t1 <- readRDS("../data/GSM4138875_scRNA_CD34_D3T1.rds") cd34_d3t1
2. Create the sample sheet
We will label each sample with the corresponding metadata from the table 1 provided in the assignment sheet.
# Define the sample metadata based on Table 1
<- data.frame(
metadata Sample = c("BMMC_D1T1", "BMMC_D1T2", "CD34_D2T1", "CD34_D3T1"),
Donor = c("D1", "D1", "D2", "D3"),
Replicate = c("T1", "T2", "T1", "T1"),
Sex = c("F", "F", "M", "F"),
row.names = c("BMMC_D1T1", "BMMC_D1T2", "CD34_D2T1", "CD34_D3T1")
)
3. Add meta-data
# Create Seurat objects with metadata added
<- CreateSeuratObject(counts = bmmc_d1t1, project = "BMMC_D1T1")
bmmc_d1t1_seurat @meta.data$Sample <- "BMMC_D1T1"
bmmc_d1t1_seurat@meta.data$Donor <- "D1"
bmmc_d1t1_seurat@meta.data$Replicate <- "T1"
bmmc_d1t1_seurat@meta.data$Sex <- "F"
bmmc_d1t1_seurat
<- CreateSeuratObject(counts = bmmc_d1t2, project = "BMMC_D1T2")
bmmc_d1t2_seurat @meta.data$Sample <- "BMMC_D1T2"
bmmc_d1t2_seurat@meta.data$Donor <- "D1"
bmmc_d1t2_seurat@meta.data$Replicate <- "T2"
bmmc_d1t2_seurat@meta.data$Sex <- "F"
bmmc_d1t2_seurat
<- CreateSeuratObject(counts = cd34_d2t1, project = "CD34_D2T1")
cd34_d2t1_seurat @meta.data$Sample <- "CD34_D2T1"
cd34_d2t1_seurat@meta.data$Donor <- "D2"
cd34_d2t1_seurat@meta.data$Replicate <- "T1"
cd34_d2t1_seurat@meta.data$Sex <- "M"
cd34_d2t1_seurat
<- CreateSeuratObject(counts = cd34_d3t1, project = "CD34_D3T1")
cd34_d3t1_seurat @meta.data$Sample <- "CD34_D3T1"
cd34_d3t1_seurat@meta.data$Donor <- "D3"
cd34_d3t1_seurat@meta.data$Replicate <- "T1"
cd34_d3t1_seurat@meta.data$Sex <- "F"
cd34_d3t1_seurat
# Check that the metadata has been added correctly
# print(bmmc_d1t1_seurat@meta.data)
# print(bmmc_d1t2_seurat@meta.data)
# print(cd34_d2t1_seurat@meta.data)
# print(cd34_d3t1_seurat@meta.data)
For each sample, we report the following information:
3.1. How many cells are in each sample?
3.2. . How many genes are in the expression matrices?
# Function to extract and print required information
<- function(seurat_object, sample_name) {
report_sample_info cat(sample_name, ":\n")
# 1. Report the number of cells
<- ncol(seurat_object)
num_cells cat("Number of cells in the sample:", num_cells, "\n")
# 2. Report the number of genes
<- nrow(seurat_object)
num_genes cat("Number of genes in the sample:", num_genes, "\n")
}
# Report information for each sample
report_sample_info(bmmc_d1t1_seurat, "BMMC_D1T1")
## BMMC_D1T1 :
## Number of cells in the sample: 6270
## Number of genes in the sample: 20287
report_sample_info(bmmc_d1t2_seurat, "BMMC_D1T2")
## BMMC_D1T2 :
## Number of cells in the sample: 6332
## Number of genes in the sample: 20287
report_sample_info(cd34_d2t1_seurat, "CD34_D2T1")
## CD34_D2T1 :
## Number of cells in the sample: 2424
## Number of genes in the sample: 20287
report_sample_info(cd34_d3t1_seurat, "CD34_D3T1")
## CD34_D3T1 :
## Number of cells in the sample: 5752
## Number of genes in the sample: 20287
# Save each processed Seurat object
# saveRDS(bmmc_d1t1_seurat, file = "../results/seurat/bmmc_d1t1_seurat.rds")
# saveRDS(bmmc_d1t2_seurat, file = "../results/seurat/bmmc_d1t2_seurat.rds")
# saveRDS(cd34_d2t1_seurat, file = "../results/seurat/cd34_d2t1_seurat.rds")
# saveRDS(cd34_d3t1_seurat, file = "../results/seurat/cd34_d3t1_seurat.rds")
3.3. What information is now part of the meta-data of the objects?
Column name present in the metadata of all samples:
orig.ident: this often contains the sample identity if known, in our case, we know the identity of the samples.
nCount_RNA: Number of UMIs per cell.
nFeature_RNA: number of genes detected per cell.
Sample: <orig.ident>_
Donor: Identity of the human donor from whom the cell has been collected (assumed).
Replicate: TODO
Sex: Biological sexual identity of the origin human.
Week 2
4. Preprocessing
4.1 **Preprocessing
4.1.1. Correct order of preprocessing steps:
Filtering is the initial step to remove low-quality cells and genes from each dataset. This ensures that only high-quality data is used for downstream analyses.
Normalization adjusts for technical variations such as differences in sequencing depth between cells, making the gene expression levels comparable across all cells.
Feature Selection identifies the most variable genes across the dataset, which are most informative for downstream analyses like dimensionality reduction and clustering.
Doublet Removal is performed after normalization and feature selection because tools like DoubletFinder require normalized data and principal component analysis (PCA) results to accurately detect doublets. (as documented in its Github docuementation)
Re-normalization
Re-feature Selection
4.1.2 Steps Before and After Merging (Task 4.2):
All steps are performed after merging the dataset for the non-batch corrected procedure.
4.1.3. Performing preprocessing on the data
4.1.3.1. Filtering
Before filtering, lets get some QC Metrics.
4.1.3.1.1. QC Metrics
# Merging the dataset to get the QC metrics.
<- merge(
merged_seurat x = bmmc_d1t1_seurat,
y = list(bmmc_d1t2_seurat, cd34_d2t1_seurat, cd34_d3t1_seurat),
add.cell.ids = c("d1t1", "d1t2", "d2t1", "d3t1"),
project = "Filtering"
)
# log10 transform to compare the the samples
# Number of genes per UMI for each cells
$log10GenesPerUMI <-log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)
merged_seurat
# Mitochondrial Gene percentage
# merged_seurat[["percent.mt"]] <- PercentageFeatureSet(object = merged_seurat, pattern = "^MT-")
# calculate the mitoRatio as a proportion (0-1 scale) if you need it
# merged_seurat$mitoRatio <- merged_seurat$percent.mt / 100
# Create orig.ident_mod column based on cell names (for visualization purpose)
$orig.ident_mod <- sapply(strsplit(rownames(merged_seurat@meta.data), "_"), `[`, 1)
merged_seurat
# Apply mapping to convert abbreviated IDs to full names
# merged_seurat$orig.ident_mod <- sample_mapping[merged_seurat$orig.ident_mod]
# Verify the new column
# table(merged_seurat$orig.ident_mod)
4.1.3.1.2. Cell Counts
# View the distribution of cell counts
table(merged_seurat$orig.ident_mod) %>%
barplot(main="Cell Counts per Sample",
xlab="Sample", ylab="Cell Count", col="steelblue", las=2)
4.1.3.1.2. UMI Counts per Cell
# Visualize the distribution of UMI counts per cell using orig.ident_mod
VlnPlot(merged_seurat, features = "nCount_RNA", group.by = "orig.ident_mod", pt.size = 0.1) +
ggtitle("UMI Counts per Cell") +
ylab("UMI Count") +
theme_minimal()
4.1.3.1.3. Genes Detected per Cell
# Visualize the distribution of genes detected per cell using orig.ident_mod
VlnPlot(merged_seurat, features = "nFeature_RNA", group.by = "orig.ident_mod", pt.size = 0.1) +
ggtitle("Genes Detected per Cell") +
ylab("Gene Count") +
theme_minimal()
4.1.3.1.4. UMIs vs. Genes Detected
# Scatter plot of UMIs vs. genes detected per cell using orig.ident_mod
FeatureScatter(merged_seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident_mod") +
ggtitle("UMIs vs. Genes Detected") +
xlab("UMI Count") +
ylab("Gene Count") +
theme_minimal()
4.1.3.1.5. Novelty
# Calculate novelty as UMI-to-gene ratio if not already present
$novelty <- merged_seurat$nCount_RNA / merged_seurat$nFeature_RNA
merged_seurat
# Visualize novelty using orig.ident_mod
VlnPlot(merged_seurat, features = "novelty", group.by = "orig.ident_mod", pt.size = 0.1) +
ggtitle("Novelty (UMI-to-Gene Ratio)") +
ylab("Novelty (UMI/Gene)") +
theme_minimal()
4.1.3.1.6.Plots
# For each Seurat object
VlnPlot(merged_seurat, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
4.1.4. Choosing Cut-off parameters
** Name the parameters that have been used for filtering.**
We can define thresholds (cut-off parameter) based on multiples of MAD.
MAD: Median Absolute Deviation is a measurement of the spread of data. It’s calculated by taking the median of the absolute deviations from the median. For nFeature_RNA for example, MAD represents the typical deviation from the median.
Using Median and MAD:
Lower Threshold (nFeature_RNA):
\[ \text{Lower Threshold} = \text{Median}_{\text{nFeature\_RNA}} - k \times \text{MAD}_{\text{nFeature\_RNA}} \]
Upper Threshold (nFeature_RNA):
\[ \text{Upper Threshold} = \text{Median}_{\text{nFeature\_RNA}} + k \times \text{MAD}_{\text{nFeature\_RNA}} \]
4.1.4.1. Cut-off for genes
<- merged_seurat@meta.data$nFeature_RNA
nFeature_RNA_values <- merged_seurat@meta.data$nCount_RNA
nCount_RNA_values
# For nFeature_RNA
# Median and MAD
<- median(nFeature_RNA_values)
nFeature_median <- mad(nFeature_RNA_values)
nFeature_mad
# Mean and Standard Deviation
<- mean(nFeature_RNA_values)
nFeature_mean <- sd(nFeature_RNA_values) nFeature_sd
Choosing k=6 would include most typical data points while excluding outliers, covering a large portion of data in a normal distribution. It should minimizes the impact of extreme values.
It also matches the visual min-max threshold of the plots that we previously visualised.
# Determining min/max threshold
<- 6
k
# For nFeature_RNA using median and MAD
<- nFeature_median - k * nFeature_mad
nFeature_lower
# Ensure lower thresholds are not negative
<- max(nFeature_lower, min(nFeature_RNA_values))
nFeature_lower
cat("lower threshold for Genes Detected:", nFeature_lower, "\n")
## lower threshold for Genes Detected: 403
4.1.4.2. Cut-off for UMI
# Calculate Median and MAD for nCount_RNA
<- median(nCount_RNA_values)
nCount_median <- mad(nCount_RNA_values)
nCount_mad
# Calculate Mean and Standard Deviation for nCount_RNA
<- mean(nCount_RNA_values)
nCount_mean <- sd(nCount_RNA_values)
nCount_sd
# Choosing k = 3 for thresholding
<- 6
k
# Determine min/max threshold for nCount_RNA using median and MAD
<- nCount_median - k * nCount_mad
nCount_lower <- nCount_median + k * nCount_mad
nCount_upper
# Ensure lower threshold is not negative
<- max(nCount_lower, min(nCount_RNA_values))
nCount_lower
cat("lower threshold for UMI:", nCount_lower, "\n")
## lower threshold for UMI: 1002
cat("upper threshold for UMI:", nCount_upper, "\n")
## upper threshold for UMI: 10850.22
# # Mean and SD deviation. Not going to use them.
# # For nFeature_RNA
# nFeature_lower <- nFeature_mean - k * nFeature_sd
# nFeature_upper <- nFeature_mean + k * nFeature_sd
#
# # Ensure lower thresholds are not negative
# nFeature_lower <- max(nFeature_lower, min(nFeature_RNA_values))
4.1.5. Apply Filtering
# Apply filtering
<- subset(merged_seurat,
merged_seurat_filtered subset = nFeature_RNA > nFeature_lower &
> nCount_lower &
nCount_RNA < nCount_upper) nCount_RNA
We filtered cells where genes are lower than a certain threshold (no upper threshold selected). On the other hand, for UMI, both lower and upper threshold is selected.
No upper threshold for genes is selected as the original analyses have not selected an upper threshold as well.
TODO: better explanation
4.1.5.1. Compare before and after filtration
# Compare
# Before filtering
VlnPlot(merged_seurat, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
# After filtering
VlnPlot(merged_seurat_filtered, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
4.1.6. Doublet removal with DoubletFinder
4.1.6.1. Explanation of why we are performing doublet removal
Purpose: Remove potential doublets to ensure that each observation represents a single cell.
4.1.6.2. Normalization
# Normalization
<- NormalizeData(merged_seurat_filtered) merged_seurat_filtered
4.1.6.3. Feature Selection
# Identify Highly Variable Features
<- FindVariableFeatures(merged_seurat_filtered, selection.method = "vst", nfeatures = 3000) merged_seurat_filtered
TODO: why 3000 features? For initial data exploration 2000 to 3000 HVF is recommended/conventional. But it might be interesting to check out UMAP results for different values.
4.1.6.4. Scale
# Scale the data
<- ScaleData(merged_seurat_filtered, features = VariableFeatures(object = merged_seurat_filtered)) merged_seurat_filtered
## Centering and scaling data matrix
4.1.6.5. Run PCA
# Run PCA
<- RunPCA(merged_seurat_filtered,
merged_seurat_filtered npcs = 50 # PC calculated
)
## PC_ 1
## Positive: ACTB, S100A9, FCN1, S100A8, VCAN, S100A11, S100A10, S100A12, CD14, JUND
## TYROBP, COTL1, SERPINA1, SLC2A3, JUNB, CYBB, MNDA, ITGB2, TYMP, FCER1G
## FGR, FGL2, MS4A6A, CFP, PLBD1, CTSD, CD68, CTSS, CLEC7A, MPEG1
## Negative: STMN1, NPM1, HNRNPA1, NUCB2, EEF1B2, CDK6, HMGB1, HSP90AB1, EBPL, SNRPE
## PRSS57, NAP1L1, TUBA1B, FABP5, C6orf48, TYMS, HSPD1, DUT, NUCKS1, HMGA1
## TUBB, UBB, HIST1H4C, NCL, IGFBP7, RAN, SPINK2, SNRPD1, IMPDH2, PARP1
## PC_ 2
## Positive: CD3E, LTB, EVL, CD3D, LCK, IL32, PTPRCAP, KLF2, IL7R, CD7
## CD3G, LAT, TCF7, CD27, CD69, PIK3IP1, SPOCK2, CD247, ISG20, CD2
## ETS1, IFITM1, LEF1, CD6, JUNB, SEPT1, ARL4C, RBM38, IL2RG, BCL11B
## Negative: CST3, LYZ, MNDA, S100A9, S100A8, GRN, APLP2, FCN1, VCAN, LGALS1
## TYROBP, CSTA, AIF1, GSTP1, MS4A6A, SPI1, CFD, FCER1G, TNFSF13B, CEBPD
## LST1, PLAUR, S100A12, RNF130, CD14, TYMP, CYBB, SERPINA1, H2AFY, HLA-DRA
## PC_ 3
## Positive: SPIB, FAM129C, CD79A, VPREB3, CD79B, TCL1A, IGLL5, IRF4, MZB1, FCRLA
## CD72, MME, CD74, VPREB1, BLNK, PLD4, CD19, POU2AF1, AFF3, CD9
## PAX5, BLK, HLA-DRA, EBF1, TCF4, HLA-DQA1, IRF8, ABHD15, CCDC50, CXCR4
## Negative: CYTL1, SERPINB1, FTL, ANXA1, PRSS57, EBPL, SLC40A1, S100A4, S100A6, HSP90AB1
## MLLT3, AIF1, GATA2, CRHBP, ANKRD28, SNHG8, ALDH1A1, SRGN, FAM117A, ITM2A
## LAPTM4B, EEF1B2, AVP, MGST1, IL1B, NPR3, PPA1, CREG1, SPINK2, HOPX
## PC_ 4
## Positive: TOP2A, UBE2C, MKI67, NUSAP1, CDK1, RRM2, CCNA2, BIRC5, AHSP, CA1
## TPX2, HBB, ZWINT, CDKN3, DLGAP5, AURKB, CENPF, ASPM, HMMR, CENPM
## TYMS, TK1, UBE2T, CCNB2, PRC1, HMGB2, PCNA, NUF2, HBD, S100A6
## Negative: CD74, HLA-DRA, HLA-DPB1, HLA-DPA1, SPINK2, CRHBP, AVP, HLA-DRB5, HLA-DRB1, MZB1
## ITM2C, SOX4, BCL11A, HLA-DMA, HLA-DQB1, TCF4, SNHG7, EGFL7, HOPX, CD34
## HLA-DQA1, ZFAS1, MSI2, NRIP1, DDAH2, MEF2C, PCDH9, GAPT, CXXC5, MDK
## PC_ 5
## Positive: HBD, CA1, KLF1, AHSP, APOC1, HBB, CNRIP1, ANK1, RHAG, GATA1
## KCNH2, CD79A, SMIM1, MYL4, HBQ1, CA2, TFR2, FHL2, IGLL5, VPREB3
## TCL1A, CD79B, BLVRB, SPTA1, ALDH1A1, EPCAM, APOE, SELENBP1, TPM1, NFIA
## Negative: MPO, VIM, PLAC8, CST7, NKG7, SRGN, CTSG, AZU1, C12orf75, ACTG1
## ELANE, HCST, PRTN3, CYBA, GSTP1, SPNS3, C1QTNF4, CALR, HSP90B1, MS4A3
## RNASE2, ITM2C, RAB32, SSR4, FLT3, GZMB, CLIC3, LGALS1, IRF8, ENO1
4.1.6.6. Elbow Point
The Elbow Plot is commonly used to help decide the number of PCs to retain. By plotting the variance explained by each PC, we look for an “elbow point” where the added value of each subsequent PC diminishes significantly.
ElbowPlot(merged_seurat_filtered)
### 4.1.6.7. Choosing elbow point
# Get the standard deviation of each PC and calculate the variance explained
<- merged_seurat_filtered[["pca"]]@stdev
stdev <- (stdev^2) / sum(stdev^2) * 100
variance_explained
# Create a data frame to store the variance explained for each PC
<- data.frame(PC = 1:length(variance_explained), VarianceExplained = variance_explained)
pca_variance_df
# Find the elbow point by locating the largest drop-off in variance explained
# We calculate the second derivative and find the point where the rate of decline decreases
<- diff(diff(variance_explained))
second_derivative <- which.max(second_derivative) + 1 # +1 to account for indexing difference
elbow_point
# Manually set elbow_point
<- 20
elbow_point
# Visualize the Elbow Plot with the selected elbow point
library(ggplot2)
ggplot(pca_variance_df, aes(x = PC, y = VarianceExplained)) +
geom_line(color = "blue") +
geom_point() +
geom_vline(xintercept = elbow_point, color = "red", linetype = "dashed") +
annotate("text", x = elbow_point, y = variance_explained[elbow_point],
label = paste("Elbow at PC", elbow_point), color = "red", vjust = -1) +
labs(title = "Elbow Plot with Selected PC", x = "Principal Component", y = "Variance Explained (%)") +
theme_minimal()
4.1.6.8. Estimate optimal pK value
# Estimate optimal pK value
# Perform parameter sweep
<- paramSweep_v3(merged_seurat_filtered, PCs = 1:elbow_point, sct = FALSE) sweep.res.list
## Loading required package: fields
## Loading required package: spam
## Spam version 2.11-0 (2024-10-03) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following object is masked from 'package:stats4':
##
## mle
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: viridisLite
##
## Try help(fields) to get started.
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
<- summarizeSweep(sweep.res.list, GT = FALSE) sweep.stats
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## Loading required package: ROCR
<- find.pK(sweep.stats) bcmvn
## NULL
# Selecting the pK value corresponding to the maximum BCmetric
<- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))
optimal_pK
ggplot(bcmvn, aes(x = as.numeric(as.character(pK)), y = BCmetric)) +
geom_point() +
geom_line() +
geom_vline(xintercept = optimal_pK, color = "red", linetype = "dashed") +
annotate("text", x = optimal_pK, y = max(bcmvn$BCmetric),
label = paste("Optimal pK:", optimal_pK), hjust = -0.1, vjust = -0.5) +
labs(title = "Optimal pK Value Identification", x = "pK", y = "BCmetric")
cat("Optimal pK value:", optimal_pK, "\n")
## Optimal pK value: 0.29
4.1.6.9. Estimate the Expected Number of Doublets
# Assuming a typical doublet rate (e.g., 5-10%)
<- 0.075 # 7.5%
doublet_rate
# Calculate the expected number of doublets
<- round(doublet_rate * ncol(merged_seurat_filtered))
nExp cat("Expected number of doublets:", nExp, "\n")
## Expected number of doublets: 1558
4.1.6.10. Run Doublet Finder
# Run doublet finder
<- doubletFinder_v3(merged_seurat_filtered,
merged_seurat_filtered PCs = 1:elbow_point,
pN = 0.25,
pK = optimal_pK,
nExp = nExp,
reuse.pANN = FALSE,
sct = FALSE)
## [1] "Creating 6925 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
<- "DF.classifications_0.25_0.29_1558"
classification_col
# Verify the contents of the classification column
table(merged_seurat_filtered@meta.data[[classification_col]])
##
## Doublet Singlet
## 1558 19216
# Filter out doublets, keeping only cells classified as "Singlet"
<- subset(merged_seurat_filtered, subset = DF.classifications_0.25_0.29_1558 == "Singlet")
merged_seurat_filtered
# Confirm the number of cells after filtering
cat("Number of cells after doublet removal:", ncol(merged_seurat_filtered), "\n")
## Number of cells after doublet removal: 19216
4.1.7. Re-normalize the data
While we have normalized the data before removing doublet, re-normalizing may ensure data remains constant after running doubletfinder.
- Which Normalization method is used by the Seurat Normalization function by default?
– Answer: LogNormalize
<- NormalizeData(merged_seurat_filtered, normalization.method = "LogNormalize", scale.factor = 10000) merged_seurat_filtered
4.1.8. Re-Feature Selection
4.1.8.1. Purpose
The purpose of Feature Selection is to identify a subset of genes (features) that exhibit the most significant variability across cells. This variability is often indicative of biological differences, such as distinct cell types, cell states, or responses to different conditions. Selecting these features improves the efficiency and effectiveness of downstream analyses, such as clustering, dimensionality reduction, and identifying cell types.
4.1.8.2. How Are Features Selected?
In Seurat, features are typically selected based on high variability
across cells using the FindVariableFeatures()
function,
which implements several methods for feature selection, such as
vst
, mean.var.plot
, and
dispersion
.
1. Calculate Mean and Variance of Each Gene
Seurat calculates the average expression (mean) and variability (variance) of each gene across all cells.
2. Standardize Variability
To ensure comparability across genes with different mean expression levels, Seurat uses a method such as Variance Stabilizing Transformation (VST) to adjust the variance based on the mean expression. This transformation helps in selecting genes with true biological variability rather than variability caused by technical noise.
3. Rank Genes by Variability
After transformation, Seurat ranks genes by their standardized variability, identifying those that deviate the most from expected variance at a given mean expression level.
4. Select Top Variable Genes
Based on the user-defined nfeatures
parameter (commonly
set to 2000), Seurat selects the top genes with the highest variance as
“highly variable genes.” This subset is then used for scaling,
dimensionality reduction (PCA and UMAP), and clustering.
# Identify highly variable features (genes)
<- FindVariableFeatures(
merged_seurat_filtered object = merged_seurat_filtered,
selection.method = "vst",
nfeatures = 3000,
verbose = FALSE
)
4.2 Batch Correction
The batch correction process in Seurat allows the integration of multiple datasets, each potentially generated from different experimental batches or conditions. Batch correction is essential when we want to combine data from separate sources while minimizing batch effects, which can obscure the biological signal of interest. The batch correction process in Seurat aligns multiple datasets from different experimental batches to minimize technical differences. It begins with normalization and identification of highly variable features in each dataset. Shared variable features across datasets are selected as common points for alignment. Using these features, Seurat identifies “anchors,” which are pairs of biologically similar cells across batches. Anchors serve as reference points, allowing Seurat to map datasets onto a shared space. The integration step then adjusts gene expression values across datasets, reducing batch-specific effects while retaining biological signals. Seurat generates an “integrated” assay with batch-corrected data, which replaces the batch-biased individual datasets.
# Split the merged object by batch (e.g., orig.ident_mod)
<- SplitObject(merged_seurat_filtered, split.by = "orig.ident_mod")
seurat_list
# Normalize and find variable features for each dataset
for (i in 1:length(seurat_list)) {
<- NormalizeData(seurat_list[[i]], verbose = FALSE)
seurat_list[[i]] <- FindVariableFeatures(seurat_list[[i]], selection.method = "vst", nfeatures = 3000, verbose = FALSE)
seurat_list[[i]]
}
# Select features for integration
<- SelectIntegrationFeatures(object.list = seurat_list, nfeatures = 3000)
features
# Find integration anchors
<- FindIntegrationAnchors(object.list = seurat_list, anchor.features = features) anchors
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 14546 anchors
## Filtering anchors
## Retained 8687 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 6709 anchors
## Filtering anchors
## Retained 2825 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 6730 anchors
## Filtering anchors
## Retained 3053 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11005 anchors
## Filtering anchors
## Retained 3140 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 10892 anchors
## Filtering anchors
## Retained 3283 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 8194 anchors
## Filtering anchors
## Retained 4060 anchors
# Integrate data
<- IntegrateData(anchorset = anchors) integrated_seurat
## Merging dataset 3 into 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 4 3 into 2 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
# Set the default assay to integrated
DefaultAssay(integrated_seurat) <- "integrated"
5. Dimensionality Reduction
5.1. Dimensionality Reduction
# Scale data
<- rownames(merged_seurat_filtered)
all_genes <- ScaleData(merged_seurat_filtered, features = all_genes) merged_seurat_filtered
## Centering and scaling data matrix
5.1.1. PCA
# Run PCA
<- RunPCA(merged_seurat_filtered, features = VariableFeatures(object = merged_seurat_filtered)) merged_seurat_filtered
## PC_ 1
## Positive: JUND, JUNB, S100A10, S100A9, FCN1, S100A8, S100A11, SLC2A3, VCAN, COTL1
## ITGB2, S100A12, CD14, KLF2, CD52, HCST, NCF1, SERPINA1, TYROBP, CYBB
## FLNA, FGR, MNDA, MS4A6A, TYMP, FCER1G, FGL2, CTSD, PLBD1, CFP
## Negative: STMN1, NUCB2, HNRNPA1, NPM1, SNRPE, PRSS57, EBPL, CDK6, HSP90AB1, NAP1L1
## FABP5, TUBA1B, HMGB1, C6orf48, EEF1B2, HSPD1, TYMS, IGFBP7, TUBB, DUT
## HMGA1, NUCKS1, TXN, SPINK2, HIST1H4C, ANKRD28, CYTL1, SNRPD1, H2AFZ, NCL
## PC_ 2
## Positive: CD3E, EVL, LTB, CD3D, PTPRCAP, LCK, IL32, IL7R, KLF2, CD7
## CD3G, LAT, TCF7, CD69, CD27, PIK3IP1, SPOCK2, ISG20, CD247, CD2
## CXCR4, ETS1, LEF1, SEPT1, ARL4C, IL2RG, RBM38, IFITM1, CD6, BCL11B
## Negative: CST3, LYZ, S100A9, FCN1, S100A8, MNDA, VCAN, S100A12, TYROBP, CD14
## GRN, MS4A6A, SERPINA1, PLAUR, CSTA, APLP2, FCER1G, CYBB, LGALS1, SPI1
## CFD, TYMP, CEBPD, LST1, MPEG1, CD36, CD68, PLBD1, AIF1, FGL2
## PC_ 3
## Positive: CD74, HLA-DRA, CD79A, SPIB, CD79B, FAM129C, MZB1, HLA-DPB1, HLA-DPA1, VPREB3
## TCL1A, IGLL5, HLA-DQA1, TCF4, HLA-DQB1, HLA-DRB5, HLA-DRB1, FCRLA, HLA-DMA, IRF4
## MME, AFF3, HLA-DMB, BLNK, CD72, BLK, PLD4, CD9, CD19, ITM2C
## Negative: S100A6, CD3E, S100A4, CD3D, IL32, CD7, LCK, LAT, CD3G, HBD
## CA1, KLF1, APOC1, SRGN, JUNB, IL7R, BLVRB, ANXA1, HBB, CD247
## AHSP, MT2A, CNRIP1, TNFAIP3, KCNH2, CD2, SPOCK2, S100A10, GZMM, CD27
## PC_ 4
## Positive: AVP, CRHBP, SPINK2, HOPX, EGFL7, AIF1, SERPINB1, CD34, ZFAS1, ANKRD28
## MDK, SNHG8, NPR3, LAPTM4B, NRIP1, ANGPT1, MSRB3, MSI2, PROM1, BAALC
## CRYGD, CSF3R, NPDC1, IL18, C1QTNF4, IDS, BEX2, RBPMS, TAOK3, HTR1F
## Negative: TOP2A, MKI67, AHSP, CA1, NUSAP1, UBE2C, HBB, CDK1, CCNA2, HMMR
## BIRC5, RRM2, DLGAP5, CDKN3, CENPF, ASPM, HBD, RHAG, CCNB2, ZWINT
## AURKB, TPX2, CA2, CENPA, NUF2, UBE2T, MYBL2, ANK1, MYL4, PRC1
## PC_ 5
## Positive: MPO, C12orf75, PLAC8, CST7, SRGN, NKG7, HSP90B1, CTSG, IRF8, ITM2C
## GZMB, AZU1, LGALS1, GSTP1, CLIC3, ACTG1, C1QTNF4, HCST, ELANE, PRTN3
## SPNS3, LILRA4, CALR, RNASE2, SCT, RAB32, RUNX2, FABP5, IL3RA, SSR4
## Negative: HBD, KLF1, CA1, CNRIP1, APOC1, AHSP, ANK1, HBB, GATA1, SMIM1
## RHAG, KCNH2, TFR2, FHL2, CA2, ALDH1A1, CD79A, BLVRB, MYL4, HBQ1
## TPM1, NFIA, ITGA2B, MINPP1, TCL1A, PRKAR2B, SPTA1, IGLL5, APOE, PKIG
5.1.2. Visualise PCA results
# Visualize PCA results
# dims 1:2 because PC1 and PC2 explains maximum variance
#VizDimLoadings(merged_seurat_filtered, dims = 1:elbow_point, reduction = "pca")
DimPlot(merged_seurat_filtered, reduction = "pca")
5.1.3. UMAP
# Run UMAP
<- RunUMAP(merged_seurat_filtered, dims = 1:elbow_point) merged_seurat_filtered
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 23:48:34 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 23:48:34 Read 19216 rows and found 20 numeric columns
## 23:48:34 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 23:48:34 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:48:36 Writing NN index file to temp file /tmp/RtmpaUQ6Ar/file1a810a47ceb20a
## 23:48:36 Searching Annoy index using 1 thread, search_k = 3000
## 23:48:45 Annoy recall = 100%
## 23:48:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:48:52 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:48:54 Commencing optimization for 200 epochs, with 788140 positive edges
## 23:49:11 Optimization finished
# Visualize UMAP
DimPlot(merged_seurat_filtered, reduction = "umap", group.by = "orig.ident")
DimPlot(merged_seurat_filtered, reduction = "umap", label = TRUE)
# Find Neighbors
<- FindNeighbors(merged_seurat_filtered, dims = 1:elbow_point) merged_seurat_filtered
## Computing nearest neighbor graph
## Computing SNN
# Find Clusters
<- FindClusters(merged_seurat_filtered, resolution = 0.25) merged_seurat_filtered
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 19216
## Number of edges: 667366
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9569
## Number of communities: 16
## Elapsed time: 5 seconds
# Visualize Clusters on UMAP
DimPlot(merged_seurat_filtered, reduction = "umap", label = FALSE, pt.size = 0.25)
4.2.1. Comparison of Batch corrected vs Non-corrected datasets
# Set the default assay to "integrated"
DefaultAssay(integrated_seurat) <- "integrated"
# Scale the integrated data
<- ScaleData(integrated_seurat, verbose = FALSE)
integrated_seurat
# Run PCA on the integrated data
<- RunPCA(integrated_seurat, npcs = 50, verbose = FALSE)
integrated_seurat
# Determine the elbow point
# Get the standard deviation of each PC and calculate the variance explained
<- integrated_seurat[["pca"]]@stdev
stdev <- (stdev^2) / sum(stdev^2) * 100
variance_explained
# Create a data frame to store the variance explained for each PC
<- data.frame(PC = 1:length(variance_explained), VarianceExplained = variance_explained)
pca_variance_df
# Find the elbow point by locating the largest drop-off in variance explained
<- diff(diff(variance_explained))
second_derivative <- which.max(second_derivative) + 1 # +1 to adjust indexing
elbow_point
# Manually Set elbow_point
<- 20
elbow_point
# Visualize the Elbow Plot with the selected elbow point
ggplot(pca_variance_df, aes(x = PC, y = VarianceExplained)) +
geom_line(color = "blue") +
geom_point() +
geom_vline(xintercept = elbow_point, color = "red", linetype = "dashed") +
annotate("text", x = elbow_point, y = variance_explained[elbow_point],
label = paste("Elbow at PC", elbow_point), color = "red", vjust = -1) +
labs(title = "Elbow Plot with Selected PC", x = "Principal Component", y = "Variance Explained (%)") +
theme_minimal()
# Run UMAP using the PCs up to the elbow point
<- RunUMAP(integrated_seurat, dims = 1:elbow_point, seed.use = 1234) integrated_seurat
## 23:50:20 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 23:50:20 Read 19216 rows and found 20 numeric columns
## 23:50:20 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 23:50:20 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:50:22 Writing NN index file to temp file /tmp/RtmpaUQ6Ar/file1a810a37068e3
## 23:50:22 Searching Annoy index using 1 thread, search_k = 3000
## 23:50:29 Annoy recall = 100%
## 23:50:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:50:33 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:50:35 Commencing optimization for 200 epochs, with 827712 positive edges
## 23:50:51 Optimization finished
# Find Neighbors
<- FindNeighbors(integrated_seurat, dims = 1:elbow_point) integrated_seurat
## Computing nearest neighbor graph
## Computing SNN
# Find Clusters
<- FindClusters(integrated_seurat, resolution = 0.3) integrated_seurat
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 19216
## Number of edges: 719924
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9472
## Number of communities: 16
## Elapsed time: 4 seconds
# Visualize UMAP colored by sample origin
DimPlot(integrated_seurat, reduction = "umap", group.by = "orig.ident_mod") +
ggtitle("UMAP Colored by Sample Origin")
# Visualize UMAP with cluster labels
DimPlot(integrated_seurat, reduction = "umap", label = TRUE, pt.size = 0.25) +
ggtitle("UMAP with Cluster Labels")
# Compare PCA with or without batch correction
<- DimPlot(merged_seurat_filtered, reduction = "pca", group.by = "orig.ident") + ggtitle("Without Batch Correction")
p1 <- DimPlot(integrated_seurat, reduction = "pca", group.by = "orig.ident") + ggtitle("With Batch Correction")
p2
# Combine plots for comparison
+ p2 p1
4.2.2. Observations from the PCA Plots
In the left plot (Without Batch Correction), we see a clear separation between the “BMMC” (red) and “CD34” (teal) clusters, similar to what we observed in the UMAP plots. This separation along the principal components is likely due to batch effects rather than true biological variation, as the cells of the same type (CD34) should ideally overlap with minimal distinction between batches if no technical artifacts were present. In the right plot (With Batch Correction), the “BMMC” and “CD34” cells are more intermixed along PC1 and PC2, indicating that batch effects have been reduced. The cells now cluster based on intrinsic biological characteristics rather than technical artifacts.
# Compare UMAP with or without batch correction
<- DimPlot(merged_seurat_filtered, reduction = "umap", group.by = "orig.ident") + ggtitle("Without Batch Correction")
p1 <- DimPlot(integrated_seurat, reduction = "umap", group.by = "orig.ident") + ggtitle("With Batch Correction")
p2
print(p1 + p2)
4.2.3. Observations from the UMAP Plots
The left plot (Without Batch Correction) shows distinct clusters that are clearly separated by the sample types, labeled as “BMMC” (red) and “CD34” (teal). This separation likely reflects batch effects rather than true biological differences, as the same cell types from different batches should ideally overlap or be closely intermixed if they share similar characteristics.
The right plot (With Batch Correction) shows a more integrated distribution of “BMMC” and “CD34” cells, suggesting that the batch effects have been minimized, allowing the biological differences to emerge more clearly.
4.2.4. Batch effects in single-cell data can arise from various sources, including:
Sequencing Depth: Different batches may have variations in the number of reads per cell.
Library Preparation: Variations in reagent quality or handling can cause batch effects.
Cell Preparation: Differences in cell handling (e.g., sorting, staining) across batches can influence the data.
Instrument Settings: Variability in instrument calibration across runs can introduce batch effects.
4.2.5. Batch correction is crucial because without it:
The results could be biased by technical artifacts, misrepresenting true biological variability.
Downstream analyses, such as clustering and differential expression, may be inaccurate if technical differences are mistaken for biological differences.
To demonstrate the effect of each parameter, we could create additional visualizations for each suspected source of batch effect (e.g., comparing UMAPs colored by sequencing depth or library preparation batch). However, the primary goal is to achieve a plot where cells cluster by type rather than batch, as seen in the “With Batch Correction” UMAP plot on the right.
Week 3
6. Cell Type Annotation
6.1. Automatic Cell Type Annotation with SingleR
In this section, we use SingleR, a tool for automatic cell type annotation, to assign cell type labels to our integrated Seurat object. We utilize the built-in reference dataset HumanPrimaryCellAtlasData from the celldex package. Finally, we visualize the annotation results on a UMAP plot.
# Load the Human Primary Cell Atlas reference data
<- readRDS("../data/celldex_annot.RDS")
reference
# Extract the normalized expression data from the integrated assay
# SingleR requires a normalized expression matrix
# We use the "integrated" assay's data slot
<- GetAssayData(integrated_seurat, assay = "integrated", slot = "data")
expression_data
# Run SingleR to annotate cell types
# 'labels' parameter uses the main cell type annotations from the reference
<- SingleR(test = expression_data,
singleR_results ref = reference,
labels = reference$label.main,
assay.type.test = "logcounts",
assay.type.ref = "logcounts")
# Add SingleR labels to the Seurat object's metadata
$SingleR.labels <- singleR_results$labels
integrated_seurat
# Verify the annotation results
table(integrated_seurat$SingleR.labels)
##
## B_cell BM BM & Prog. CMP
## 1637 18 32 834
## DC Erythroblast GMP HSC_-G-CSF
## 4 481 698 73
## MEP Monocyte Myelocyte Neutrophils
## 498 3796 49 3
## NK_cell Pre-B_cell_CD34- Pro-B_cell_CD34+ Pro-Myelocyte
## 665 758 809 459
## T_cells Tissue_stem_cells
## 8401 1
# Plot UMAP with SingleR cell type labels
DimPlot(integrated_seurat,
reduction = "umap",
group.by = "SingleR.labels",
label = TRUE,
repel = TRUE) +
ggtitle("UMAP Plot with SingleR Cell Type Annotations") +
theme_minimal()
6.2. Manual Annotation
6.2.1. Perform Differential Expression Analysis for Cell-Type Annotation
# Set the default assay
DefaultAssay(integrated_seurat) <- "integrated"
# Perform differential expression analysis
<- FindAllMarkers(
all_markers object = integrated_seurat,
assay = "integrated",
only.pos = FALSE,
min.pct = 0.25,
logfc.threshold = 0.25
)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
# View the top markers for each cluster
head(all_markers)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## CD7 0 2.191120 0.900 0.679 0 0 CD7
## CD3D 0 2.150795 0.928 0.677 0 0 CD3D
## KLF2 0 2.013932 0.987 0.850 0 0 KLF2
## LTB 0 1.994837 0.986 0.781 0 0 LTB
## LEF1 0 1.956435 0.765 0.622 0 0 LEF1
## TCF7 0 1.945835 0.789 0.644 0 0 TCF7
6.2.2. Use Markers from Table 2 to Identify Cell Types and Assign Names to Clusters
6.2.2.1. Define Cell Type Markers
# Define a list of marker genes for each cell type
<- list(
cell_type_markers HSC = c("CD34", "CD38", "Sca1", "Kit"),
LMPP = c("CD38", "CD52", "CSF3R", "ca1", "Kit", "CD34", "Flk2"),
CLP = c("IL7R"),
GMP_Neutrophils = c("ELANE"),
CMP = c("IL3", "GM-CSF", "M-CSF"),
B = c("CD19"),
Pre_B = c("CD19", "CD34"),
Plasma = c("SDC1", "IGHA1", "IGLC1", "MZB1", "JCHAIN"),
CD8_T = c("CD3D", "CD3E", "CD8A", "CD8B"),
CD4_T = c("CD3D", "CD3E", "CD4"),
NK = c("FCGR3A", "NCAM1", "NKG7", "KLRB1"),
Erythrocytes = c("GATA1", "HBB", "HBA1", "HBA2"),
pDC = c("IRF8", "IRF4", "IRF7"),
cDC = c("CD1C", "CD207", "ITGAM", "NOTCH2", "SIRPA"),
CD14_Monocytes = c("CD14", "CCL3", "CCL4", "IL1B"),
CD16_Monocytes = c("FCGR3A", "CD68", "S100A12"),
Basophils = c("GATA2")
)
6.2.2.2. Create a Function to Match Clusters to Cell Types
# Function to find matching cell type for each cluster
<- function(cluster_markers, cell_type_markers) {
match_cluster_to_cell_type <- sapply(names(cell_type_markers), function(cell_type) {
cell_type_scores <- cell_type_markers[[cell_type]]
markers sum(cluster_markers %in% markers)
})<- names(cell_type_scores)[which.max(cell_type_scores)]
best_match return(best_match)
}
6.2.2.3. Create a Function to Match Clusters to Cell Types
# Get list of clusters
<- levels(integrated_seurat)
clusters
# Initialize a vector to store cell type annotations
<- vector("character", length = length(clusters))
cluster_annotations names(cluster_annotations) <- clusters
# Annotate clusters
for (cluster in clusters) {
# Get top markers for this cluster
<- all_markers %>%
cluster_markers filter(cluster == !!cluster) %>%
arrange(p_val_adj) %>%
pull(gene) %>%
unique()
# Find best matching cell type
<- match_cluster_to_cell_type(cluster_markers, cell_type_markers)
cell_type
# Assign cell type abbreviation (in brackets)
<- paste0(cell_type, " (", cluster, ")")
cluster_annotations[cluster]
}
# Display cluster annotations
cluster_annotations
## 0 1 2
## "LMPP (0)" "LMPP (1)" "LMPP (2)"
## 3 4 5
## "Erythrocytes (3)" "Erythrocytes (4)" "LMPP (5)"
## 6 7 8
## "LMPP (6)" "LMPP (7)" "Erythrocytes (8)"
## 9 10 11
## "Erythrocytes (9)" "LMPP (10)" "CD14_Monocytes (11)"
## 12 13 14
## "LMPP (12)" "LMPP (13)" "NK (14)"
## 15
## "Erythrocytes (15)"
6.2.2.4. Missing Markers in the datasets
<- setdiff(unique(unlist(cell_type_markers)), rownames(merged_seurat))
missing_genes_merged_seurat cat("Missing gene markers in unfiltered merged seurat object:", missing_genes_merged_seurat)
## Missing gene markers in unfiltered merged seurat object: Sca1 Kit ca1 Flk2 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN
<- setdiff(unique(unlist(cell_type_markers)), rownames(merged_seurat_filtered))
missing_genes_merged_seurat_filtered cat("Missing gene markers in filtered merged seurat object (non-batch corrected):", missing_genes_merged_seurat)
## Missing gene markers in filtered merged seurat object (non-batch corrected): Sca1 Kit ca1 Flk2 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN
<- setdiff(unique(unlist(cell_type_markers)), rownames(integrated_seurat[["integrated"]]))
missing_genes_integrated_seurat cat("Missing gene markers in filtered merged seurat object (batch-corrected):", missing_genes_integrated_seurat)
## Missing gene markers in filtered merged seurat object (batch-corrected): Sca1 Kit ca1 Flk2 IL3 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN CD3E CD207
It looks like there are some marker genes that were lost in the batch correction procedure.
<- setdiff(missing_genes_merged_seurat_filtered, missing_genes_integrated_seurat)
unique_missing_in_merged <- setdiff(missing_genes_integrated_seurat, missing_genes_merged_seurat_filtered)
unique_missing_in_integrated
cat("Number of genes uniquely missing in integrated_seurat:", length(unique_missing_in_integrated), "\n")
## Number of genes uniquely missing in integrated_seurat: 3
print(unique_missing_in_integrated)
## [1] "IL3" "CD3E" "CD207"
6.2.2.5. Update Seurat Object with Manual Annotations
# Update cluster identities
<- RenameIdents(integrated_seurat, cluster_annotations)
integrated_seurat
# Alternatively, create a new metadata column for manual annotations
$manual_annotations <- Idents(integrated_seurat) integrated_seurat
6.2.2.6. Plot UMAP with Manual Annotations and Compare with Automatic Annotations
6.2.2.6.1. Manual Annotations UMAP Plot
# Plot UMAP with manual annotations
png("UMAP_Plot_with_Manual_Annotations.png", width = 800, height = 600)
DimPlot(integrated_seurat,
reduction = "umap",
group.by = "manual_annotations",
label = FALSE,
repel = TRUE) +
ggtitle("UMAP Plot with Manual Cell Type Annotations") +
theme_minimal()
dev.off()
## png
## 2
6.2.6.2. Comparison with Automatic Annotations
# Plot automatic annotations
<- DimPlot(integrated_seurat,
p1 reduction = "umap",
group.by = "SingleR.labels",
label = FALSE,
repel = TRUE) +
ggtitle("Automatic Annotations (SingleR)") +
theme_minimal()
# Plot manual annotations
<- DimPlot(integrated_seurat,
p2 reduction = "umap",
group.by = "manual_annotations",
label = FALSE,
repel = TRUE) +
ggtitle("Manual Annotations") +
theme_minimal()
# Combine plots
+ p2 p1
6.2.6 Visualize Gene Expression of Marker Genes
Let’s choose CD3D (T cell marker), CD19 (B cell marker), and NKG7 (NK cell marker).
6.2.2.7. Violin Plots
# Violin plots for selected genes
VlnPlot(integrated_seurat,
features = c("CD3D", "CD19", "NKG7"),
group.by = "manual_annotations",
pt.size = 0) +
theme_minimal()
6.2.2.8. Feature Plots (UMAP)
# Feature plots for selected genes
FeaturePlot(integrated_seurat,
features = c("CD3D", "CD19", "NKG7"),
reduction = "umap",
cols = c("lightgrey", "blue")) +
theme_minimal()
6.3. Cell-Type Proportions (Bonus)
6.3.1. Compute Cell-Type Proportions for Each Sample
# Compute cell-type proportions
<- table(integrated_seurat$manual_annotations, integrated_seurat$Sample)
cell_type_counts <- prop.table(cell_type_counts, margin = 2) * 100
cell_type_proportions
# Convert to data frame for plotting
<- as.data.frame(cell_type_proportions)
cell_type_proportions_df colnames(cell_type_proportions_df) <- c("CellType", "Sample", "Proportion")
6.3.2. Plot Cell-Type Proportions
ggplot(cell_type_proportions_df, aes(x = Sample, y = Proportion, fill = CellType)) +
geom_bar(stat = "identity", position = "fill") +
ylab("Proportion (%)") +
ggtitle("Cell-Type Proportions per Sample") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis_d(option = "plasma") # Options: "magma", "inferno", "plasma", "viridis", "cividis"
6.3.3. Explain Sample Variability in Cell-Type Proportions
The cell-type proportions across the four samples show a similar overall composition dominated by LMPP and Erythrocyte clusters. The BMMC samples (D1T1 and D1T2) have a slightly higher proportion of certain LMPP clusters in the top sections compared to CD34 samples (D2T1 and D3T1). Erythrocyte clusters appear more prominent in the CD34 samples, suggesting an enrichment of these cells in those samples. The lower sections contain minor cell types like CD14_Monocytes and NK cells, which are slightly more prevalent in BMMC samples. In a nutshell, BMMC samples exhibit a bit more diversity in minor cell types, while CD34 samples are more concentrated in certain clusters.
7. Differential Analysis
7.1. Differential Expression Analysis on cell-types
We will use the automatic annotations stored under the singleR.labels for the differential expression analysis.
cat("Unique cell types present:", unique(integrated_seurat$SingleR.labels))
## Unique cell types present: T_cells Monocyte B_cell Pro-B_cell_CD34+ GMP Pre-B_cell_CD34- CMP Erythroblast NK_cell MEP Pro-Myelocyte Myelocyte BM & Prog. HSC_-G-CSF BM Tissue_stem_cells Neutrophils DC
# B cells vs T cells
<- FindMarkers(integrated_seurat, ident.1 = "B_cell", ident.2 = "T_cells", group.by = "SingleR.labels")
b_vs_t
# T cells vs Monocytes
<- FindMarkers(integrated_seurat, ident.1 = "T_cells", ident.2 = "Monocyte", group.by = "SingleR.labels") t_vs_mono
Volcano Plot
# Prepare Data for volcano plot
# Add -log10 adjusted p-values for each data frame
$logP <- -log10(b_vs_t$p_val_adj + 1e-10) # Small value to avoid log(0)
b_vs_t$logP <- -log10(t_vs_mono$p_val_adj + 1e-10)
t_vs_mono
# Volcano plot for B cells vs T cells
ggplot(b_vs_t, aes(x = avg_log2FC, y = logP)) +
geom_point(aes(color = avg_log2FC > 0), alpha = 0.6) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + # Significance threshold
xlab("Log2 Fold Change (B cells vs T cells)") +
ylab("-Log10 Adjusted P-Value") +
ggtitle("Differential Expression: B cells vs T cells") +
theme_minimal() +
scale_color_manual(values = c("blue", "red")) # Blue for downregulated, Red for upregulated
# Volcano plot for T cells vs Monocytes
ggplot(t_vs_mono, aes(x = avg_log2FC, y = logP)) +
geom_point(aes(color = avg_log2FC > 0), alpha = 0.6) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + # Significance threshold
xlab("Log2 Fold Change (T cells vs Monocytes)") +
ylab("-Log10 Adjusted P-Value") +
ggtitle("Differential Expression: T cells vs Monocytes") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"))
7.1.1. Memory Formation on Cells
Naive T cells are mature T cells that have not yet encountered their specific antigen, whereas memory T cells are antigen-experienced cells that persist long-term and respond more rapidly upon re-exposure to the same antigen. Memory T cells arise from naive T cells following antigen exposure, undergoing differentiation processes that enable them to provide quicker and more specific immune responses during subsequent encounters with the same pathogen.
8. Pathway Analysis
8.1. Differential Expression Analysis on groups
8.1.1. Differential Expression Analysis for BMMC vs CD34 (All Cell Types)
# Differential Expression Analysis for BMMC vs CD34 (All Cell Types)
<- FindMarkers(integrated_seurat, ident.1 = "BMMC", ident.2 = "CD34", group.by = "orig.ident") bmmc_vs_cd34
8.1.2. Extract Top 5 DEGs
$gene <- rownames(bmmc_vs_cd34)
bmmc_vs_cd34
<- bmmc_vs_cd34 %>%
top5_bmmc_vs_cd34 arrange(p_val_adj) %>%
head(5) %>%
select(gene, avg_log2FC, p_val_adj)
::kable(top5_bmmc_vs_cd34, caption = "Top 5 Differentially Expressed Genes (BMMC vs CD34)") knitr
gene | avg_log2FC | p_val_adj | |
---|---|---|---|
AHSP | AHSP | -1.2424964 | 0 |
HBM | HBM | 0.2587761 | 0 |
HBA1 | HBA1 | -0.4004577 | 0 |
GNLY | GNLY | 2.2900926 | 0 |
CLC | CLC | -1.4500950 | 0 |
8.2. Pathway analysis on groups
Idents(integrated_seurat) <- "orig.ident"
<- DEenrichRPlot(
top_pathways object = integrated_seurat,
ident.1 = "BMMC",
ident.2 = "CD34",
balanced = TRUE, # Show both upregulated and downregulated terms
logfc.threshold = 0.25, # Only consider genes with a minimum log fold change
test.use = "wilcox", # Use Wilcoxon Rank Sum test (default)
p.val.cutoff = 0.05, # Only include significant DE genes (adjusted p < 0.05)
enrich.database = "GO_Biological_Process_2021", # Choose GO Biological Process terms
num.pathway = 10, # Display top 10 enriched pathways
max.genes = 500
)
## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.
8.3. Biological interpretation
# Filter the results to include only significant genes
<- bmmc_vs_cd34 %>%
significant_genes filter(p_val_adj < 0.05) %>%
arrange(p_val_adj) %>%
pull(gene)
# Use EnrichR to perform enrichment analysis on significant genes
<- enrichr(
enrichr_results
significant_genes,databases = "GO_Biological_Process_2021"
)
## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.
# Extract the GO Biological Process results and find the top pathway
<- enrichr_results[["GO_Biological_Process_2021"]]
go_results
# Arrange the results by adjusted p-value and select the top pathway
<- go_results %>%
top_pathway arrange(Adjusted.P.value) %>%
head(1)
kable(top_pathway,
format = "html",
col.names = c("Term", "Overlap", "P.value", "Adjusted P-value", "Old P-value", "Old Adjusted P-value", "Odds Ratio", "Combined Score", "Genes"),
caption = "Pathway with the lowest P-value")
Term | Overlap | P.value | Adjusted P-value | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes |
---|---|---|---|---|---|---|---|---|
neutrophil degranulation (GO:0043312) | 112/481 | 0 | 0 | 0 | 0 | 5.88068 | 552.6518 | FCN1;CDA;MPO;LGALS3;ANPEP;COTL1;SIRPA;CD93;SLC11A1;CYBB;CYBA;RNASE3;MIF;ILF2;OSCAR;RAB31;TYROBP;RAB37;CRISPLD2;ADAM8;PRTN3;S100A9;SLC27A2;PPIA;KPNB1;S100A8;FTL;CFD;HVCN1;STXBP2;C5AR1;FPR1;MGST1;IQGAP1;CFP;IQGAP2;GNS;SYNGR1;PRDX4;S100A12;MLEC;CD14;ELANE;S100A11;ATP8B4;JUP;GGH;AZU1;PA2G4;NIT2;SERPINA1;HSP90AB1;ITGAM;B4GALT1;MS4A3;ITGB2;CTSZ;HBB;HMGB1;ITGAL;SLC2A5;CTSS;SIRPB1;HK3;TIMP2;CTSH;CTSG;CD36;CEP290;CTSD;CTSC;CCT2;SERPINB1;HSP90AA1;CR1;FCER1G;ANXA2;RNASET2;TUBB;PLAUR;DYNLL1;TNFRSF1B;CKAP4;SERPINB6;VAMP8;FGR;CAT;GRN;CLEC12A;GSTP1;PTAFR;FGL2;RETN;CST3;ALOX5;STOM;CD59;CCT8;LAIR1;GSN;GCA;LILRB2;TUBB4B;LILRB3;TSPAN14;FABP5;IMPDH2;P2RX1;QPCT;MNDA;FOLR3;CD68 |
Neutrophil degranulation (GO:0043312) is the controlled release of secretory granules from neutrophils, which contain pre-stored substances like proteases, lipases, and inflammatory mediators. QuickGo
8.1.3. Differential Expression Analysis for Monocytes in BMMC vs CD34
# Subset for Monocyte cells only
<- subset(integrated_seurat, subset = SingleR.labels == "Monocyte") monocytes
# Differential Expression Analysis for Monocytes in BMMC vs CD34
<- FindMarkers(monocytes, ident.1 = "BMMC", ident.2 = "CD34", group.by = "orig.ident") monocyte_bmmc_vs_cd34
$gene <- rownames(monocyte_bmmc_vs_cd34)
monocyte_bmmc_vs_cd34
# Extract Top 5 DEGs by p-value for Monocytes
<- monocyte_bmmc_vs_cd34 %>%
top5_monocyte_bmmc_vs_cd34 arrange(p_val_adj) %>%
head(5) %>%
select(gene, avg_log2FC, p_val_adj)
::kable(top5_monocyte_bmmc_vs_cd34, caption = "Top 5 Differentially Expressed Genes in Monocytes (BMMC vs CD34)") knitr
gene | avg_log2FC | p_val_adj | |
---|---|---|---|
IFI44L | IFI44L | 0.2763581 | 0 |
FCN1 | FCN1 | 1.2554231 | 0 |
SCT | SCT | -0.4626762 | 0 |
HBA1 | HBA1 | 0.7003820 | 0 |
CLDN10 | CLDN10 | -0.2761770 | 0 |
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")