Project 1- scRNAseq

true, true

Week 1
1. Loading the Data
2. Create the sample sheet
3. Add meta-data
Week 2
4. Preprocessing
5. Dimensionality Reduction
- 5.1. Dimensionality Reduction
- 4.2.1. Comparison of Batch corrected vs Non-corrected datasets
Week 3
6. Cell Type Annotation
7. Differential Analysis
- 7.1. Differential Expression Analysis on cell-types
8. Pathway Analysis

# Set working directory
setwd("~/scbi_ds1/notebook")

# Set a global seed for reproducibility
set.seed(1234)

# TODO: Fix tidyverse installation
# install.packages("tidyverse")
# library("tidyverse")

# Check all libraries are installed
suppressPackageStartupMessages({
library(dplyr)
library(spatstat.core)
library(Seurat)
library(patchwork)
library(DoubletFinder)
library(SingleR)
library(enrichR)
library(CellChat)
library(SingleCellExperiment)
library(SeuratWrappers)
# library(tidyverse)
library(monocle3)
library(celldex)
library(knitr)
})

Week 1

1. Loading the Data

We will load the expression matrices from the dataset and construct a Seurat object. We will need to load two files: one containing data on Bone Marrow Mononuclear Cells (BMMC) and the other on CD34+ Enriched Bone Marrow Cells (CD34).

# Load the .rds files

# Bone Marrow Mononuclear Cells (BMMC)
bmmc_d1t1 <- readRDS("../data/GSM4138872_scRNA_BMMC_D1T1.rds")
bmmc_d1t2 <- readRDS("../data/GSM4138873_scRNA_BMMC_D1T2.rds")

# CD34+ Enriched Bone Marrow Cells (CD34)
cd34_d2t1 <- readRDS("../data/GSM4138874_scRNA_CD34_D2T1.rds")
cd34_d3t1 <- readRDS("../data/GSM4138875_scRNA_CD34_D3T1.rds")

2. Create the sample sheet

We will label each sample with the corresponding metadata from the table 1 provided in the assignment sheet.

# Define the sample metadata based on Table 1
metadata <- data.frame(
  Sample = c("BMMC_D1T1", "BMMC_D1T2", "CD34_D2T1", "CD34_D3T1"),
  Donor = c("D1", "D1", "D2", "D3"),
  Replicate = c("T1", "T2", "T1", "T1"),
  Sex = c("F", "F", "M", "F"),
  row.names = c("BMMC_D1T1", "BMMC_D1T2", "CD34_D2T1", "CD34_D3T1")
)

3. Add meta-data

# Create Seurat objects with metadata added
bmmc_d1t1_seurat <- CreateSeuratObject(counts = bmmc_d1t1, project = "BMMC_D1T1")
bmmc_d1t1_seurat@meta.data$Sample <- "BMMC_D1T1"
bmmc_d1t1_seurat@meta.data$Donor <- "D1"
bmmc_d1t1_seurat@meta.data$Replicate <- "T1"
bmmc_d1t1_seurat@meta.data$Sex <- "F"

bmmc_d1t2_seurat <- CreateSeuratObject(counts = bmmc_d1t2, project = "BMMC_D1T2")
bmmc_d1t2_seurat@meta.data$Sample <- "BMMC_D1T2"
bmmc_d1t2_seurat@meta.data$Donor <- "D1"
bmmc_d1t2_seurat@meta.data$Replicate <- "T2"
bmmc_d1t2_seurat@meta.data$Sex <- "F"

cd34_d2t1_seurat <- CreateSeuratObject(counts = cd34_d2t1, project = "CD34_D2T1")
cd34_d2t1_seurat@meta.data$Sample <- "CD34_D2T1"
cd34_d2t1_seurat@meta.data$Donor <- "D2"
cd34_d2t1_seurat@meta.data$Replicate <- "T1"
cd34_d2t1_seurat@meta.data$Sex <- "M"

cd34_d3t1_seurat <- CreateSeuratObject(counts = cd34_d3t1, project = "CD34_D3T1")
cd34_d3t1_seurat@meta.data$Sample <- "CD34_D3T1"
cd34_d3t1_seurat@meta.data$Donor <- "D3"
cd34_d3t1_seurat@meta.data$Replicate <- "T1"
cd34_d3t1_seurat@meta.data$Sex <- "F"

# Check that the metadata has been added correctly
# print(bmmc_d1t1_seurat@meta.data)
# print(bmmc_d1t2_seurat@meta.data)
# print(cd34_d2t1_seurat@meta.data)
# print(cd34_d3t1_seurat@meta.data)

For each sample, we report the following information:

3.1. How many cells are in each sample?

3.2. . How many genes are in the expression matrices?

# Function to extract and print required information 
report_sample_info <- function(seurat_object, sample_name) {
  cat(sample_name, ":\n")
  
  # 1. Report the number of cells
  num_cells <- ncol(seurat_object)
  cat("Number of cells in the sample:", num_cells, "\n")
  
  # 2. Report the number of genes
  num_genes <- nrow(seurat_object)
  cat("Number of genes in the sample:", num_genes, "\n")
  
}

# Report information for each sample
report_sample_info(bmmc_d1t1_seurat, "BMMC_D1T1")

## BMMC_D1T1 :
## Number of cells in the sample: 6270 
## Number of genes in the sample: 20287

report_sample_info(bmmc_d1t2_seurat, "BMMC_D1T2")

## BMMC_D1T2 :
## Number of cells in the sample: 6332 
## Number of genes in the sample: 20287

report_sample_info(cd34_d2t1_seurat, "CD34_D2T1")

## CD34_D2T1 :
## Number of cells in the sample: 2424 
## Number of genes in the sample: 20287

report_sample_info(cd34_d3t1_seurat, "CD34_D3T1")

## CD34_D3T1 :
## Number of cells in the sample: 5752 
## Number of genes in the sample: 20287

# Save each processed Seurat object 
# saveRDS(bmmc_d1t1_seurat, file = "../results/seurat/bmmc_d1t1_seurat.rds")
# saveRDS(bmmc_d1t2_seurat, file = "../results/seurat/bmmc_d1t2_seurat.rds")
# saveRDS(cd34_d2t1_seurat, file = "../results/seurat/cd34_d2t1_seurat.rds")
# saveRDS(cd34_d3t1_seurat, file = "../results/seurat/cd34_d3t1_seurat.rds")

3.3. What information is now part of the meta-data of the objects?

Column name present in the metadata of all samples:

orig.ident: this often contains the sample identity if known, in our case, we know the identity of the samples.

nCount_RNA: Number of UMIs per cell.

nFeature_RNA: number of genes detected per cell.

Sample: <orig.ident>_

Donor: Identity of the human donor from whom the cell has been collected (assumed).

Replicate: TODO

Sex: Biological sexual identity of the origin human.

Week 2

4. Preprocessing

4.1 **Preprocessing

4.1.1. Correct order of preprocessing steps:

Filtering is the initial step to remove low-quality cells and genes from each dataset. This ensures that only high-quality data is used for downstream analyses.
Normalization adjusts for technical variations such as differences in sequencing depth between cells, making the gene expression levels comparable across all cells.
Feature Selection identifies the most variable genes across the dataset, which are most informative for downstream analyses like dimensionality reduction and clustering.
Doublet Removal is performed after normalization and feature selection because tools like DoubletFinder require normalized data and principal component analysis (PCA) results to accurately detect doublets. (as documented in its Github docuementation)
Re-normalization
Re-feature Selection

4.1.2 Steps Before and After Merging (Task 4.2):

All steps are performed after merging the dataset for the non-batch corrected procedure.

4.1.3. Performing preprocessing on the data

4.1.3.1. Filtering

Before filtering, lets get some QC Metrics.

4.1.3.1.1. QC Metrics

# Merging the dataset to get the QC metrics.

merged_seurat <- merge(
  x = bmmc_d1t1_seurat,
  y = list(bmmc_d1t2_seurat, cd34_d2t1_seurat, cd34_d3t1_seurat),
  add.cell.ids = c("d1t1", "d1t2", "d2t1", "d3t1"),
  project = "Filtering"
)

# log10 transform to compare the the samples

# Number of genes per UMI for each cells
merged_seurat$log10GenesPerUMI <-log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)


# Mitochondrial Gene percentage
# merged_seurat[["percent.mt"]] <- PercentageFeatureSet(object = merged_seurat, pattern = "^MT-")
# calculate the mitoRatio as a proportion (0-1 scale) if you need it
# merged_seurat$mitoRatio <- merged_seurat$percent.mt / 100

# Create orig.ident_mod column based on cell names (for visualization purpose)
merged_seurat$orig.ident_mod <- sapply(strsplit(rownames(merged_seurat@meta.data), "_"), `[`, 1)

# Apply mapping to convert abbreviated IDs to full names
# merged_seurat$orig.ident_mod <- sample_mapping[merged_seurat$orig.ident_mod]

# Verify the new column
# table(merged_seurat$orig.ident_mod)

4.1.3.1.2. Cell Counts

# View the distribution of cell counts
table(merged_seurat$orig.ident_mod) %>%
    barplot(main="Cell Counts per Sample",
            xlab="Sample", ylab="Cell Count", col="steelblue", las=2)

4.1.3.1.2. UMI Counts per Cell

# Visualize the distribution of UMI counts per cell using orig.ident_mod
VlnPlot(merged_seurat, features = "nCount_RNA", group.by = "orig.ident_mod", pt.size = 0.1) +
    ggtitle("UMI Counts per Cell") +
    ylab("UMI Count") +
    theme_minimal()

4.1.3.1.3. Genes Detected per Cell

# Visualize the distribution of genes detected per cell using orig.ident_mod
VlnPlot(merged_seurat, features = "nFeature_RNA", group.by = "orig.ident_mod", pt.size = 0.1) +
    ggtitle("Genes Detected per Cell") +
    ylab("Gene Count") +
    theme_minimal()

4.1.3.1.4. UMIs vs. Genes Detected

# Scatter plot of UMIs vs. genes detected per cell using orig.ident_mod
FeatureScatter(merged_seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident_mod") +
    ggtitle("UMIs vs. Genes Detected") +
    xlab("UMI Count") +
    ylab("Gene Count") +
    theme_minimal()

4.1.3.1.5. Novelty

# Calculate novelty as UMI-to-gene ratio if not already present
merged_seurat$novelty <- merged_seurat$nCount_RNA / merged_seurat$nFeature_RNA

# Visualize novelty using orig.ident_mod
VlnPlot(merged_seurat, features = "novelty", group.by = "orig.ident_mod", pt.size = 0.1) +
    ggtitle("Novelty (UMI-to-Gene Ratio)") +
    ylab("Novelty (UMI/Gene)") +
    theme_minimal()

4.1.3.1.6.Plots

# For each Seurat object
VlnPlot(merged_seurat, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

4.1.4. Choosing Cut-off parameters

** Name the parameters that have been used for filtering.**

We can define thresholds (cut-off parameter) based on multiples of MAD.

MAD: Median Absolute Deviation is a measurement of the spread of data. It’s calculated by taking the median of the absolute deviations from the median. For nFeature_RNA for example, MAD represents the typical deviation from the median.

Using Median and MAD:

Lower Threshold (nFeature_RNA):

\[ \text{Lower Threshold} = \text{Median}_{\text{nFeature\_RNA}} - k \times \text{MAD}_{\text{nFeature\_RNA}} \]
Upper Threshold (nFeature_RNA):

\[ \text{Upper Threshold} = \text{Median}_{\text{nFeature\_RNA}} + k \times \text{MAD}_{\text{nFeature\_RNA}} \]

4.1.4.1. Cut-off for genes

nFeature_RNA_values <- merged_seurat@meta.data$nFeature_RNA
nCount_RNA_values <- merged_seurat@meta.data$nCount_RNA

# For nFeature_RNA

# Median and MAD
nFeature_median <- median(nFeature_RNA_values)
nFeature_mad <- mad(nFeature_RNA_values)

# Mean and Standard Deviation
nFeature_mean <- mean(nFeature_RNA_values)
nFeature_sd <- sd(nFeature_RNA_values)

Choosing k=6 would include most typical data points while excluding outliers, covering a large portion of data in a normal distribution. It should minimizes the impact of extreme values.

It also matches the visual min-max threshold of the plots that we previously visualised.

# Determining min/max threshold

k <- 6

# For nFeature_RNA using median and MAD
nFeature_lower <- nFeature_median - k * nFeature_mad

# Ensure lower thresholds are not negative
nFeature_lower <- max(nFeature_lower, min(nFeature_RNA_values))

cat("lower threshold for Genes Detected:", nFeature_lower, "\n")

## lower threshold for Genes Detected: 403

4.1.4.2. Cut-off for UMI

# Calculate Median and MAD for nCount_RNA
nCount_median <- median(nCount_RNA_values)
nCount_mad <- mad(nCount_RNA_values)

# Calculate Mean and Standard Deviation for nCount_RNA
nCount_mean <- mean(nCount_RNA_values)
nCount_sd <- sd(nCount_RNA_values)

# Choosing k = 3 for thresholding
k <- 6

# Determine min/max threshold for nCount_RNA using median and MAD
nCount_lower <- nCount_median - k * nCount_mad
nCount_upper <- nCount_median + k * nCount_mad

# Ensure lower threshold is not negative
nCount_lower <- max(nCount_lower, min(nCount_RNA_values))

cat("lower threshold for UMI:", nCount_lower, "\n")

## lower threshold for UMI: 1002

cat("upper threshold for UMI:", nCount_upper, "\n")

## upper threshold for UMI: 10850.22

# # Mean and SD deviation. Not going to use them.
# # For nFeature_RNA
# nFeature_lower <- nFeature_mean - k * nFeature_sd
# nFeature_upper <- nFeature_mean + k * nFeature_sd
# 
# # Ensure lower thresholds are not negative
# nFeature_lower <- max(nFeature_lower, min(nFeature_RNA_values))

4.1.5. Apply Filtering

# Apply filtering
merged_seurat_filtered <- subset(merged_seurat, 
                                 subset = nFeature_RNA > nFeature_lower & 
                                          nCount_RNA > nCount_lower & 
                                          nCount_RNA < nCount_upper)

We filtered cells where genes are lower than a certain threshold (no upper threshold selected). On the other hand, for UMI, both lower and upper threshold is selected.

No upper threshold for genes is selected as the original analyses have not selected an upper threshold as well.

TODO: better explanation

4.1.5.1. Compare before and after filtration

# Compare

# Before filtering
VlnPlot(merged_seurat, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

# After filtering
VlnPlot(merged_seurat_filtered, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

4.1.6. Doublet removal with DoubletFinder

4.1.6.1. Explanation of why we are performing doublet removal

Purpose: Remove potential doublets to ensure that each observation represents a single cell.

4.1.6.2. Normalization

# Normalization
merged_seurat_filtered <- NormalizeData(merged_seurat_filtered)

4.1.6.3. Feature Selection

# Identify Highly Variable Features
merged_seurat_filtered <- FindVariableFeatures(merged_seurat_filtered, selection.method = "vst", nfeatures = 3000)

TODO: why 3000 features? For initial data exploration 2000 to 3000 HVF is recommended/conventional. But it might be interesting to check out UMAP results for different values.

4.1.6.4. Scale

# Scale the data
merged_seurat_filtered <- ScaleData(merged_seurat_filtered, features = VariableFeatures(object = merged_seurat_filtered))

## Centering and scaling data matrix

4.1.6.5. Run PCA

# Run PCA
merged_seurat_filtered <- RunPCA(merged_seurat_filtered,
                                 npcs = 50 #  PC calculated
                                 )

## PC_ 1 
## Positive:  ACTB, S100A9, FCN1, S100A8, VCAN, S100A11, S100A10, S100A12, CD14, JUND 
##     TYROBP, COTL1, SERPINA1, SLC2A3, JUNB, CYBB, MNDA, ITGB2, TYMP, FCER1G 
##     FGR, FGL2, MS4A6A, CFP, PLBD1, CTSD, CD68, CTSS, CLEC7A, MPEG1 
## Negative:  STMN1, NPM1, HNRNPA1, NUCB2, EEF1B2, CDK6, HMGB1, HSP90AB1, EBPL, SNRPE 
##     PRSS57, NAP1L1, TUBA1B, FABP5, C6orf48, TYMS, HSPD1, DUT, NUCKS1, HMGA1 
##     TUBB, UBB, HIST1H4C, NCL, IGFBP7, RAN, SPINK2, SNRPD1, IMPDH2, PARP1 
## PC_ 2 
## Positive:  CD3E, LTB, EVL, CD3D, LCK, IL32, PTPRCAP, KLF2, IL7R, CD7 
##     CD3G, LAT, TCF7, CD27, CD69, PIK3IP1, SPOCK2, CD247, ISG20, CD2 
##     ETS1, IFITM1, LEF1, CD6, JUNB, SEPT1, ARL4C, RBM38, IL2RG, BCL11B 
## Negative:  CST3, LYZ, MNDA, S100A9, S100A8, GRN, APLP2, FCN1, VCAN, LGALS1 
##     TYROBP, CSTA, AIF1, GSTP1, MS4A6A, SPI1, CFD, FCER1G, TNFSF13B, CEBPD 
##     LST1, PLAUR, S100A12, RNF130, CD14, TYMP, CYBB, SERPINA1, H2AFY, HLA-DRA 
## PC_ 3 
## Positive:  SPIB, FAM129C, CD79A, VPREB3, CD79B, TCL1A, IGLL5, IRF4, MZB1, FCRLA 
##     CD72, MME, CD74, VPREB1, BLNK, PLD4, CD19, POU2AF1, AFF3, CD9 
##     PAX5, BLK, HLA-DRA, EBF1, TCF4, HLA-DQA1, IRF8, ABHD15, CCDC50, CXCR4 
## Negative:  CYTL1, SERPINB1, FTL, ANXA1, PRSS57, EBPL, SLC40A1, S100A4, S100A6, HSP90AB1 
##     MLLT3, AIF1, GATA2, CRHBP, ANKRD28, SNHG8, ALDH1A1, SRGN, FAM117A, ITM2A 
##     LAPTM4B, EEF1B2, AVP, MGST1, IL1B, NPR3, PPA1, CREG1, SPINK2, HOPX 
## PC_ 4 
## Positive:  TOP2A, UBE2C, MKI67, NUSAP1, CDK1, RRM2, CCNA2, BIRC5, AHSP, CA1 
##     TPX2, HBB, ZWINT, CDKN3, DLGAP5, AURKB, CENPF, ASPM, HMMR, CENPM 
##     TYMS, TK1, UBE2T, CCNB2, PRC1, HMGB2, PCNA, NUF2, HBD, S100A6 
## Negative:  CD74, HLA-DRA, HLA-DPB1, HLA-DPA1, SPINK2, CRHBP, AVP, HLA-DRB5, HLA-DRB1, MZB1 
##     ITM2C, SOX4, BCL11A, HLA-DMA, HLA-DQB1, TCF4, SNHG7, EGFL7, HOPX, CD34 
##     HLA-DQA1, ZFAS1, MSI2, NRIP1, DDAH2, MEF2C, PCDH9, GAPT, CXXC5, MDK 
## PC_ 5 
## Positive:  HBD, CA1, KLF1, AHSP, APOC1, HBB, CNRIP1, ANK1, RHAG, GATA1 
##     KCNH2, CD79A, SMIM1, MYL4, HBQ1, CA2, TFR2, FHL2, IGLL5, VPREB3 
##     TCL1A, CD79B, BLVRB, SPTA1, ALDH1A1, EPCAM, APOE, SELENBP1, TPM1, NFIA 
## Negative:  MPO, VIM, PLAC8, CST7, NKG7, SRGN, CTSG, AZU1, C12orf75, ACTG1 
##     ELANE, HCST, PRTN3, CYBA, GSTP1, SPNS3, C1QTNF4, CALR, HSP90B1, MS4A3 
##     RNASE2, ITM2C, RAB32, SSR4, FLT3, GZMB, CLIC3, LGALS1, IRF8, ENO1

4.1.6.6. Elbow Point

The Elbow Plot is commonly used to help decide the number of PCs to retain. By plotting the variance explained by each PC, we look for an “elbow point” where the added value of each subsequent PC diminishes significantly.

ElbowPlot(merged_seurat_filtered)

### 4.1.6.7. Choosing elbow point

# Get the standard deviation of each PC and calculate the variance explained
stdev <- merged_seurat_filtered[["pca"]]@stdev
variance_explained <- (stdev^2) / sum(stdev^2) * 100

# Create a data frame to store the variance explained for each PC
pca_variance_df <- data.frame(PC = 1:length(variance_explained), VarianceExplained = variance_explained)

# Find the elbow point by locating the largest drop-off in variance explained
# We calculate the second derivative and find the point where the rate of decline decreases
second_derivative <- diff(diff(variance_explained))
elbow_point <- which.max(second_derivative) + 1  # +1 to account for indexing difference

# Manually set elbow_point
elbow_point <- 20

# Visualize the Elbow Plot with the selected elbow point
library(ggplot2)
ggplot(pca_variance_df, aes(x = PC, y = VarianceExplained)) +
  geom_line(color = "blue") +
  geom_point() +
  geom_vline(xintercept = elbow_point, color = "red", linetype = "dashed") +
  annotate("text", x = elbow_point, y = variance_explained[elbow_point], 
           label = paste("Elbow at PC", elbow_point), color = "red", vjust = -1) +
  labs(title = "Elbow Plot with Selected PC", x = "Principal Component", y = "Variance Explained (%)") +
  theme_minimal()

4.1.6.8. Estimate optimal pK value

# Estimate optimal pK value
# Perform parameter sweep
sweep.res.list <- paramSweep_v3(merged_seurat_filtered, PCs = 1:elbow_point, sct = FALSE)

## Loading required package: fields

## Loading required package: spam

## Spam version 2.11-0 (2024-10-03) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following object is masked from 'package:stats4':
## 
##     mle

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: viridisLite

## 
## Try help(fields) to get started.

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 5e-04..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)

## Loading required package: KernSmooth

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

## Loading required package: ROCR

bcmvn <- find.pK(sweep.stats)

## NULL

# Selecting the pK value corresponding to the maximum BCmetric
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))

ggplot(bcmvn, aes(x = as.numeric(as.character(pK)), y = BCmetric)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept = optimal_pK, color = "red", linetype = "dashed") +
  annotate("text", x = optimal_pK, y = max(bcmvn$BCmetric), 
           label = paste("Optimal pK:", optimal_pK), hjust = -0.1, vjust = -0.5) +
  labs(title = "Optimal pK Value Identification", x = "pK", y = "BCmetric")

cat("Optimal pK value:", optimal_pK, "\n")

## Optimal pK value: 0.29

4.1.6.9. Estimate the Expected Number of Doublets

# Assuming a typical doublet rate (e.g., 5-10%)
doublet_rate <- 0.075  # 7.5%

# Calculate the expected number of doublets
nExp <- round(doublet_rate * ncol(merged_seurat_filtered))
cat("Expected number of doublets:", nExp, "\n")

## Expected number of doublets: 1558

4.1.6.10. Run Doublet Finder

# Run doublet finder

merged_seurat_filtered <- doubletFinder_v3(merged_seurat_filtered, 
                                           PCs = 1:elbow_point, 
                                           pN = 0.25, 
                                           pK = optimal_pK, 
                                           nExp = nExp, 
                                           reuse.pANN = FALSE, 
                                           sct = FALSE)

## [1] "Creating 6925 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."

## Centering and scaling data matrix

## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

classification_col <- "DF.classifications_0.25_0.29_1558"

# Verify the contents of the classification column
table(merged_seurat_filtered@meta.data[[classification_col]])

## 
## Doublet Singlet 
##    1558   19216

# Filter out doublets, keeping only cells classified as "Singlet"
merged_seurat_filtered <- subset(merged_seurat_filtered, subset = DF.classifications_0.25_0.29_1558 == "Singlet")

# Confirm the number of cells after filtering
cat("Number of cells after doublet removal:", ncol(merged_seurat_filtered), "\n")

## Number of cells after doublet removal: 19216

4.1.7. Re-normalize the data

While we have normalized the data before removing doublet, re-normalizing may ensure data remains constant after running doubletfinder.

Which Normalization method is used by the Seurat Normalization function by default?

– Answer: LogNormalize

merged_seurat_filtered <- NormalizeData(merged_seurat_filtered, normalization.method = "LogNormalize", scale.factor = 10000)

4.1.8. Re-Feature Selection

4.1.8.1. Purpose

The purpose of Feature Selection is to identify a subset of genes (features) that exhibit the most significant variability across cells. This variability is often indicative of biological differences, such as distinct cell types, cell states, or responses to different conditions. Selecting these features improves the efficiency and effectiveness of downstream analyses, such as clustering, dimensionality reduction, and identifying cell types.

4.1.8.2. How Are Features Selected?

In Seurat, features are typically selected based on high variability across cells using the FindVariableFeatures() function, which implements several methods for feature selection, such as vst, mean.var.plot, and dispersion.

1. Calculate Mean and Variance of Each Gene

Seurat calculates the average expression (mean) and variability (variance) of each gene across all cells.

2. Standardize Variability

To ensure comparability across genes with different mean expression levels, Seurat uses a method such as Variance Stabilizing Transformation (VST) to adjust the variance based on the mean expression. This transformation helps in selecting genes with true biological variability rather than variability caused by technical noise.

3. Rank Genes by Variability

After transformation, Seurat ranks genes by their standardized variability, identifying those that deviate the most from expected variance at a given mean expression level.

4. Select Top Variable Genes

Based on the user-defined nfeatures parameter (commonly set to 2000), Seurat selects the top genes with the highest variance as “highly variable genes.” This subset is then used for scaling, dimensionality reduction (PCA and UMAP), and clustering.

# Identify highly variable features (genes)
merged_seurat_filtered <- FindVariableFeatures(
  object = merged_seurat_filtered, 
  selection.method = "vst",   
  nfeatures = 3000,              
  verbose = FALSE                 
)

4.2 Batch Correction

The batch correction process in Seurat allows the integration of multiple datasets, each potentially generated from different experimental batches or conditions. Batch correction is essential when we want to combine data from separate sources while minimizing batch effects, which can obscure the biological signal of interest. The batch correction process in Seurat aligns multiple datasets from different experimental batches to minimize technical differences. It begins with normalization and identification of highly variable features in each dataset. Shared variable features across datasets are selected as common points for alignment. Using these features, Seurat identifies “anchors,” which are pairs of biologically similar cells across batches. Anchors serve as reference points, allowing Seurat to map datasets onto a shared space. The integration step then adjusts gene expression values across datasets, reducing batch-specific effects while retaining biological signals. Seurat generates an “integrated” assay with batch-corrected data, which replaces the batch-biased individual datasets.

# Split the merged object by batch (e.g., orig.ident_mod)
seurat_list <- SplitObject(merged_seurat_filtered, split.by = "orig.ident_mod")

# Normalize and find variable features for each dataset
for (i in 1:length(seurat_list)) {
  seurat_list[[i]] <- NormalizeData(seurat_list[[i]], verbose = FALSE)
  seurat_list[[i]] <- FindVariableFeatures(seurat_list[[i]], selection.method = "vst", nfeatures = 3000, verbose = FALSE)
}

# Select features for integration
features <- SelectIntegrationFeatures(object.list = seurat_list, nfeatures = 3000)

# Find integration anchors
anchors <- FindIntegrationAnchors(object.list = seurat_list, anchor.features = features)

## Scaling features for provided objects

## Finding all pairwise anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 14546 anchors

## Filtering anchors

##  Retained 8687 anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 6709 anchors

## Filtering anchors

##  Retained 2825 anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 6730 anchors

## Filtering anchors

##  Retained 3053 anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 11005 anchors

## Filtering anchors

##  Retained 3140 anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 10892 anchors

## Filtering anchors

##  Retained 3283 anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 8194 anchors

## Filtering anchors

##  Retained 4060 anchors

# Integrate data
integrated_seurat <- IntegrateData(anchorset = anchors)

## Merging dataset 3 into 4

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 1 into 2

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Merging dataset 4 3 into 2 1

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

# Set the default assay to integrated
DefaultAssay(integrated_seurat) <- "integrated"

5. Dimensionality Reduction

5.1. Dimensionality Reduction

# Scale data
all_genes <- rownames(merged_seurat_filtered)
merged_seurat_filtered <- ScaleData(merged_seurat_filtered, features = all_genes)

## Centering and scaling data matrix

5.1.1. PCA

# Run PCA
merged_seurat_filtered <- RunPCA(merged_seurat_filtered, features = VariableFeatures(object = merged_seurat_filtered))

## PC_ 1 
## Positive:  JUND, JUNB, S100A10, S100A9, FCN1, S100A8, S100A11, SLC2A3, VCAN, COTL1 
##     ITGB2, S100A12, CD14, KLF2, CD52, HCST, NCF1, SERPINA1, TYROBP, CYBB 
##     FLNA, FGR, MNDA, MS4A6A, TYMP, FCER1G, FGL2, CTSD, PLBD1, CFP 
## Negative:  STMN1, NUCB2, HNRNPA1, NPM1, SNRPE, PRSS57, EBPL, CDK6, HSP90AB1, NAP1L1 
##     FABP5, TUBA1B, HMGB1, C6orf48, EEF1B2, HSPD1, TYMS, IGFBP7, TUBB, DUT 
##     HMGA1, NUCKS1, TXN, SPINK2, HIST1H4C, ANKRD28, CYTL1, SNRPD1, H2AFZ, NCL 
## PC_ 2 
## Positive:  CD3E, EVL, LTB, CD3D, PTPRCAP, LCK, IL32, IL7R, KLF2, CD7 
##     CD3G, LAT, TCF7, CD69, CD27, PIK3IP1, SPOCK2, ISG20, CD247, CD2 
##     CXCR4, ETS1, LEF1, SEPT1, ARL4C, IL2RG, RBM38, IFITM1, CD6, BCL11B 
## Negative:  CST3, LYZ, S100A9, FCN1, S100A8, MNDA, VCAN, S100A12, TYROBP, CD14 
##     GRN, MS4A6A, SERPINA1, PLAUR, CSTA, APLP2, FCER1G, CYBB, LGALS1, SPI1 
##     CFD, TYMP, CEBPD, LST1, MPEG1, CD36, CD68, PLBD1, AIF1, FGL2 
## PC_ 3 
## Positive:  CD74, HLA-DRA, CD79A, SPIB, CD79B, FAM129C, MZB1, HLA-DPB1, HLA-DPA1, VPREB3 
##     TCL1A, IGLL5, HLA-DQA1, TCF4, HLA-DQB1, HLA-DRB5, HLA-DRB1, FCRLA, HLA-DMA, IRF4 
##     MME, AFF3, HLA-DMB, BLNK, CD72, BLK, PLD4, CD9, CD19, ITM2C 
## Negative:  S100A6, CD3E, S100A4, CD3D, IL32, CD7, LCK, LAT, CD3G, HBD 
##     CA1, KLF1, APOC1, SRGN, JUNB, IL7R, BLVRB, ANXA1, HBB, CD247 
##     AHSP, MT2A, CNRIP1, TNFAIP3, KCNH2, CD2, SPOCK2, S100A10, GZMM, CD27 
## PC_ 4 
## Positive:  AVP, CRHBP, SPINK2, HOPX, EGFL7, AIF1, SERPINB1, CD34, ZFAS1, ANKRD28 
##     MDK, SNHG8, NPR3, LAPTM4B, NRIP1, ANGPT1, MSRB3, MSI2, PROM1, BAALC 
##     CRYGD, CSF3R, NPDC1, IL18, C1QTNF4, IDS, BEX2, RBPMS, TAOK3, HTR1F 
## Negative:  TOP2A, MKI67, AHSP, CA1, NUSAP1, UBE2C, HBB, CDK1, CCNA2, HMMR 
##     BIRC5, RRM2, DLGAP5, CDKN3, CENPF, ASPM, HBD, RHAG, CCNB2, ZWINT 
##     AURKB, TPX2, CA2, CENPA, NUF2, UBE2T, MYBL2, ANK1, MYL4, PRC1 
## PC_ 5 
## Positive:  MPO, C12orf75, PLAC8, CST7, SRGN, NKG7, HSP90B1, CTSG, IRF8, ITM2C 
##     GZMB, AZU1, LGALS1, GSTP1, CLIC3, ACTG1, C1QTNF4, HCST, ELANE, PRTN3 
##     SPNS3, LILRA4, CALR, RNASE2, SCT, RAB32, RUNX2, FABP5, IL3RA, SSR4 
## Negative:  HBD, KLF1, CA1, CNRIP1, APOC1, AHSP, ANK1, HBB, GATA1, SMIM1 
##     RHAG, KCNH2, TFR2, FHL2, CA2, ALDH1A1, CD79A, BLVRB, MYL4, HBQ1 
##     TPM1, NFIA, ITGA2B, MINPP1, TCL1A, PRKAR2B, SPTA1, IGLL5, APOE, PKIG

5.1.2. Visualise PCA results

# Visualize PCA results
# dims 1:2 because PC1 and PC2 explains maximum variance
#VizDimLoadings(merged_seurat_filtered, dims = 1:elbow_point, reduction = "pca")

DimPlot(merged_seurat_filtered, reduction = "pca")

5.1.3. UMAP

# Run UMAP
merged_seurat_filtered <- RunUMAP(merged_seurat_filtered, dims = 1:elbow_point)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 23:48:34 UMAP embedding parameters a = 0.9922 b = 1.112

## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

## Also defined by 'spam'

## 23:48:34 Read 19216 rows and found 20 numeric columns

## 23:48:34 Using Annoy for neighbor search, n_neighbors = 30

## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

## Also defined by 'spam'

## 23:48:34 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 23:48:36 Writing NN index file to temp file /tmp/RtmpaUQ6Ar/file1a810a47ceb20a
## 23:48:36 Searching Annoy index using 1 thread, search_k = 3000
## 23:48:45 Annoy recall = 100%
## 23:48:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:48:52 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:48:54 Commencing optimization for 200 epochs, with 788140 positive edges
## 23:49:11 Optimization finished

# Visualize UMAP
DimPlot(merged_seurat_filtered, reduction = "umap", group.by = "orig.ident")

DimPlot(merged_seurat_filtered, reduction = "umap", label = TRUE)

# Find Neighbors
merged_seurat_filtered <- FindNeighbors(merged_seurat_filtered, dims = 1:elbow_point)

## Computing nearest neighbor graph

## Computing SNN

# Find Clusters
merged_seurat_filtered <- FindClusters(merged_seurat_filtered, resolution = 0.25)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 19216
## Number of edges: 667366
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9569
## Number of communities: 16
## Elapsed time: 5 seconds

# Visualize Clusters on UMAP
DimPlot(merged_seurat_filtered, reduction = "umap", label = FALSE, pt.size = 0.25)

4.2.1. Comparison of Batch corrected vs Non-corrected datasets

# Set the default assay to "integrated"
DefaultAssay(integrated_seurat) <- "integrated"

# Scale the integrated data
integrated_seurat <- ScaleData(integrated_seurat, verbose = FALSE)

# Run PCA on the integrated data
integrated_seurat <- RunPCA(integrated_seurat, npcs = 50, verbose = FALSE)

# Determine the elbow point
# Get the standard deviation of each PC and calculate the variance explained
stdev <- integrated_seurat[["pca"]]@stdev
variance_explained <- (stdev^2) / sum(stdev^2) * 100

# Create a data frame to store the variance explained for each PC
pca_variance_df <- data.frame(PC = 1:length(variance_explained), VarianceExplained = variance_explained)

# Find the elbow point by locating the largest drop-off in variance explained
second_derivative <- diff(diff(variance_explained))
elbow_point <- which.max(second_derivative) + 1  # +1 to adjust indexing

# Manually Set elbow_point
elbow_point <- 20

# Visualize the Elbow Plot with the selected elbow point
ggplot(pca_variance_df, aes(x = PC, y = VarianceExplained)) +
  geom_line(color = "blue") +
  geom_point() +
  geom_vline(xintercept = elbow_point, color = "red", linetype = "dashed") +
  annotate("text", x = elbow_point, y = variance_explained[elbow_point],
           label = paste("Elbow at PC", elbow_point), color = "red", vjust = -1) +
  labs(title = "Elbow Plot with Selected PC", x = "Principal Component", y = "Variance Explained (%)") +
  theme_minimal()

# Run UMAP using the PCs up to the elbow point
integrated_seurat <- RunUMAP(integrated_seurat, dims = 1:elbow_point, seed.use = 1234)

## 23:50:20 UMAP embedding parameters a = 0.9922 b = 1.112

## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

## Also defined by 'spam'

## 23:50:20 Read 19216 rows and found 20 numeric columns

## 23:50:20 Using Annoy for neighbor search, n_neighbors = 30

## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

## Also defined by 'spam'

## 23:50:20 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 23:50:22 Writing NN index file to temp file /tmp/RtmpaUQ6Ar/file1a810a37068e3
## 23:50:22 Searching Annoy index using 1 thread, search_k = 3000
## 23:50:29 Annoy recall = 100%
## 23:50:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:50:33 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:50:35 Commencing optimization for 200 epochs, with 827712 positive edges
## 23:50:51 Optimization finished

# Find Neighbors
integrated_seurat <- FindNeighbors(integrated_seurat, dims = 1:elbow_point)

## Computing nearest neighbor graph
## Computing SNN

# Find Clusters
integrated_seurat <- FindClusters(integrated_seurat, resolution = 0.3)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 19216
## Number of edges: 719924
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9472
## Number of communities: 16
## Elapsed time: 4 seconds

# Visualize UMAP colored by sample origin
DimPlot(integrated_seurat, reduction = "umap", group.by = "orig.ident_mod") +
  ggtitle("UMAP Colored by Sample Origin")

# Visualize UMAP with cluster labels
DimPlot(integrated_seurat, reduction = "umap", label = TRUE, pt.size = 0.25) +
  ggtitle("UMAP with Cluster Labels")

# Compare PCA with or without batch correction

p1 <- DimPlot(merged_seurat_filtered, reduction = "pca", group.by = "orig.ident") + ggtitle("Without Batch Correction")
p2 <- DimPlot(integrated_seurat, reduction = "pca", group.by = "orig.ident") + ggtitle("With Batch Correction")

# Combine plots for comparison
p1 + p2

4.2.2. Observations from the PCA Plots

In the left plot (Without Batch Correction), we see a clear separation between the “BMMC” (red) and “CD34” (teal) clusters, similar to what we observed in the UMAP plots. This separation along the principal components is likely due to batch effects rather than true biological variation, as the cells of the same type (CD34) should ideally overlap with minimal distinction between batches if no technical artifacts were present. In the right plot (With Batch Correction), the “BMMC” and “CD34” cells are more intermixed along PC1 and PC2, indicating that batch effects have been reduced. The cells now cluster based on intrinsic biological characteristics rather than technical artifacts.

# Compare UMAP with or without batch correction

p1 <- DimPlot(merged_seurat_filtered, reduction = "umap", group.by = "orig.ident") + ggtitle("Without Batch Correction")
p2 <- DimPlot(integrated_seurat, reduction = "umap", group.by = "orig.ident") + ggtitle("With Batch Correction")

print(p1 + p2)

4.2.3. Observations from the UMAP Plots

The left plot (Without Batch Correction) shows distinct clusters that are clearly separated by the sample types, labeled as “BMMC” (red) and “CD34” (teal). This separation likely reflects batch effects rather than true biological differences, as the same cell types from different batches should ideally overlap or be closely intermixed if they share similar characteristics.

The right plot (With Batch Correction) shows a more integrated distribution of “BMMC” and “CD34” cells, suggesting that the batch effects have been minimized, allowing the biological differences to emerge more clearly.

4.2.4. Batch effects in single-cell data can arise from various sources, including:

Sequencing Depth: Different batches may have variations in the number of reads per cell.
Library Preparation: Variations in reagent quality or handling can cause batch effects.
Cell Preparation: Differences in cell handling (e.g., sorting, staining) across batches can influence the data.
Instrument Settings: Variability in instrument calibration across runs can introduce batch effects.

4.2.5. Batch correction is crucial because without it:

The results could be biased by technical artifacts, misrepresenting true biological variability.

Downstream analyses, such as clustering and differential expression, may be inaccurate if technical differences are mistaken for biological differences.

To demonstrate the effect of each parameter, we could create additional visualizations for each suspected source of batch effect (e.g., comparing UMAPs colored by sequencing depth or library preparation batch). However, the primary goal is to achieve a plot where cells cluster by type rather than batch, as seen in the “With Batch Correction” UMAP plot on the right.

Week 3

6. Cell Type Annotation

6.1. Automatic Cell Type Annotation with SingleR

In this section, we use SingleR, a tool for automatic cell type annotation, to assign cell type labels to our integrated Seurat object. We utilize the built-in reference dataset HumanPrimaryCellAtlasData from the celldex package. Finally, we visualize the annotation results on a UMAP plot.

# Load the Human Primary Cell Atlas reference data
reference <- readRDS("../data/celldex_annot.RDS")

# Extract the normalized expression data from the integrated assay
# SingleR requires a normalized expression matrix
# We use the "integrated" assay's data slot
expression_data <- GetAssayData(integrated_seurat, assay = "integrated", slot = "data")

# Run SingleR to annotate cell types
# 'labels' parameter uses the main cell type annotations from the reference
singleR_results <- SingleR(test = expression_data, 
                           ref = reference, 
                           labels = reference$label.main, 
                           assay.type.test = "logcounts", 
                           assay.type.ref = "logcounts")

# Add SingleR labels to the Seurat object's metadata
integrated_seurat$SingleR.labels <- singleR_results$labels

# Verify the annotation results
table(integrated_seurat$SingleR.labels)

## 
##            B_cell                BM        BM & Prog.               CMP 
##              1637                18                32               834 
##                DC      Erythroblast               GMP        HSC_-G-CSF 
##                 4               481               698                73 
##               MEP          Monocyte         Myelocyte       Neutrophils 
##               498              3796                49                 3 
##           NK_cell  Pre-B_cell_CD34-  Pro-B_cell_CD34+     Pro-Myelocyte 
##               665               758               809               459 
##           T_cells Tissue_stem_cells 
##              8401                 1

# Plot UMAP with SingleR cell type labels
DimPlot(integrated_seurat, 
        reduction = "umap", 
        group.by = "SingleR.labels", 
        label = TRUE, 
        repel = TRUE) +
  ggtitle("UMAP Plot with SingleR Cell Type Annotations") +
  theme_minimal()

6.2. Manual Annotation

6.2.1. Perform Differential Expression Analysis for Cell-Type Annotation

# Set the default assay
DefaultAssay(integrated_seurat) <- "integrated"

# Perform differential expression analysis
all_markers <- FindAllMarkers(
  object = integrated_seurat,
  assay = "integrated",
  only.pos = FALSE,
  min.pct = 0.25,
  logfc.threshold = 0.25
)

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

## Calculating cluster 6

## Calculating cluster 7

## Calculating cluster 8

## Calculating cluster 9

## Calculating cluster 10

## Calculating cluster 11

## Calculating cluster 12

## Calculating cluster 13

## Calculating cluster 14

## Calculating cluster 15

# View the top markers for each cluster
head(all_markers)

##      p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## CD7      0   2.191120 0.900 0.679         0       0  CD7
## CD3D     0   2.150795 0.928 0.677         0       0 CD3D
## KLF2     0   2.013932 0.987 0.850         0       0 KLF2
## LTB      0   1.994837 0.986 0.781         0       0  LTB
## LEF1     0   1.956435 0.765 0.622         0       0 LEF1
## TCF7     0   1.945835 0.789 0.644         0       0 TCF7

6.2.2. Use Markers from Table 2 to Identify Cell Types and Assign Names to Clusters

6.2.2.1. Define Cell Type Markers

# Define a list of marker genes for each cell type
cell_type_markers <- list(
  HSC = c("CD34", "CD38", "Sca1", "Kit"),
  LMPP = c("CD38", "CD52", "CSF3R", "ca1", "Kit", "CD34", "Flk2"),
  CLP = c("IL7R"),
  GMP_Neutrophils = c("ELANE"),
  CMP = c("IL3", "GM-CSF", "M-CSF"),
  B = c("CD19"),
  Pre_B = c("CD19", "CD34"),
  Plasma = c("SDC1", "IGHA1", "IGLC1", "MZB1", "JCHAIN"),
  CD8_T = c("CD3D", "CD3E", "CD8A", "CD8B"),
  CD4_T = c("CD3D", "CD3E", "CD4"),
  NK = c("FCGR3A", "NCAM1", "NKG7", "KLRB1"),
  Erythrocytes = c("GATA1", "HBB", "HBA1", "HBA2"),
  pDC = c("IRF8", "IRF4", "IRF7"),
  cDC = c("CD1C", "CD207", "ITGAM", "NOTCH2", "SIRPA"),
  CD14_Monocytes = c("CD14", "CCL3", "CCL4", "IL1B"),
  CD16_Monocytes = c("FCGR3A", "CD68", "S100A12"),
  Basophils = c("GATA2")
)

6.2.2.2. Create a Function to Match Clusters to Cell Types

# Function to find matching cell type for each cluster
match_cluster_to_cell_type <- function(cluster_markers, cell_type_markers) {
  cell_type_scores <- sapply(names(cell_type_markers), function(cell_type) {
    markers <- cell_type_markers[[cell_type]]
    sum(cluster_markers %in% markers)
  })
  best_match <- names(cell_type_scores)[which.max(cell_type_scores)]
  return(best_match)
}

6.2.2.3. Create a Function to Match Clusters to Cell Types

# Get list of clusters
clusters <- levels(integrated_seurat)

# Initialize a vector to store cell type annotations
cluster_annotations <- vector("character", length = length(clusters))
names(cluster_annotations) <- clusters

# Annotate clusters
for (cluster in clusters) {
  # Get top markers for this cluster
  cluster_markers <- all_markers %>%
    filter(cluster == !!cluster) %>%
    arrange(p_val_adj) %>%
    pull(gene) %>%
    unique()
  
  # Find best matching cell type
  cell_type <- match_cluster_to_cell_type(cluster_markers, cell_type_markers)
  
  # Assign cell type abbreviation (in brackets)
  cluster_annotations[cluster] <- paste0(cell_type, " (", cluster, ")")
}

# Display cluster annotations
cluster_annotations

##                     0                     1                     2 
##            "LMPP (0)"            "LMPP (1)"            "LMPP (2)" 
##                     3                     4                     5 
##    "Erythrocytes (3)"    "Erythrocytes (4)"            "LMPP (5)" 
##                     6                     7                     8 
##            "LMPP (6)"            "LMPP (7)"    "Erythrocytes (8)" 
##                     9                    10                    11 
##    "Erythrocytes (9)"           "LMPP (10)" "CD14_Monocytes (11)" 
##                    12                    13                    14 
##           "LMPP (12)"           "LMPP (13)"             "NK (14)" 
##                    15 
##   "Erythrocytes (15)"

6.2.2.4. Missing Markers in the datasets

missing_genes_merged_seurat <- setdiff(unique(unlist(cell_type_markers)), rownames(merged_seurat))
cat("Missing gene markers in unfiltered merged seurat object:", missing_genes_merged_seurat)

## Missing gene markers in unfiltered merged seurat object: Sca1 Kit ca1 Flk2 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN

missing_genes_merged_seurat_filtered <- setdiff(unique(unlist(cell_type_markers)), rownames(merged_seurat_filtered))
cat("Missing gene markers in filtered merged seurat object (non-batch corrected):", missing_genes_merged_seurat)

## Missing gene markers in filtered merged seurat object (non-batch corrected): Sca1 Kit ca1 Flk2 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN

missing_genes_integrated_seurat <- setdiff(unique(unlist(cell_type_markers)), rownames(integrated_seurat[["integrated"]]))
cat("Missing gene markers in filtered merged seurat object (batch-corrected):", missing_genes_integrated_seurat)

## Missing gene markers in filtered merged seurat object (batch-corrected): Sca1 Kit ca1 Flk2 IL3 GM-CSF M-CSF IGHA1 IGLC1 JCHAIN CD3E CD207

It looks like there are some marker genes that were lost in the batch correction procedure.

unique_missing_in_merged <- setdiff(missing_genes_merged_seurat_filtered, missing_genes_integrated_seurat)
unique_missing_in_integrated <- setdiff(missing_genes_integrated_seurat, missing_genes_merged_seurat_filtered)


cat("Number of genes uniquely missing in integrated_seurat:", length(unique_missing_in_integrated), "\n")

## Number of genes uniquely missing in integrated_seurat: 3

print(unique_missing_in_integrated)

## [1] "IL3"   "CD3E"  "CD207"

6.2.2.5. Update Seurat Object with Manual Annotations

# Update cluster identities
integrated_seurat <- RenameIdents(integrated_seurat, cluster_annotations)

# Alternatively, create a new metadata column for manual annotations
integrated_seurat$manual_annotations <- Idents(integrated_seurat)

6.2.2.6. Plot UMAP with Manual Annotations and Compare with Automatic Annotations

6.2.2.6.1. Manual Annotations UMAP Plot

# Plot UMAP with manual annotations
png("UMAP_Plot_with_Manual_Annotations.png", width = 800, height = 600)
DimPlot(integrated_seurat, 
        reduction = "umap", 
        group.by = "manual_annotations", 
        label = FALSE, 
        repel = TRUE) +
  ggtitle("UMAP Plot with Manual Cell Type Annotations") +
  theme_minimal()
dev.off()

## png 
##   2

6.2.6.2. Comparison with Automatic Annotations

# Plot automatic annotations
p1 <- DimPlot(integrated_seurat, 
              reduction = "umap", 
              group.by = "SingleR.labels", 
              label = FALSE, 
              repel = TRUE) +
  ggtitle("Automatic Annotations (SingleR)") +
  theme_minimal()

# Plot manual annotations
p2 <- DimPlot(integrated_seurat, 
              reduction = "umap", 
              group.by = "manual_annotations", 
              label = FALSE, 
              repel = TRUE) +
  ggtitle("Manual Annotations") +
  theme_minimal()

# Combine plots
p1 + p2

6.2.6 Visualize Gene Expression of Marker Genes

Let’s choose CD3D (T cell marker), CD19 (B cell marker), and NKG7 (NK cell marker).

6.2.2.7. Violin Plots

# Violin plots for selected genes
VlnPlot(integrated_seurat, 
        features = c("CD3D", "CD19", "NKG7"), 
        group.by = "manual_annotations", 
        pt.size = 0) +
  theme_minimal()

6.2.2.8. Feature Plots (UMAP)

# Feature plots for selected genes
FeaturePlot(integrated_seurat, 
            features = c("CD3D", "CD19", "NKG7"), 
            reduction = "umap", 
            cols = c("lightgrey", "blue")) +
  theme_minimal()

6.3. Cell-Type Proportions (Bonus)

6.3.1. Compute Cell-Type Proportions for Each Sample

# Compute cell-type proportions
cell_type_counts <- table(integrated_seurat$manual_annotations, integrated_seurat$Sample)
cell_type_proportions <- prop.table(cell_type_counts, margin = 2) * 100

# Convert to data frame for plotting
cell_type_proportions_df <- as.data.frame(cell_type_proportions)
colnames(cell_type_proportions_df) <- c("CellType", "Sample", "Proportion")

6.3.2. Plot Cell-Type Proportions

ggplot(cell_type_proportions_df, aes(x = Sample, y = Proportion, fill = CellType)) +
  geom_bar(stat = "identity", position = "fill") +
  ylab("Proportion (%)") +
  ggtitle("Cell-Type Proportions per Sample") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_viridis_d(option = "plasma")  # Options: "magma", "inferno", "plasma", "viridis", "cividis"

6.3.3. Explain Sample Variability in Cell-Type Proportions

The cell-type proportions across the four samples show a similar overall composition dominated by LMPP and Erythrocyte clusters. The BMMC samples (D1T1 and D1T2) have a slightly higher proportion of certain LMPP clusters in the top sections compared to CD34 samples (D2T1 and D3T1). Erythrocyte clusters appear more prominent in the CD34 samples, suggesting an enrichment of these cells in those samples. The lower sections contain minor cell types like CD14_Monocytes and NK cells, which are slightly more prevalent in BMMC samples. In a nutshell, BMMC samples exhibit a bit more diversity in minor cell types, while CD34 samples are more concentrated in certain clusters.

7. Differential Analysis

7.1. Differential Expression Analysis on cell-types

We will use the automatic annotations stored under the singleR.labels for the differential expression analysis.

cat("Unique cell types present:", unique(integrated_seurat$SingleR.labels))

## Unique cell types present: T_cells Monocyte B_cell Pro-B_cell_CD34+ GMP Pre-B_cell_CD34- CMP Erythroblast NK_cell MEP Pro-Myelocyte Myelocyte BM & Prog. HSC_-G-CSF BM Tissue_stem_cells Neutrophils DC

# B cells vs T cells
b_vs_t <- FindMarkers(integrated_seurat, ident.1 = "B_cell", ident.2 = "T_cells", group.by = "SingleR.labels")

# T cells vs Monocytes
t_vs_mono <- FindMarkers(integrated_seurat, ident.1 = "T_cells", ident.2 = "Monocyte", group.by = "SingleR.labels")

Volcano Plot

# Prepare Data for volcano plot
# Add -log10 adjusted p-values for each data frame
b_vs_t$logP <- -log10(b_vs_t$p_val_adj + 1e-10)  # Small value to avoid log(0)
t_vs_mono$logP <- -log10(t_vs_mono$p_val_adj + 1e-10)


# Volcano plot for B cells vs T cells
ggplot(b_vs_t, aes(x = avg_log2FC, y = logP)) +
  geom_point(aes(color = avg_log2FC > 0), alpha = 0.6) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +  # Significance threshold
  xlab("Log2 Fold Change (B cells vs T cells)") + 
  ylab("-Log10 Adjusted P-Value") +
  ggtitle("Differential Expression: B cells vs T cells") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"))  # Blue for downregulated, Red for upregulated

# Volcano plot for T cells vs Monocytes
ggplot(t_vs_mono, aes(x = avg_log2FC, y = logP)) +
  geom_point(aes(color = avg_log2FC > 0), alpha = 0.6) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +  # Significance threshold
  xlab("Log2 Fold Change (T cells vs Monocytes)") + 
  ylab("-Log10 Adjusted P-Value") +
  ggtitle("Differential Expression: T cells vs Monocytes") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"))

7.1.1. Memory Formation on Cells

Naive T cells are mature T cells that have not yet encountered their specific antigen, whereas memory T cells are antigen-experienced cells that persist long-term and respond more rapidly upon re-exposure to the same antigen. Memory T cells arise from naive T cells following antigen exposure, undergoing differentiation processes that enable them to provide quicker and more specific immune responses during subsequent encounters with the same pathogen.

8. Pathway Analysis

8.1. Differential Expression Analysis on groups

8.1.1. Differential Expression Analysis for BMMC vs CD34 (All Cell Types)

# Differential Expression Analysis for BMMC vs CD34 (All Cell Types)
bmmc_vs_cd34 <- FindMarkers(integrated_seurat, ident.1 = "BMMC", ident.2 = "CD34", group.by = "orig.ident")

8.1.2. Extract Top 5 DEGs

bmmc_vs_cd34$gene <- rownames(bmmc_vs_cd34)

top5_bmmc_vs_cd34 <- bmmc_vs_cd34 %>%
  arrange(p_val_adj) %>%
  head(5) %>%
  select(gene, avg_log2FC, p_val_adj)

knitr::kable(top5_bmmc_vs_cd34, caption = "Top 5 Differentially Expressed Genes (BMMC vs CD34)")

Top 5 Differentially Expressed Genes (BMMC vs CD34)
	gene	avg_log2FC
AHSP	AHSP	-1.2424964
HBM	HBM	0.2587761
HBA1	HBA1	-0.4004577
GNLY	GNLY	2.2900926
CLC	CLC	-1.4500950

8.2. Pathway analysis on groups

Idents(integrated_seurat) <- "orig.ident"

top_pathways <- DEenrichRPlot(
  object = integrated_seurat,
  ident.1 = "BMMC",
  ident.2 = "CD34",
  balanced = TRUE,            # Show both upregulated and downregulated terms
  logfc.threshold = 0.25,     # Only consider genes with a minimum log fold change
  test.use = "wilcox",        # Use Wilcoxon Rank Sum test (default)
  p.val.cutoff = 0.05,        # Only include significant DE genes (adjusted p < 0.05)
  enrich.database = "GO_Biological_Process_2021",  # Choose GO Biological Process terms
  num.pathway = 10,            # Display top 10 enriched pathways
  max.genes = 500
)

## Uploading data to Enrichr... Done.
##   Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.
## Uploading data to Enrichr... Done.
##   Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.

8.3. Biological interpretation

# Filter the results to include only significant genes
significant_genes <- bmmc_vs_cd34 %>%
  filter(p_val_adj < 0.05) %>%
  arrange(p_val_adj) %>%
  pull(gene)

# Use EnrichR to perform enrichment analysis on significant genes
enrichr_results <- enrichr(
  significant_genes,
  databases = "GO_Biological_Process_2021"
)

## Uploading data to Enrichr... Done.
##   Querying GO_Biological_Process_2021... Done.
## Parsing results... Done.

# Extract the GO Biological Process results and find the top pathway
go_results <- enrichr_results[["GO_Biological_Process_2021"]]

# Arrange the results by adjusted p-value and select the top pathway
top_pathway <- go_results %>%
  arrange(Adjusted.P.value) %>%
  head(1)

kable(top_pathway, 
      format = "html", 
      col.names = c("Term", "Overlap", "P.value", "Adjusted P-value", "Old P-value", "Old Adjusted P-value", "Odds Ratio", "Combined Score", "Genes"), 
      caption = "Pathway with the lowest P-value")

Pathway with the lowest P-value
Term	Overlap	P.value	Adjusted P-value	Old P-value	Old Adjusted P-value	Odds Ratio	Combined Score	Genes
neutrophil degranulation (GO:0043312)	112/481	0	0	0	0	5.88068	552.6518	FCN1;CDA;MPO;LGALS3;ANPEP;COTL1;SIRPA;CD93;SLC11A1;CYBB;CYBA;RNASE3;MIF;ILF2;OSCAR;RAB31;TYROBP;RAB37;CRISPLD2;ADAM8;PRTN3;S100A9;SLC27A2;PPIA;KPNB1;S100A8;FTL;CFD;HVCN1;STXBP2;C5AR1;FPR1;MGST1;IQGAP1;CFP;IQGAP2;GNS;SYNGR1;PRDX4;S100A12;MLEC;CD14;ELANE;S100A11;ATP8B4;JUP;GGH;AZU1;PA2G4;NIT2;SERPINA1;HSP90AB1;ITGAM;B4GALT1;MS4A3;ITGB2;CTSZ;HBB;HMGB1;ITGAL;SLC2A5;CTSS;SIRPB1;HK3;TIMP2;CTSH;CTSG;CD36;CEP290;CTSD;CTSC;CCT2;SERPINB1;HSP90AA1;CR1;FCER1G;ANXA2;RNASET2;TUBB;PLAUR;DYNLL1;TNFRSF1B;CKAP4;SERPINB6;VAMP8;FGR;CAT;GRN;CLEC12A;GSTP1;PTAFR;FGL2;RETN;CST3;ALOX5;STOM;CD59;CCT8;LAIR1;GSN;GCA;LILRB2;TUBB4B;LILRB3;TSPAN14;FABP5;IMPDH2;P2RX1;QPCT;MNDA;FOLR3;CD68

Neutrophil degranulation (GO:0043312) is the controlled release of secretory granules from neutrophils, which contain pre-stored substances like proteases, lipases, and inflammatory mediators. QuickGo

8.1.3. Differential Expression Analysis for Monocytes in BMMC vs CD34

# Subset for Monocyte cells only
monocytes <- subset(integrated_seurat, subset = SingleR.labels == "Monocyte")

# Differential Expression Analysis for Monocytes in BMMC vs CD34
monocyte_bmmc_vs_cd34 <- FindMarkers(monocytes, ident.1 = "BMMC", ident.2 = "CD34", group.by = "orig.ident")

monocyte_bmmc_vs_cd34$gene <- rownames(monocyte_bmmc_vs_cd34)
                                       
# Extract Top 5 DEGs by p-value for Monocytes
top5_monocyte_bmmc_vs_cd34 <- monocyte_bmmc_vs_cd34 %>%
  arrange(p_val_adj) %>%
  head(5) %>%
  select(gene, avg_log2FC, p_val_adj)

knitr::kable(top5_monocyte_bmmc_vs_cd34, caption = "Top 5 Differentially Expressed Genes in Monocytes (BMMC vs CD34)")

Top 5 Differentially Expressed Genes in Monocytes (BMMC vs CD34)
	gene	avg_log2FC
IFI44L	IFI44L	0.2763581
FCN1	FCN1	1.2554231
SCT	SCT	-0.4626762
HBA1	HBA1	0.7003820
CLDN10	CLDN10	-0.2761770

writeLines(capture.output(sessionInfo()), "sessionInfo.txt")