library(Seurat)
## Warning: package 'Seurat' was built under R version 4.3.3
## Loading required package: SeuratObject
## Warning: package 'SeuratObject' was built under R version 4.3.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.3.3
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(SeuratDisk)
## Registered S3 method overwritten by 'SeuratDisk':
## method from
## as.sparse.H5Group Seurat
library(SeuratWrappers)
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
library(harmony)
## Warning: package 'harmony' was built under R version 4.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.3.3
library(reshape2)
library(RColorBrewer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
base_path <- "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/"
output_path <- "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/"
# Dataset folders
folders <- c("DS1", "DS2", "DS3", "DS4")
full_paths <- file.path(base_path, folders)
full_paths
## [1] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS1"
## [2] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS2"
## [3] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS3"
## [4] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS4"
#Load dataset DS1
DS1 <- Read10X("/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS1")
Seurat.DS1 <- CreateSeuratObject(DS1, project = "DS1")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
Seurat.DS1
## An object of class Seurat
## 33694 features across 4672 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
## 1 layer present: counts
#Load dataset DS3
DS3 <- Read10X("/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS3")
Seurat.DS3 <- CreateSeuratObject(DS3, project = "DS3")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
Seurat.DS3
## An object of class Seurat
## 33694 features across 4933 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
## 1 layer present: counts
#Add Mitochondrial QC Info
Seurat.DS1[["percent.mt"]] <- PercentageFeatureSet(Seurat.DS1, pattern = "^MT-")
head(Seurat.DS1@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1 DS1 16752 3287 1.098376
## AAACCTGAGAGCCTAG-1 DS1 13533 3399 1.950787
## AAACCTGAGTAATCCC-1 DS1 3098 1558 2.453196
## AAACCTGCACACTGCG-1 DS1 5158 2015 3.761148
## AAACCTGCATCGGAAG-1 DS1 6966 2322 2.182027
## AAACCTGGTGGTAACG-1 DS1 5177 2247 5.176743
Seurat.DS3[["percent.mt"]] <- PercentageFeatureSet(Seurat.DS3, pattern = "^MT-")
head(Seurat.DS3@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGACTCGGA-1 DS3 10261 3146 3.216061
## AAACCTGAGAGTGAGA-1 DS3 7065 2415 2.165605
## AAACCTGCACAAGCCC-1 DS3 4007 1780 2.096331
## AAACCTGCAGATTGCT-1 DS3 4497 1793 2.112519
## AAACCTGCAGGATTGG-1 DS3 4685 1641 2.902882
## AAACCTGGTTAGTGGG-1 DS3 10039 2998 2.510210
#Visualize QC Metrics
VlnPlot(Seurat.DS1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(Seurat.DS3, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

library(Seurat)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(patchwork)
# Load and prepare both datasets
ds_paths <- list(
DS1 = "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS1",
DS3 = "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS3"
)
seurat_list <- lapply(names(ds_paths), function(name) {
data <- Read10X(ds_paths[[name]])
rownames(data) <- make.unique(rownames(data)) # Ensure no duplicate gene names
obj <- CreateSeuratObject(counts = data, project = name, min.cells = 3, min.features = 200)
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
# QC filtering
obj <- subset(obj, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 10)
# Normalize and find variable genes
obj <- NormalizeData(obj)
obj <- FindVariableFeatures(obj)
return(obj)
})
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Normalizing layer: counts
## Finding variable features for layer counts
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Normalizing layer: counts
## Finding variable features for layer counts
names(seurat_list) <- names(ds_paths)
# Integrate
anchors <- FindIntegrationAnchors(object.list = seurat_list, dims = 1:30)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Computing 2000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 11013 anchors
## Filtering anchors
## Retained 5563 anchors
integrated <- IntegrateData(anchorset = anchors, dims = 1:30)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
# Downstream steps
DefaultAssay(integrated) <- "integrated"
integrated <- ScaleData(integrated)
## Centering and scaling data matrix
integrated <- RunPCA(integrated)
## PC_ 1
## Positive: SFRP1, VIM, SOX2, FABP7, HES1, PTN, TTYH1, ID4, RCN1, HMGN2
## SOX3, HMGB2, TUBA1B, HSPB1, GNG5, DOK5, MDK, PHGDH, SMC4, CLU
## CD99, RPS27L, H2AFZ, GAPDH, DEK, ECI1, NUSAP1, NES, TYMS, KIAA0101
## Negative: STMN2, NEUROD2, RTN1, GAP43, SYT1, HMP19, SOX4, NEUROD6, GPM6A, CNTNAP2
## TMSB10, MAPT, MLLT11, RAB3A, DCX, CD24, NSG1, PCSK1N, RBFOX2, TBR1
## SEZ6L2, CRMP1, SLA, STMN4, SHTN1, SCG3, THRA, TTC9B, CAMK2N1, TUBA1A
## PC_ 2
## Positive: RPS19, FTL, C1orf61, RPL12, RPS6, RPS27L, RPS16, RPL41, FABP7, NHLH1
## BTG1, TMSB4X, TPT1, PRDX1, EZR, NAP1L1, NKAIN4, EEF1D, CCND1, TKTL1
## DOK5, ENO1, TAC3, RPLP1, IFITM3, FAM60A, GAPDH, HES1, CALD1, LINC01021
## Negative: UBE2C, TOP2A, MKI67, GTSE1, TPX2, CCNA2, CENPF, FAM64A, ASPM, AURKB
## NUF2, SGOL2, CCNB1, CDCA3, CDK1, NUSAP1, PBK, PLK1, CDC20, BIRC5
## CDKN3, PRC1, AURKA, NEK2, KIF23, KIF2C, DLGAP5, NDC80, CENPE, SGOL1
## PC_ 3
## Positive: TTYH1, HOPX, HES4, PEA15, IGFBP5, CDH7, PTN, VIM, FGFBP3, ITM2C
## LMO3, FAM181B, MEF2C, DACT1, DOK6, IFITM3, SPARC, DUT, FAM107A, EEF1B2
## MCM3, GINS2, TMEM161B-AS1, PCNA, VCAN, LRRC3B, TMEM106C, VSNL1, PLAT, LRRC1
## Negative: NHLH1, EOMES, HES6, SOX11, ARL4D, PRDX1, RCOR2, SSTR2, ELAVL2, INSM1
## TAGLN3, ELAVL4, ENC1, PPP1R17, DUOX1, EZR, HN1, TAC3, RP4-665J23.1, BHLHE22
## SLCO5A1, NEUROD1, GADD45G, NXPH4, DLL3, NPPA, NEUROD4, PLK1, TMSB4X, PTMS
## PC_ 4
## Positive: PLK1, CDC20, CCNB1, CCNA1, CENPA, NEK2, CCNB2, PIF1, TROAP, NMU
## VIM, AURKA, BUB1, GAS2L3, ARL6IP1, DLGAP5, PSRC1, BNIP3, CENPE, CDC25B
## TTYH1, SAPCD2, ASPM, CEP70, HMMR, LINC01551, BORA, TUBA1C, KNSTRN, NUDCD2
## Negative: MYBL2, KIAA0101, CLSPN, DHFR, TYMS, RRM2, CDC6, TK1, CCNE2, PCNA
## TUBB, CDC45, FEN1, MCM10, E2F1, HELLS, ATAD2, CHAF1A, RAD51AP1, CENPU
## GINS2, ESCO2, MCM7, CENPK, RNASEH2A, DNAJC9, ORC6, DTL, PKMYT1, ZWINT
## PC_ 5
## Positive: NEUROG1, HES5, NEUROD4, HES6, EOMES, RPS16, RPS19, VSNL1, CDH13, MFNG
## CDH7, HSPA12A, EEF1B2, RPLP1, MYCL, NEGR1, LMO3, MEF2C, DACT1, RGS16
## RBPJ, ELAVL2, SORCS1, ADCY1, CLSTN2, PPP1R1B, CPNE6, TRIL, STXBP6, DLL1
## Negative: TUBB2B, TUBB2A, PTPRZ1, TUBA1A, BHLHE22, FRMD4B, C1orf61, CSRP2, PPP2R2B, NEUROD6
## MSMO1, ACAT2, CTTNBP2, SLA, PEA15, HMGCS1, TUBB, HOPX, NXPH4, CEMIP
## FAM107A, FGFBP3, C4orf48, GPM6B, SH3BP5, SHB, SYT4, CD82, CALM2, ID2
integrated <- RunUMAP(integrated, dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 12:35:13 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:35:13 Read 9605 rows and found 30 numeric columns
## 12:35:13 Using Annoy for neighbor search, n_neighbors = 30
## 12:35:13 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:35:14 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e6778017d
## 12:35:14 Searching Annoy index using 1 thread, search_k = 3000
## 12:35:16 Annoy recall = 100%
## 12:35:16 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:35:16 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:35:17 Commencing optimization for 500 epochs, with 417692 positive edges
## 12:35:17 Using rng type: pcg
## 12:35:29 Optimization finished
# Plot UMAP
DimPlot(integrated, group.by = "orig.ident", label = TRUE) + ggtitle("UMAP: DS1 + DS3")

saveRDS(integrated, file = 'integrated.DS1.DS3.rds')
# Display the list of Seurat objects
seurat_list
## $DS1
## An object of class Seurat
## 19315 features across 4672 samples within 1 assay
## Active assay: RNA (19315 features, 2000 variable features)
## 2 layers present: counts, data
##
## $DS3
## An object of class Seurat
## 17829 features across 4933 samples within 1 assay
## Active assay: RNA (17829 features, 2000 variable features)
## 2 layers present: counts, data
# Pre-integration: Merge DS1 and DS3
merged_raw <- merge(seurat_list[[1]], y = seurat_list[[2]], add.cell.ids = names(seurat_list))
merged_raw <- NormalizeData(merged_raw)
## Normalizing layer: counts.DS1
## Normalizing layer: counts.DS3
merged_raw <- FindVariableFeatures(merged_raw)
## Finding variable features for layer counts.DS1
## Finding variable features for layer counts.DS3
merged_raw <- ScaleData(merged_raw)
## Centering and scaling data matrix
merged_raw <- RunPCA(merged_raw)
## PC_ 1
## Positive: VIM, HMGB2, SOX2, HES1, SMC4, TTYH1, GNG5, NUSAP1, SFRP1, FABP7
## HMGN2, ZFP36L1, PTN, SOX3, CKS2, KIAA0101, PHGDH, TUBA1B, TYMS, RCN1
## CDK1, CKS1B, TOP2A, RPS27L, MKI67, PTTG1, CENPF, HSPB1, FGFBP3, BTG3
## Negative: STMN2, RTN1, HMP19, GAP43, MAPT, GPM6A, MLLT11, DCX, RAB3A, SOX4
## SYT1, PCSK1N, TMSB10, RBFOX2, NSG1, SEZ6L2, CRMP1, CD24, SCG3, TTC9B
## VAMP2, NEUROD2, CAMK2N1, APLP1, STMN4, NRXN1, GRIA2, INA, TUBA1A, BEX1
## PC_ 2
## Positive: RPS19, RPL41, RPS6, RPS16, RPL12, C1orf61, RPS27L, FABP7, FTL, GAPDH
## EEF1D, DOK5, MDK, RPLP1, IFITM3, TPT1, NAP1L1, CCND1, ENO1, HES1
## CLU, SLC1A3, TKTL1, FGFBP3, RCN1, HSPB1, SFRP1, PTN, MYO10, NKAIN4
## Negative: UBE2C, CCNA2, PLK1, TOP2A, GTSE1, CCNB1, ASPM, SGOL2, CDCA8, NUF2
## MKI67, CENPF, CDC20, TPX2, CENPE, AURKB, NEK2, DLGAP5, AURKA, FAM64A
## CDCA3, KIF2C, CDKN3, CKAP2L, PIF1, PBK, CENPA, HJURP, KIF23, PRC1
## PC_ 3
## Positive: NFIB, NFIA, NEUROD6, NEUROD2, LINC01551, CSRP2, SLA, TBR1, C1orf61, SOX5
## BCL11B, ZBTB18, CNTNAP2, STK17A, BCL11A, PTPRZ1, ABRACL, CKB, SSTR2, FEZF2
## TMSB4X, SYNE2, RPS6, CTTNBP2, FOXG1, NXPH4, SLC17A7, PDE1A, EMX1, CLMP
## Negative: NR2F1, BTG1, NR2F2, MEST, PBX3, CXXC4, SCG5, LY6H, TCEAL7, LRRC4C
## NEFL, NAP1L3, PPP1R1A, ZFHX3, AP1S2, SLC32A1, CCDC144NL-AS1, RIC3, NTRK2, KCNQ1OT1
## TSHZ2, GAD1, CRNDE, ZNF503, ZFHX4, PAK3, TXNIP, FOS, MT-ND5, GAD2
## PC_ 4
## Positive: NHLH1, SOX11, HES6, EOMES, INSM1, ELAVL2, TAGLN3, GADD45G, RCOR2, TAC3
## ARL4D, ENC1, PTMS, SOX4, ELAVL4, CD24, DLL3, NEUROD1, DUOX1, PCBP4
## SSTR2, BTG1, PLK1, PRDX1, PPP1R17, CHD7, NEUROD4, C21orf59, GADD45A, MLLT11
## Negative: TTYH1, PEA15, PTN, HES4, IGFBP5, DUT, GINS2, EEF1B2, VIM, KIAA0101
## FGFBP3, SPARC, CLU, MCM3, HOPX, TYMS, PCNA, TMEM106C, VSTM2L, CDH7
## MEF2C, FAM181B, DHFR, VSNL1, HELLS, GGCT, FEN1, PRKCB, MCM4, ZFP36L1
## PC_ 5
## Positive: MYBL2, KIAA0101, CLSPN, PCNA, DHFR, TYMS, GINS2, HELLS, MCM7, TUBB
## E2F1, FEN1, CENPU, RRM2, CCNE2, TK1, CDC6, ATAD2, CHAF1A, CDC45
## DNAJC9, MCM10, RAD51AP1, ZWINT, MCM3, RNASEH2A, ORC6, CENPK, DTL, ESCO2
## Negative: NTRK2, EDNRB, IGFBP5, PLP1, CTGF, VIM, SPARC, ID3, TTYH1, SFRP2
## ENPP2, ID2, CCNA1, P4HA1, PLK1, NEK2, IFI6, MGST1, WLS, ZFP36L1
## MASP1, PIF1, CDC20, CCNB1, CXCR4, PDE1A, SAT1, DDIT4, CENPA, RFX4
merged_raw <- RunUMAP(merged_raw, dims = 1:20)
## 12:36:25 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:36:25 Read 9605 rows and found 20 numeric columns
## 12:36:25 Using Annoy for neighbor search, n_neighbors = 30
## 12:36:25 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:36:25 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e5863b5b1
## 12:36:25 Searching Annoy index using 1 thread, search_k = 3000
## 12:36:27 Annoy recall = 100%
## 12:36:27 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:36:28 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:36:28 Commencing optimization for 500 epochs, with 393456 positive edges
## 12:36:28 Using rng type: pcg
## 12:36:40 Optimization finished
# Batch effect visualization: Before vs After Integration
p1 <- DimPlot(merged_raw, group.by = "orig.ident") + ggtitle("Before Integration")
p2 <- DimPlot(integrated, group.by = "orig.ident") + ggtitle("After Integration")
batch_plot <- p1 + p2
batch_plot

# Save the batch comparison plot
ggsave(filename = file.path(output_path, "Batch_Effect_Comparison.png"), plot = batch_plot, width = 12)
## Saving 12 x 5 in image
# Clustering
integrated <- FindNeighbors(integrated, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
integrated <- FindClusters(integrated, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9605
## Number of edges: 394628
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8896
## Number of communities: 15
## Elapsed time: 0 seconds
# UMAP colored by cluster
cluster_plot <- DimPlot(integrated, reduction = "umap", group.by = "seurat_clusters", label = TRUE) +
ggtitle("Clusters After Integration")
cluster_plot

# Save UMAP plot
ggsave(filename = file.path(output_path, "Clusters_UMAP.png"),
plot = cluster_plot, width = 8, height = 6)
# Marker Gene Detection
markers <- FindAllMarkers(integrated, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.2, drop = FALSE]): NaNs produced
## Calculating cluster 1
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 2
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 3
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 4
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 5
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 6
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 7
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 8
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 9
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 10
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 11
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 12
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 13
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 14
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
# Get top 10 marker genes per cluster
top2 <- markers %>%
group_by(cluster) %>%
slice_max(order_by = avg_log2FC, n = 2)
# Heatmap of top markers
heatmap <- DoHeatmap(integrated, features = top2$gene) + NoLegend()
heatmap

# Save heatmap
ggsave(filename = file.path(output_path, "Top2_Markers_Heatmap.png"),
plot = heatmap, width = 12, height = 8)
# Save all marker genes to a CSV file
write.csv(markers,
file = file.path(output_path, "All_Marker_Genes.csv"),
row.names = FALSE)
# check metadata in integrated data
head(integrated@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1_1 DS1 16752 3287 1.098376
## AAACCTGAGAGCCTAG-1_1 DS1 13533 3399 1.950787
## AAACCTGAGTAATCCC-1_1 DS1 3097 1557 2.453988
## AAACCTGCACACTGCG-1_1 DS1 5158 2015 3.761148
## AAACCTGCATCGGAAG-1_1 DS1 6964 2320 2.182654
## integrated_snn_res.0.5 seurat_clusters
## AAACCTGAGAATGTTG-1_1 8 8
## AAACCTGAGAGCCTAG-1_1 8 8
## AAACCTGAGTAATCCC-1_1 1 1
## AAACCTGCACACTGCG-1_1 4 4
## AAACCTGCATCGGAAG-1_1 1 1
table(integrated@meta.data$orig.ident)
##
## DS1 DS3
## 4672 4933
# Required packages
library(dplyr)
library(ggplot2)
# Convert metadata to a true data frame (not a tibble or environment object)
meta_df <- as.data.frame(integrated@meta.data)
# Confirm column names (double check!)
colnames(meta_df)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA"
## [4] "percent.mt" "integrated_snn_res.0.5" "seurat_clusters"
library(dplyr)
# Count cells per cluster per sample
cluster_counts <- meta_df %>%
dplyr::count(seurat_clusters, orig.ident)
# Rename count column
colnames(cluster_counts)[3] <- "cell_count"
# View result
head(cluster_counts)
## seurat_clusters orig.ident cell_count
## 1 0 DS1 817
## 2 0 DS3 794
## 3 1 DS1 487
## 4 1 DS3 698
## 5 2 DS1 296
## 6 2 DS3 830
# Compute fraction of cells per cluster in each sample
cluster_fractions <- cluster_counts %>%
group_by(orig.ident) %>%
mutate(fraction = cell_count / sum(cell_count)) %>%
ungroup()
# Preview the result
head(cluster_fractions)
## # A tibble: 6 × 4
## seurat_clusters orig.ident cell_count fraction
## <fct> <chr> <int> <dbl>
## 1 0 DS1 817 0.175
## 2 0 DS3 794 0.161
## 3 1 DS1 487 0.104
## 4 1 DS3 698 0.141
## 5 2 DS1 296 0.0634
## 6 2 DS3 830 0.168
# in percentages
cluster_fractions.percentages <- cluster_fractions %>%
mutate(percent = round(fraction * 100, 1))
cluster_fractions.percentages
## # A tibble: 30 × 5
## seurat_clusters orig.ident cell_count fraction percent
## <fct> <chr> <int> <dbl> <dbl>
## 1 0 DS1 817 0.175 17.5
## 2 0 DS3 794 0.161 16.1
## 3 1 DS1 487 0.104 10.4
## 4 1 DS3 698 0.141 14.1
## 5 2 DS1 296 0.0634 6.3
## 6 2 DS3 830 0.168 16.8
## 7 3 DS1 365 0.0781 7.8
## 8 3 DS3 489 0.0991 9.9
## 9 4 DS1 687 0.147 14.7
## 10 4 DS3 102 0.0207 2.1
## # ℹ 20 more rows
####
#Plot Cell Fractions (Stacked Barplot)
ggplot(cluster_fractions, aes(x = orig.ident, y = fraction, fill = seurat_clusters)) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Cell Fraction per Cluster per Sample",
y = "Fraction", x = "Sample", fill = "Cluster") +
theme_minimal()

#Save as CSV
write.csv(cluster_fractions, file = file.path(output_path, "Cell_Fractions_Per_Cluster_Per_Sample.csv"), row.names = FALSE)
ggplot(cluster_fractions, aes(x = factor(seurat_clusters), y = fraction, fill = orig.ident)) +
geom_bar(stat = "identity", position = "fill") +
labs(
title = "Cell Fractions per Sample per Cluster",
x = "Cluster", y = "Fraction of Cells", fill = "Sample"
) +
theme_minimal()

ggplot(cluster_fractions, aes(x = factor(seurat_clusters), y = fraction, fill = orig.ident)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Cell Fractions per Sample per Cluster (Side-by-Side)",
x = "Cluster", y = "Fraction of Cells", fill = "Sample"
) +
theme_minimal()

saveRDS(integrated, file = file.path(output_path, "integrated_seurat_object.rds"))
library(harmony)
seurat <- RunHarmony(integrated, group.by.vars = "orig.ident", dims.use = 1:20, max.iter.harmony = 50)
## Warning: Warning: The parameter max.iter.harmony is replaced with parameter max_iter. It will be ignored for this function call and please use parameter max_iter in future function calls.
## This warning is displayed once per session.
## Transposing data matrix
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
seurat <- RunUMAP(seurat, reduction = "harmony", dims = 1:20)
## 12:37:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:37:37 Read 9605 rows and found 20 numeric columns
## 12:37:37 Using Annoy for neighbor search, n_neighbors = 30
## 12:37:37 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:37:38 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e4d244eb4
## 12:37:38 Searching Annoy index using 1 thread, search_k = 3000
## 12:37:40 Annoy recall = 100%
## 12:37:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:37:41 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:37:41 Commencing optimization for 500 epochs, with 407406 positive edges
## 12:37:41 Using rng type: pcg
## 12:37:53 Optimization finished
seurat <- FindNeighbors(seurat, reduction = "harmony", dims = 1:20) %>% FindClusters(resolution = 0.6)
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9605
## Number of edges: 362544
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8799
## Number of communities: 16
## Elapsed time: 0 seconds
# You may also want to save the object
saveRDS(seurat, file="integrated_harmony.rds")
#UMAP plot for features after batch effect
plot1 <- UMAPPlot(seurat, group.by="orig.ident")
plot2 <- UMAPPlot(seurat, label = T)
plot3 <- FeaturePlot(seurat, c("FOXG1","EMX1","DLX2","LHX9"), ncol=2, pt.size = 0.1)
((plot1 / plot2) | plot3) + plot_layout(width = c(1,2))

new_ident <- setNames(c("Dorsal telen. NPC",
"Midbrain-hindbrain boundary neuron",
"Dorsal telen. neuron",
"Dien. and midbrain excitatory neuron",
"MGE-like neuron","G2M dorsal telen. NPC",
"Dorsal telen. IP","Dien. and midbrain NPC",
"Dien. and midbrain IP and excitatory early neuron",
"G2M Dien. and midbrain NPC",
"G2M dorsal telen. NPC",
"Dien. and midbrain inhibitory neuron",
"Dien. and midbrain IP and early inhibitory neuron",
"Ventral telen. neuron",
"Unknown 1",
"Unknown 2"),
levels(seurat))
seurat <- RenameIdents(seurat, new_ident)
DimPlot(seurat, reduction = "umap", label = TRUE) + NoLegend()

DimPlot(seurat, reduction = "umap", label = TRUE, split.by = 'orig.ident', ncol = 1)

head(seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1_1 DS1 16752 3287 1.098376
## AAACCTGAGAGCCTAG-1_1 DS1 13533 3399 1.950787
## AAACCTGAGTAATCCC-1_1 DS1 3097 1557 2.453988
## AAACCTGCACACTGCG-1_1 DS1 5158 2015 3.761148
## AAACCTGCATCGGAAG-1_1 DS1 6964 2320 2.182654
## AAACCTGGTGGTAACG-1_1 DS1 5176 2246 5.177743
## integrated_snn_res.0.5 seurat_clusters
## AAACCTGAGAATGTTG-1_1 8 10
## AAACCTGAGAGCCTAG-1_1 8 11
## AAACCTGAGTAATCCC-1_1 1 1
## AAACCTGCACACTGCG-1_1 4 1
## AAACCTGCATCGGAAG-1_1 1 3
## AAACCTGGTGGTAACG-1_1 9 9
## integrated_snn_res.0.6
## AAACCTGAGAATGTTG-1_1 10
## AAACCTGAGAGCCTAG-1_1 11
## AAACCTGAGTAATCCC-1_1 1
## AAACCTGCACACTGCG-1_1 1
## AAACCTGCATCGGAAG-1_1 3
## AAACCTGGTGGTAACG-1_1 9
saveRDS(seurat, file = 'seurat.after.celltype.annotaion.rds')
# Define your cluster IDs (0–15) and cell type names
annotations <- c(
"Dorsal telen. NPC",
"Midbrain-hindbrain boundary neuron",
"Dorsal telen. neuron",
"Dien. and midbrain excitatory neuron",
"MGE-like neuron",
"G2M dorsal telen. NPC",
"Dorsal telen. IP",
"Dien. and midbrain NPC",
"Dien. and midbrain IP and excitatory early neuron",
"G2M Dien. and midbrain NPC",
"G2M dorsal telen. NPC",
"Dien. and midbrain inhibitory neuron",
"Dien. and midbrain IP and early inhibitory neuron",
"Ventral telen. neuron",
"Unknown 1",
"Unknown 2")
# Combine cluster ID with annotation
cluster_ids <- 0:14
combined_labels <- paste0(cluster_ids, ": ", annotations)
names(combined_labels) <- as.character(cluster_ids) # Cluster IDs must be character
# Apply the combined labels to Seurat clusters
integrated.annotated <- RenameIdents(integrated, combined_labels)
## Warning: Cannot find identity NA
head(Idents(integrated.annotated))
## AAACCTGAGAATGTTG-1_1
## 8: Dien. and midbrain IP and excitatory early neuron
## AAACCTGAGAGCCTAG-1_1
## 8: Dien. and midbrain IP and excitatory early neuron
## AAACCTGAGTAATCCC-1_1
## 1: Midbrain-hindbrain boundary neuron
## AAACCTGCACACTGCG-1_1
## 4: MGE-like neuron
## AAACCTGCATCGGAAG-1_1
## 1: Midbrain-hindbrain boundary neuron
## AAACCTGGTGGTAACG-1_1
## 9: G2M Dien. and midbrain NPC
## 15 Levels: 0: Dorsal telen. NPC ... 14: Unknown 1
#save Idents as Celltype in metadata of integrated.annotated object
integrated.annotated@meta.data$Celltype <- Idents(integrated.annotated)
table(integrated.annotated@meta.data$seurat_clusters, integrated.annotated@meta.data$Celltype )
##
## 0: Dorsal telen. NPC 1: Midbrain-hindbrain boundary neuron
## 0 1611 0
## 1 0 1185
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
##
## 2: Dorsal telen. neuron 3: Dien. and midbrain excitatory neuron
## 0 0 0
## 1 0 0
## 2 1126 0
## 3 0 854
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
##
## 4: MGE-like neuron 5: G2M dorsal telen. NPC 6: Dorsal telen. IP
## 0 0 0 0
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 789 0 0
## 5 0 743 0
## 6 0 0 672
## 7 0 0 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
##
## 7: Dien. and midbrain NPC
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 637
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
##
## 8: Dien. and midbrain IP and excitatory early neuron
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 556
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
##
## 9: G2M Dien. and midbrain NPC 10: G2M dorsal telen. NPC
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 444 0
## 10 0 417
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
##
## 11: Dien. and midbrain inhibitory neuron
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 244
## 12 0
## 13 0
## 14 0
##
## 12: Dien. and midbrain IP and early inhibitory neuron
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 151
## 13 0
## 14 0
##
## 13: Ventral telen. neuron 14: Unknown 1
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 107 0
## 14 0 69
#save table Cells per Celltypes
Table <- table(integrated.annotated@meta.data$seurat_clusters, integrated.annotated@meta.data$Celltype )
write.csv(Table, file = 'Table.cells.per.celltypes.csv', row.names = FALSE)