Srivastava.Session.PART-2 scRNA-seq Dataset Integration

library(Seurat)

## Warning: package 'Seurat' was built under R version 4.3.3

## Loading required package: SeuratObject

## Warning: package 'SeuratObject' was built under R version 4.3.2

## Loading required package: sp

## Warning: package 'sp' was built under R version 4.3.3

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(SeuratDisk)

## Registered S3 method overwritten by 'SeuratDisk':
##   method            from  
##   as.sparse.H5Group Seurat

library(SeuratWrappers)
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.3.3

library(harmony)

## Warning: package 'harmony' was built under R version 4.3.3

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 4.3.3

library(reshape2)
library(RColorBrewer)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

base_path <- "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/"
output_path <- "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/"
# Dataset folders
folders <- c("DS1", "DS2", "DS3", "DS4")
full_paths <- file.path(base_path, folders)
full_paths

## [1] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS1"
## [2] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS2"
## [3] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS3"
## [4] "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data//DS4"

#Load dataset DS1
DS1 <- Read10X("/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS1")
Seurat.DS1   <- CreateSeuratObject(DS1, project = "DS1")

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

Seurat.DS1

## An object of class Seurat 
## 33694 features across 4672 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
##  1 layer present: counts

#Load dataset DS3
DS3 <- Read10X("/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS3")
Seurat.DS3   <- CreateSeuratObject(DS3, project = "DS3")

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

Seurat.DS3

## An object of class Seurat 
## 33694 features across 4933 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
##  1 layer present: counts

#Add Mitochondrial QC Info
Seurat.DS1[["percent.mt"]] <- PercentageFeatureSet(Seurat.DS1, pattern = "^MT-")
head(Seurat.DS1@meta.data)

##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1        DS1      16752         3287   1.098376
## AAACCTGAGAGCCTAG-1        DS1      13533         3399   1.950787
## AAACCTGAGTAATCCC-1        DS1       3098         1558   2.453196
## AAACCTGCACACTGCG-1        DS1       5158         2015   3.761148
## AAACCTGCATCGGAAG-1        DS1       6966         2322   2.182027
## AAACCTGGTGGTAACG-1        DS1       5177         2247   5.176743

Seurat.DS3[["percent.mt"]] <- PercentageFeatureSet(Seurat.DS3, pattern = "^MT-")
head(Seurat.DS3@meta.data)

##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGACTCGGA-1        DS3      10261         3146   3.216061
## AAACCTGAGAGTGAGA-1        DS3       7065         2415   2.165605
## AAACCTGCACAAGCCC-1        DS3       4007         1780   2.096331
## AAACCTGCAGATTGCT-1        DS3       4497         1793   2.112519
## AAACCTGCAGGATTGG-1        DS3       4685         1641   2.902882
## AAACCTGGTTAGTGGG-1        DS3      10039         2998   2.510210

#Visualize QC Metrics
VlnPlot(Seurat.DS1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(Seurat.DS3, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

library(Seurat)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

library(patchwork)

# Load and prepare both datasets
ds_paths <- list(
  DS1 = "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS1",
  DS3 = "/Users/usri/Desktop/scRNAseq_analysis_april5.2025/data/DS3"
)

seurat_list <- lapply(names(ds_paths), function(name) {
  data <- Read10X(ds_paths[[name]])
  rownames(data) <- make.unique(rownames(data))  # Ensure no duplicate gene names
  obj <- CreateSeuratObject(counts = data, project = name, min.cells = 3, min.features = 200)
  obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")

  # QC filtering
  obj <- subset(obj, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 10)

  # Normalize and find variable genes
  obj <- NormalizeData(obj)
  obj <- FindVariableFeatures(obj)

  return(obj)
})

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Normalizing layer: counts

## Finding variable features for layer counts

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Normalizing layer: counts
## Finding variable features for layer counts

names(seurat_list) <- names(ds_paths)

# Integrate
anchors <- FindIntegrationAnchors(object.list = seurat_list, dims = 1:30)

## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.

## Computing 2000 integration features

## Scaling features for provided objects

## Finding all pairwise anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 11013 anchors

## Filtering anchors

##  Retained 5563 anchors

integrated <- IntegrateData(anchorset = anchors, dims = 1:30)

## Merging dataset 1 into 2

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

## Warning: Layer counts isn't present in the assay object; returning NULL

# Downstream steps
DefaultAssay(integrated) <- "integrated"
integrated <- ScaleData(integrated)

## Centering and scaling data matrix

integrated <- RunPCA(integrated)

## PC_ 1 
## Positive:  SFRP1, VIM, SOX2, FABP7, HES1, PTN, TTYH1, ID4, RCN1, HMGN2 
##     SOX3, HMGB2, TUBA1B, HSPB1, GNG5, DOK5, MDK, PHGDH, SMC4, CLU 
##     CD99, RPS27L, H2AFZ, GAPDH, DEK, ECI1, NUSAP1, NES, TYMS, KIAA0101 
## Negative:  STMN2, NEUROD2, RTN1, GAP43, SYT1, HMP19, SOX4, NEUROD6, GPM6A, CNTNAP2 
##     TMSB10, MAPT, MLLT11, RAB3A, DCX, CD24, NSG1, PCSK1N, RBFOX2, TBR1 
##     SEZ6L2, CRMP1, SLA, STMN4, SHTN1, SCG3, THRA, TTC9B, CAMK2N1, TUBA1A 
## PC_ 2 
## Positive:  RPS19, FTL, C1orf61, RPL12, RPS6, RPS27L, RPS16, RPL41, FABP7, NHLH1 
##     BTG1, TMSB4X, TPT1, PRDX1, EZR, NAP1L1, NKAIN4, EEF1D, CCND1, TKTL1 
##     DOK5, ENO1, TAC3, RPLP1, IFITM3, FAM60A, GAPDH, HES1, CALD1, LINC01021 
## Negative:  UBE2C, TOP2A, MKI67, GTSE1, TPX2, CCNA2, CENPF, FAM64A, ASPM, AURKB 
##     NUF2, SGOL2, CCNB1, CDCA3, CDK1, NUSAP1, PBK, PLK1, CDC20, BIRC5 
##     CDKN3, PRC1, AURKA, NEK2, KIF23, KIF2C, DLGAP5, NDC80, CENPE, SGOL1 
## PC_ 3 
## Positive:  TTYH1, HOPX, HES4, PEA15, IGFBP5, CDH7, PTN, VIM, FGFBP3, ITM2C 
##     LMO3, FAM181B, MEF2C, DACT1, DOK6, IFITM3, SPARC, DUT, FAM107A, EEF1B2 
##     MCM3, GINS2, TMEM161B-AS1, PCNA, VCAN, LRRC3B, TMEM106C, VSNL1, PLAT, LRRC1 
## Negative:  NHLH1, EOMES, HES6, SOX11, ARL4D, PRDX1, RCOR2, SSTR2, ELAVL2, INSM1 
##     TAGLN3, ELAVL4, ENC1, PPP1R17, DUOX1, EZR, HN1, TAC3, RP4-665J23.1, BHLHE22 
##     SLCO5A1, NEUROD1, GADD45G, NXPH4, DLL3, NPPA, NEUROD4, PLK1, TMSB4X, PTMS 
## PC_ 4 
## Positive:  PLK1, CDC20, CCNB1, CCNA1, CENPA, NEK2, CCNB2, PIF1, TROAP, NMU 
##     VIM, AURKA, BUB1, GAS2L3, ARL6IP1, DLGAP5, PSRC1, BNIP3, CENPE, CDC25B 
##     TTYH1, SAPCD2, ASPM, CEP70, HMMR, LINC01551, BORA, TUBA1C, KNSTRN, NUDCD2 
## Negative:  MYBL2, KIAA0101, CLSPN, DHFR, TYMS, RRM2, CDC6, TK1, CCNE2, PCNA 
##     TUBB, CDC45, FEN1, MCM10, E2F1, HELLS, ATAD2, CHAF1A, RAD51AP1, CENPU 
##     GINS2, ESCO2, MCM7, CENPK, RNASEH2A, DNAJC9, ORC6, DTL, PKMYT1, ZWINT 
## PC_ 5 
## Positive:  NEUROG1, HES5, NEUROD4, HES6, EOMES, RPS16, RPS19, VSNL1, CDH13, MFNG 
##     CDH7, HSPA12A, EEF1B2, RPLP1, MYCL, NEGR1, LMO3, MEF2C, DACT1, RGS16 
##     RBPJ, ELAVL2, SORCS1, ADCY1, CLSTN2, PPP1R1B, CPNE6, TRIL, STXBP6, DLL1 
## Negative:  TUBB2B, TUBB2A, PTPRZ1, TUBA1A, BHLHE22, FRMD4B, C1orf61, CSRP2, PPP2R2B, NEUROD6 
##     MSMO1, ACAT2, CTTNBP2, SLA, PEA15, HMGCS1, TUBB, HOPX, NXPH4, CEMIP 
##     FAM107A, FGFBP3, C4orf48, GPM6B, SH3BP5, SHB, SYT4, CD82, CALM2, ID2

integrated <- RunUMAP(integrated, dims = 1:30)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 12:35:13 UMAP embedding parameters a = 0.9922 b = 1.112

## 12:35:13 Read 9605 rows and found 30 numeric columns

## 12:35:13 Using Annoy for neighbor search, n_neighbors = 30

## 12:35:13 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 12:35:14 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e6778017d
## 12:35:14 Searching Annoy index using 1 thread, search_k = 3000
## 12:35:16 Annoy recall = 100%
## 12:35:16 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:35:16 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:35:17 Commencing optimization for 500 epochs, with 417692 positive edges
## 12:35:17 Using rng type: pcg
## 12:35:29 Optimization finished

# Plot UMAP
DimPlot(integrated, group.by = "orig.ident", label = TRUE) + ggtitle("UMAP: DS1 + DS3")

saveRDS(integrated, file = 'integrated.DS1.DS3.rds')

# Display the list of Seurat objects
seurat_list

## $DS1
## An object of class Seurat 
## 19315 features across 4672 samples within 1 assay 
## Active assay: RNA (19315 features, 2000 variable features)
##  2 layers present: counts, data
## 
## $DS3
## An object of class Seurat 
## 17829 features across 4933 samples within 1 assay 
## Active assay: RNA (17829 features, 2000 variable features)
##  2 layers present: counts, data

# Pre-integration: Merge DS1 and DS3
merged_raw <- merge(seurat_list[[1]], y = seurat_list[[2]], add.cell.ids = names(seurat_list))
merged_raw <- NormalizeData(merged_raw)

## Normalizing layer: counts.DS1

## Normalizing layer: counts.DS3

merged_raw <- FindVariableFeatures(merged_raw)

## Finding variable features for layer counts.DS1

## Finding variable features for layer counts.DS3

merged_raw <- ScaleData(merged_raw)

## Centering and scaling data matrix

merged_raw <- RunPCA(merged_raw)

## PC_ 1 
## Positive:  VIM, HMGB2, SOX2, HES1, SMC4, TTYH1, GNG5, NUSAP1, SFRP1, FABP7 
##     HMGN2, ZFP36L1, PTN, SOX3, CKS2, KIAA0101, PHGDH, TUBA1B, TYMS, RCN1 
##     CDK1, CKS1B, TOP2A, RPS27L, MKI67, PTTG1, CENPF, HSPB1, FGFBP3, BTG3 
## Negative:  STMN2, RTN1, HMP19, GAP43, MAPT, GPM6A, MLLT11, DCX, RAB3A, SOX4 
##     SYT1, PCSK1N, TMSB10, RBFOX2, NSG1, SEZ6L2, CRMP1, CD24, SCG3, TTC9B 
##     VAMP2, NEUROD2, CAMK2N1, APLP1, STMN4, NRXN1, GRIA2, INA, TUBA1A, BEX1 
## PC_ 2 
## Positive:  RPS19, RPL41, RPS6, RPS16, RPL12, C1orf61, RPS27L, FABP7, FTL, GAPDH 
##     EEF1D, DOK5, MDK, RPLP1, IFITM3, TPT1, NAP1L1, CCND1, ENO1, HES1 
##     CLU, SLC1A3, TKTL1, FGFBP3, RCN1, HSPB1, SFRP1, PTN, MYO10, NKAIN4 
## Negative:  UBE2C, CCNA2, PLK1, TOP2A, GTSE1, CCNB1, ASPM, SGOL2, CDCA8, NUF2 
##     MKI67, CENPF, CDC20, TPX2, CENPE, AURKB, NEK2, DLGAP5, AURKA, FAM64A 
##     CDCA3, KIF2C, CDKN3, CKAP2L, PIF1, PBK, CENPA, HJURP, KIF23, PRC1 
## PC_ 3 
## Positive:  NFIB, NFIA, NEUROD6, NEUROD2, LINC01551, CSRP2, SLA, TBR1, C1orf61, SOX5 
##     BCL11B, ZBTB18, CNTNAP2, STK17A, BCL11A, PTPRZ1, ABRACL, CKB, SSTR2, FEZF2 
##     TMSB4X, SYNE2, RPS6, CTTNBP2, FOXG1, NXPH4, SLC17A7, PDE1A, EMX1, CLMP 
## Negative:  NR2F1, BTG1, NR2F2, MEST, PBX3, CXXC4, SCG5, LY6H, TCEAL7, LRRC4C 
##     NEFL, NAP1L3, PPP1R1A, ZFHX3, AP1S2, SLC32A1, CCDC144NL-AS1, RIC3, NTRK2, KCNQ1OT1 
##     TSHZ2, GAD1, CRNDE, ZNF503, ZFHX4, PAK3, TXNIP, FOS, MT-ND5, GAD2 
## PC_ 4 
## Positive:  NHLH1, SOX11, HES6, EOMES, INSM1, ELAVL2, TAGLN3, GADD45G, RCOR2, TAC3 
##     ARL4D, ENC1, PTMS, SOX4, ELAVL4, CD24, DLL3, NEUROD1, DUOX1, PCBP4 
##     SSTR2, BTG1, PLK1, PRDX1, PPP1R17, CHD7, NEUROD4, C21orf59, GADD45A, MLLT11 
## Negative:  TTYH1, PEA15, PTN, HES4, IGFBP5, DUT, GINS2, EEF1B2, VIM, KIAA0101 
##     FGFBP3, SPARC, CLU, MCM3, HOPX, TYMS, PCNA, TMEM106C, VSTM2L, CDH7 
##     MEF2C, FAM181B, DHFR, VSNL1, HELLS, GGCT, FEN1, PRKCB, MCM4, ZFP36L1 
## PC_ 5 
## Positive:  MYBL2, KIAA0101, CLSPN, PCNA, DHFR, TYMS, GINS2, HELLS, MCM7, TUBB 
##     E2F1, FEN1, CENPU, RRM2, CCNE2, TK1, CDC6, ATAD2, CHAF1A, CDC45 
##     DNAJC9, MCM10, RAD51AP1, ZWINT, MCM3, RNASEH2A, ORC6, CENPK, DTL, ESCO2 
## Negative:  NTRK2, EDNRB, IGFBP5, PLP1, CTGF, VIM, SPARC, ID3, TTYH1, SFRP2 
##     ENPP2, ID2, CCNA1, P4HA1, PLK1, NEK2, IFI6, MGST1, WLS, ZFP36L1 
##     MASP1, PIF1, CDC20, CCNB1, CXCR4, PDE1A, SAT1, DDIT4, CENPA, RFX4

merged_raw <- RunUMAP(merged_raw, dims = 1:20)

## 12:36:25 UMAP embedding parameters a = 0.9922 b = 1.112

## 12:36:25 Read 9605 rows and found 20 numeric columns

## 12:36:25 Using Annoy for neighbor search, n_neighbors = 30

## 12:36:25 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 12:36:25 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e5863b5b1
## 12:36:25 Searching Annoy index using 1 thread, search_k = 3000
## 12:36:27 Annoy recall = 100%
## 12:36:27 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:36:28 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:36:28 Commencing optimization for 500 epochs, with 393456 positive edges
## 12:36:28 Using rng type: pcg
## 12:36:40 Optimization finished

# Batch effect visualization: Before vs After Integration
p1 <- DimPlot(merged_raw, group.by = "orig.ident") + ggtitle("Before Integration")
p2 <- DimPlot(integrated, group.by = "orig.ident") + ggtitle("After Integration")
batch_plot <- p1 + p2
batch_plot

# Save the batch comparison plot
ggsave(filename = file.path(output_path, "Batch_Effect_Comparison.png"), plot = batch_plot, width = 12)

## Saving 12 x 5 in image

# Clustering
integrated <- FindNeighbors(integrated, dims = 1:30)

## Computing nearest neighbor graph

## Computing SNN

integrated <- FindClusters(integrated, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9605
## Number of edges: 394628
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8896
## Number of communities: 15
## Elapsed time: 0 seconds

# UMAP colored by cluster
cluster_plot <- DimPlot(integrated, reduction = "umap", group.by = "seurat_clusters", label = TRUE) +
  ggtitle("Clusters After Integration")
cluster_plot

# Save UMAP plot
ggsave(filename = file.path(output_path, "Clusters_UMAP.png"),
       plot = cluster_plot, width = 8, height = 6)

# Marker Gene Detection
markers <- FindAllMarkers(integrated, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster 0

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Warning in mean.fxn(object[features, cells.2, drop = FALSE]): NaNs produced

## Calculating cluster 1

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 2

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 3

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 4

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 5

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 6

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 7

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 8

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 9

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 10

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 11

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 12

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 13

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

## Calculating cluster 14

## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced

# Get top 10 marker genes per cluster
top2 <- markers %>%
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 2)

# Heatmap of top markers
heatmap <- DoHeatmap(integrated, features = top2$gene) + NoLegend()
heatmap

# Save heatmap
ggsave(filename = file.path(output_path, "Top2_Markers_Heatmap.png"),
       plot = heatmap, width = 12, height = 8)

# Save all marker genes to a CSV file
write.csv(markers,
          file = file.path(output_path, "All_Marker_Genes.csv"),
          row.names = FALSE)

# check metadata in integrated data
head(integrated@meta.data, 5)

##                      orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1_1        DS1      16752         3287   1.098376
## AAACCTGAGAGCCTAG-1_1        DS1      13533         3399   1.950787
## AAACCTGAGTAATCCC-1_1        DS1       3097         1557   2.453988
## AAACCTGCACACTGCG-1_1        DS1       5158         2015   3.761148
## AAACCTGCATCGGAAG-1_1        DS1       6964         2320   2.182654
##                      integrated_snn_res.0.5 seurat_clusters
## AAACCTGAGAATGTTG-1_1                      8               8
## AAACCTGAGAGCCTAG-1_1                      8               8
## AAACCTGAGTAATCCC-1_1                      1               1
## AAACCTGCACACTGCG-1_1                      4               4
## AAACCTGCATCGGAAG-1_1                      1               1

table(integrated@meta.data$orig.ident)

## 
##  DS1  DS3 
## 4672 4933

# Required packages
library(dplyr)
library(ggplot2)

# Convert metadata to a true data frame (not a tibble or environment object)
meta_df <- as.data.frame(integrated@meta.data)

# Confirm column names (double check!)
colnames(meta_df)

## [1] "orig.ident"             "nCount_RNA"             "nFeature_RNA"          
## [4] "percent.mt"             "integrated_snn_res.0.5" "seurat_clusters"

library(dplyr)

# Count cells per cluster per sample
cluster_counts <- meta_df %>%
  dplyr::count(seurat_clusters, orig.ident)

# Rename count column
colnames(cluster_counts)[3] <- "cell_count"

# View result
head(cluster_counts)

##   seurat_clusters orig.ident cell_count
## 1               0        DS1        817
## 2               0        DS3        794
## 3               1        DS1        487
## 4               1        DS3        698
## 5               2        DS1        296
## 6               2        DS3        830

# Compute fraction of cells per cluster in each sample
cluster_fractions <- cluster_counts %>%
  group_by(orig.ident) %>%
  mutate(fraction = cell_count / sum(cell_count)) %>%
  ungroup()


# Preview the result

head(cluster_fractions)

## # A tibble: 6 × 4
##   seurat_clusters orig.ident cell_count fraction
##   <fct>           <chr>           <int>    <dbl>
## 1 0               DS1               817   0.175 
## 2 0               DS3               794   0.161 
## 3 1               DS1               487   0.104 
## 4 1               DS3               698   0.141 
## 5 2               DS1               296   0.0634
## 6 2               DS3               830   0.168

# in percentages
cluster_fractions.percentages <- cluster_fractions %>%
  mutate(percent = round(fraction * 100, 1))

cluster_fractions.percentages

## # A tibble: 30 × 5
##    seurat_clusters orig.ident cell_count fraction percent
##    <fct>           <chr>           <int>    <dbl>   <dbl>
##  1 0               DS1               817   0.175     17.5
##  2 0               DS3               794   0.161     16.1
##  3 1               DS1               487   0.104     10.4
##  4 1               DS3               698   0.141     14.1
##  5 2               DS1               296   0.0634     6.3
##  6 2               DS3               830   0.168     16.8
##  7 3               DS1               365   0.0781     7.8
##  8 3               DS3               489   0.0991     9.9
##  9 4               DS1               687   0.147     14.7
## 10 4               DS3               102   0.0207     2.1
## # ℹ 20 more rows

####
#Plot Cell Fractions (Stacked Barplot)
ggplot(cluster_fractions, aes(x = orig.ident, y = fraction, fill = seurat_clusters)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Cell Fraction per Cluster per Sample",
       y = "Fraction", x = "Sample", fill = "Cluster") +
  theme_minimal()

#Save as CSV
write.csv(cluster_fractions, file = file.path(output_path, "Cell_Fractions_Per_Cluster_Per_Sample.csv"), row.names = FALSE)

ggplot(cluster_fractions, aes(x = factor(seurat_clusters), y = fraction, fill = orig.ident)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(
    title = "Cell Fractions per Sample per Cluster",
    x = "Cluster", y = "Fraction of Cells", fill = "Sample"
  ) +
  theme_minimal()

ggplot(cluster_fractions, aes(x = factor(seurat_clusters), y = fraction, fill = orig.ident)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Cell Fractions per Sample per Cluster (Side-by-Side)",
    x = "Cluster", y = "Fraction of Cells", fill = "Sample"
  ) +
  theme_minimal()

saveRDS(integrated, file = file.path(output_path, "integrated_seurat_object.rds"))

library(harmony)
seurat <- RunHarmony(integrated, group.by.vars = "orig.ident", dims.use = 1:20, max.iter.harmony = 50)

## Warning: Warning: The parameter max.iter.harmony is replaced with parameter max_iter. It will be ignored for this function call and please use parameter max_iter in future function calls.
## This warning is displayed once per session.

## Transposing data matrix

## Initializing state using k-means centroids initialization

## Harmony 1/10

## Harmony 2/10

## Harmony 3/10

## Harmony converged after 3 iterations

seurat <- RunUMAP(seurat, reduction = "harmony", dims = 1:20)

## 12:37:37 UMAP embedding parameters a = 0.9922 b = 1.112

## 12:37:37 Read 9605 rows and found 20 numeric columns

## 12:37:37 Using Annoy for neighbor search, n_neighbors = 30

## 12:37:37 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 12:37:38 Writing NN index file to temp file /var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T//RtmpRimLsm/file1495e4d244eb4
## 12:37:38 Searching Annoy index using 1 thread, search_k = 3000
## 12:37:40 Annoy recall = 100%
## 12:37:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:37:41 Initializing from normalized Laplacian + noise (using RSpectra)
## 12:37:41 Commencing optimization for 500 epochs, with 407406 positive edges
## 12:37:41 Using rng type: pcg
## 12:37:53 Optimization finished

seurat <- FindNeighbors(seurat, reduction = "harmony", dims = 1:20) %>% FindClusters(resolution = 0.6)

## Computing nearest neighbor graph
## Computing SNN

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9605
## Number of edges: 362544
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8799
## Number of communities: 16
## Elapsed time: 0 seconds

# You may also want to save the object
saveRDS(seurat, file="integrated_harmony.rds")

#UMAP plot for features after batch effect
plot1 <- UMAPPlot(seurat, group.by="orig.ident")
plot2 <- UMAPPlot(seurat, label = T)
plot3 <- FeaturePlot(seurat, c("FOXG1","EMX1","DLX2","LHX9"), ncol=2, pt.size = 0.1)
((plot1 / plot2) | plot3) + plot_layout(width = c(1,2))

new_ident <- setNames(c("Dorsal telen. NPC",
                        "Midbrain-hindbrain boundary neuron",
                        "Dorsal telen. neuron",
                        "Dien. and midbrain excitatory neuron",
                        "MGE-like neuron","G2M dorsal telen. NPC",
                        "Dorsal telen. IP","Dien. and midbrain NPC",
                        "Dien. and midbrain IP and excitatory early neuron",
                        "G2M Dien. and midbrain NPC",
                        "G2M dorsal telen. NPC",
                        "Dien. and midbrain inhibitory neuron",
                        "Dien. and midbrain IP and early inhibitory neuron",
                        "Ventral telen. neuron",
                        "Unknown 1",
                        "Unknown 2"),
                      levels(seurat))
seurat <- RenameIdents(seurat, new_ident)
DimPlot(seurat, reduction = "umap", label = TRUE) + NoLegend()

DimPlot(seurat, reduction = "umap", label = TRUE, split.by = 'orig.ident', ncol = 1)

head(seurat@meta.data)

##                      orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATGTTG-1_1        DS1      16752         3287   1.098376
## AAACCTGAGAGCCTAG-1_1        DS1      13533         3399   1.950787
## AAACCTGAGTAATCCC-1_1        DS1       3097         1557   2.453988
## AAACCTGCACACTGCG-1_1        DS1       5158         2015   3.761148
## AAACCTGCATCGGAAG-1_1        DS1       6964         2320   2.182654
## AAACCTGGTGGTAACG-1_1        DS1       5176         2246   5.177743
##                      integrated_snn_res.0.5 seurat_clusters
## AAACCTGAGAATGTTG-1_1                      8              10
## AAACCTGAGAGCCTAG-1_1                      8              11
## AAACCTGAGTAATCCC-1_1                      1               1
## AAACCTGCACACTGCG-1_1                      4               1
## AAACCTGCATCGGAAG-1_1                      1               3
## AAACCTGGTGGTAACG-1_1                      9               9
##                      integrated_snn_res.0.6
## AAACCTGAGAATGTTG-1_1                     10
## AAACCTGAGAGCCTAG-1_1                     11
## AAACCTGAGTAATCCC-1_1                      1
## AAACCTGCACACTGCG-1_1                      1
## AAACCTGCATCGGAAG-1_1                      3
## AAACCTGGTGGTAACG-1_1                      9

saveRDS(seurat, file = 'seurat.after.celltype.annotaion.rds')

# Define your cluster IDs (0–15) and cell type names
annotations <- c(
  "Dorsal telen. NPC",
  "Midbrain-hindbrain boundary neuron",
  "Dorsal telen. neuron",
  "Dien. and midbrain excitatory neuron",
  "MGE-like neuron",
  "G2M dorsal telen. NPC",
  "Dorsal telen. IP",
  "Dien. and midbrain NPC",
  "Dien. and midbrain IP and excitatory early neuron",
  "G2M Dien. and midbrain NPC",
  "G2M dorsal telen. NPC",
  "Dien. and midbrain inhibitory neuron",
  "Dien. and midbrain IP and early inhibitory neuron",
  "Ventral telen. neuron",
  "Unknown 1",
  "Unknown 2")

# Combine cluster ID with annotation
cluster_ids <- 0:14
combined_labels <- paste0(cluster_ids, ": ", annotations)
names(combined_labels) <- as.character(cluster_ids)  # Cluster IDs must be character

# Apply the combined labels to Seurat clusters
integrated.annotated <- RenameIdents(integrated, combined_labels)

## Warning: Cannot find identity NA

head(Idents(integrated.annotated))

##                                 AAACCTGAGAATGTTG-1_1 
## 8: Dien. and midbrain IP and excitatory early neuron 
##                                 AAACCTGAGAGCCTAG-1_1 
## 8: Dien. and midbrain IP and excitatory early neuron 
##                                 AAACCTGAGTAATCCC-1_1 
##                1: Midbrain-hindbrain boundary neuron 
##                                 AAACCTGCACACTGCG-1_1 
##                                   4: MGE-like neuron 
##                                 AAACCTGCATCGGAAG-1_1 
##                1: Midbrain-hindbrain boundary neuron 
##                                 AAACCTGGTGGTAACG-1_1 
##                        9: G2M Dien. and midbrain NPC 
## 15 Levels: 0: Dorsal telen. NPC ... 14: Unknown 1

#save Idents as Celltype in metadata of integrated.annotated object
integrated.annotated@meta.data$Celltype <- Idents(integrated.annotated)
table(integrated.annotated@meta.data$seurat_clusters, integrated.annotated@meta.data$Celltype )

##     
##      0: Dorsal telen. NPC 1: Midbrain-hindbrain boundary neuron
##   0                  1611                                     0
##   1                     0                                  1185
##   2                     0                                     0
##   3                     0                                     0
##   4                     0                                     0
##   5                     0                                     0
##   6                     0                                     0
##   7                     0                                     0
##   8                     0                                     0
##   9                     0                                     0
##   10                    0                                     0
##   11                    0                                     0
##   12                    0                                     0
##   13                    0                                     0
##   14                    0                                     0
##     
##      2: Dorsal telen. neuron 3: Dien. and midbrain excitatory neuron
##   0                        0                                       0
##   1                        0                                       0
##   2                     1126                                       0
##   3                        0                                     854
##   4                        0                                       0
##   5                        0                                       0
##   6                        0                                       0
##   7                        0                                       0
##   8                        0                                       0
##   9                        0                                       0
##   10                       0                                       0
##   11                       0                                       0
##   12                       0                                       0
##   13                       0                                       0
##   14                       0                                       0
##     
##      4: MGE-like neuron 5: G2M dorsal telen. NPC 6: Dorsal telen. IP
##   0                   0                        0                   0
##   1                   0                        0                   0
##   2                   0                        0                   0
##   3                   0                        0                   0
##   4                 789                        0                   0
##   5                   0                      743                   0
##   6                   0                        0                 672
##   7                   0                        0                   0
##   8                   0                        0                   0
##   9                   0                        0                   0
##   10                  0                        0                   0
##   11                  0                        0                   0
##   12                  0                        0                   0
##   13                  0                        0                   0
##   14                  0                        0                   0
##     
##      7: Dien. and midbrain NPC
##   0                          0
##   1                          0
##   2                          0
##   3                          0
##   4                          0
##   5                          0
##   6                          0
##   7                        637
##   8                          0
##   9                          0
##   10                         0
##   11                         0
##   12                         0
##   13                         0
##   14                         0
##     
##      8: Dien. and midbrain IP and excitatory early neuron
##   0                                                     0
##   1                                                     0
##   2                                                     0
##   3                                                     0
##   4                                                     0
##   5                                                     0
##   6                                                     0
##   7                                                     0
##   8                                                   556
##   9                                                     0
##   10                                                    0
##   11                                                    0
##   12                                                    0
##   13                                                    0
##   14                                                    0
##     
##      9: G2M Dien. and midbrain NPC 10: G2M dorsal telen. NPC
##   0                              0                         0
##   1                              0                         0
##   2                              0                         0
##   3                              0                         0
##   4                              0                         0
##   5                              0                         0
##   6                              0                         0
##   7                              0                         0
##   8                              0                         0
##   9                            444                         0
##   10                             0                       417
##   11                             0                         0
##   12                             0                         0
##   13                             0                         0
##   14                             0                         0
##     
##      11: Dien. and midbrain inhibitory neuron
##   0                                         0
##   1                                         0
##   2                                         0
##   3                                         0
##   4                                         0
##   5                                         0
##   6                                         0
##   7                                         0
##   8                                         0
##   9                                         0
##   10                                        0
##   11                                      244
##   12                                        0
##   13                                        0
##   14                                        0
##     
##      12: Dien. and midbrain IP and early inhibitory neuron
##   0                                                      0
##   1                                                      0
##   2                                                      0
##   3                                                      0
##   4                                                      0
##   5                                                      0
##   6                                                      0
##   7                                                      0
##   8                                                      0
##   9                                                      0
##   10                                                     0
##   11                                                     0
##   12                                                   151
##   13                                                     0
##   14                                                     0
##     
##      13: Ventral telen. neuron 14: Unknown 1
##   0                          0             0
##   1                          0             0
##   2                          0             0
##   3                          0             0
##   4                          0             0
##   5                          0             0
##   6                          0             0
##   7                          0             0
##   8                          0             0
##   9                          0             0
##   10                         0             0
##   11                         0             0
##   12                         0             0
##   13                       107             0
##   14                         0            69

#save table Cells per Celltypes

Table <- table(integrated.annotated@meta.data$seurat_clusters, integrated.annotated@meta.data$Celltype )
write.csv(Table, file = 'Table.cells.per.celltypes.csv', row.names = FALSE)

Srivastava.Session.PART-2 scRNA-seq Dataset Integration

Dr UPASNA

2025-04-08