1. load libraries

2. Load Seurat Object


All_samples_Merged <- readRDS("../../0-Seurat_RDS_OBJECT_FINAL/Seurat_object_Final_changes/All_samples_Merged_with_STCAT_Annotation_final-5-09-2025.rds")

DefaultAssay(All_samples_Merged) <- "RNA"

All_samples_Merged <- NormalizeData(
    All_samples_Merged,
    normalization.method = "LogNormalize",
    scale.factor = 1e4,
    verbose = TRUE
  )
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|

3.find Top markers

Idents(All_samples_Merged) <- "seurat_clusters"

# ---------------------------------------------------------
# 2️⃣ Find marker genes per cluster
SS_markers <- FindAllMarkers(
  All_samples_Merged,
  only.pos = TRUE,
  min.pct = 0.4,
  logfc.threshold = 1,
  min.pct.diff = 0.3
  
)

library(dplyr)

# Precise blacklist for uninformative genes
blacklist_patterns <- c(
  "^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC", # TCR
  "^IGH", "^IGK", "^IGL", "^IGJ",                                         # Ig genes
  "^RPL", "^RPS",                                                         # ribosomal
  "^MT-",                                                                 # mitochondria
  "^HBA", "^HBB", "^HB[ABZ]",                                             # hemoglobins
  "^NEAT1$", "^MALAT1$",                                                  # optional lncRNAs
  "^XIST$"                              )

blacklist_regex <- paste(blacklist_patterns, collapse = "|")

# Preview which markers will be removed
to_remove <- SS_markers %>%
  filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
head(to_remove$gene)
[1] "TRAV17"      "TRAV9-2"     "RPL22L1"     "TRAV38-2DV8" "TRGV2"       "TRGV4"      
# Filter markers (keep important metabolic/proliferation genes)
SS_markers_filtered <- SS_markers %>%
  filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))

4. Top5 Markers


library(dplyr)

# ---------------------------------------------------------
# Save filtered markers
#write.csv(SS_markers_filtered, file = "SS_markers_filtered.csv", row.names = FALSE)

# ---------------------------------------------------------
# Extract top 25 markers per cluster
top25_markers <- SS_markers_filtered %>%
  filter(p_val_adj < 0.05) %>%  # ensure statistical significance
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 25) %>%
  ungroup()

#write.csv(top25_markers, file = "SS_markers_top25.csv", row.names = FALSE)

# ---------------------------------------------------------
# Extract top 5 markers per cluster
top5_markers <- SS_markers_filtered %>%
  filter(p_val_adj < 0.05) %>%  # ensure statistical significance
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 5) %>%
  ungroup()

#write.csv(top5_markers, file = "SS_markers_top5.csv", row.names = FALSE)

#message("Filtered markers, top25, and top5 markers saved successfully.")

nbiSweden Approach-Top25

DefaultAssay(All_samples_Merged) <- "SCT"

par(mfrow = c(2, 5), mar = c(4, 6, 3, 1))
for (i in unique(top25_markers$cluster)) {
    barplot(sort(setNames(top25_markers$avg_log2FC, top25_markers$gene)[top25_markers$cluster == i], F),
        horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i"
    )
    abline(v = c(0, 0.25), lty = c(1, 2))
}

NA

nbiSweden Approach-Top5


par(mfrow = c(2, 5), mar = c(4, 6, 3, 1))
for (i in unique(top5_markers$cluster)) {
    barplot(sort(setNames(top5_markers$avg_log2FC, top5_markers$gene)[top5_markers$cluster == i], F),
        horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i"
    )
    abline(v = c(0, 0.25), lty = c(1, 2))
}

NA

Chatomics approach

library(dplyr)
library(Seurat)
library(ComplexHeatmap)
library(magick)


DefaultAssay(All_samples_Merged) <- "SCT"
data_mat <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")

# Convert sparse matrix to dense matrix (beware memory use)
mat <- as.matrix(data_mat)

# Clean top5_markers: unlist and keep only valid gene names present in data_mat
top5_markers_clean <- unlist(top5_markers)
top5_markers_clean <- top5_markers_clean[top5_markers_clean %in% rownames(mat)]

# Subset the matrix using only valid markers
mat_subset <- mat[top5_markers_clean, , drop = FALSE]

# Scale rows
mat_scaled <- t(scale(t(mat_subset)))

# Now proceed with heatmap plotting etc.



cluster_anno<- All_samples_Merged@meta.data$seurat_clusters


# what's the value range in the matrix
quantile(mat, c(0.1, 0.95))
     10%      95% 
0.000000 1.098612 
Seurat::PurpleAndYellow()
 [1] "#FF00FF" "#F400F4" "#EA00EA" "#DF00DF" "#D500D5" "#CA00CA" "#BF00BF" "#B500B5" "#AA00AA" "#9F009F" "#950095"
[12] "#8A008A" "#800080" "#750075" "#6A006A" "#600060" "#550055" "#4A004A" "#400040" "#350035" "#2B002B" "#200020"
[23] "#150015" "#0B000B" "#000000" "#000000" "#0B0B00" "#151500" "#202000" "#2B2B00" "#353500" "#404000" "#4A4A00"
[34] "#555500" "#606000" "#6A6A00" "#757500" "#808000" "#8A8A00" "#959500" "#9F9F00" "#AAAA00" "#B5B500" "#BFBF00"
[45] "#CACA00" "#D4D400" "#DFDF00" "#EAEA00" "#F4F400" "#FFFF00"
## make the black color map to 0. the yellow map to highest and the purle map to the lowest
col_fun = circlize::colorRamp2(c(-1, 0, 3), c("#FF00FF", "black", "#FFFF00"))

Heatmap(mat_scaled, name = "Expression",  
        column_split = factor(cluster_anno),
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        cluster_column_slices = TRUE,
        column_title_gp = gpar(fontsize = 8),
        column_gap = unit(0.5, "mm"),
        cluster_rows = TRUE,
        show_row_dend = FALSE,
        col = col_fun,
        row_names_gp = gpar(fontsize = 8),
        column_title_rot = 90,
        top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = scales::hue_pal()(9)))),
        show_column_names = FALSE,
        use_raster = TRUE,
        raster_quality = 8)

nbiSweden Approach


DotPlot(All_samples_Merged, features = rev(as.character(unique(top5_markers$gene))), group.by = "seurat_clusters", assay = "RNA") + coord_flip()

scCustomize Dotplot

library(scCustomize)
Clustered_DotPlot(seurat_object = All_samples_Merged, features = top5_markers$gene, k = 6)
[[1]]

[[2]]

scCustomize::Clustered_DotPlot(All_samples_Merged, features = top5_markers$gene,
                               plot_km_elbow = FALSE)

---
title: "Identify Top5 Markers and do Cluster Annotation"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  word_document:
    toc: yes
  html_document:
    toc: yes
    df_print: paged
  pdf_document:
    toc: yes
---


# 1. load libraries
```{r setup, include=FALSE}
# Load below libraries
library(Seurat)
library(ggplot2)
library(plotly)
library(tidyverse)
library(cowplot)


library(SCpubr)
library(dplyr)
```


# 2. Load Seurat Object 
```{r}

All_samples_Merged <- readRDS("../../0-Seurat_RDS_OBJECT_FINAL/Seurat_object_Final_changes/All_samples_Merged_with_STCAT_Annotation_final-5-09-2025.rds")

DefaultAssay(All_samples_Merged) <- "RNA"

All_samples_Merged <- NormalizeData(
    All_samples_Merged,
    normalization.method = "LogNormalize",
    scale.factor = 1e4,
    verbose = TRUE
  )


```

# 3.find Top markers
```{r}
Idents(All_samples_Merged) <- "seurat_clusters"

# ---------------------------------------------------------
# 2️⃣ Find marker genes per cluster
SS_markers <- FindAllMarkers(
  All_samples_Merged,
  only.pos = TRUE,
  min.pct = 0.4,
  logfc.threshold = 1,
  min.pct.diff = 0.3
  
)

library(dplyr)

# Precise blacklist for uninformative genes
blacklist_patterns <- c(
  "^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC", # TCR
  "^IGH", "^IGK", "^IGL", "^IGJ",                                         # Ig genes
  "^RPL", "^RPS",                                                         # ribosomal
  "^MT-",                                                                 # mitochondria
  "^HBA", "^HBB", "^HB[ABZ]",                                             # hemoglobins
  "^NEAT1$", "^MALAT1$",                                                  # optional lncRNAs
  "^XIST$"                              )

blacklist_regex <- paste(blacklist_patterns, collapse = "|")

# Preview which markers will be removed
to_remove <- SS_markers %>%
  filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
head(to_remove$gene)

# Filter markers (keep important metabolic/proliferation genes)
SS_markers_filtered <- SS_markers %>%
  filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))


```


# 4. Top5 Markers
```{r, fig.width=12, fig.height=6}

library(dplyr)

# ---------------------------------------------------------
# Save filtered markers
#write.csv(SS_markers_filtered, file = "SS_markers_filtered.csv", row.names = FALSE)

# ---------------------------------------------------------
# Extract top 25 markers per cluster
top25_markers <- SS_markers_filtered %>%
  filter(p_val_adj < 0.05) %>%  # ensure statistical significance
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 25) %>%
  ungroup()

#write.csv(top25_markers, file = "SS_markers_top25.csv", row.names = FALSE)

# ---------------------------------------------------------
# Extract top 5 markers per cluster
top5_markers <- SS_markers_filtered %>%
  filter(p_val_adj < 0.05) %>%  # ensure statistical significance
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 5) %>%
  ungroup()

#write.csv(top5_markers, file = "SS_markers_top5.csv", row.names = FALSE)

#message("Filtered markers, top25, and top5 markers saved successfully.")

```


## nbiSweden Approach-Top25
```{r, fig.width=12, fig.height=6}
DefaultAssay(All_samples_Merged) <- "SCT"

par(mfrow = c(2, 5), mar = c(4, 6, 3, 1))
for (i in unique(top25_markers$cluster)) {
    barplot(sort(setNames(top25_markers$avg_log2FC, top25_markers$gene)[top25_markers$cluster == i], F),
        horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i"
    )
    abline(v = c(0, 0.25), lty = c(1, 2))
}

```

## nbiSweden Approach-Top5
```{r, fig.width=12, fig.height=6}

par(mfrow = c(2, 5), mar = c(4, 6, 3, 1))
for (i in unique(top5_markers$cluster)) {
    barplot(sort(setNames(top5_markers$avg_log2FC, top5_markers$gene)[top5_markers$cluster == i], F),
        horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i"
    )
    abline(v = c(0, 0.25), lty = c(1, 2))
}

```

## Chatomics approach 
```{r, fig.width=10, fig.height=8}
library(dplyr)
library(Seurat)
library(ComplexHeatmap)
library(magick)


DefaultAssay(All_samples_Merged) <- "SCT"
data_mat <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")

# Convert sparse matrix to dense matrix (beware memory use)
mat <- as.matrix(data_mat)

# Clean top5_markers: unlist and keep only valid gene names present in data_mat
top5_markers_clean <- unlist(top5_markers)
top5_markers_clean <- top5_markers_clean[top5_markers_clean %in% rownames(mat)]

# Subset the matrix using only valid markers
mat_subset <- mat[top5_markers_clean, , drop = FALSE]

# Scale rows
mat_scaled <- t(scale(t(mat_subset)))

# Now proceed with heatmap plotting etc.



cluster_anno<- All_samples_Merged@meta.data$seurat_clusters


# what's the value range in the matrix
quantile(mat, c(0.1, 0.95))

Seurat::PurpleAndYellow()

## make the black color map to 0. the yellow map to highest and the purle map to the lowest
col_fun = circlize::colorRamp2(c(-1, 0, 3), c("#FF00FF", "black", "#FFFF00"))

Heatmap(mat_scaled, name = "Expression",  
        column_split = factor(cluster_anno),
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        cluster_column_slices = TRUE,
        column_title_gp = gpar(fontsize = 8),
        column_gap = unit(0.5, "mm"),
        cluster_rows = TRUE,
        show_row_dend = FALSE,
        col = col_fun,
        row_names_gp = gpar(fontsize = 8),
        column_title_rot = 90,
        top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = scales::hue_pal()(9)))),
        show_column_names = FALSE,
        use_raster = TRUE,
        raster_quality = 8)
```

## nbiSweden Approach
```{r, fig.width=10, fig.height=12}

DotPlot(All_samples_Merged, features = rev(as.character(unique(top5_markers$gene))), group.by = "seurat_clusters", assay = "RNA") + coord_flip()

```

## scCustomize Dotplot
```{r, fig.width=10, fig.height=12}
library(scCustomize)
Clustered_DotPlot(seurat_object = All_samples_Merged, features = top5_markers$gene, k = 6)


scCustomize::Clustered_DotPlot(All_samples_Merged, features = top5_markers$gene,
                               plot_km_elbow = FALSE)
```






