4. Use FindAllMarkers() for DE on RNA assay
# Load necessary libraries
library(Seurat)
DefaultAssay(All_samples_Merged) <- "RNA"
All_samples_Merged <- NormalizeData(
All_samples_Merged,
assay = "RNA",
normalization.method = "LogNormalize",
scale.factor = 10000
)
Normalizing layer: counts
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Subset only malignant Sézary syndrome cell lines
malignant_lines <- c("L1", "L2", "L3", "L4", "L5", "L6", "L7")
Malignant_obj <- subset(All_samples_Merged, subset = cell_line %in% malignant_lines)
# Set cell_line as the active identity
Idents(Malignant_obj) <- "cell_line"
# Ensure RNA slot is normalized (if not already done)
if (!"RNA" %in% names(Malignant_obj@assays)) {
Malignant_obj <- NormalizeData(Malignant_obj)
}
# Run FindAllMarkers to compare each cell_line against all other malignant cell lines
all_markers<- FindAllMarkers(Malignant_obj,
assay = "RNA",
only.pos = FALSE,
logfc.threshold = 0,
min.pct = 0,
test.use = "wilcox") # can also use "MAST", "t", etc.
Calculating cluster L1
Avis : The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Avis : `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.Calculating cluster L2
Calculating cluster L3
Calculating cluster L4
Calculating cluster L5
Calculating cluster L6
Calculating cluster L7
# Save all marker results
write.csv(all_markers, "AllMarkers_CellLines_MalignantOnly.csv", row.names = FALSE)
4. Run GSEA per Cluster using DE Files
# 3. Prepare output list for storing FGSEA results
gsea_all <- list()
# 4. Get unique clusters from all_markers
clusters <- unique(all_markers$cluster)
# 5. Loop over each cluster and run FGSEA
for (cluster_id in clusters) {
cat("Processing cluster:", cluster_id, "\n")
# Subset DE genes for cluster vs rest
de_df <- all_markers %>%
filter(cluster == cluster_id) %>%
filter(!is.na(avg_log2FC))
# Create named vector ranked by avg_log2FC (descending)
ranked_genes <- deframe(de_df %>% select(gene, avg_log2FC))
ranked_genes <- sort(ranked_genes, decreasing = TRUE)
# Run FGSEA on each MSigDB collection
for (db_name in names(msigdb_sets)) {
fgsea_res <- fgsea(
pathways = msigdb_sets[[db_name]],
stats = ranked_genes,
minSize = 10,
maxSize = 300,
nperm = 10000
)
if (nrow(fgsea_res) > 0) {
fgsea_res$cluster <- cluster_id
fgsea_res$database <- db_name
# Save in list with unique name
gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- fgsea_res
}
}
}
Processing cluster: L1
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There were 1 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There were 1 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic values
Processing cluster: L2
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
Processing cluster: L3
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
Processing cluster: L4
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
Processing cluster: L5
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
Processing cluster: L6
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: L7
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
Combine and Save All GSEA Results
# 6. Combine all GSEA results into one dataframe
gsea_df <- bind_rows(gsea_all)
# 7. Flatten leadingEdge list column into a semicolon-separated string
gsea_df <- gsea_df %>%
mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))
# 8. Save combined results (optional)
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise_findAllMarkers_used.csv", row.names = FALSE)
# 9. Create folders and save cluster-wise results separately
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)
for (cluster_id in names(gsea_by_cluster)) {
folder_name <- paste0("Cluster_", cluster_id)
if (!dir.exists(folder_name)) {
dir.create(folder_name)
}
file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}
Combine and Save All GSEA Results
# 10. Plot top GSEA pathways per cluster & collection (top 3 per cluster/db, padj<0.05)
top_terms <- gsea_df %>%
filter(padj < 0.05) %>%
group_by(cluster, database) %>%
slice_max(NES, n = 10, with_ties = FALSE) %>%
ungroup()
# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)
# Plot using ggplot2
p1 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
geom_col(width = 0.9) +
coord_flip() +
facet_grid(database ~ cluster, scales = "free_y", space = "free") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(
x = "Pathway",
y = "Normalized Enrichment Score (NES)",
title = "Top GSEA Pathways per Cluster",
fill = "Database"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal(base_size = 16) +
theme(
strip.text = element_text(face = "bold", size = 16),
strip.background = element_rect(fill = "gray90", color = "black"),
strip.placement = "outside",
legend.position = "bottom",
axis.text.y = element_text(face = "bold", size = 13),
axis.text.x = element_text(size = 14),
panel.spacing = unit(1, "lines"),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
)
ggsave("Top_GSEA_Pathways_Per_Cluster1.pdf", p1, width = 30, height = 58, limitsize = FALSE)
p1

Combine and Save All GSEA Results
library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)
# Force cluster order
top_terms$cluster <- factor(top_terms$cluster, levels = as.character(0:13))
# Clean and shorten pathway names
top_terms <- top_terms %>%
mutate(
pathway = gsub("GO_|REACTOME_|KEGG_", "", pathway),
pathway = gsub("_", " ", pathway),
pathway = str_to_title(pathway),
pathway = str_trunc(pathway, width = 60)
)
# Add direction column
top_terms <- top_terms %>%
mutate(direction = ifelse(NES > 0, "Upregulated", "Downregulated"))
# Use direction and database together for color
top_terms$direction_db <- interaction(top_terms$direction, top_terms$database, sep = "_")
# Define custom colors for each direction+db combo (optional: customize as you like)
n_colors <- length(unique(top_terms$direction_db))
palette <- scales::hue_pal()(n_colors)
names(palette) <- unique(top_terms$direction_db)
# Plot
p2 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
geom_col(width = 0.9) +
coord_flip() +
facet_grid(database ~ cluster, scales = "free_y", space = "free") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(
x = "Pathway",
y = "Normalized Enrichment Score (NES)",
title = "Top GSEA Pathways per Cluster",
fill = "Direction + DB"
) +
scale_fill_manual(values = palette) +
theme_minimal(base_size = 16) +
theme(
strip.text = element_text(face = "bold", size = 16),
strip.background = element_rect(fill = "gray90", color = "black"),
strip.placement = "outside",
legend.position = "bottom",
axis.text.y = element_text(face = "bold", size = 13),
axis.text.x = element_text(size = 14),
panel.spacing = unit(1, "lines"),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
)
# Save with correct size override
ggsave("Top_GSEA_Pathways_Per_Cluster2.pdf", p2, width = 30, height = 58, limitsize = FALSE)
p2

Combine and Save All GSEA Results

Combine and Save All GSEA Results
# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)
# 1. Curated Sézary syndrome–relevant MSigDB pathways
curated_pathways <- c(
# Hallmark
"HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
# KEGG
"KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
"KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
"KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
# Reactome
"REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
"REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
# GO (Biological Process)
"GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
"GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
"GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
"GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)
# 2. Load MSigDB genes
msig_df <- msigdbr(species = "Homo sapiens") %>%
filter(gs_name %in% curated_pathways)
pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)
# 3. Split FindAllMarkers data frame by cluster
cluster_stats <- split(all_markers, all_markers$cluster)
# 4. Run fgsea using avg_log2FC as ranking metric
fgsea_results <- lapply(cluster_stats, function(stats_df) {
# Ensure avg_log2FC exists and gene names are present
stats_df <- stats_df %>%
filter(!is.na(avg_log2FC)) %>%
distinct(gene, .keep_all = TRUE)
# Check if gene column exists
if (!"gene" %in% colnames(stats_df)) {
warning("No 'gene' column found in stats_df")
return(NULL)
}
# Create rank vector
ranks <- stats_df$avg_log2FC
names(ranks) <- stats_df$gene
ranks <- sort(ranks, decreasing = TRUE)
# Optional: skip small rank vectors
if (length(ranks) < 100) return(NULL)
fgsea(
pathways = pathways_list,
stats = sort(ranks, decreasing = TRUE),
minSize = 10,
maxSize = 300,
nperm = 10000
)
})
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.11% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
# 5. Combine and annotate results
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
res <- fgsea_results[[cluster]]
res$cluster <- cluster
res
}))
# 6. Ensure all pathways appear across clusters
all_combos <- expand.grid(
cluster = names(cluster_stats),
pathway = curated_pathways,
stringsAsFactors = FALSE
)
gsea_complete <- all_combos %>%
left_join(gsea_df, by = c("cluster", "pathway")) %>%
mutate(
NES = ifelse(is.na(NES), 0, NES),
padj = ifelse(is.na(padj), 1, padj),
database = case_when(
str_detect(pathway, "^HALLMARK") ~ "Hallmark",
str_detect(pathway, "^KEGG") ~ "KEGG",
str_detect(pathway, "^REACTOME") ~ "Reactome",
str_detect(pathway, "^GOBP") ~ "GO",
TRUE ~ "Other"
),
direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)),
sig = ifelse(padj < 0.05, "*", "")
)
# 7. Color palette
palette <- c(
`+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
`+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
`+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
`+ GO` = "#9467bd", `- GO` = "#d5bde3"
)
# 8. Plot
p4 <- ggplot(gsea_complete, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
geom_col(width = 0.9) +
geom_text(aes(label = sig), hjust = -0.3, size = 5, fontface = "bold", color = "black") +
coord_flip() +
facet_grid(database ~ cluster, scales = "free_y", space = "free") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(
x = "Pathway",
y = "Normalized Enrichment Score (NES)",
title = "GSEA (avg_log2FC only) — Curated Pathways per Cluster",
fill = "Enrichment"
) +
scale_fill_manual(values = palette, na.value = "gray80") +
theme_minimal(base_size = 16) +
theme(
strip.text = element_text(face = "bold", size = 16),
strip.background = element_rect(fill = "gray90", color = "black"),
legend.position = "bottom",
axis.text.y = element_text(face = "bold", size = 13),
axis.text.x = element_text(size = 14),
panel.spacing = unit(1, "lines"),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
)
# 9. Save to PDF
ggsave("Curated_GSEA_avgLogFC_Only.pdf", p4, width = 30, height = 58, limitsize = FALSE)
p4

---
title: "findAllMarkers-NewUMAP fgsea Analysis on malignant cell lines"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  pdf_document:
    toc: yes
---

# 1. Load Libraries
```{r setup, include=FALSE}
library(Seurat)
library(harmony)
library(dplyr)
library(EnhancedVolcano)
library(ggplot2)
library(pheatmap)
library(tibble)
```


# 2. load seurat object
```{r load_seurat}
#Load Seurat Object
All_samples_Merged <- readRDS("/home/nabbasi/isilon/PHD_3rd_YEAR_Analysis/0-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_Harmony_integrated_Cell_line_renamed_03-07-2025.rds")


All_samples_Merged
```

# 3. Set Up Identifiers for Clustering
```{r}
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"

DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)

```

# 4. Use FindAllMarkers() for DE on RNA assay
```{r}
# Load necessary libraries
library(Seurat)

DefaultAssay(All_samples_Merged) <- "RNA"


All_samples_Merged <- NormalizeData(
  All_samples_Merged, 
  assay = "RNA", 
  normalization.method = "LogNormalize", 
  scale.factor = 10000
)


# Subset only malignant Sézary syndrome cell lines
malignant_lines <- c("L1", "L2", "L3", "L4", "L5", "L6", "L7")
Malignant_obj <- subset(All_samples_Merged, subset = cell_line %in% malignant_lines)

# Set cell_line as the active identity
Idents(Malignant_obj) <- "cell_line"

# Ensure RNA slot is normalized (if not already done)
if (!"RNA" %in% names(Malignant_obj@assays)) {
  Malignant_obj <- NormalizeData(Malignant_obj)
}

# Run FindAllMarkers to compare each cell_line against all other malignant cell lines
all_markers<- FindAllMarkers(Malignant_obj,
                               assay = "RNA",
                               only.pos = FALSE,
                               logfc.threshold = 0,
                               min.pct = 0,
                               test.use = "wilcox")  # can also use "MAST", "t", etc.

# Save all marker results
write.csv(all_markers, "AllMarkers_CellLines_MalignantOnly.csv", row.names = FALSE)

```

# 5.   Load Gene Sets from MSigDB
```{r , fig.height=8, fig.width=12}
library(Seurat)
library(dplyr)
library(purrr)
library(fgsea)
library(msigdbr)
library(ggplot2)
library(stringr)
library(forcats)

# 2. Load Gene Sets from MSigDB (your exact code)

# Hallmark (no subcollection needed)
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# KEGG legacy pathways
kegg_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:KEGG_LEGACY"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Reactome pathways
reactome_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:REACTOME"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Immunologic signatures
c7_sets <- msigdbr(species = "Homo sapiens", category = "C7") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# GO Biological Process, CC, MF
c5_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcollection = "BP") %>%
  split(x = .$gene_symbol, f = .$gs_name)


# Combine all sets
msigdb_sets <- list(
  Hallmark = hallmark_sets,
  KEGG     = kegg_sets,
  Reactome = reactome_sets,
  C7       = c7_sets,
  GO_BP    = c5_bp
)

```



# 4.  Run GSEA per Cluster using DE Files
```{r , fig.height=8, fig.width=12}

# 3. Prepare output list for storing FGSEA results
gsea_all <- list()

# 4. Get unique clusters from all_markers
clusters <- unique(all_markers$cluster)

# 5. Loop over each cluster and run FGSEA
for (cluster_id in clusters) {
  
  cat("Processing cluster:", cluster_id, "\n")
  
  # Subset DE genes for cluster vs rest
  de_df <- all_markers %>%
    filter(cluster == cluster_id) %>%
    filter(!is.na(avg_log2FC))
  
  # Create named vector ranked by avg_log2FC (descending)
  ranked_genes <- deframe(de_df %>% select(gene, avg_log2FC))
  ranked_genes <- sort(ranked_genes, decreasing = TRUE)
  
  # Run FGSEA on each MSigDB collection
  for (db_name in names(msigdb_sets)) {
    fgsea_res <- fgsea(
      pathways = msigdb_sets[[db_name]],
      stats = ranked_genes,
      minSize = 10,
      maxSize = 300,
      nperm = 10000
    )
    
    if (nrow(fgsea_res) > 0) {
      fgsea_res$cluster <- cluster_id
      fgsea_res$database <- db_name
      
      # Save in list with unique name
      gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- fgsea_res
    }
  }
}


```

## Combine and Save All GSEA Results
```{r , fig.height=8, fig.width=12}
# 6. Combine all GSEA results into one dataframe
gsea_df <- bind_rows(gsea_all)

# 7. Flatten leadingEdge list column into a semicolon-separated string
gsea_df <- gsea_df %>%
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))

# 8. Save combined results (optional)
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise_findAllMarkers_used.csv", row.names = FALSE)

# 9. Create folders and save cluster-wise results separately
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)

for (cluster_id in names(gsea_by_cluster)) {
  folder_name <- paste0("Cluster_", cluster_id)
  
  if (!dir.exists(folder_name)) {
    dir.create(folder_name)
  }
  
  file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
  write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}

```


## Combine and Save All GSEA Results
```{r , fig.height=40, fig.width=58}
# 10. Plot top GSEA pathways per cluster & collection (top 3 per cluster/db, padj<0.05)
top_terms <- gsea_df %>%
  filter(padj < 0.05) %>%
  group_by(cluster, database) %>%
  slice_max(NES, n = 10, with_ties = FALSE) %>%
  ungroup()

# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)

# Plot using ggplot2
p1 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Database"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

 ggsave("Top_GSEA_Pathways_Per_Cluster1.pdf", p1, width = 30, height = 58, limitsize = FALSE)

p1

```


## Combine and Save All GSEA Results
```{r , fig.height=30, fig.width=58}
library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)

# Force cluster order
top_terms$cluster <- factor(top_terms$cluster, levels = as.character(0:13))

# Clean and shorten pathway names
top_terms <- top_terms %>%
  mutate(
    pathway = gsub("GO_|REACTOME_|KEGG_", "", pathway),
    pathway = gsub("_", " ", pathway),
    pathway = str_to_title(pathway),
    pathway = str_trunc(pathway, width = 60)
  )

# Add direction column
top_terms <- top_terms %>%
  mutate(direction = ifelse(NES > 0, "Upregulated", "Downregulated"))

# Use direction and database together for color
top_terms$direction_db <- interaction(top_terms$direction, top_terms$database, sep = "_")

# Define custom colors for each direction+db combo (optional: customize as you like)
n_colors <- length(unique(top_terms$direction_db))
palette <- scales::hue_pal()(n_colors)
names(palette) <- unique(top_terms$direction_db)

# Plot
p2 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Direction + DB"
  ) +
  scale_fill_manual(values = palette) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# Save with correct size override
 ggsave("Top_GSEA_Pathways_Per_Cluster2.pdf", p2, width = 30, height = 58, limitsize = FALSE)
p2
```

## Combine and Save All GSEA Results
```{r , fig.height=18, fig.width=20}
# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)

# Split the allmarkers (from FindAllMarkers) into a named list per cluster
cluster_stats <- split(all_markers, all_markers$cluster)


# 1. Curated Sézary syndrome–relevant pathways (use their exact MSigDB names)
curated_pathways <- c(
  # Hallmark
  "HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
  # KEGG
  "KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
  "KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
  "KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
  # Reactome
  "REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
  "REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
  # GO (Biological Process)
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
  "GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
  "GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
  "GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)

# 2. Load pathways from msigdbr
msig_df <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name %in% curated_pathways)

pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)

# 3. Your input: a list of ranked gene stats (e.g., from Seurat::FindMarkers) for each cluster
# Example: cluster_stats is a named list of data.frames, each containing gene, avg_log2FC, p_val_adj
# Example of making rank vector: stats$avg_log2FC * -log10(stats$p_val_adj + 1e-300)

# Replace with your real cluster-level stats list
# cluster_stats <- list(clusterA = df_A, clusterB = df_B, ...)

# 4. Run fgsea for each cluster
fgsea_results <- lapply(cluster_stats, function(stats_df) {
  stats_df <- stats_df %>% filter(!is.na(p_val_adj), !is.na(avg_log2FC))
  ranks <- stats_df$avg_log2FC * -log10(stats_df$p_val_adj + 1e-300)
  names(ranks) <- rownames(stats_df)
  fgsea(pathways = pathways_list, stats = sort(ranks, decreasing = TRUE), nperm = 5000)
})

# 5. Combine and annotate
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
  res <- fgsea_results[[cluster]]
  res$cluster <- cluster
  res
}))

# Filter to curated pathways (already done but good to double check)
gsea_df <- gsea_df %>% filter(pathway %in% curated_pathways)

# Add pathway source label
gsea_df <- gsea_df %>%
  mutate(database = case_when(
    str_detect(pathway, "^HALLMARK") ~ "Hallmark",
    str_detect(pathway, "^KEGG") ~ "KEGG",
    str_detect(pathway, "^REACTOME") ~ "Reactome",
    str_detect(pathway, "^GOBP") ~ "GO",
    TRUE ~ "Other"
  )) %>%
  mutate(direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)))

# 6. Optionally filter to top N pathways per cluster
top_terms <- gsea_df %>%
  group_by(cluster, database) %>%
  top_n(5, wt = abs(NES)) %>%
  ungroup()

# 7. Set fill palette for up/down-regulation
palette <- c(
  "Hallmark" = "#d62728", "KEGG" = "#2ca02c",
  "Reactome" = "#1f77b4", "GO" = "#9467bd"
)
palette <- c(
  `+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
  `+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
  `+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
  `+ GO` = "#9467bd", `- GO` = "#d5bde3"
)

# 8. Plot
p3 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Direction + DB"
  ) +
  scale_fill_manual(values = palette) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# 9. Save the plot
ggsave("Top_GSEA_Pathways_Per_Cluster3.pdf", p3, width = 30, height = 58, limitsize = FALSE)

p3
```


## Combine and Save All GSEA Results
```{r , fig.height=18, fig.width=20}
# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)

# 1. Curated Sézary syndrome–relevant MSigDB pathways
curated_pathways <- c(
  # Hallmark
  "HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
  # KEGG
  "KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
  "KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
  "KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
  # Reactome
  "REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
  "REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
  # GO (Biological Process)
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
  "GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
  "GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
  "GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)

# 2. Load MSigDB genes
msig_df <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name %in% curated_pathways)

pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)

# 3. Split FindAllMarkers data frame by cluster
cluster_stats <- split(all_markers, all_markers$cluster)

# 4. Run fgsea using avg_log2FC as ranking metric
fgsea_results <- lapply(cluster_stats, function(stats_df) {
  # Ensure avg_log2FC exists and gene names are present
  stats_df <- stats_df %>%
    filter(!is.na(avg_log2FC)) %>%
    distinct(gene, .keep_all = TRUE)

  # Check if gene column exists
  if (!"gene" %in% colnames(stats_df)) {
    warning("No 'gene' column found in stats_df")
    return(NULL)
  }

  # Create rank vector
  ranks <- stats_df$avg_log2FC
  names(ranks) <- stats_df$gene
  ranks <- sort(ranks, decreasing = TRUE)

  # Optional: skip small rank vectors
  if (length(ranks) < 100) return(NULL)

   fgsea(
    pathways = pathways_list,
    stats = sort(ranks, decreasing = TRUE),
    minSize = 10,
    maxSize = 300,
    nperm = 10000
  )
})


# 5. Combine and annotate results
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
  res <- fgsea_results[[cluster]]
  res$cluster <- cluster
  res
}))

# 6. Ensure all pathways appear across clusters
all_combos <- expand.grid(
  cluster = names(cluster_stats),
  pathway = curated_pathways,
  stringsAsFactors = FALSE
)

gsea_complete <- all_combos %>%
  left_join(gsea_df, by = c("cluster", "pathway")) %>%
  mutate(
    NES = ifelse(is.na(NES), 0, NES),
    padj = ifelse(is.na(padj), 1, padj),
    database = case_when(
      str_detect(pathway, "^HALLMARK") ~ "Hallmark",
      str_detect(pathway, "^KEGG") ~ "KEGG",
      str_detect(pathway, "^REACTOME") ~ "Reactome",
      str_detect(pathway, "^GOBP") ~ "GO",
      TRUE ~ "Other"
    ),
    direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)),
    sig = ifelse(padj < 0.05, "*", "")
  )

# 7. Color palette
palette <- c(
  `+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
  `+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
  `+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
  `+ GO` = "#9467bd", `- GO` = "#d5bde3"
)

# 8. Plot
p4 <- ggplot(gsea_complete, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  geom_text(aes(label = sig), hjust = -0.3, size = 5, fontface = "bold", color = "black") +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "GSEA (avg_log2FC only) — Curated Pathways per Cluster",
    fill = "Enrichment"
  ) +
  scale_fill_manual(values = palette, na.value = "gray80") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# 9. Save to PDF
ggsave("Curated_GSEA_avgLogFC_Only.pdf", p4, width = 30, height = 58, limitsize = FALSE)

p4
```


