1. Load Libraries

2. load seurat object

#Load Seurat Object
All_samples_Merged <- readRDS("/home/nabbasi/isilon/PHD_3rd_YEAR_Analysis/0-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_Harmony_integrated_Cell_line_renamed_03-07-2025.rds")


All_samples_Merged
An object of class Seurat 
62900 features across 49305 samples within 6 assays 
Active assay: SCT (26176 features, 3000 variable features)
 3 layers present: counts, data, scale.data
 5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
 5 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap, harmony

3. Set Up Identifiers for Clustering

# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"

DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)

DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)

4. Use FindAllMarkers() for DE on RNA assay

# Load necessary libraries
library(Seurat)

DefaultAssay(All_samples_Merged) <- "RNA"

All_samples_Merged <- NormalizeData(
  All_samples_Merged, 
  assay = "RNA", 
  normalization.method = "LogNormalize", 
  scale.factor = 10000
)

Idents(All_samples_Merged) <- "seurat_clusters"

all_markers <- FindAllMarkers(
  object = All_samples_Merged,
  assay = "RNA",
  logfc.threshold = 0,
  min.pct = 0,
  test.use = "wilcox",
  only.pos = FALSE,
  return.thresh = 1 # Keep all genes, don't filter by p-value
)

5. Load Gene Sets from MSigDB

library(Seurat)
library(dplyr)
library(purrr)
library(fgsea)
library(msigdbr)
library(ggplot2)
library(stringr)
library(forcats)

# 2. Load Gene Sets from MSigDB (your exact code)

# Hallmark (no subcollection needed)
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") %>%
  split(x = .$gene_symbol, f = .$gs_name)
Avis : The `category` argument of `msigdbr()` is deprecated as of msigdbr 10.0.0.
Please use the `collection` argument instead.
# KEGG legacy pathways
kegg_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:KEGG_LEGACY"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Reactome pathways
reactome_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:REACTOME"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Immunologic signatures
c7_sets <- msigdbr(species = "Homo sapiens", category = "C7") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# GO Biological Process, CC, MF
c5_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcollection = "BP") %>%
  split(x = .$gene_symbol, f = .$gs_name)


# Combine all sets
msigdb_sets <- list(
  Hallmark = hallmark_sets,
  KEGG     = kegg_sets,
  Reactome = reactome_sets,
  C7       = c7_sets,
  GO_BP    = c5_bp
)

4. Run GSEA per Cluster using DE Files


# 3. Prepare output list for storing FGSEA results
gsea_all <- list()

# 4. Get unique clusters from all_markers
clusters <- unique(all_markers$cluster)

# 5. Loop over each cluster and run FGSEA
for (cluster_id in clusters) {
  
  cat("Processing cluster:", cluster_id, "\n")
  
  # Subset DE genes for cluster vs rest
  de_df <- all_markers %>%
    filter(cluster == cluster_id) %>%
    filter(!is.na(avg_log2FC))
  
  # Create named vector ranked by avg_log2FC (descending)
  ranked_genes <- deframe(de_df %>% select(gene, avg_log2FC))
  ranked_genes <- sort(ranked_genes, decreasing = TRUE)
  
  # Run FGSEA on each MSigDB collection
  for (db_name in names(msigdb_sets)) {
    fgsea_res <- fgsea(
      pathways = msigdb_sets[[db_name]],
      stats = ranked_genes,
      minSize = 10,
      maxSize = 300,
      nperm = 10000
    )
    
    if (nrow(fgsea_res) > 0) {
      fgsea_res$cluster <- cluster_id
      fgsea_res$database <- db_name
      
      # Save in list with unique name
      gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- fgsea_res
    }
  }
}
Processing cluster: 0 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.37% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.37% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.37% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.37% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.37% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 1 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.39% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.39% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.39% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.39% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.39% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 2 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 3 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 4 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 5 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 3 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 6 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 7 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 8 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 9 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 10 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.42% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Processing cluster: 11 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.43% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 15 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.43% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.43% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 18 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.43% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 15 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.43% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 98 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic values
Processing cluster: 12 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 14 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 6 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 129 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 2517 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 198 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic values
Processing cluster: 13 
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 18 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 16 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 110 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 3342 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic valuesAvis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.Avis : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 368 pathways for which P-values were not calculated properly due to unbalanced gene-level statistic values
setwd("/isilon/homes/nabbasi/PHD_3rd_YEAR_Analysis/5-Clusters_vs_Control_fgsea/fgsea_on_findAllMarkers")

Combine and Save All GSEA Results

# 6. Combine all GSEA results into one dataframe
gsea_df <- bind_rows(gsea_all)

# 7. Flatten leadingEdge list column into a semicolon-separated string
gsea_df <- gsea_df %>%
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))

# 8. Save combined results (optional)
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise_findAllMarkers_used.csv", row.names = FALSE)

# 9. Create folders and save cluster-wise results separately
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)

for (cluster_id in names(gsea_by_cluster)) {
  folder_name <- paste0("Cluster_", cluster_id)
  
  if (!dir.exists(folder_name)) {
    dir.create(folder_name)
  }
  
  file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
  write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}

Combine and Save All GSEA Results

# 10. Plot top GSEA pathways per cluster & collection (top 3 per cluster/db, padj<0.05)
top_terms <- gsea_df %>%
  filter(padj < 0.05) %>%
  group_by(cluster, database) %>%
  slice_max(NES, n = 10, with_ties = FALSE) %>%
  ungroup()

# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)

# Plot using ggplot2
p1 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Database"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

 ggsave("Top_GSEA_Pathways_Per_Cluster1.pdf", p, width = 30, height = 58, limitsize = FALSE)

p1

Combine and Save All GSEA Results

library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)

# Force cluster order
top_terms$cluster <- factor(top_terms$cluster, levels = as.character(0:13))

# Clean and shorten pathway names
top_terms <- top_terms %>%
  mutate(
    pathway = gsub("GO_|REACTOME_|KEGG_", "", pathway),
    pathway = gsub("_", " ", pathway),
    pathway = str_to_title(pathway),
    pathway = str_trunc(pathway, width = 60)
  )

# Add direction column
top_terms <- top_terms %>%
  mutate(direction = ifelse(NES > 0, "Upregulated", "Downregulated"))

# Use direction and database together for color
top_terms$direction_db <- interaction(top_terms$direction, top_terms$database, sep = "_")

# Define custom colors for each direction+db combo (optional: customize as you like)
n_colors <- length(unique(top_terms$direction_db))
palette <- scales::hue_pal()(n_colors)
names(palette) <- unique(top_terms$direction_db)

# Plot
p2 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Direction + DB"
  ) +
  scale_fill_manual(values = palette) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# Save with correct size override
 ggsave("Top_GSEA_Pathways_Per_Cluster2.pdf", p, width = 30, height = 58, limitsize = FALSE)
p2

Combine and Save All GSEA Results

Combine and Save All GSEA Results

# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)

# 1. Curated Sézary syndrome–relevant MSigDB pathways
curated_pathways <- c(
  # Hallmark
  "HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
  # KEGG
  "KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
  "KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
  "KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
  # Reactome
  "REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
  "REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
  # GO (Biological Process)
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
  "GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
  "GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
  "GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)

# 2. Load MSigDB genes
msig_df <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name %in% curated_pathways)

pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)

# 3. Split FindAllMarkers data frame by cluster
cluster_stats <- split(all_markers, all_markers$cluster)

# 4. Run fgsea using avg_log2FC as ranking metric
fgsea_results <- lapply(cluster_stats, function(stats_df) {
  # Ensure avg_log2FC exists and gene names are present
  stats_df <- stats_df %>%
    filter(!is.na(avg_log2FC)) %>%
    distinct(gene, .keep_all = TRUE)

  # Check if gene column exists
  if (!"gene" %in% colnames(stats_df)) {
    warning("No 'gene' column found in stats_df")
    return(NULL)
  }

  # Create rank vector
  ranks <- stats_df$avg_log2FC
  names(ranks) <- stats_df$gene
  ranks <- sort(ranks, decreasing = TRUE)

  # Optional: skip small rank vectors
  if (length(ranks) < 100) return(NULL)

   fgsea(
    pathways = pathways_list,
    stats = sort(ranks, decreasing = TRUE),
    minSize = 10,
    maxSize = 300,
    nperm = 10000
  )
})


# 5. Combine and annotate results
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
  res <- fgsea_results[[cluster]]
  res$cluster <- cluster
  res
}))

# 6. Ensure all pathways appear across clusters
all_combos <- expand.grid(
  cluster = names(cluster_stats),
  pathway = curated_pathways,
  stringsAsFactors = FALSE
)

gsea_complete <- all_combos %>%
  left_join(gsea_df, by = c("cluster", "pathway")) %>%
  mutate(
    NES = ifelse(is.na(NES), 0, NES),
    padj = ifelse(is.na(padj), 1, padj),
    database = case_when(
      str_detect(pathway, "^HALLMARK") ~ "Hallmark",
      str_detect(pathway, "^KEGG") ~ "KEGG",
      str_detect(pathway, "^REACTOME") ~ "Reactome",
      str_detect(pathway, "^GOBP") ~ "GO",
      TRUE ~ "Other"
    ),
    direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)),
    sig = ifelse(padj < 0.05, "*", "")
  )

# 7. Color palette
palette <- c(
  `+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
  `+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
  `+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
  `+ GO` = "#9467bd", `- GO` = "#d5bde3"
)

# 8. Plot
p4 <- ggplot(gsea_complete, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  geom_text(aes(label = sig), hjust = -0.3, size = 5, fontface = "bold", color = "black") +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "GSEA (avg_log2FC only) — Curated Pathways per Cluster",
    fill = "Enrichment"
  ) +
  scale_fill_manual(values = palette, na.value = "gray80") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# 9. Save to PDF
ggsave("Curated_GSEA_avgLogFC_Only.pdf", p4, width = 30, height = 58, limitsize = FALSE)

p4
---
title: "findAllMarkers-NewUMAP fgsea Analysis"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  pdf_document:
    toc: yes
---

# 1. Load Libraries
```{r setup, include=FALSE}
library(Seurat)
library(harmony)
library(dplyr)
library(EnhancedVolcano)
library(ggplot2)
library(pheatmap)
library(tibble)
```


# 2. load seurat object
```{r load_seurat}
#Load Seurat Object
All_samples_Merged <- readRDS("/home/nabbasi/isilon/PHD_3rd_YEAR_Analysis/0-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_Harmony_integrated_Cell_line_renamed_03-07-2025.rds")


All_samples_Merged
```

# 3. Set Up Identifiers for Clustering
```{r}
# Assign cluster identities to the Seurat object
Idents(All_samples_Merged) <- "seurat_clusters"

DimPlot(All_samples_Merged, reduction = "umap", group.by = "cell_line",label = T, label.box = T)
DimPlot(All_samples_Merged, reduction = "umap", group.by = "seurat_clusters",label = T, label.box = T)

```

# 4. Use FindAllMarkers() for DE on RNA assay
```{r}
# Load necessary libraries
library(Seurat)

DefaultAssay(All_samples_Merged) <- "RNA"

All_samples_Merged <- NormalizeData(
  All_samples_Merged, 
  assay = "RNA", 
  normalization.method = "LogNormalize", 
  scale.factor = 10000
)

Idents(All_samples_Merged) <- "seurat_clusters"

all_markers <- FindAllMarkers(
  object = All_samples_Merged,
  assay = "RNA",
  logfc.threshold = 0,
  min.pct = 0,
  test.use = "wilcox",
  only.pos = FALSE,
  return.thresh = 1 # Keep all genes, don't filter by p-value
)

```

# 5.   Load Gene Sets from MSigDB
```{r , fig.height=8, fig.width=12}
library(Seurat)
library(dplyr)
library(purrr)
library(fgsea)
library(msigdbr)
library(ggplot2)
library(stringr)
library(forcats)

# 2. Load Gene Sets from MSigDB (your exact code)

# Hallmark (no subcollection needed)
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# KEGG legacy pathways
kegg_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:KEGG_LEGACY"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Reactome pathways
reactome_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:REACTOME"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Immunologic signatures
c7_sets <- msigdbr(species = "Homo sapiens", category = "C7") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# GO Biological Process, CC, MF
c5_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcollection = "BP") %>%
  split(x = .$gene_symbol, f = .$gs_name)


# Combine all sets
msigdb_sets <- list(
  Hallmark = hallmark_sets,
  KEGG     = kegg_sets,
  Reactome = reactome_sets,
  C7       = c7_sets,
  GO_BP    = c5_bp
)

```



# 4.  Run GSEA per Cluster using DE Files
```{r , fig.height=8, fig.width=12}

# 3. Prepare output list for storing FGSEA results
gsea_all <- list()

# 4. Get unique clusters from all_markers
clusters <- unique(all_markers$cluster)

# 5. Loop over each cluster and run FGSEA
for (cluster_id in clusters) {
  
  cat("Processing cluster:", cluster_id, "\n")
  
  # Subset DE genes for cluster vs rest
  de_df <- all_markers %>%
    filter(cluster == cluster_id) %>%
    filter(!is.na(avg_log2FC))
  
  # Create named vector ranked by avg_log2FC (descending)
  ranked_genes <- deframe(de_df %>% select(gene, avg_log2FC))
  ranked_genes <- sort(ranked_genes, decreasing = TRUE)
  
  # Run FGSEA on each MSigDB collection
  for (db_name in names(msigdb_sets)) {
    fgsea_res <- fgsea(
      pathways = msigdb_sets[[db_name]],
      stats = ranked_genes,
      minSize = 10,
      maxSize = 300,
      nperm = 10000
    )
    
    if (nrow(fgsea_res) > 0) {
      fgsea_res$cluster <- cluster_id
      fgsea_res$database <- db_name
      
      # Save in list with unique name
      gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- fgsea_res
    }
  }
}


```

## Combine and Save All GSEA Results
```{r , fig.height=8, fig.width=12}
# 6. Combine all GSEA results into one dataframe
gsea_df <- bind_rows(gsea_all)

# 7. Flatten leadingEdge list column into a semicolon-separated string
gsea_df <- gsea_df %>%
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))

# 8. Save combined results (optional)
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise_findAllMarkers_used.csv", row.names = FALSE)

# 9. Create folders and save cluster-wise results separately
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)

for (cluster_id in names(gsea_by_cluster)) {
  folder_name <- paste0("Cluster_", cluster_id)
  
  if (!dir.exists(folder_name)) {
    dir.create(folder_name)
  }
  
  file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
  write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}




```


## Combine and Save All GSEA Results
```{r , fig.height=30, fig.width=58}
# 10. Plot top GSEA pathways per cluster & collection (top 3 per cluster/db, padj<0.05)
top_terms <- gsea_df %>%
  filter(padj < 0.05) %>%
  group_by(cluster, database) %>%
  slice_max(NES, n = 10, with_ties = FALSE) %>%
  ungroup()

# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)

# Plot using ggplot2
p1 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Database"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

 ggsave("Top_GSEA_Pathways_Per_Cluster1.pdf", p, width = 30, height = 58, limitsize = FALSE)

p1

```


## Combine and Save All GSEA Results
```{r , fig.height=30, fig.width=58}
library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)

# Force cluster order
top_terms$cluster <- factor(top_terms$cluster, levels = as.character(0:13))

# Clean and shorten pathway names
top_terms <- top_terms %>%
  mutate(
    pathway = gsub("GO_|REACTOME_|KEGG_", "", pathway),
    pathway = gsub("_", " ", pathway),
    pathway = str_to_title(pathway),
    pathway = str_trunc(pathway, width = 60)
  )

# Add direction column
top_terms <- top_terms %>%
  mutate(direction = ifelse(NES > 0, "Upregulated", "Downregulated"))

# Use direction and database together for color
top_terms$direction_db <- interaction(top_terms$direction, top_terms$database, sep = "_")

# Define custom colors for each direction+db combo (optional: customize as you like)
n_colors <- length(unique(top_terms$direction_db))
palette <- scales::hue_pal()(n_colors)
names(palette) <- unique(top_terms$direction_db)

# Plot
p2 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Direction + DB"
  ) +
  scale_fill_manual(values = palette) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# Save with correct size override
 ggsave("Top_GSEA_Pathways_Per_Cluster2.pdf", p, width = 30, height = 58, limitsize = FALSE)
p2
```

## Combine and Save All GSEA Results
```{r , fig.height=18, fig.width=20}
# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)

# Split the allmarkers (from FindAllMarkers) into a named list per cluster
cluster_stats <- split(all_markers, all_markers$cluster)


# 1. Curated Sézary syndrome–relevant pathways (use their exact MSigDB names)
curated_pathways <- c(
  # Hallmark
  "HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
  # KEGG
  "KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
  "KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
  "KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
  # Reactome
  "REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
  "REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
  # GO (Biological Process)
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
  "GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
  "GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
  "GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)

# 2. Load pathways from msigdbr
msig_df <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name %in% curated_pathways)

pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)

# 3. Your input: a list of ranked gene stats (e.g., from Seurat::FindMarkers) for each cluster
# Example: cluster_stats is a named list of data.frames, each containing gene, avg_log2FC, p_val_adj
# Example of making rank vector: stats$avg_log2FC * -log10(stats$p_val_adj + 1e-300)

# Replace with your real cluster-level stats list
# cluster_stats <- list(clusterA = df_A, clusterB = df_B, ...)

# 4. Run fgsea for each cluster
fgsea_results <- lapply(cluster_stats, function(stats_df) {
  stats_df <- stats_df %>% filter(!is.na(p_val_adj), !is.na(avg_log2FC))
  ranks <- stats_df$avg_log2FC * -log10(stats_df$p_val_adj + 1e-300)
  names(ranks) <- rownames(stats_df)
  fgsea(pathways = pathways_list, stats = sort(ranks, decreasing = TRUE), nperm = 5000)
})

# 5. Combine and annotate
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
  res <- fgsea_results[[cluster]]
  res$cluster <- cluster
  res
}))

# Filter to curated pathways (already done but good to double check)
gsea_df <- gsea_df %>% filter(pathway %in% curated_pathways)

# Add pathway source label
gsea_df <- gsea_df %>%
  mutate(database = case_when(
    str_detect(pathway, "^HALLMARK") ~ "Hallmark",
    str_detect(pathway, "^KEGG") ~ "KEGG",
    str_detect(pathway, "^REACTOME") ~ "Reactome",
    str_detect(pathway, "^GOBP") ~ "GO",
    TRUE ~ "Other"
  )) %>%
  mutate(direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)))

# 6. Optionally filter to top N pathways per cluster
top_terms <- gsea_df %>%
  group_by(cluster, database) %>%
  top_n(5, wt = abs(NES)) %>%
  ungroup()

# 7. Set fill palette for up/down-regulation
palette <- c(
  "Hallmark" = "#d62728", "KEGG" = "#2ca02c",
  "Reactome" = "#1f77b4", "GO" = "#9467bd"
)
palette <- c(
  `+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
  `+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
  `+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
  `+ GO` = "#9467bd", `- GO` = "#d5bde3"
)

# 8. Plot
p3 <- ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Direction + DB"
  ) +
  scale_fill_manual(values = palette) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# 9. Save the plot
ggsave("Top_GSEA_Pathways_Per_Cluster3.pdf", p3, width = 30, height = 58, limitsize = FALSE)

p3
```


## Combine and Save All GSEA Results
```{r , fig.height=18, fig.width=20}
# Required packages
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(forcats)
library(ggplot2)
library(pheatmap)
library(stringr)

# 1. Curated Sézary syndrome–relevant MSigDB pathways
curated_pathways <- c(
  # Hallmark
  "HALLMARK_JAK_STAT3_SIGNALING", "HALLMARK_TNFA_SIGNALING_VIA_NFKB", "HALLMARK_APOPTOSIS",
  # KEGG
  "KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY", "KEGG_JAK_STAT_SIGNALING_PATHWAY",
  "KEGG_PI3K_AKT_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY",
  "KEGG_TGF_BETA_SIGNALING_PATHWAY", "KEGG_CELL_CYCLE", "KEGG_APOPTOSIS",
  # Reactome
  "REACTOME_SIGNALING_BY_TCR", "REACTOME_SIGNALING_BY_JAK_STAT",
  "REACTOME_PI3K_AKT_ACTIVATION", "REACTOME_NFKB_ACTIVATION",
  # GO (Biological Process)
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY", "GOBP_T_CELL_ACTIVATION",
  "GOBP_CELL_CYCLE_PROCESS", "GOBP_APOPTOTIC_PROCESS",
  "GOBP_MAPK_CASCADE", "GOBP_NF_KAPPA_B_TRANSCRIPTION_FACTOR_ACTIVITY",
  "GOBP_JAK_STAT_CASCADE", "GOBP_EPIGENETIC_REGULATION"
)

# 2. Load MSigDB genes
msig_df <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name %in% curated_pathways)

pathways_list <- split(msig_df$gene_symbol, msig_df$gs_name)

# 3. Split FindAllMarkers data frame by cluster
cluster_stats <- split(all_markers, all_markers$cluster)

# 4. Run fgsea using avg_log2FC as ranking metric
fgsea_results <- lapply(cluster_stats, function(stats_df) {
  # Ensure avg_log2FC exists and gene names are present
  stats_df <- stats_df %>%
    filter(!is.na(avg_log2FC)) %>%
    distinct(gene, .keep_all = TRUE)

  # Check if gene column exists
  if (!"gene" %in% colnames(stats_df)) {
    warning("No 'gene' column found in stats_df")
    return(NULL)
  }

  # Create rank vector
  ranks <- stats_df$avg_log2FC
  names(ranks) <- stats_df$gene
  ranks <- sort(ranks, decreasing = TRUE)

  # Optional: skip small rank vectors
  if (length(ranks) < 100) return(NULL)

   fgsea(
    pathways = pathways_list,
    stats = sort(ranks, decreasing = TRUE),
    minSize = 10,
    maxSize = 300,
    nperm = 10000
  )
})


# 5. Combine and annotate results
gsea_df <- bind_rows(lapply(names(fgsea_results), function(cluster) {
  res <- fgsea_results[[cluster]]
  res$cluster <- cluster
  res
}))

# 6. Ensure all pathways appear across clusters
all_combos <- expand.grid(
  cluster = names(cluster_stats),
  pathway = curated_pathways,
  stringsAsFactors = FALSE
)

gsea_complete <- all_combos %>%
  left_join(gsea_df, by = c("cluster", "pathway")) %>%
  mutate(
    NES = ifelse(is.na(NES), 0, NES),
    padj = ifelse(is.na(padj), 1, padj),
    database = case_when(
      str_detect(pathway, "^HALLMARK") ~ "Hallmark",
      str_detect(pathway, "^KEGG") ~ "KEGG",
      str_detect(pathway, "^REACTOME") ~ "Reactome",
      str_detect(pathway, "^GOBP") ~ "GO",
      TRUE ~ "Other"
    ),
    direction_db = ifelse(NES > 0, paste0("+ ", database), paste0("- ", database)),
    sig = ifelse(padj < 0.05, "*", "")
  )

# 7. Color palette
palette <- c(
  `+ Hallmark` = "#d62728", `- Hallmark` = "#fcbba1",
  `+ KEGG` = "#2ca02c", `- KEGG` = "#b9f6ca",
  `+ Reactome` = "#1f77b4", `- Reactome` = "#aec7e8",
  `+ GO` = "#9467bd", `- GO` = "#d5bde3"
)

# 8. Plot
p4 <- ggplot(gsea_complete, aes(x = fct_reorder(pathway, NES), y = NES, fill = direction_db)) +
  geom_col(width = 0.9) +
  geom_text(aes(label = sig), hjust = -0.3, size = 5, fontface = "bold", color = "black") +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "GSEA (avg_log2FC only) — Curated Pathways per Cluster",
    fill = "Enrichment"
  ) +
  scale_fill_manual(values = palette, na.value = "gray80") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )

# 9. Save to PDF
ggsave("Curated_GSEA_avgLogFC_Only.pdf", p4, width = 30, height = 58, limitsize = FALSE)

p4
```


