1. load libraries
2. Read DE Files for Each Cluster
cluster_files <- list(
"0" = "../Updated_DE_Cluster_0_with_MeanExpr.csv",
"1" = "../Updated_DE_Cluster_1_with_MeanExpr.csv",
"2" = "../Updated_DE_Cluster_2_with_MeanExpr.csv",
"4" = "../Updated_DE_Cluster_4_with_MeanExpr.csv",
"5" = "../Updated_DE_Cluster_5_with_MeanExpr.csv",
"6" = "../Updated_DE_Cluster_6_with_MeanExpr.csv",
"7" = "../Updated_DE_Cluster_7_with_MeanExpr.csv",
"8" = "../Updated_DE_Cluster_8_with_MeanExpr.csv",
"9" = "../Updated_DE_Cluster_9_with_MeanExpr.csv",
"11" = "../Updated_DE_Cluster_11_with_MeanExpr.csv",
"12" = "../Updated_DE_Cluster_12_with_MeanExpr.csv",
"13" = "../Updated_DE_Cluster_13_with_MeanExpr.csv"
)
4. Load Gene Sets from MSigDB
library(msigdbr)
# Hallmark (no subcollection needed)
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") %>%
split(x = .$gene_symbol, f = .$gs_name)
Avis : The `category` argument of `msigdbr()` is deprecated as of msigdbr 10.0.0.
Please use the `collection` argument instead.
# KEGG legacy pathways
kegg_sets <- msigdbr(
species = "Homo sapiens",
category = "C2",
subcollection = "CP:KEGG_LEGACY"
) %>%
split(x = .$gene_symbol, f = .$gs_name)
# Reactome pathways
reactome_sets <- msigdbr(
species = "Homo sapiens",
category = "C2",
subcollection = "CP:REACTOME"
) %>%
split(x = .$gene_symbol, f = .$gs_name)
# Immunologic signatures
c7_sets <- msigdbr(species = "Homo sapiens", category = "C7") %>%
split(x = .$gene_symbol, f = .$gs_name)
# GO Biological Process, CC, MF
c5_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcollection = "BP") %>%
split(x = .$gene_symbol, f = .$gs_name)
# Combine all sets
msigdb_sets <- list(
Hallmark = hallmark_sets,
KEGG = kegg_sets,
Reactome = reactome_sets,
C7 = c7_sets,
GO_BP = c5_bp
)
4. Run GSEA per Cluster using DE Files
gsea_all <- list()
for (cluster_id in names(cluster_files)) {
# Load DE result for the cluster
de_df <- read.csv(cluster_files[[cluster_id]])
# Optional: filter genes with NA or very low avg_log2FC (if desired)
de_df <- de_df %>% filter(!is.na(avg_log2FC))
# Use only avg_log2FC as ranking metric
ranked_genes <- de_df$avg_log2FC
names(ranked_genes) <- de_df$gene
ranked_genes <- sort(ranked_genes, decreasing = TRUE)
# Run GSEA for each MSigDB collection
for (db_name in names(msigdb_sets)) {
gsea_res <- fgsea(pathways = msigdb_sets[[db_name]],
stats = ranked_genes,
minSize = 10,
maxSize = 300,
nperm = 10000)
if (nrow(gsea_res) > 0) {
gsea_res$cluster <- cluster_id
gsea_res$database <- db_name
gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- gsea_res
}
}
}
Combine and Save All GSEA Results
library(purrr)
library(dplyr)
# Combine all GSEA results from list into a single dataframe
gsea_df <- bind_rows(gsea_all)
# Flatten the leadingEdge column (list to character)
gsea_df_flat <- gsea_df %>%
mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))
Plot Top GSEA Pathways per Cluster and Collection
top_terms <- gsea_df %>%
filter(padj < 0.05) %>%
group_by(cluster, database) %>%
top_n(3, wt = NES) %>%
ungroup()
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
library(stringr)
top_terms$pathway <- str_to_title(top_terms$pathway)
ggplot(top_terms, aes(x = NES, y = fct_reorder(pathway, NES), fill = padj)) +
geom_col(show.legend = FALSE) +
facet_grid(cluster ~ database, scales = "free_y", space = "free_y") +
scale_fill_viridis_c(option = "D", direction = -1) +
labs(title = "Top GSEA Pathways per Cluster",
x = "Normalized Enrichment Score (NES)",
y = "Pathway") +
theme_minimal(base_size = 11) +
theme(strip.text = element_text(size = 10, face = "bold"))

NA
NA
NA
NA
NA
Plot Top GSEA Pathways per Cluster and Collection2
library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)
# Prepare data (reuse your `top_terms` and enhance labels)
top_terms <- gsea_df %>%
filter(padj < 0.05) %>%
group_by(cluster, database) %>%
top_n(3, wt = NES) %>%
ungroup()
# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)
# Optional: shorten long pathway names for plot clarity
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)
# Plot
ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
geom_col(width = 0.9) +
coord_flip() +
facet_grid(database ~ cluster, scales = "free_y", space = "free") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(
x = "Pathway",
y = "Normalized Enrichment Score (NES)",
title = "Top GSEA Pathways per Cluster",
fill = "Database"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal(base_size = 16) +
theme(
strip.text = element_text(face = "bold", size = 16),
strip.background = element_rect(fill = "gray90", color = "black"),
strip.placement = "outside",
legend.position = "bottom",
axis.text.y = element_text(face = "bold", size = 13),
axis.text.x = element_text(size = 14),
panel.spacing = unit(1, "lines"),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
)

NA
NA
NA
Combine and Save All GSEA Results
library(dplyr)
library(purrr)
gsea_df <- bind_rows(gsea_all)
# Flatten leadingEdge column (list of gene names per pathway)
gsea_df <- gsea_df %>%
mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))
# Now write full combined result
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise.csv", row.names = FALSE)
Updated Code: Create folder per cluster and save CSV inside
# Combine into single df and flatten leadingEdge
library(dplyr)
library(purrr)
gsea_df <- bind_rows(gsea_all) %>%
mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))
# Split by cluster
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)
# Write one file per cluster inside its own folder
for (cluster_id in names(gsea_by_cluster)) {
folder_name <- paste0("Cluster_", cluster_id)
# Create folder if it doesn't exist
if (!dir.exists(folder_name)) {
dir.create(folder_name)
}
# Write CSV inside the folder
file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}
---
title: "fgsea Cluster vs Normal Control-DE-NewUMAP"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    toc_collapsed: yes
  pdf_document:
    toc: yes
---


# 1. load libraries
```{r setup, include=FALSE}
library(Seurat)
library(dplyr)
library(tibble)
library(tidyr)
library(ggplot2)
library(fgsea)
library(msigdbr)
library(forcats)
library(purrr)

  
```

# 2. Read DE Files for Each Cluster
```{r data1, fig.height=8, fig.width=12}

cluster_files <- list(
  "0" = "../Updated_DE_Cluster_0_with_MeanExpr.csv",
  "1" = "../Updated_DE_Cluster_1_with_MeanExpr.csv",
  "2" = "../Updated_DE_Cluster_2_with_MeanExpr.csv",
  "4" = "../Updated_DE_Cluster_4_with_MeanExpr.csv",
  "5" = "../Updated_DE_Cluster_5_with_MeanExpr.csv",
  "6" = "../Updated_DE_Cluster_6_with_MeanExpr.csv",
  "7" = "../Updated_DE_Cluster_7_with_MeanExpr.csv",
  "8" = "../Updated_DE_Cluster_8_with_MeanExpr.csv",
  "9" = "../Updated_DE_Cluster_9_with_MeanExpr.csv",
  "11" = "../Updated_DE_Cluster_11_with_MeanExpr.csv",
  "12" = "../Updated_DE_Cluster_12_with_MeanExpr.csv",
  "13" = "../Updated_DE_Cluster_13_with_MeanExpr.csv"
)


```




# 4.   Load Gene Sets from MSigDB
```{r , fig.height=8, fig.width=12}
library(msigdbr)

# Hallmark (no subcollection needed)
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# KEGG legacy pathways
kegg_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:KEGG_LEGACY"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Reactome pathways
reactome_sets <- msigdbr(
    species      = "Homo sapiens",
    category     = "C2",
    subcollection = "CP:REACTOME"
  ) %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Immunologic signatures
c7_sets <- msigdbr(species = "Homo sapiens", category = "C7") %>%
  split(x = .$gene_symbol, f = .$gs_name)

# GO Biological Process, CC, MF
c5_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcollection = "BP") %>%
  split(x = .$gene_symbol, f = .$gs_name)


# Combine all sets
msigdb_sets <- list(
  Hallmark = hallmark_sets,
  KEGG     = kegg_sets,
  Reactome = reactome_sets,
  C7       = c7_sets,
  GO_BP    = c5_bp
)

```



# 4.  Run GSEA per Cluster using DE Files
```{r , fig.height=8, fig.width=12}


gsea_all <- list()

for (cluster_id in names(cluster_files)) {
  
  # Load DE result for the cluster
  de_df <- read.csv(cluster_files[[cluster_id]])
  
  # Optional: filter genes with NA or very low avg_log2FC (if desired)
  de_df <- de_df %>% filter(!is.na(avg_log2FC))
  
  # Use only avg_log2FC as ranking metric
  ranked_genes <- de_df$avg_log2FC
  names(ranked_genes) <- de_df$gene
  ranked_genes <- sort(ranked_genes, decreasing = TRUE)
  
  # Run GSEA for each MSigDB collection
  for (db_name in names(msigdb_sets)) {
    gsea_res <- fgsea(pathways = msigdb_sets[[db_name]],
                      stats = ranked_genes,
                      minSize = 10,
                      maxSize = 300,
                      nperm = 10000)
    
    if (nrow(gsea_res) > 0) {
      gsea_res$cluster <- cluster_id
      gsea_res$database <- db_name
      gsea_all[[paste0("Cluster_", cluster_id, "_", db_name)]] <- gsea_res
    }
  }
}



```



## Combine and Save All GSEA Results
```{r , fig.height=8, fig.width=12}
library(purrr)
library(dplyr)

# Combine all GSEA results from list into a single dataframe
gsea_df <- bind_rows(gsea_all)

# Flatten the leadingEdge column (list to character)
gsea_df_flat <- gsea_df %>% 
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))




```



##  Plot Top GSEA Pathways per Cluster and Collection
```{r , fig.height=28, fig.width=20}
top_terms <- gsea_df %>%
  filter(padj < 0.05) %>%
  group_by(cluster, database) %>%
  top_n(3, wt = NES) %>%
  ungroup()

top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
library(stringr)

top_terms$pathway <- str_to_title(top_terms$pathway)


ggplot(top_terms, aes(x = NES, y = fct_reorder(pathway, NES), fill = padj)) +
  geom_col(show.legend = FALSE) +
  facet_grid(cluster ~ database, scales = "free_y", space = "free_y") +
  scale_fill_viridis_c(option = "D", direction = -1) +
  labs(title = "Top GSEA Pathways per Cluster",
       x = "Normalized Enrichment Score (NES)",
       y = "Pathway") +
  theme_minimal(base_size = 11) +
  theme(strip.text = element_text(size = 10, face = "bold"))





```

##  Plot Top GSEA Pathways per Cluster and Collection2
```{r , fig.height=28, fig.width=20}
library(ggplot2)
library(ggforce)
library(forcats)
library(dplyr)
library(stringr)

# Prepare data (reuse your `top_terms` and enhance labels)
top_terms <- gsea_df %>%
  filter(padj < 0.05) %>%
  group_by(cluster, database) %>%
  top_n(3, wt = NES) %>%
  ungroup()

# Clean pathway names
top_terms$pathway <- gsub("GO_|REACTOME_|KEGG_", "", top_terms$pathway)
top_terms$pathway <- gsub("_", " ", top_terms$pathway)
top_terms$pathway <- str_to_title(top_terms$pathway)

# Optional: shorten long pathway names for plot clarity
top_terms$pathway <- str_trunc(top_terms$pathway, width = 60)

# Plot
ggplot(top_terms, aes(x = fct_reorder(pathway, NES), y = NES, fill = database)) +
  geom_col(width = 0.9) +
  coord_flip() +
  facet_grid(database ~ cluster, scales = "free_y", space = "free") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    title = "Top GSEA Pathways per Cluster",
    fill = "Database"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.placement = "outside",
    legend.position = "bottom",
    axis.text.y = element_text(face = "bold", size = 13),
    axis.text.x = element_text(size = 14),
    panel.spacing = unit(1, "lines"),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  )



```
## Combine and Save All GSEA Results
```{r , fig.height=8, fig.width=12}
library(dplyr)
library(purrr)

gsea_df <- bind_rows(gsea_all)

# Flatten leadingEdge column (list of gene names per pathway)
gsea_df <- gsea_df %>%
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))

# Now write full combined result
write.csv(gsea_df, "GSEA_MSigDB_Results_Clusterwise.csv", row.names = FALSE)



```

## Updated Code: Create folder per cluster and save CSV inside
```{r , fig.height=8, fig.width=12}
# Combine into single df and flatten leadingEdge
library(dplyr)
library(purrr)

gsea_df <- bind_rows(gsea_all) %>%
  mutate(leadingEdge = map_chr(leadingEdge, ~ paste(.x, collapse = ";")))

# Split by cluster
gsea_by_cluster <- split(gsea_df, gsea_df$cluster)

# Write one file per cluster inside its own folder
for (cluster_id in names(gsea_by_cluster)) {
  folder_name <- paste0("Cluster_", cluster_id)
  
  # Create folder if it doesn't exist
  if (!dir.exists(folder_name)) {
    dir.create(folder_name)
  }
  
  # Write CSV inside the folder
  file_path <- file.path(folder_name, paste0("GSEA_Cluster_", cluster_id, ".csv"))
  write.csv(gsea_by_cluster[[cluster_id]], file = file_path, row.names = FALSE)
}







```
