load libraries
Load DE results
de_file <- "1-LIBRA_Pseudobulk_Deseq2_LRT_filtered_on_mean.csv"
de <- read.csv(de_file, stringsAsFactors = FALSE)
head(de)
sample <- readRDS("../../0-Seurat_RDS_Final_OBJECT/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds")
Volcano plot
Malignant CD4 T cells(cell lines) vs normal CD4 T cells
library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- de %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 0.05,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 5.0,
labSize = 5.0,
col = c('grey70', 'black', 'grey', 'red'),
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

Prepare preranked list
for GSEA (fgsea)
# A1. IMPORTANT: Re-calculate ranking statistic using P-value (Not P-adj)
# This improves granularity and prevents ties
de <- de %>% mutate(signed_stat = avg_logFC * (-log10(p_val + 1e-300)))
# B. Create Rank Vector
ranks <- de %>% arrange(desc(signed_stat)) %>% distinct(gene, .keep_all=TRUE)
ranks_vec <- ranks$signed_stat; names(ranks_vec) <- ranks$gene
ranks_vec <- ranks_vec[!is.na(names(ranks_vec)) & !duplicated(names(ranks_vec))]
# Load MSigDB gene sets
# Hallmark
msig_h <- msigdbr(species = "Homo sapiens", category = "H") %>%
dplyr::select(gs_name, gene_symbol)
pathways_h <- split(msig_h$gene_symbol, msig_h$gs_name)
# C2 canonical pathways: KEGG + Reactome
msig_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::select(gs_name, gs_subcollection, gene_symbol)
# KEGG - choose Medicus or Legacy
msig_kegg <- msigdbr(
species = "Homo sapiens",
category = "C2",
subcategory = "CP:KEGG_LEGACY" # or "CP:KEGG_LEGACY"
) %>%
select(gs_name, gene_symbol)
pathways_kegg <- split(msig_kegg$gene_symbol, msig_kegg$gs_name)
# Reactome
msig_react <- msig_c2 %>% filter(gs_subcollection == "CP:REACTOME") %>% select(gs_name, gene_symbol)
pathways_react <- split(msig_react$gene_symbol, msig_react$gs_name)
# GO Biological Process
msig_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>%
dplyr::select(gs_name, gene_symbol)
pathways_bp <- split(msig_bp$gene_symbol, msig_bp$gs_name)
# fgsea function
run_fgsea <- function(pathways, ranks_vec, label){
set.seed(42)
fg <- fgseaMultilevel(pathways = pathways, stats = ranks_vec)
fg <- as_tibble(fg) %>%
arrange(padj) %>%
mutate(
dataset = label,
leadingEdgeCount = lengths(leadingEdge),
leadingEdge = sapply(leadingEdge, function(x) paste(x, collapse = ";"))
) %>%
dplyr::select(dataset, pathway, NES, padj, pval, size, leadingEdge, leadingEdgeCount)
#write.csv(fg, paste0("fgsea_", label, "_results.csv"), row.names = FALSE)
return(fg)
}
# Run GSEA for each dataset
fg_h <- run_fgsea(pathways_h, ranks_vec, "hallmark")
fg_kegg <- run_fgsea(pathways_kegg, ranks_vec, "kegg")
fg_react <- run_fgsea(pathways_react, ranks_vec, "reactome")
fg_bp <- run_fgsea(pathways_bp, ranks_vec, "go_bp")
Define Exclusion List
(Strict)
# Expanded Proliferation Keywords
# STRICT EXCLUSION LIST (Updated)
prolif_terms <- c(
"CELL_CYCLE", "MITOTIC", "G2M", "E2F", "SPINDLE",
"CHROMOSOME", "DNA_REPLICATION", "NUCLEAR_DIVISION",
"ORGANELLE_FISSION", "KINETOCHORE", "CENTROSOME",
"REPLICATION", "SEGREGATION", "DIVISION", "M_PHASE",
"KINESINS", "MEIOSIS", "OOCYTE",
"MICROTUBULE", "CYTOSKELETON", "TRAFFIC", "GOLGI", "CYCLIN",
"RECOMBINATION", "REPAIR", "REPLICATIVE", "POLO_LIKE"
)
DATA PREPARATION
FUNCTION
prepare_data <- function(fg_tbl, topN = 5, exclude_prolif = FALSE) {
if (exclude_prolif) {
fg_tbl <- fg_tbl %>% filter(!grepl(paste(prolif_terms, collapse = "|"), pathway, ignore.case = TRUE))
}
# Get top N up and down per database
fg_plot <- bind_rows(
fg_tbl %>% filter(dataset == "hallmark") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
fg_tbl %>% filter(dataset == "kegg") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
fg_tbl %>% filter(dataset == "reactome") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
fg_tbl %>% filter(dataset == "go_bp") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) }
)
# Format Labels
fg_plot %>%
mutate(
db_prefix = case_when(
dataset == "hallmark" ~ "HALLMARK",
dataset == "kegg" ~ "KEGG",
dataset == "reactome" ~ "REACTOME",
dataset == "go_bp" ~ "GOBP"
),
clean_pathway = gsub("^HALLMARK_|^KEGG_|^REACTOME_|^GOBP_", "", pathway),
plot_label = paste0(db_prefix, "_", clean_pathway)
) %>%
arrange(NES) %>%
mutate(plot_label = factor(plot_label, levels = unique(plot_label)))
}
PLOTTING FUNCTION
create_plot <- function(data, color_var, color_label, title_text) {
ggplot(data, aes(x = NES, y = plot_label)) +
geom_point(aes(shape = dataset, size = leadingEdgeCount, color = !!sym(color_var)), alpha = 0.9) +
geom_vline(xintercept = 0, linetype = "solid", color = "gray80", linewidth = 0.5) +
scale_color_gradientn(
colors = c("red", "orange", "blue"),
trans = "log10",
name = color_label,
guide = guide_colorbar(reverse = TRUE)
) +
scale_shape_manual(
values = c("hallmark" = 17, "kegg" = 15, "reactome" = 3, "go_bp" = 16),
guide = "none"
) +
scale_size_continuous(range = c(3, 8), name = "Leading edge genes") +
theme_minimal() +
labs(x = "Normalized Enrichment Score (NES)", y = NULL, title = title_text) +
theme(
axis.text.y = element_text(size = 14, face = "bold", color = "black"),
# X-axis labels (Numbers) - BOLD & BIGGER
axis.text.x = element_text(size = 10, face = "bold", color = "black"),
# X-axis TITLE (The Text "Normalized Enrichment Score...") - BOLD
axis.title.x = element_text(size = 14, face = "bold", color = "black", margin = margin(t = 10)),
# Plot Title
plot.title = element_text(face = "bold", size = 13, hjust = 0.5),
# Legend
legend.position = "right",
legend.box = "vertical",
legend.title = element_text(face = "bold", size = 10),
panel.grid.major.y = element_line(color = "gray95")
)
}
GENERATE 4 PLOTS
# A) All Pathways (padj)
df_all <- prepare_data(fg_all, exclude_prolif = FALSE)
p1 <- create_plot(df_all, "padj", "FDR (padj)", "Global Pathway Alterations (Malignant vs. Normal CD4+)")
ggsave("Fig1_All_padj.png", p1, width = 16, height = 8, dpi = 300)
ggsave("Fig1_All_padj.pdf", p1, width = 16, height = 8)
# B) All Pathways (p-value)
p2 <- create_plot(df_all, "pval", "P-value", "Nominally Significant Pathway Alterations")
ggsave("Fig2_All_pval.png", p2, width = 16, height = 8, dpi = 300)
ggsave("Fig2_All_pval.pdf", p2, width = 16, height = 8)
# C) Non-Proliferation (padj)
df_no_prolif <- prepare_data(fg_all, exclude_prolif = TRUE)
p3 <- create_plot(df_no_prolif, "padj", "FDR (padj)", "Functional & Immune Signatures (Non-Proliferative)")
ggsave("Fig3_NonProlif_padj.png", p3, width = 16, height = 8, dpi = 300)
ggsave("Fig3_NonProlif_padj.pdf", p3, width = 16, height = 8)
# D) Non-Proliferation (p-value)
p4 <- create_plot(df_no_prolif, "pval", "P-value", "Exploratory Functional Signatures (Non-Proliferative)")
ggsave("Fig4_NonProlif_pval.png", p4, width = 16, height = 8, dpi = 300)
ggsave("Fig4_NonProlif_pval.pdf", p4, width = 16, height = 8)
print("Created 4 figures: Fig1..Fig4 (.png and .pdf)")
[1] "Created 4 figures: Fig1..Fig4 (.png and .pdf)"
p1

p2

p3

p4

Enrichment map_cnetplot
for leading-edge genes
#### Enrichment map / cnetplot for leading-edge genes ####
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
# ---- Function ----
make_cnet_emap <- function(fgsea_res,
pathways,
dataset_name,
padj_cutoff = 0.25,
topN_fallback = 10,
showCategory = 15,
fold_change_vec = NULL,
save_plots = TRUE) {
message("Processing dataset: ", dataset_name)
# 1) Try to filter by padj
leading_list <- fgsea_res %>%
filter(padj < padj_cutoff) %>%
select(pathway, leadingEdge) %>%
mutate(leadingEdge = map(leadingEdge, as.character)) %>%
deframe()
# 2) Fallback: if too few, take topN pathways by NES
if (length(leading_list) < 2) {
message("⚠ No pathways pass padj <", padj_cutoff,
" for ", dataset_name, ". Using top ", topN_fallback, " pathways by NES.")
leading_list <- fgsea_res %>%
arrange(padj, desc(abs(NES))) %>%
slice_head(n = topN_fallback) %>%
select(pathway, leadingEdge) %>%
mutate(leadingEdge = map(leadingEdge, as.character)) %>%
deframe()
}
if (length(leading_list) < 2) {
message("❌ Still too few pathways for ", dataset_name, " — skipping.")
return(NULL)
}
# TERM2GENE conversion
lead_term2gene <- enframe(leading_list, name = "TERM", value = "GENE") %>%
unnest(cols = c(GENE))
# Union of leading-edge genes
union_leading_genes <- unique(unlist(leading_list))
if (length(union_leading_genes) < 2) {
message("❌ Too few leading-edge genes for ", dataset_name, " — skipping.")
return(NULL)
}
# Run enrichment (very relaxed min/max sizes)
enr <- enricher(
union_leading_genes,
TERM2GENE = lead_term2gene,
minGSSize = 1,
maxGSSize = 5000
)
if (is.null(enr) || nrow(enr) == 0) {
message("❌ No enrichment result for ", dataset_name)
return(NULL)
}
# ---- cnetplot ----
cnet <- cnetplot(
enr,
foldChange = fold_change_vec,
showCategory = showCategory,
circular = FALSE,
colorEdge = TRUE
) + ggtitle(paste("Cnetplot —", dataset_name))
if (save_plots) {
ggsave(
paste0("cnetplot_leadingEdge_", dataset_name, ".png"),
cnet,
width = 12, height = 8, dpi = 300
)
}
# ---- emapplot ----
emap <- pairwise_termsim(enr)
emap_plot <- emapplot(emap, showCategory = showCategory) +
ggtitle(paste("Enrichment Map —", dataset_name))
if (save_plots) {
ggsave(
paste0("emapplot_leadingEdge_", dataset_name, ".png"),
emap_plot,
width = 12, height = 8, dpi = 300
)
}
return(list(enrich_res = enr, cnet = cnet, emap = emap_plot))
}
# ---- Run for all datasets ----
# Optional: fold-change vector from DE results
fold_change_vec <- setNames(de$avg_logFC, de$gene)
res_h <- make_cnet_emap(fg_h, pathways_h, "Hallmark", fold_change_vec = fold_change_vec)
res_kegg <- make_cnet_emap(fg_kegg, pathways_kegg, "KEGG", fold_change_vec = fold_change_vec)
res_react <- make_cnet_emap(fg_react, pathways_react, "Reactome", fold_change_vec = fold_change_vec)
res_bp <- make_cnet_emap(fg_bp, pathways_bp, "GO_BP", fold_change_vec = fold_change_vec)
message("✅ Finished Cnet/Enrichment map plots for all datasets")
Enrichment map_cnetplot
for leading-edge genes
#### Enrichment map / cnetplot for leading-edge genes ####
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
# ---- Function ----
make_cnet_emap <- function(fgsea_res,
pathways,
dataset_name,
padj_cutoff = 0.25,
topN_fallback = 10,
showCategory = 15,
fold_change_vec = NULL,
save_plots = TRUE) {
message("Processing dataset: ", dataset_name)
# 1) Try to filter by padj
leading_list <- fgsea_res %>%
filter(padj < padj_cutoff) %>%
select(pathway, leadingEdge) %>%
mutate(leadingEdge = map(leadingEdge, as.character)) %>%
deframe()
# 2) Fallback: if too few, take topN pathways by NES
if (length(leading_list) < 2) {
message("⚠ No pathways pass padj <", padj_cutoff,
" for ", dataset_name, ". Using top ", topN_fallback, " pathways by NES.")
leading_list <- fgsea_res %>%
arrange(padj, desc(abs(NES))) %>%
slice_head(n = topN_fallback) %>%
select(pathway, leadingEdge) %>%
mutate(leadingEdge = map(leadingEdge, as.character)) %>%
deframe()
}
if (length(leading_list) < 2) {
message("❌ Still too few pathways for ", dataset_name, " — skipping.")
return(NULL)
}
# TERM2GENE conversion
lead_term2gene <- enframe(leading_list, name = "TERM", value = "GENE") %>%
unnest(cols = c(GENE))
# Union of leading-edge genes
union_leading_genes <- unique(unlist(leading_list))
if (length(union_leading_genes) < 2) {
message("❌ Too few leading-edge genes for ", dataset_name, " — skipping.")
return(NULL)
}
# Run enrichment (very relaxed min/max sizes)
enr <- enricher(
union_leading_genes,
TERM2GENE = lead_term2gene,
minGSSize = 1,
maxGSSize = 5000
)
if (is.null(enr) || nrow(enr) == 0) {
message("❌ No enrichment result for ", dataset_name)
return(NULL)
}
# ---- cnetplot ----
cnet <- cnetplot(
enr,
foldChange = fold_change_vec,
showCategory = showCategory,
circular = FALSE,
colorEdge = TRUE
) + ggtitle(paste("Cnetplot —", dataset_name))
if (save_plots) {
ggsave(
paste0("cnetplot_leadingEdge_", dataset_name, ".png"),
cnet,
width = 12, height = 8, dpi = 300
)
}
# ---- emapplot ----
emap <- pairwise_termsim(enr)
emap_plot <- emapplot(emap, showCategory = showCategory) +
ggtitle(paste("Enrichment Map —", dataset_name))
if (save_plots) {
ggsave(
paste0("emapplot_leadingEdge_", dataset_name, ".png"),
emap_plot,
width = 12, height = 8, dpi = 300
)
}
return(list(enrich_res = enr, cnet = cnet, emap = emap_plot))
}
# ---- Run for all datasets ----
# Optional: fold-change vector from DE results
fold_change_vec <- setNames(de$avg_logFC, de$gene)
res_h <- make_cnet_emap(fg_h, pathways_h, "Hallmark", fold_change_vec = fold_change_vec)
res_kegg <- make_cnet_emap(fg_kegg, pathways_kegg, "KEGG", fold_change_vec = fold_change_vec)
res_react <- make_cnet_emap(fg_react, pathways_react, "Reactome", fold_change_vec = fold_change_vec)
res_bp <- make_cnet_emap(fg_bp, pathways_bp, "GO_BP", fold_change_vec = fold_change_vec)
message("✅ Finished Cnet/Enrichment map plots for all datasets")
Pathways Expression on
UMAP
library(Seurat)
library(cowplot)
library(SeuratObject)
topPathways <- fg_h[["pathway"]] %>% head(10)
titles <- sub("HALLMARK_", "", topPathways)
# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_h))
# Titles for plots
titles <- sub("HALLMARK_", "", topPathways)
obj <- sample
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"
# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
pathways_h[topPathways],
obj,
assay = "SCT", # explicitly use SCT assay
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
NA
Pathways Expression
on UMAP
library(dplyr)
library(Seurat)
library(cowplot)
library(SeuratObject)
# 1️⃣ Sort the Hallmark FGSEA table by NES
fg_h_sorted <- fg_h %>% arrange(desc(NES)) # descending NES
# 2️⃣ Get top 10 up and top 10 down pathways
top_up <- fg_h_sorted$pathway[1:10] # top 10 up by NES
top_down <- fg_h_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway) # top 10 down
# 3️⃣ Combine up and down
topPathways <- c(top_up, top_down)
# Keep only pathways that exist in the Hallmark gene sets
topPathways <- intersect(topPathways, names(pathways_h))
# Titles for plotting
titles <- sub("HALLMARK_", "", topPathways)
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"
# 4️⃣ Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
pathways_h[topPathways],
obj,
assay = "SCT",
title = titles,
reduction = "umap"
)
# 5️⃣ Combine plots in a grid
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 4)

NA
NA
Pathways Expression
on UMAP
library(Seurat)
library(cowplot)
library(SeuratObject)
topPathways <- fg_kegg[["pathway"]] %>% head(10)
titles <- sub("KEGG_", "", topPathways)
# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_kegg))
# Titles for plots
titles <- sub("KEGG_", "", topPathways)
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"
# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
pathways_kegg[topPathways],
obj,
assay = "SCT", # explicitly use SCT assay
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
NA
Pathways Expression
on UMAP
library(Seurat)
library(cowplot)
library(SeuratObject)
topPathways <- fg_react[["pathway"]] %>% head(10)
titles <- sub("REACTOME_", "", topPathways)
# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_react))
# Titles for plots
titles <- sub("REACTOME_", "", topPathways)
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"
# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
pathways_react[topPathways],
obj,
assay = "SCT", # explicitly use SCT assay
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
NA
Pathways Expression
on UMAP
# Sort Reactome FGSEA table by NES
fg_react_sorted <- fg_react %>% arrange(desc(NES))
# Top 10 up and top 10 down
top_up <- fg_react_sorted$pathway[1:10]
top_down <- fg_react_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway)
# Combine and keep only pathways present in Reactome gene sets
topPathways <- intersect(c(top_up, top_down), names(pathways_react))
# Titles
titles <- sub("REACTOME_", "", topPathways)
# SCT assay
DefaultAssay(obj) <- "SCT"
# Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
pathways_react[topPathways],
obj,
assay = "SCT",
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
NA
Pathways Expression
on UMAP
library(Seurat)
library(cowplot)
library(SeuratObject)
topPathways <- fg_bp[["pathway"]] %>% head(10)
titles <- sub("GO_", "", topPathways)
# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_bp))
# Titles for plots
titles <- sub("GO_", "", topPathways)
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"
# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
pathways_bp[topPathways],
obj,
assay = "SCT", # explicitly use SCT assay
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
NA
Pathways Expression
on UMAP
# Sort GO:BP FGSEA table by NES
fg_bp_sorted <- fg_bp %>% arrange(desc(NES))
# Top 10 up and top 10 down
top_up <- fg_bp_sorted$pathway[1:10]
top_down <- fg_bp_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway)
# Combine and keep only pathways present in GO:BP gene sets
topPathways <- intersect(c(top_up, top_down), names(pathways_bp))
# Titles
titles <- sub("GO_BP_", "", topPathways)
# SCT assay
DefaultAssay(obj) <- "SCT"
# Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
pathways_bp[topPathways],
obj,
assay = "SCT",
title = titles,
reduction = "umap"
)
# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)

NA
NA
---
title: "fgsea Complete Analysis for ManuScript_Feb2026-p_val_used_ranking"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# load libraries
```{r setup, include=FALSE}

# 1. Load libraries
library(tidyverse)
library(fgsea)
library(msigdbr)
library(enrichplot)
library(clusterProfiler)
library(ggrepel)
library(Seurat)
library(SeuratObject)
library(cowplot)
```

# Load DE results
```{r }

de_file <- "1-LIBRA_Pseudobulk_Deseq2_LRT_filtered_on_mean.csv"
de <- read.csv(de_file, stringsAsFactors = FALSE)
head(de)


sample <- readRDS("../../0-Seurat_RDS_Final_OBJECT/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds")

```
## Volcano plot Malignant CD4 T cells(cell lines) vs normal CD4 T cells
```{r, fig.height= 12, fig.width= 14}
library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- de %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 5.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'grey', 'red'), 
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)
```


# Prepare preranked list for GSEA (fgsea)
```{r, fig.height=12, fig.width= 16}

# A1. IMPORTANT: Re-calculate ranking statistic using P-value (Not P-adj)
# This improves granularity and prevents ties
de <- de %>% mutate(signed_stat = avg_logFC * (-log10(p_val + 1e-300)))

# B. Create Rank Vector
ranks <- de %>% arrange(desc(signed_stat)) %>% distinct(gene, .keep_all=TRUE)
ranks_vec <- ranks$signed_stat; names(ranks_vec) <- ranks$gene
ranks_vec <- ranks_vec[!is.na(names(ranks_vec)) & !duplicated(names(ranks_vec))]

# Load MSigDB gene sets
# Hallmark
msig_h <- msigdbr(species = "Homo sapiens", category = "H") %>%
  dplyr::select(gs_name, gene_symbol)
pathways_h <- split(msig_h$gene_symbol, msig_h$gs_name)

# C2 canonical pathways: KEGG + Reactome
msig_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
  dplyr::select(gs_name, gs_subcollection, gene_symbol)

# KEGG - choose Medicus or Legacy
msig_kegg <- msigdbr(
  species = "Homo sapiens",
  category = "C2",
  subcategory = "CP:KEGG_LEGACY"   # or "CP:KEGG_LEGACY"
) %>%
  select(gs_name, gene_symbol)

pathways_kegg <- split(msig_kegg$gene_symbol, msig_kegg$gs_name)

# Reactome
msig_react <- msig_c2 %>% filter(gs_subcollection == "CP:REACTOME") %>% select(gs_name, gene_symbol)
pathways_react <- split(msig_react$gene_symbol, msig_react$gs_name)

# GO Biological Process
msig_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>%
  dplyr::select(gs_name, gene_symbol)
pathways_bp <- split(msig_bp$gene_symbol, msig_bp$gs_name)

# fgsea function
run_fgsea <- function(pathways, ranks_vec, label){
  set.seed(42)
  fg <- fgseaMultilevel(pathways = pathways, stats = ranks_vec)
  fg <- as_tibble(fg) %>%
    arrange(padj) %>%
    mutate(
      dataset = label,
      leadingEdgeCount = lengths(leadingEdge),
      leadingEdge = sapply(leadingEdge, function(x) paste(x, collapse = ";"))
    ) %>%
    dplyr::select(dataset, pathway, NES, padj, pval, size, leadingEdge, leadingEdgeCount)
  #write.csv(fg, paste0("fgsea_", label, "_results.csv"), row.names = FALSE)
  return(fg)
}

# Run GSEA for each dataset
fg_h     <- run_fgsea(pathways_h, ranks_vec, "hallmark")
fg_kegg  <- run_fgsea(pathways_kegg, ranks_vec, "kegg")
fg_react <- run_fgsea(pathways_react, ranks_vec, "reactome")
fg_bp    <- run_fgsea(pathways_bp, ranks_vec, "go_bp")

```

# Define Exclusion List (Strict)
```{r }

# Expanded Proliferation Keywords
# STRICT EXCLUSION LIST (Updated)
prolif_terms <- c(
  "CELL_CYCLE", "MITOTIC", "G2M", "E2F", "SPINDLE", 
  "CHROMOSOME", "DNA_REPLICATION", "NUCLEAR_DIVISION",
  "ORGANELLE_FISSION", "KINETOCHORE", "CENTROSOME",
  "REPLICATION", "SEGREGATION", "DIVISION", "M_PHASE", 
  "KINESINS", "MEIOSIS", "OOCYTE", 
  "MICROTUBULE", "CYTOSKELETON", "TRAFFIC", "GOLGI", "CYCLIN",
  "RECOMBINATION", "REPAIR", "REPLICATIVE", "POLO_LIKE"
)
```


# DATA PREPARATION FUNCTION
```{r, fig.height= 6, fig.width= 10}
prepare_data <- function(fg_tbl, topN = 5, exclude_prolif = FALSE) {
  if (exclude_prolif) {
    fg_tbl <- fg_tbl %>% filter(!grepl(paste(prolif_terms, collapse = "|"), pathway, ignore.case = TRUE))
  }

  # Get top N up and down per database
  fg_plot <- bind_rows(
    fg_tbl %>% filter(dataset == "hallmark") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
    fg_tbl %>% filter(dataset == "kegg") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
    fg_tbl %>% filter(dataset == "reactome") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) },
    fg_tbl %>% filter(dataset == "go_bp") %>% { bind_rows(slice_max(., NES, n = topN), slice_min(., NES, n = topN)) }
  )

  # Format Labels
  fg_plot %>%
    mutate(
      db_prefix = case_when(
        dataset == "hallmark" ~ "HALLMARK",
        dataset == "kegg" ~ "KEGG",
        dataset == "reactome" ~ "REACTOME",
        dataset == "go_bp" ~ "GOBP"
      ),
      clean_pathway = gsub("^HALLMARK_|^KEGG_|^REACTOME_|^GOBP_", "", pathway),
      plot_label = paste0(db_prefix, "_", clean_pathway)
    ) %>%
    arrange(NES) %>%
    mutate(plot_label = factor(plot_label, levels = unique(plot_label)))
}

```

# PLOTTING FUNCTION 
```{r, fig.height= 6, fig.width= 10}
create_plot <- function(data, color_var, color_label, title_text) {
  ggplot(data, aes(x = NES, y = plot_label)) +
    geom_point(aes(shape = dataset, size = leadingEdgeCount, color = !!sym(color_var)), alpha = 0.9) +
    geom_vline(xintercept = 0, linetype = "solid", color = "gray80", linewidth = 0.5) +

    scale_color_gradientn(
      colors = c("red", "orange", "blue"),
      trans = "log10",
      name = color_label,
      guide = guide_colorbar(reverse = TRUE)
    ) +

    scale_shape_manual(
      values = c("hallmark" = 17, "kegg" = 15, "reactome" = 3, "go_bp" = 16),
      guide = "none"
    ) +

    scale_size_continuous(range = c(3, 8), name = "Leading edge genes") +

    theme_minimal() +
    labs(x = "Normalized Enrichment Score (NES)", y = NULL, title = title_text) +
    theme(
      axis.text.y = element_text(size = 14, face = "bold", color = "black"),
      # X-axis labels (Numbers) - BOLD & BIGGER
      axis.text.x = element_text(size = 10, face = "bold", color = "black"),
      # X-axis TITLE (The Text "Normalized Enrichment Score...") - BOLD
      axis.title.x = element_text(size = 14, face = "bold", color = "black", margin = margin(t = 10)),
      
      # Plot Title
      plot.title = element_text(face = "bold", size = 13, hjust = 0.5),
      
      # Legend
      legend.position = "right",
      legend.box = "vertical",
      legend.title = element_text(face = "bold", size = 10),

      panel.grid.major.y = element_line(color = "gray95")
    )
}

```


# GENERATE 4 PLOTS 
```{r, fig.height= 12, fig.width= 16}
# A) All Pathways (padj)
df_all <- prepare_data(fg_all, exclude_prolif = FALSE)
p1 <- create_plot(df_all, "padj", "FDR (padj)", "Global Pathway Alterations (Malignant vs. Normal CD4+)")
ggsave("Fig1_All_padj.png", p1, width = 16, height = 8, dpi = 300)
ggsave("Fig1_All_padj.pdf", p1, width = 16, height = 8)

# B) All Pathways (p-value)
p2 <- create_plot(df_all, "pval", "P-value", "Nominally Significant Pathway Alterations")
ggsave("Fig2_All_pval.png", p2, width = 16, height = 8, dpi = 300)
ggsave("Fig2_All_pval.pdf", p2, width = 16, height = 8)

# C) Non-Proliferation (padj)
df_no_prolif <- prepare_data(fg_all, exclude_prolif = TRUE)
p3 <- create_plot(df_no_prolif, "padj", "FDR (padj)", "Functional & Immune Signatures (Non-Proliferative)")
ggsave("Fig3_NonProlif_padj.png", p3, width = 16, height = 8, dpi = 300)
ggsave("Fig3_NonProlif_padj.pdf", p3, width = 16, height = 8)

# D) Non-Proliferation (p-value)
p4 <- create_plot(df_no_prolif, "pval", "P-value", "Exploratory Functional Signatures (Non-Proliferative)")
ggsave("Fig4_NonProlif_pval.png", p4, width = 16, height = 8, dpi = 300)
ggsave("Fig4_NonProlif_pval.pdf", p4, width = 16, height = 8)

print("Created 4 figures: Fig1..Fig4 (.png and .pdf)")

p1
p2
p3
p4
```
# Enrichment map_cnetplot for leading-edge genes
```{r, fig.height=12, fig.width= 16}

#### Enrichment map / cnetplot for leading-edge genes ####
library(clusterProfiler)
library(enrichplot)
library(tidyverse)

# ---- Function ----
make_cnet_emap <- function(fgsea_res,
                           pathways,
                           dataset_name,
                           padj_cutoff = 0.25,
                           topN_fallback = 10,
                           showCategory = 15,
                           fold_change_vec = NULL,
                           save_plots = TRUE) {
  
  message("Processing dataset: ", dataset_name)
  
  # 1) Try to filter by padj
  leading_list <- fgsea_res %>%
    filter(padj < padj_cutoff) %>%
    select(pathway, leadingEdge) %>%
    mutate(leadingEdge = map(leadingEdge, as.character)) %>%
    deframe()
  
  # 2) Fallback: if too few, take topN pathways by NES
  if (length(leading_list) < 2) {
    message("⚠ No pathways pass padj <", padj_cutoff, 
            " for ", dataset_name, ". Using top ", topN_fallback, " pathways by NES.")
    
    leading_list <- fgsea_res %>%
      arrange(padj, desc(abs(NES))) %>%
      slice_head(n = topN_fallback) %>%
      select(pathway, leadingEdge) %>%
      mutate(leadingEdge = map(leadingEdge, as.character)) %>%
      deframe()
  }
  
  if (length(leading_list) < 2) {
    message("❌ Still too few pathways for ", dataset_name, " — skipping.")
    return(NULL)
  }
  
  # TERM2GENE conversion
  lead_term2gene <- enframe(leading_list, name = "TERM", value = "GENE") %>%
    unnest(cols = c(GENE))
  
  # Union of leading-edge genes
  union_leading_genes <- unique(unlist(leading_list))
  
  if (length(union_leading_genes) < 2) {
    message("❌ Too few leading-edge genes for ", dataset_name, " — skipping.")
    return(NULL)
  }
  
  # Run enrichment (very relaxed min/max sizes)
  enr <- enricher(
    union_leading_genes,
    TERM2GENE = lead_term2gene,
    minGSSize = 1,
    maxGSSize = 5000
  )
  
  if (is.null(enr) || nrow(enr) == 0) {
    message("❌ No enrichment result for ", dataset_name)
    return(NULL)
  }
  
  # ---- cnetplot ----
  cnet <- cnetplot(
    enr,
    foldChange = fold_change_vec,
    showCategory = showCategory,
    circular = FALSE,
    colorEdge = TRUE
  ) + ggtitle(paste("Cnetplot —", dataset_name))
  
  if (save_plots) {
    ggsave(
      paste0("cnetplot_leadingEdge_", dataset_name, ".png"),
      cnet,
      width = 12, height = 8, dpi = 300
    )
  }
  
  # ---- emapplot ----
  emap <- pairwise_termsim(enr)
  emap_plot <- emapplot(emap, showCategory = showCategory) +
    ggtitle(paste("Enrichment Map —", dataset_name))
  
  if (save_plots) {
    ggsave(
      paste0("emapplot_leadingEdge_", dataset_name, ".png"),
      emap_plot,
      width = 12, height = 8, dpi = 300
    )
  }
  
  return(list(enrich_res = enr, cnet = cnet, emap = emap_plot))
}

# ---- Run for all datasets ----
# Optional: fold-change vector from DE results
fold_change_vec <- setNames(de$avg_logFC, de$gene)

res_h     <- make_cnet_emap(fg_h,     pathways_h,     "Hallmark", fold_change_vec = fold_change_vec)
res_kegg  <- make_cnet_emap(fg_kegg,  pathways_kegg,  "KEGG",     fold_change_vec = fold_change_vec)
res_react <- make_cnet_emap(fg_react, pathways_react, "Reactome", fold_change_vec = fold_change_vec)
res_bp    <- make_cnet_emap(fg_bp,    pathways_bp,    "GO_BP",    fold_change_vec = fold_change_vec)

message("✅ Finished Cnet/Enrichment map plots for all datasets")

```





# Enrichment map_cnetplot for leading-edge genes
```{r, fig.height=12, fig.width= 16}

#### Enrichment map / cnetplot for leading-edge genes ####
library(clusterProfiler)
library(enrichplot)
library(tidyverse)

# ---- Function ----
make_cnet_emap <- function(fgsea_res,
                           pathways,
                           dataset_name,
                           padj_cutoff = 0.25,
                           topN_fallback = 10,
                           showCategory = 15,
                           fold_change_vec = NULL,
                           save_plots = TRUE) {
  
  message("Processing dataset: ", dataset_name)
  
  # 1) Try to filter by padj
  leading_list <- fgsea_res %>%
    filter(padj < padj_cutoff) %>%
    select(pathway, leadingEdge) %>%
    mutate(leadingEdge = map(leadingEdge, as.character)) %>%
    deframe()
  
  # 2) Fallback: if too few, take topN pathways by NES
  if (length(leading_list) < 2) {
    message("⚠ No pathways pass padj <", padj_cutoff, 
            " for ", dataset_name, ". Using top ", topN_fallback, " pathways by NES.")
    
    leading_list <- fgsea_res %>%
      arrange(padj, desc(abs(NES))) %>%
      slice_head(n = topN_fallback) %>%
      select(pathway, leadingEdge) %>%
      mutate(leadingEdge = map(leadingEdge, as.character)) %>%
      deframe()
  }
  
  if (length(leading_list) < 2) {
    message("❌ Still too few pathways for ", dataset_name, " — skipping.")
    return(NULL)
  }
  
  # TERM2GENE conversion
  lead_term2gene <- enframe(leading_list, name = "TERM", value = "GENE") %>%
    unnest(cols = c(GENE))
  
  # Union of leading-edge genes
  union_leading_genes <- unique(unlist(leading_list))
  
  if (length(union_leading_genes) < 2) {
    message("❌ Too few leading-edge genes for ", dataset_name, " — skipping.")
    return(NULL)
  }
  
  # Run enrichment (very relaxed min/max sizes)
  enr <- enricher(
    union_leading_genes,
    TERM2GENE = lead_term2gene,
    minGSSize = 1,
    maxGSSize = 5000
  )
  
  if (is.null(enr) || nrow(enr) == 0) {
    message("❌ No enrichment result for ", dataset_name)
    return(NULL)
  }
  
  # ---- cnetplot ----
  cnet <- cnetplot(
    enr,
    foldChange = fold_change_vec,
    showCategory = showCategory,
    circular = FALSE,
    colorEdge = TRUE
  ) + ggtitle(paste("Cnetplot —", dataset_name))
  
  if (save_plots) {
    ggsave(
      paste0("cnetplot_leadingEdge_", dataset_name, ".png"),
      cnet,
      width = 12, height = 8, dpi = 300
    )
  }
  
  # ---- emapplot ----
  emap <- pairwise_termsim(enr)
  emap_plot <- emapplot(emap, showCategory = showCategory) +
    ggtitle(paste("Enrichment Map —", dataset_name))
  
  if (save_plots) {
    ggsave(
      paste0("emapplot_leadingEdge_", dataset_name, ".png"),
      emap_plot,
      width = 12, height = 8, dpi = 300
    )
  }
  
  return(list(enrich_res = enr, cnet = cnet, emap = emap_plot))
}

# ---- Run for all datasets ----
# Optional: fold-change vector from DE results
fold_change_vec <- setNames(de$avg_logFC, de$gene)

res_h     <- make_cnet_emap(fg_h,     pathways_h,     "Hallmark", fold_change_vec = fold_change_vec)
res_kegg  <- make_cnet_emap(fg_kegg,  pathways_kegg,  "KEGG",     fold_change_vec = fold_change_vec)
res_react <- make_cnet_emap(fg_react, pathways_react, "Reactome", fold_change_vec = fold_change_vec)
res_bp    <- make_cnet_emap(fg_bp,    pathways_bp,    "GO_BP",    fold_change_vec = fold_change_vec)

message("✅ Finished Cnet/Enrichment map plots for all datasets")

```

# Pathways Expression on UMAP
```{r,  fig.height=22, fig.width= 26}

library(Seurat)
library(cowplot)
library(SeuratObject)


topPathways <- fg_h[["pathway"]] %>% head(10)
titles <- sub("HALLMARK_", "", topPathways)


# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_h))

# Titles for plots
titles <- sub("HALLMARK_", "", topPathways)

obj <- sample
# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"

# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
  pathways_h[topPathways],
  obj,
  assay = "SCT",         # explicitly use SCT assay
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)



```

## Pathways Expression on UMAP
```{r,  fig.height=22, fig.width= 26}

library(dplyr)
library(Seurat)
library(cowplot)
library(SeuratObject)

# 1️⃣ Sort the Hallmark FGSEA table by NES
fg_h_sorted <- fg_h %>% arrange(desc(NES))  # descending NES

# 2️⃣ Get top 10 up and top 10 down pathways
top_up   <- fg_h_sorted$pathway[1:10]       # top 10 up by NES
top_down <- fg_h_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway)  # top 10 down

# 3️⃣ Combine up and down
topPathways <- c(top_up, top_down)

# Keep only pathways that exist in the Hallmark gene sets
topPathways <- intersect(topPathways, names(pathways_h))

# Titles for plotting
titles <- sub("HALLMARK_", "", topPathways)

# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"

# 4️⃣ Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
  pathways_h[topPathways],
  obj,
  assay = "SCT",
  title = titles,
  reduction = "umap"
)

# 5️⃣ Combine plots in a grid
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 4)


```
## Pathways Expression on UMAP
```{r,  fig.height=22, fig.width= 26}

library(Seurat)
library(cowplot)
library(SeuratObject)


topPathways <- fg_kegg[["pathway"]] %>% head(10)
titles <- sub("KEGG_", "", topPathways)


# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_kegg))

# Titles for plots
titles <- sub("KEGG_", "", topPathways)

# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"

# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
  pathways_kegg[topPathways],
  obj,
  assay = "SCT",         # explicitly use SCT assay
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)



```


## Pathways Expression on UMAP
```{r,  fig.height=22, fig.width= 26}

library(Seurat)
library(cowplot)
library(SeuratObject)


topPathways <- fg_react[["pathway"]] %>% head(10)
titles <- sub("REACTOME_", "", topPathways)


# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_react))

# Titles for plots
titles <- sub("REACTOME_", "", topPathways)

# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"

# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
  pathways_react[topPathways],
  obj,
  assay = "SCT",         # explicitly use SCT assay
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)



```
## Pathways Expression on UMAP
```{r,  fig.height=30, fig.width= 20}

# Sort Reactome FGSEA table by NES
fg_react_sorted <- fg_react %>% arrange(desc(NES))

# Top 10 up and top 10 down
top_up   <- fg_react_sorted$pathway[1:10]
top_down <- fg_react_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway)

# Combine and keep only pathways present in Reactome gene sets
topPathways <- intersect(c(top_up, top_down), names(pathways_react))

# Titles
titles <- sub("REACTOME_", "", topPathways)

# SCT assay
DefaultAssay(obj) <- "SCT"

# Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
  pathways_react[topPathways],
  obj,
  assay = "SCT",
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)



```


## Pathways Expression on UMAP
```{r,  fig.height=22, fig.width= 26}

library(Seurat)
library(cowplot)
library(SeuratObject)


topPathways <- fg_bp[["pathway"]] %>% head(10)
titles <- sub("GO_", "", topPathways)


# Make sure topPathways are in the Hallmark list
topPathways <- intersect(topPathways, names(pathways_bp))

# Titles for plots
titles <- sub("GO_", "", topPathways)

# Use SCT assay explicitly
DefaultAssay(obj) <- "SCT"

# Run plotCoregulationProfileReduction
ps <- plotCoregulationProfileReduction(
  pathways_bp[topPathways],
  obj,
  assay = "SCT",         # explicitly use SCT assay
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)



```
## Pathways Expression on UMAP
```{r, fig.height=30, fig.width= 20}

# Sort GO:BP FGSEA table by NES
fg_bp_sorted <- fg_bp %>% arrange(desc(NES))

# Top 10 up and top 10 down
top_up   <- fg_bp_sorted$pathway[1:10]
top_down <- fg_bp_sorted %>% arrange(NES) %>% slice(1:10) %>% pull(pathway)

# Combine and keep only pathways present in GO:BP gene sets
topPathways <- intersect(c(top_up, top_down), names(pathways_bp))

# Titles
titles <- sub("GO_BP_", "", topPathways)

# SCT assay
DefaultAssay(obj) <- "SCT"

# Run co-regulation UMAP
ps <- plotCoregulationProfileReduction(
  pathways_bp[topPathways],
  obj,
  assay = "SCT",
  title = titles,
  reduction = "umap"
)

# Combine plots
cowplot::plot_grid(plotlist = ps[1:length(ps)], ncol = 2)


```
