1. load libraries
3. Create the EnhancedVolcano plot
library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 1e-10,
FCcutoff = 1.5,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 1e-10,
FCcutoff = 1.5,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)

NA
NA
7. Save Hallmark and kegg to CSV
# Assuming you have the results stored in fgsea_result_hallmark and fgsea_result_kegg
# Flatten the list columns into character strings for Hallmark results
fgsea_result_hallmark_flattened <- fgsea_result %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened Hallmark results to a CSV file
write.csv(fgsea_result_hallmark_flattened, "fgsea_results_hallmark.csv", row.names = FALSE)
# Flatten the list columns into character strings for KEGG results
fgsea_result_kegg_flattened <- fgsea_result_kegg %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened KEGG results to a CSV file
write.csv(fgsea_result_kegg_flattened, "fgsea_results_kegg.csv", row.names = FALSE)
ggplot(data.frame(gene_symbol = names(gene_list)[1:50], ranks = gene_list[1:50]), aes(gene_symbol, ranks)) +
geom_point() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

8. Hallmark genes found in multiple pathways
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# Assuming 'fgsea_results' is the output of your fgsea analysis
# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result %>%
filter(padj < 0.05) %>%
select(pathway, leadingEdge)
# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
unnest(cols = leadingEdge)
# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
group_by(leadingEdge) %>%
summarise(count = n()) %>%
arrange(desc(count)) # Arrange genes by the number of pathways they appear in
# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
filter(count > 1) # Genes involved in more than one pathway
# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() +
labs(title = "Top Regulator Genes Involved in Multiple Pathways",
x = "Gene",
y = "Number of Pathways") +
theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")
8. Hallmark genes found in multiple pathways
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# Assuming 'fgsea_results' is the output of your fgsea analysis
# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result_kegg %>%
filter(padj < 0.05) %>%
select(pathway, leadingEdge)
# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
unnest(cols = leadingEdge)
# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
group_by(leadingEdge) %>%
summarise(count = n()) %>%
arrange(desc(count)) # Arrange genes by the number of pathways they appear in
# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
filter(count > 1) # Genes involved in more than one pathway
# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() +
labs(title = "Top Regulator Genes Involved in Multiple Pathways",
x = "Gene",
y = "Number of Pathways") +
theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")
---
title: "FGSEA- of Malignant CD4Tcells vs Control(Normal CD4 Tcells)"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}
suppressPackageStartupMessages({
library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(harmony)
library(ggplot2)
library(cowplot)
library(reticulate)
library(Azimuth)
library(dplyr)
library(Rtsne)
library(harmony)
library(gridExtra)
library(EnhancedVolcano)
})
  
```

# 2. Perform DE analysis using Malignant_CD4Tcells_vs_Normal_CD4Tcells genes
```{r data1, fig.height=8, fig.width=12}

Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("1-Pseudobulk_DEseq2_LRT_DE_with_libra.csv", header = T)
```

# 3. Create the EnhancedVolcano plot
```{r enhancedV, fig.height=12, fig.width=16}

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-10,
  FCcutoff = 1.5,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)



EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-10,
  FCcutoff = 1.5,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
) 


```

# 4.  Perform Fast GSEA using Hallmark Gene Sets
```{r , fig.height=8, fig.width=12}
# Load necessary libraries
library(fgsea)
library(msigdbr)
library(dplyr)

# Obtain Hallmark gene sets from msigdbr
hallmark_genes <- msigdbr(species = "Homo sapiens", category = "H")

# Convert the gene sets to a list format for fgsea
hallmark_list <- hallmark_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list with SIGNED metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_logFC * -log10(p_val_adj))


# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Remove infinite values and sort in decreasing order
gene_list <- gene_list[is.finite(gene_list)]
gene_list <- sort(gene_list, decreasing = TRUE)

# Perform fast GSEA using fgseaMultilevel
fgsea_result <- fgsea(pathways = hallmark_list, 
                                stats = gene_list,
                                eps = 0.0,
                                minSize = 15,
                                maxSize = 500)  # No need for nperm

# View the fgsea results
head(fgsea_result[order(pval), ])


```



# 4.  Perform Visualization of fgseq using Hallmark Gene Sets
```{r , fig.height=8, fig.width=12}
library(data.table)
library(fgsea)
library(ggplot2)

# Plot the top pathway
top_pathway <- fgsea_result[order(fgsea_result$padj), ][1, ]
plotEnrichment(hallmark_list[[top_pathway$pathway]], gene_list) +
  labs(title = top_pathway$pathway)

topPathwaysUp <- fgsea_result[ES > 0][head(order(padj), n=10), pathway]
topPathwaysDown <- fgsea_result[ES < 0][head(order(padj), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
plotGseaTable(hallmark_list[topPathways], gene_list, fgsea_result, 
              gseaParam=0.5)


```

## . Visualization-Hallmark
```{r , fig.height=8, fig.width=12}
fgseaResTidy <- fgsea_result %>%
  as_tibble() %>%
  arrange(desc(NES))

# Show in a nice table, excluding any columns that do not exist
fgseaResTidy %>% 
  dplyr::select(-leadingEdge, -ES) %>% 
  arrange(padj) %>% 
  DT::datatable()

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.05)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from fgsea") + 
  theme_minimal() + 
  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
  theme(axis.text.y = element_text(size = 8))  # Adjust axis text size for readability



```

# 5. Obtain KEGG Gene Sets and Perform fgsea Using KEGG Pathways
```{r , fig.height=8, fig.width=12}
library(fgsea)
library(msigdbr)
library(dplyr)
library(pheatmap)

# Obtain KEGG Legacy gene sets from msigdbr
kegg_genes <- msigdbr(species = "Homo sapiens", category = "C2", subcollection = "CP:KEGG_LEGACY")

# Convert the gene sets to a list format for fgsea
kegg_list <- kegg_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list based on avg_logFC and p_val_adj
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_logFC * -log10(p_val_adj))

# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Sort the named vector in decreasing order
gene_list <- sort(gene_list, decreasing = TRUE)

gene_list <- gene_list[is.finite(gene_list)]

# Perform fast GSEA using KEGG pathways
fgsea_result_kegg <- fgsea(pathways = kegg_list, 
                           stats = gene_list,
                           eps=0.0,
                           minSize = 10,
                           maxSize = 500)  


# View the fgsea results
head(fgsea_result_kegg[order(pval), ])


```

# 6.  Perform Visualization of fgseq using KEGG Gene Sets
```{r , fig.height=8, fig.width=12}
library(data.table)
library(fgsea)
library(ggplot2)

# Plot the top pathway
top_pathway <- fgsea_result_kegg[order(fgsea_result_kegg$padj), ][1, ]
plotEnrichment(kegg_list[[top_pathway$pathway]], gene_list) +
  labs(title = top_pathway$pathway)

topPathwaysUp <- fgsea_result_kegg[ES > 0][head(order(padj), n=10), pathway]
topPathwaysDown <- fgsea_result_kegg[ES < 0][head(order(padj), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
plotGseaTable(kegg_list[topPathways], gene_list, fgsea_result_kegg, 
              gseaParam=0.5)


```

## . Visualization-Kegg1
```{r , fig.height=8, fig.width=12}
fgseaResTidy <- fgsea_result_kegg %>%
  as_tibble() %>%
  arrange(desc(NES))

# Show in a nice table, excluding any columns that do not exist
fgseaResTidy %>% 
  dplyr::select(-leadingEdge, -ES) %>% 
  arrange(padj) %>% 
  DT::datatable()

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.05)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from fgsea") + 
  theme_minimal() + 
  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
  theme(axis.text.y = element_text(size = 8))  # Adjust axis text size for readability



```

## . Visualization-Kegg2
```{r , fig.height=8, fig.width=12}
# Arrange by NES and select top 20 up and down pathways
topUp <- fgseaResTidy %>%
  dplyr::filter(NES > 0) %>%
  dplyr::arrange(desc(NES)) %>%
  dplyr::slice_head(n = 20)

topDown <- fgseaResTidy %>%
  dplyr::filter(NES < 0) %>%
  dplyr::arrange(NES) %>%
  dplyr::slice_head(n = 20)

# Combine the top up and down pathways
topPathways <- dplyr::bind_rows(topUp, topDown)


ggplot(topPathways, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill = padj < 0.05)) +
  coord_flip() +
  labs(x = "Pathway", y = "Normalized Enrichment Score",
       title = "Top 20 Up and Down KEGG Pathways NES from GSEA") +
  theme_minimal() +
  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "grey"))




```

# 7. Save Hallmark and kegg to CSV
```{r , fig.height=8, fig.width=12}

# Assuming you have the results stored in fgsea_result_hallmark and fgsea_result_kegg

# Flatten the list columns into character strings for Hallmark results
fgsea_result_hallmark_flattened <- fgsea_result %>%
  mutate(across(where(is.list), ~ sapply(., toString)))

# Write the flattened Hallmark results to a CSV file
write.csv(fgsea_result_hallmark_flattened, "fgsea_results_hallmark.csv", row.names = FALSE)

# Flatten the list columns into character strings for KEGG results
fgsea_result_kegg_flattened <- fgsea_result_kegg %>%
  mutate(across(where(is.list), ~ sapply(., toString)))

# Write the flattened KEGG results to a CSV file
write.csv(fgsea_result_kegg_flattened, "fgsea_results_kegg.csv", row.names = FALSE)


ggplot(data.frame(gene_symbol = names(gene_list)[1:50], ranks = gene_list[1:50]), aes(gene_symbol, ranks)) + 
	geom_point() +
	theme_classic() + 
	theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

```
# 8.  Hallmark genes found in multiple pathways
```{r , fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# Assuming 'fgsea_results' is the output of your fgsea analysis

# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result %>%
  filter(padj < 0.05) %>%
  select(pathway, leadingEdge)

# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
  unnest(cols = leadingEdge)

# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
  group_by(leadingEdge) %>%
  summarise(count = n()) %>%
  arrange(desc(count))  # Arrange genes by the number of pathways they appear in

# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
  filter(count > 1)  # Genes involved in more than one pathway

# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
  geom_bar(stat = "identity", fill = "salmon") +
  coord_flip() +
  labs(title = "Top Regulator Genes Involved in Multiple Pathways",
       x = "Gene",
       y = "Number of Pathways") +
  theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")


```


# 8.  Hallmark genes found in multiple pathways
```{r , fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# Assuming 'fgsea_results' is the output of your fgsea analysis

# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result_kegg %>%
  filter(padj < 0.05) %>%
  select(pathway, leadingEdge)

# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
  unnest(cols = leadingEdge)

# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
  group_by(leadingEdge) %>%
  summarise(count = n()) %>%
  arrange(desc(count))  # Arrange genes by the number of pathways they appear in

# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
  filter(count > 1)  # Genes involved in more than one pathway

# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
  geom_bar(stat = "identity", fill = "salmon") +
  coord_flip() +
  labs(title = "Top Regulator Genes Involved in Multiple Pathways",
       x = "Gene",
       y = "Number of Pathways") +
  theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")


```