1. load libraries
3. Create the EnhancedVolcano plot
EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_log2FC",
y = "p_val_adj",
title = "MAST with Batch Correction (All Genes)",
pCutoff = 0.05,
FCcutoff = 1.0)

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_log2FC",
y = "p_val_adj",
selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9',
'CD80', 'IL1B', 'RPS4Y1',
'IL7R', 'TCF7', 'MKI67', 'CD70',
'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9',
'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL',
'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4',
'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 0.05,
FCcutoff = 1.5,
pointSize = 3.0,
labSize = 5.0,
boxedLabels = TRUE,
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5,
colConnectors = 'grey50',
arrowheads = FALSE,
max.overlaps = 30)

library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_log2FC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 0.05,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0] # Only label significant genes
)

EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
x = "avg_log2FC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 0.05,
FCcutoff = 1.0,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)

NA
NA
5. Create the Heatmap of fgsea results
library(pheatmap)
# Select the top 50 pathways
top_pathways <- fgsea_result %>%
arrange(padj) %>%
head(50)
# Create a matrix for the heatmap with pathways as rows and NES as the values
heatmap_data <- matrix(top_pathways$NES, nrow = length(top_pathways$pathway), ncol = 1)
rownames(heatmap_data) <- top_pathways$pathway
colnames(heatmap_data) <- c("NES")
# Plot the combined heatmap for the top 50 pathways
pheatmap(heatmap_data,
cluster_rows = TRUE,
cluster_cols = FALSE,
show_rownames = TRUE,
show_colnames = TRUE,
main = "Hallmark Pathways: Malignant CD4 T Cells compared to normal CD4 T cells",
color = colorRampPalette(c("blue", "white", "red"))(50))

7. Save Hallmark and kegg to CSV
# Assuming you have the results stored in fgsea_result_hallmark and fgsea_result_kegg
# Flatten the list columns into character strings for Hallmark results
fgsea_result_hallmark_flattened <- fgsea_result %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened Hallmark results to a CSV file
write.csv(fgsea_result_hallmark_flattened, "fgsea_results_hallmark.csv", row.names = FALSE)
# Flatten the list columns into character strings for KEGG results
fgsea_result_kegg_flattened <- fgsea_result_kegg %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened KEGG results to a CSV file
write.csv(fgsea_result_kegg_flattened, "fgsea_results_kegg.csv", row.names = FALSE)
---
title: "Differential Expression Analysis of Malignant CD4Tcells vs Control(Normal CD4 Tcells)"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}
library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(harmony)
library(ggplot2)
library(cowplot)
library(reticulate)
library(Azimuth)
library(dplyr)
library(Rtsne)
library(harmony)
library(gridExtra)
library(EnhancedVolcano)
```

# 2. Perform DE analysis using Malignant_CD4Tcells_vs_Normal_CD4Tcells genes
```{r data1, fig.height=8, fig.width=12}

Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("0-imp_Robj/1-MAST_with_batch_as_Covariate_with_meanExpression.csv", header = T)
```

# 3. Create the EnhancedVolcano plot
```{r enhancedV, fig.height=12, fig.width=16}

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)


EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)



EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
) 


```

# 4.  Perform Fast GSEA using Hallmark Gene Sets
```{r data2, fig.height=8, fig.width=12}

library(fgsea)
library(msigdbr)
library(dplyr)

# Obtain Hallmark gene sets from msigdbr
hallmark_genes <- msigdbr(species = "Homo sapiens", category = "H")

# Convert the gene sets to a list format for fgsea
hallmark_list <- hallmark_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list based on avg_log2FC and p_val_adj
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_log2FC * -log10(p_val_adj))

# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Sort the named vector in decreasing order
gene_list <- sort(gene_list, decreasing = TRUE)

# Perform fast GSEA
fgsea_result <- fgsea(pathways = hallmark_list, 
                      stats = gene_list, 
                      nperm = 1000)  # Number of permutations

# View the fgsea results
head(fgsea_result)

# Plot the top pathway
top_pathway <- fgsea_result[order(fgsea_result$padj), ][1, ]
plotEnrichment(hallmark_list[[top_pathway$pathway]], gene_list) +
  labs(title = top_pathway$pathway)

```

# 5. Create the Heatmap of fgsea results
```{r data3, fig.height=8, fig.width=12}
library(pheatmap)

# Select the top 50 pathways
top_pathways <- fgsea_result %>%
  arrange(padj) %>%
  head(50)

# Create a matrix for the heatmap with pathways as rows and NES as the values
heatmap_data <- matrix(top_pathways$NES, nrow = length(top_pathways$pathway), ncol = 1)
rownames(heatmap_data) <- top_pathways$pathway
colnames(heatmap_data) <- c("NES")

# Plot the combined heatmap for the top 50 pathways
pheatmap(heatmap_data, 
         cluster_rows = TRUE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         main = "Hallmark Pathways: Malignant CD4 T Cells compared to normal CD4 T cells",
         color = colorRampPalette(c("blue", "white", "red"))(50))

```

# 6. Obtain KEGG Gene Sets and Perform Fast GSEA Using KEGG Pathways
```{r data4, fig.height=8, fig.width=12}
library(fgsea)
library(msigdbr)
library(dplyr)
library(pheatmap)

# Obtain KEGG gene sets from msigdbr
kegg_genes <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG")

# Convert the gene sets to a list format for fgsea
kegg_list <- kegg_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list based on avg_log2FC and p_val_adj
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_log2FC * -log10(p_val_adj))

# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Sort the named vector in decreasing order
gene_list <- sort(gene_list, decreasing = TRUE)

# Perform fast GSEA using KEGG pathways
fgsea_result_kegg <- fgsea(pathways = kegg_list, 
                           stats = gene_list, 
                           nperm = 1000)  # Number of permutations

# View the fgsea results
head(fgsea_result_kegg)

# Separate upregulated (positive NES) and downregulated (negative NES) pathways
upregulated_pathways <- fgsea_result_kegg %>% filter(NES > 0) %>% arrange(padj) %>% head(20)
downregulated_pathways <- fgsea_result_kegg %>% filter(NES < 0) %>% arrange(padj) %>% head(20)

# Combine the top 20 upregulated and top 20 downregulated pathways
combined_pathways <- bind_rows(upregulated_pathways, downregulated_pathways)

# Create a matrix for the heatmap with pathways as rows and NES as the values
heatmap_data_combined <- matrix(combined_pathways$NES, nrow = length(combined_pathways$pathway), ncol = 1)
rownames(heatmap_data_combined) <- combined_pathways$pathway
colnames(heatmap_data_combined) <- c("NES")

# Plot the combined heatmap for the top 20 upregulated and top 20 downregulated pathways
pheatmap(heatmap_data_combined, 
         cluster_rows = TRUE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         main = "KEGG Pathways: Malignant CD4 T Cells compared to normal CD4 T Cells",
         color = colorRampPalette(c("blue", "white", "red"))(50))
```

# 6. Obtain KEGG Gene Sets and Perform Fast GSEA Using KEGG Pathways
```{r data5, fig.height=8, fig.width=12}
library(fgsea)
library(msigdbr)
library(dplyr)
library(pheatmap)

# Obtain KEGG gene sets from msigdbr
kegg_genes <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG")

# Convert the gene sets to a list format for fgsea
kegg_list <- kegg_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list based on avg_log2FC and p_val_adj
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_log2FC * -log10(p_val_adj))

# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Sort the named vector in decreasing order
gene_list <- sort(gene_list, decreasing = TRUE)

# Perform fast GSEA using KEGG pathways
fgsea_result_kegg <- fgsea(pathways = kegg_list, 
                           stats = gene_list, 
                           nperm = 1000)  # Number of permutations

# View the fgsea results
head(fgsea_result_kegg)

# Separate upregulated (positive NES) and downregulated (negative NES) pathways
upregulated_pathways <- fgsea_result_kegg %>% filter(NES > 0) %>% arrange(padj) %>% head(5)
downregulated_pathways <- fgsea_result_kegg %>% filter(NES < 0) %>% arrange(padj) %>% head(5)

# Combine the top 5 upregulated and top 5 downregulated pathways
combined_pathways <- bind_rows(upregulated_pathways, downregulated_pathways)

# Create a matrix for the heatmap with pathways as rows and NES as the values
heatmap_data_combined <- matrix(combined_pathways$NES, nrow = length(combined_pathways$pathway), ncol = 1)
rownames(heatmap_data_combined) <- combined_pathways$pathway
colnames(heatmap_data_combined) <- c("NES")

# Plot the combined heatmap for the top 20 upregulated and top 20 downregulated pathways
pheatmap(heatmap_data_combined, 
         cluster_rows = TRUE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         main = "KEGG Pathways: Malignant CD4 T Cells compared to normal CD4 T Cells",
         color = colorRampPalette(c("blue", "white", "red"))(50))
```
# 7. Save Hallmark and kegg to CSV
```{r data6, fig.height=8, fig.width=12}

# Assuming you have the results stored in fgsea_result_hallmark and fgsea_result_kegg

# Flatten the list columns into character strings for Hallmark results
fgsea_result_hallmark_flattened <- fgsea_result %>%
  mutate(across(where(is.list), ~ sapply(., toString)))

# Write the flattened Hallmark results to a CSV file
write.csv(fgsea_result_hallmark_flattened, "fgsea_results_hallmark.csv", row.names = FALSE)

# Flatten the list columns into character strings for KEGG results
fgsea_result_kegg_flattened <- fgsea_result_kegg %>%
  mutate(across(where(is.list), ~ sapply(., toString)))

# Write the flattened KEGG results to a CSV file
write.csv(fgsea_result_kegg_flattened, "fgsea_results_kegg.csv", row.names = FALSE)



```
