1. load libraries

2. Perform DE analysis using Malignant_CD4Tcells_vs_Normal_CD4Tcells genes



Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("0-imp_Robj/1-MAST_with_batch_as_Covariate_with_meanExpression.csv", header = T)

3. Create the EnhancedVolcano plot


EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)



EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)



library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)




EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
) 

NA
NA

4. Perform Fast GSEA using Hallmark Gene Sets


library(fgsea)
library(msigdbr)
library(dplyr)

# Obtain Hallmark gene sets from msigdbr
hallmark_genes <- msigdbr(species = "Homo sapiens", category = "H")

# Convert the gene sets to a list format for fgsea
hallmark_list <- hallmark_genes %>%
  split(x = .$gene_symbol, f = .$gs_name)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Create a ranked list based on avg_log2FC and p_val_adj
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  mutate(rank_metric = avg_log2FC * -log10(p_val_adj))

# Ensure no NA values in rank_metric
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(!is.na(rank_metric))

# Create a named vector for ranking
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$rank_metric
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene

# Sort the named vector in decreasing order
gene_list <- sort(gene_list, decreasing = TRUE)

# Perform fast GSEA
fgsea_result <- fgsea(pathways = hallmark_list, 
                      stats = gene_list, 
                      minSize = 15,  # Minimum size of a gene set to test
                      maxSize = 500,  # Maximum size of a gene set to test
                      nperm = 1000)  # Number of permutations
Avis : You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument in the fgsea function call.
# View the fgsea results
head(fgsea_result)

# Plot the top pathway
top_pathway <- fgsea_result[order(fgsea_result$padj), ][1, ]
plotEnrichment(hallmark_list[[top_pathway$pathway]], gene_list) +
  labs(title = top_pathway$pathway)

5. Create the Heatmap of fgsea results

library(pheatmap)

# Select the top 50 pathways
top_pathways <- fgsea_result %>%
  arrange(padj) %>%
  head(50)

# Create a matrix for the heatmap with pathways as rows and NES as the values
heatmap_data <- matrix(top_pathways$NES, nrow = length(top_pathways$pathway), ncol = 1)
rownames(heatmap_data) <- top_pathways$pathway
colnames(heatmap_data) <- c("NES")

# Plot the combined heatmap for the top 50 pathways
pheatmap(heatmap_data, 
         cluster_rows = TRUE, 
         cluster_cols = FALSE, 
         show_rownames = TRUE, 
         show_colnames = TRUE,
         main = "Hallmark Pathways: Malignant CD4 T Cells compared to normal CD4 T cells",
         color = colorRampPalette(c("blue", "white", "red"))(50))

