1. load libraries

2. Perform DE analysis using Malignant_CD4Tcells_vs_Normal_CD4Tcells genes


Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("../../2-DE_on_Harmony_Integration/Filtered_DE_Results_L7_with_MeanExpr.csv", header = T)

3. Create the EnhancedVolcano plot


EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
) 
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

4. Enrichment Analysis-1


# Step-by-Step Guide for Gene Set Enrichment Analysis (GSEA) or Over-Representation Analysis (ORA)

# Load the necessary libraries
library(clusterProfiler)
Registered S3 methods overwritten by 'treeio':
  method              from    
  MRCA.phylo          tidytree
  MRCA.treedata       tidytree
  Nnode.treedata      tidytree
  Ntip.treedata       tidytree
  ancestor.phylo      tidytree
  ancestor.treedata   tidytree
  child.phylo         tidytree
  child.treedata      tidytree
  full_join.phylo     tidytree
  full_join.treedata  tidytree
  groupClade.phylo    tidytree
  groupClade.treedata tidytree
  groupOTU.phylo      tidytree
  groupOTU.treedata   tidytree
  is.rooted.treedata  tidytree
  nodeid.phylo        tidytree
  nodeid.treedata     tidytree
  nodelab.phylo       tidytree
  nodelab.treedata    tidytree
  offspring.phylo     tidytree
  offspring.treedata  tidytree
  parent.phylo        tidytree
  parent.treedata     tidytree
  root.treedata       tidytree
  rootnode.phylo      tidytree
  sibling.phylo       tidytree
clusterProfiler v4.6.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

Attachement du package : 'clusterProfiler'

L'objet suivant est masqué depuis 'package:stats':

    filter
library(org.Hs.eg.db)
Le chargement a nécessité le package : AnnotationDbi
Le chargement a nécessité le package : stats4
Le chargement a nécessité le package : BiocGenerics

Attachement du package : 'BiocGenerics'

L'objet suivant est masqué depuis 'package:gridExtra':

    combine

Les objets suivants sont masqués depuis 'package:dplyr':

    combine, intersect, setdiff, union

L'objet suivant est masqué depuis 'package:SeuratObject':

    intersect

Les objets suivants sont masqués depuis 'package:stats':

    IQR, mad, sd, var, xtabs

Les objets suivants sont masqués depuis 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget, order,
    paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
    rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit,
    which.max, which.min

Le chargement a nécessité le package : Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'.
    To cite Bioconductor, see 'citation("Biobase")', and for packages
    'citation("pkgname")'.

Le chargement a nécessité le package : IRanges
Le chargement a nécessité le package : S4Vectors

Attachement du package : 'S4Vectors'

L'objet suivant est masqué depuis 'package:clusterProfiler':

    rename

Les objets suivants sont masqués depuis 'package:dplyr':

    first, rename

Les objets suivants sont masqués depuis 'package:base':

    expand.grid, I, unname


Attachement du package : 'IRanges'

L'objet suivant est masqué depuis 'package:clusterProfiler':

    slice

Les objets suivants sont masqués depuis 'package:dplyr':

    collapse, desc, slice

L'objet suivant est masqué depuis 'package:sp':

    %over%


Attachement du package : 'AnnotationDbi'

L'objet suivant est masqué depuis 'package:clusterProfiler':

    select

L'objet suivant est masqué depuis 'package:dplyr':

    select
library(enrichplot)
library(ReactomePA)
ReactomePA v1.42.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479
# Get upregulated genes based on log2FC and p-value thresholds
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Get downregulated genes based on log2FC and p-value thresholds
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Gene Ontology (GO) Enrichment Analysis
# GO enrichment for upregulated genes
go_up <- enrichGO(gene = upregulated_genes$gene, 
                  OrgDb = org.Hs.eg.db, 
                  keyType = "SYMBOL", 
                  ont = "BP",   # Biological Process (BP), Molecular Function (MF), Cellular Component (CC)
                  pAdjustMethod = "BH", 
                  pvalueCutoff = 0.05)

# GO enrichment for downregulated genes
go_down <- enrichGO(gene = downregulated_genes$gene, 
                    OrgDb = org.Hs.eg.db, 
                    keyType = "SYMBOL", 
                    ont = "BP", 
                    pAdjustMethod = "BH", 
                    pvalueCutoff = 0.05)

# Visualize the top enriched GO terms
dotplot(go_up, showCategory = 10, title = "GO Enrichment for Upregulated Genes")

dotplot(go_down, showCategory = 10, title = "GO Enrichment for Downregulated Genes")


# KEGG Pathway Enrichment
# Convert gene symbols to Entrez IDs for KEGG analysis
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
'select()' returned 1:1 mapping between keys and columns
Avis : 2.74% of input gene IDs are fail to map...
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
'select()' returned 1:1 mapping between keys and columns
Avis : 3.93% of input gene IDs are fail to map...
# KEGG pathway enrichment for upregulated genes
kegg_up <- enrichKEGG(gene = upregulated_entrez, 
                      organism = "hsa", 
                      pvalueCutoff = 0.05)
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
# KEGG pathway enrichment for downregulated genes
kegg_down <- enrichKEGG(gene = downregulated_entrez, 
                        organism = "hsa", 
                        pvalueCutoff = 0.05)

# Visualize KEGG pathway results
dotplot(kegg_up, showCategory = 10, title = "KEGG Pathway Enrichment for Upregulated Genes")

dotplot(kegg_down, showCategory = 6, title = "KEGG Pathway Enrichment for Downregulated Genes")


# Reactome Pathway Enrichment
# Reactome pathway enrichment for upregulated genes
reactome_up <- enrichPathway(gene = upregulated_entrez, 
                             organism = "human", 
                             pvalueCutoff = 0.05)

# Reactome pathway enrichment for downregulated genes
reactome_down <- enrichPathway(gene = downregulated_entrez, 
                               organism = "human", 
                               pvalueCutoff = 0.05)

# Visualize Reactome pathways
dotplot(reactome_up, showCategory = 10, title = "Reactome Pathway Enrichment for Upregulated Genes")

dotplot(reactome_down, showCategory = 10, title = "Reactome Pathway Enrichment for Downregulated Genes")


# Gene Set Enrichment Analysis (GSEA)
# Create a ranked list of genes (log2FC as ranking metric)
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene  # Use the $gene column for gene symbols
gene_list <- sort(gene_list, decreasing = TRUE)

# Convert gene symbols to Entrez IDs for GSEA
gene_df <- bitr(names(gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 2.78% of input gene IDs are fail to map...
# Ensure the gene list matches the Entrez IDs
gene_list <- gene_list[names(gene_list) %in% gene_df$SYMBOL]

# Replace gene symbols with Entrez IDs
names(gene_list) <- gene_df$ENTREZID[match(names(gene_list), gene_df$SYMBOL)]

# Run GSEA using KEGG pathways
gsea_kegg <- gseKEGG(geneList = gene_list, 
                     organism = "hsa", 
                     pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Avis : There are ties in the preranked stats (0.18% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : There were 2 pathways for which P-values were not calculated properly due to unbalanced (positive and negative) gene-level statistic values. For such pathways pval, padj, NES, log2err are set to NA. You can try to increase the value of the argument nPermSimple (for example set it nPermSimple = 10000)Avis : For some of the pathways the P-values were likely overestimated. For such pathways log2err is set to NA.Avis : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.leading edge analysis...
done...
# Plot the GSEA results
gseaplot(gsea_kegg, geneSetID = 1, title = "Top KEGG Pathway")


# Extract the name of the top KEGG pathway
top_pathway <- gsea_kegg@result[1, "Description"]

# Plot GSEA with the top pathway's name as the title
gseaplot(gsea_kegg, geneSetID = 1, title = top_pathway)

NA
NA

4.2. Enrichment Analysis-2


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Get upregulated and downregulated genes based on log2 fold change and adjusted p-value
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 1534 
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 61 
# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}


if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
NA
NA
NA

4.3. Hallmark-GSEA

# Gene Set Enrichment Analysis (GSEA) for Hallmark Pathways
# Create a ranked list of genes (log2FC as ranking metric)
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene
gene_list <- sort(gene_list, decreasing = TRUE)

# Convert gene symbols to Entrez IDs for GSEA
gene_df <- bitr(names(gene_list), 
                fromType = "SYMBOL", 
                toType = "ENTREZID", 
                OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 2.78% of input gene IDs are fail to map...
# Filter out genes without Entrez ID mappings
gene_list <- gene_list[names(gene_list) %in% gene_df$SYMBOL]

# Replace gene symbols with Entrez IDs in the gene list
names(gene_list) <- gene_df$ENTREZID[match(names(gene_list), gene_df$SYMBOL)]

# Run GSEA using Hallmark pathways
gsea_hallmark <- GSEA(geneList = gene_list, 
                      TERM2GENE = hallmark_sets[, c("gs_name", "entrez_gene")], 
                      pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Avis : There are ties in the preranked stats (0.18% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.Avis : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.leading edge analysis...
done...
# Check and visualize GSEA results
if (!is.null(gsea_hallmark) && nrow(gsea_hallmark) > 0) {
  # Visualize top GSEA results for Hallmark pathways
  dotplot(gsea_hallmark, showCategory = 20, title = "GSEA for Hallmark Pathways")
  
  # Plot enrichment score for the top pathway
  gseaplot(gsea_hallmark, geneSetID = 1, title = "Top Hallmark Pathway")
  
  # Extract the name of the top Hallmark pathway
  top_hallmark <- gsea_hallmark@result[1, "Description"]
  
  # Plot GSEA with the top pathway's name as the title
  gseaplot(gsea_hallmark, geneSetID = 1, title = top_hallmark)
} else {
  cat("No significant GSEA results for Hallmark pathways.\n")
}

NA
NA
NA

5. ggplot2 for Volcano

library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, "Upregulated genes",
                                                   ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, "Downregulated genes", "Nonsignificant"))

# Create a volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels for top and bottom genes
  geom_text_repel(data = top_genes, aes(label = gene), color = "black", vjust = 1, fontface = "bold") +
  geom_text_repel(data = bottom_genes, aes(label = gene), color = "black", vjust = -1, fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # # Add significance threshold lines
   geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
   geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", "Downregulated genes" = "blue", "Nonsignificant" = "darkgrey")) +
  
  # Customize theme if needed
  theme_minimal()

NA
NA
NA
NA
NA

5. ggplot3 for Volcano

# Load necessary libraries
library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.00001 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 4, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.00001 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -4, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, 
                                                        "Upregulated genes",
                                                        ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, 
                                                               "Downregulated genes", 
                                                               "Nonsignificant"))

# Create the volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels next to the dots without repel lines
  geom_text(data = top_genes, aes(label = gene), hjust = -0.2, vjust = 0, size = 3, color = "black", fontface = "bold") +
  geom_text(data = bottom_genes, aes(label = gene), hjust = 1.2, vjust = 0, size = 3, color = "black", fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # Add significance threshold lines
  geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", 
                                "Downregulated genes" = "blue", 
                                "Nonsignificant" = "darkgrey")) +
  
  # Customize theme
  theme_minimal()

NA
NA
---
title: "Differential Expression Analysis of L7 vs Control(Normal CD4 Tcells)-GSEA"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(harmony)
library(ggplot2)
library(cowplot)
library(reticulate)
library(Azimuth)
library(dplyr)
library(Rtsne)
library(harmony)
library(gridExtra)
library(EnhancedVolcano)
  

```

# 2. Perform DE analysis using Malignant_CD4Tcells_vs_Normal_CD4Tcells genes
```{r data1, fig.height=8, fig.width=12}

Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("../../2-DE_on_Harmony_Integration/Filtered_DE_Results_L7_with_MeanExpr.csv", header = T)
```

# 3. Create the EnhancedVolcano plot
```{r enhancedV, fig.height=12, fig.width=16}

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)


EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_log2FC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)



EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
) 


```


# 4. Enrichment Analysis-1
```{r , fig.height=6, fig.width=10}

# Step-by-Step Guide for Gene Set Enrichment Analysis (GSEA) or Over-Representation Analysis (ORA)

# Load the necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)

# Get upregulated genes based on log2FC and p-value thresholds
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Get downregulated genes based on log2FC and p-value thresholds
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Gene Ontology (GO) Enrichment Analysis
# GO enrichment for upregulated genes
go_up <- enrichGO(gene = upregulated_genes$gene, 
                  OrgDb = org.Hs.eg.db, 
                  keyType = "SYMBOL", 
                  ont = "BP",   # Biological Process (BP), Molecular Function (MF), Cellular Component (CC)
                  pAdjustMethod = "BH", 
                  pvalueCutoff = 0.05)

# GO enrichment for downregulated genes
go_down <- enrichGO(gene = downregulated_genes$gene, 
                    OrgDb = org.Hs.eg.db, 
                    keyType = "SYMBOL", 
                    ont = "BP", 
                    pAdjustMethod = "BH", 
                    pvalueCutoff = 0.05)

# Visualize the top enriched GO terms
dotplot(go_up, showCategory = 10, title = "GO Enrichment for Upregulated Genes")
dotplot(go_down, showCategory = 10, title = "GO Enrichment for Downregulated Genes")

# KEGG Pathway Enrichment
# Convert gene symbols to Entrez IDs for KEGG analysis
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID

# KEGG pathway enrichment for upregulated genes
kegg_up <- enrichKEGG(gene = upregulated_entrez, 
                      organism = "hsa", 
                      pvalueCutoff = 0.05)

# KEGG pathway enrichment for downregulated genes
kegg_down <- enrichKEGG(gene = downregulated_entrez, 
                        organism = "hsa", 
                        pvalueCutoff = 0.05)

# Visualize KEGG pathway results
dotplot(kegg_up, showCategory = 10, title = "KEGG Pathway Enrichment for Upregulated Genes")
dotplot(kegg_down, showCategory = 6, title = "KEGG Pathway Enrichment for Downregulated Genes")

# Reactome Pathway Enrichment
# Reactome pathway enrichment for upregulated genes
reactome_up <- enrichPathway(gene = upregulated_entrez, 
                             organism = "human", 
                             pvalueCutoff = 0.05)

# Reactome pathway enrichment for downregulated genes
reactome_down <- enrichPathway(gene = downregulated_entrez, 
                               organism = "human", 
                               pvalueCutoff = 0.05)

# Visualize Reactome pathways
dotplot(reactome_up, showCategory = 10, title = "Reactome Pathway Enrichment for Upregulated Genes")
dotplot(reactome_down, showCategory = 10, title = "Reactome Pathway Enrichment for Downregulated Genes")

# Gene Set Enrichment Analysis (GSEA)
# Create a ranked list of genes (log2FC as ranking metric)
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene  # Use the $gene column for gene symbols
gene_list <- sort(gene_list, decreasing = TRUE)

# Convert gene symbols to Entrez IDs for GSEA
gene_df <- bitr(names(gene_list), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Ensure the gene list matches the Entrez IDs
gene_list <- gene_list[names(gene_list) %in% gene_df$SYMBOL]

# Replace gene symbols with Entrez IDs
names(gene_list) <- gene_df$ENTREZID[match(names(gene_list), gene_df$SYMBOL)]

# Run GSEA using KEGG pathways
gsea_kegg <- gseKEGG(geneList = gene_list, 
                     organism = "hsa", 
                     pvalueCutoff = 0.05)

# Plot the GSEA results
gseaplot(gsea_kegg, geneSetID = 1, title = "Top KEGG Pathway")

# Extract the name of the top KEGG pathway
top_pathway <- gsea_kegg@result[1, "Description"]

# Plot GSEA with the top pathway's name as the title
gseaplot(gsea_kegg, geneSetID = 1, title = top_pathway)


```




# 4.2. Enrichment Analysis-2
```{r , fig.height=6, fig.width=10}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Get upregulated and downregulated genes based on log2 fold change and adjusted p-value
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05, ]

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}





```
# 4.3. Hallmark-GSEA
```{r , fig.height=8, fig.width=12}
# Gene Set Enrichment Analysis (GSEA) for Hallmark Pathways
# Create a ranked list of genes (log2FC as ranking metric)
gene_list <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC
names(gene_list) <- Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene
gene_list <- sort(gene_list, decreasing = TRUE)

# Convert gene symbols to Entrez IDs for GSEA
gene_df <- bitr(names(gene_list), 
                fromType = "SYMBOL", 
                toType = "ENTREZID", 
                OrgDb = org.Hs.eg.db)

# Filter out genes without Entrez ID mappings
gene_list <- gene_list[names(gene_list) %in% gene_df$SYMBOL]

# Replace gene symbols with Entrez IDs in the gene list
names(gene_list) <- gene_df$ENTREZID[match(names(gene_list), gene_df$SYMBOL)]

# Run GSEA using Hallmark pathways
gsea_hallmark <- GSEA(geneList = gene_list, 
                      TERM2GENE = hallmark_sets[, c("gs_name", "entrez_gene")], 
                      pvalueCutoff = 0.05)

# Check and visualize GSEA results
if (!is.null(gsea_hallmark) && nrow(gsea_hallmark) > 0) {
  # Visualize top GSEA results for Hallmark pathways
  dotplot(gsea_hallmark, showCategory = 20, title = "GSEA for Hallmark Pathways")
  
  # Plot enrichment score for the top pathway
  gseaplot(gsea_hallmark, geneSetID = 1, title = "Top Hallmark Pathway")
  
  # Extract the name of the top Hallmark pathway
  top_hallmark <- gsea_hallmark@result[1, "Description"]
  
  # Plot GSEA with the top pathway's name as the title
  gseaplot(gsea_hallmark, geneSetID = 1, title = top_hallmark)
} else {
  cat("No significant GSEA results for Hallmark pathways.\n")
}



```



# 5. ggplot2 for Volcano
```{r , fig.height=8, fig.width=12}
library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, "Upregulated genes",
                                                   ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, "Downregulated genes", "Nonsignificant"))

# Create a volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels for top and bottom genes
  geom_text_repel(data = top_genes, aes(label = gene), color = "black", vjust = 1, fontface = "bold") +
  geom_text_repel(data = bottom_genes, aes(label = gene), color = "black", vjust = -1, fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # # Add significance threshold lines
   geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
   geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", "Downregulated genes" = "blue", "Nonsignificant" = "darkgrey")) +
  
  # Customize theme if needed
  theme_minimal()





```


# 5. ggplot3 for Volcano
```{r , fig.height=8, fig.width=12}
# Load necessary libraries
library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.00001 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 4, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.00001 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -4, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC > 0.5, 
                                                        "Upregulated genes",
                                                        ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_log2FC < -0.5, 
                                                               "Downregulated genes", 
                                                               "Nonsignificant"))

# Create the volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels next to the dots without repel lines
  geom_text(data = top_genes, aes(label = gene), hjust = -0.2, vjust = 0, size = 3, color = "black", fontface = "bold") +
  geom_text(data = bottom_genes, aes(label = gene), hjust = 1.2, vjust = 0, size = 3, color = "black", fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # Add significance threshold lines
  geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", 
                                "Downregulated genes" = "blue", 
                                "Nonsignificant" = "darkgrey")) +
  
  # Customize theme
  theme_minimal()


```

