1. load libraries

2. Load the filtered list on mean expression


# Load the DE results from CSV
df <- read.csv("../comparison_L5_vs_L6_with_mean_expression_filtered.csv", stringsAsFactors = FALSE)


DE_results_df <- df

3. Summarize Markers

markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
}

cat("Markers Summary at 0.05:\n")
Markers Summary at 0.05:
summarize_markers(markers)
Number of genes with p_val_adj = 0: 365 
Number of genes with p_val_adj = 1: 467 
Number of upregulated genes (avg_log2FC > 1.5): 128 
Number of downregulated genes (avg_log2FC < -1): 338 
Number of significant genes (p_val_adj < 0.05): 3671 
markers2 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-4)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-4):", num_significant, "\n")
}

cat("Markers Summary at 1e-4:\n")
Markers Summary at 1e-4:
summarize_markers(markers2)
Number of genes with p_val_adj = 0: 365 
Number of genes with p_val_adj = 1: 467 
Number of upregulated genes (avg_log2FC > 1.5): 128 
Number of downregulated genes (avg_log2FC < -1): 338 
Number of significant genes (p_val_adj < 1e-4): 3468 
markers3 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-6)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-6):", num_significant, "\n")
}

cat("Markers Summary at 1e-6:\n")
Markers Summary at 1e-6:
summarize_markers(markers3)
Number of genes with p_val_adj = 0: 365 
Number of genes with p_val_adj = 1: 467 
Number of upregulated genes (avg_log2FC > 1.5): 128 
Number of downregulated genes (avg_log2FC < -1): 338 
Number of significant genes (p_val_adj < 1e-6): 3325 
markers4 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-10)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-10):", num_significant, "\n")
  }

cat("Markers Summary at 1e-10:\n")
Markers Summary at 1e-10:
summarize_markers(markers4)
Number of genes with p_val_adj = 0: 365 
Number of genes with p_val_adj = 1: 467 
Number of upregulated genes (avg_log2FC > 1.5): 128 
Number of downregulated genes (avg_log2FC < -1): 338 
Number of significant genes (p_val_adj < 1e-10): 3064 
markers5 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-15)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-15):", num_significant, "\n")
}

cat("Markers Summary at 1e-15:\n")
Markers Summary at 1e-15:
summarize_markers(markers5)
Number of genes with p_val_adj = 0: 365 
Number of genes with p_val_adj = 1: 467 
Number of upregulated genes (avg_log2FC > 1.5): 128 
Number of downregulated genes (avg_log2FC < -1): 338 
Number of significant genes (p_val_adj < 1e-15): 2798 

4. Volcano Plots

library(ggplot2)
library(dplyr)

Attachement du package : ‘dplyr’

L'objet suivant est masqué depuis ‘package:Biobase’:

    combine

Les objets suivants sont masqués depuis ‘package:GenomicRanges’:

    intersect, setdiff, union

L'objet suivant est masqué depuis ‘package:GenomeInfoDb’:

    intersect

Les objets suivants sont masqués depuis ‘package:IRanges’:

    collapse, desc, intersect, setdiff, slice, union

Les objets suivants sont masqués depuis ‘package:S4Vectors’:

    first, intersect, rename, setdiff, setequal, union

Les objets suivants sont masqués depuis ‘package:BiocGenerics’:

    combine, intersect, setdiff, union

L'objet suivant est masqué depuis ‘package:matrixStats’:

    count

Les objets suivants sont masqués depuis ‘package:stats’:

    filter, lag

Les objets suivants sont masqués depuis ‘package:base’:

    intersect, setdiff, setequal, union
library(ggrepel)


# Ensure correct column names
colnames(DE_results_df)
[1] "gene"             "p_val"            "avg_log2FC"       "pct.1"            "pct.2"            "p_val_adj"       
[7] "mean_expr_ident1" "mean_expr_ident2"
# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_log2FC > 2 ~ "Most Upregulated",
      p_val_adj < 1e-20 & avg_log2FC < -2 ~ "Most Downregulated",
      p_val_adj < 0.05 & avg_log2FC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_log2FC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_log2FC > 2 | avg_log2FC < -2))

ggplot(volcano_data, aes(x = avg_log2FC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_log2FC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Most Upregulated" = "darkred",
    "Most Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues

NA
NA

EnhancedVolcano plot


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...

EnhancedVolcano plot


library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Volcano_Plot_Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- filtered_genes

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p1)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', "TOX", "CD52", "TWIST1", "CCR4", "CCR7","PDCD1",
                              'IL7R', 'TCF7',  'MKI67', 'CD70', "DPP4",
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p2)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p3)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
print(p4)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'Malignant_vs_Control' folder.")
All volcano plots have been displayed and saved successfully in the 'Malignant_vs_Control' folder.

5. Enrichment Analysis-All_Pathways

# Load necessary libraries
library(clusterProfiler)

Registered S3 methods overwritten by 'treeio':
  method              from    
  MRCA.phylo          tidytree
  MRCA.treedata       tidytree
  Nnode.treedata      tidytree
  Ntip.treedata       tidytree
  ancestor.phylo      tidytree
  ancestor.treedata   tidytree
  child.phylo         tidytree
  child.treedata      tidytree
  full_join.phylo     tidytree
  full_join.treedata  tidytree
  groupClade.phylo    tidytree
  groupClade.treedata tidytree
  groupOTU.phylo      tidytree
  groupOTU.treedata   tidytree
  is.rooted.treedata  tidytree
  nodeid.phylo        tidytree
  nodeid.treedata     tidytree
  nodelab.phylo       tidytree
  nodelab.treedata    tidytree
  offspring.phylo     tidytree
  offspring.treedata  tidytree
  parent.phylo        tidytree
  parent.treedata     tidytree
  root.treedata       tidytree
  rootnode.phylo      tidytree
  sibling.phylo       tidytree
clusterProfiler v4.6.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

Attachement du package : ‘clusterProfiler’

L'objet suivant est masqué depuis ‘package:IRanges’:

    slice

L'objet suivant est masqué depuis ‘package:S4Vectors’:

    rename

L'objet suivant est masqué depuis ‘package:stats’:

    filter
library(org.Hs.eg.db)
Le chargement a nécessité le package : AnnotationDbi

Attachement du package : ‘AnnotationDbi’

L'objet suivant est masqué depuis ‘package:clusterProfiler’:

    select

L'objet suivant est masqué depuis ‘package:dplyr’:

    select
library(enrichplot)
library(ReactomePA)
ReactomePA v1.42.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479
library(DOSE) # For GSEA analysis
DOSE v3.24.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(ggplot2) # Ensure ggplot2 is available for plotting
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "L5_vs_L6/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Define the number of upregulated and downregulated genes to select
UP_genes <- 250
Down_genes <- 250

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1         # Downregulated logFC threshold

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Filter the genes based on avg_log2FC and arrange by p_val_adj
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(avg_log2FC > logFC_up_threshold | avg_log2FC < logFC_down_threshold) %>%
  arrange(p_val_adj)

# Separate upregulated and downregulated genes
upregulated_genes <- filtered_genes %>%
  filter(avg_log2FC > logFC_up_threshold)

downregulated_genes <- filtered_genes %>%
  filter(avg_log2FC < logFC_down_threshold)

# Check if there are fewer than the specified number of upregulated genes
if (nrow(upregulated_genes) < UP_genes) {
  top_upregulated_genes <- upregulated_genes
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj value for the last selected upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
} else {
  # Select the specified number of upregulated genes
  top_upregulated_genes <- upregulated_genes %>%
    head(UP_genes)
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj value for the last selected upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
}
Number of upregulated genes selected: 250 
p_val_adj value for the last selected upregulated gene: 1.976288e-126 
# Check if there are fewer than the specified number of downregulated genes
if (nrow(downregulated_genes) < Down_genes) {
  top_downregulated_genes <- downregulated_genes
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj value for the last selected downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
} else {
  # Select the specified number of downregulated genes
  top_downregulated_genes <- downregulated_genes %>%
    head(Down_genes)
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj value for the last selected downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
}
Number of downregulated genes selected: 250 
p_val_adj value for the last selected downregulated gene: 1.091314e-134 
# Combine the top upregulated and downregulated genes
top_genes <- bind_rows(top_upregulated_genes, top_downregulated_genes)

# Check for missing genes (NAs) in the gene column and remove them
top_genes <- na.omit(top_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(top_upregulated_genes, paste0(output_folder, "upregulated_genes.csv"), row.names = FALSE)
write.csv(top_downregulated_genes, paste0(output_folder, "downregulated_genes.csv"), row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(top_upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 3.6% of input gene IDs are fail to map...
downregulated_entrez <- bitr(top_downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 3.6% of input gene IDs are fail to map...
# Check for missing Entrez IDs and retain gene names
missing_upregulated <- top_upregulated_genes$gene[!top_upregulated_genes$gene %in% upregulated_entrez$SYMBOL]
missing_downregulated <- top_downregulated_genes$gene[!top_downregulated_genes$gene %in% downregulated_entrez$SYMBOL]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
 AC097518.2 AC068672.2 AC022613.1 AC092939.1 KIAA1211 AC004160.1 HIST1H1B MT-ATP8 CCDC58 
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
 AP001783.1 AC114977.1 AL606807.1 PALM2-AKAP2 AC104365.1 AC008875.3 AC099552.1 ARNTL C5orf56 
# Merge the Entrez IDs back with the original data frames to retain gene names
top_upregulated_genes <- merge(top_upregulated_genes, upregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)
top_downregulated_genes <- merge(top_downregulated_genes, downregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)

# Remove genes that couldn't be mapped to Entrez IDs
top_upregulated_genes <- top_upregulated_genes[!is.na(top_upregulated_genes$ENTREZID), ]
top_downregulated_genes <- top_downregulated_genes[!is.na(top_downregulated_genes$ENTREZID), ]

# Extract Entrez IDs for enrichment analysis
upregulated_entrez <- top_upregulated_genes$ENTREZID
downregulated_entrez <- top_downregulated_genes$ENTREZID

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, readable = TRUE)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(top_upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")

safe_enrichGO(top_downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")


safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")


safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
No significant Reactome pathways found for: Reactome Pathway Enrichment for Upregulated Genes
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")

Enrichment Analysis_Hallmark


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
library(ggplot2)
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "L5_vs_L6/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", collection = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
top_upregulated_genes$gene <- toupper(top_upregulated_genes$gene)
top_downregulated_genes$gene <- toupper(top_downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(top_upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(top_downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 103 
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 102 
# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}


if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
---
title: "Enrichment Analysis of L5L6 derived from P3_filtred_on_mean"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  # pdf_document: default
  # word_document: default
  # html_document: default
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

# Load libraries
library(Seurat)
library(Matrix)
library(SingleCellExperiment)
library(DESeq2)
library(Libra)

```

# 2. Load the filtered list on mean expression
```{r , fig.height=8, fig.width=10}

# Load the DE results from CSV
df <- read.csv("../comparison_L5_vs_L6_with_mean_expression_filtered.csv", stringsAsFactors = FALSE)


DE_results_df <- df

```

# 3. Summarize Markers
```{r , fig.height=12, fig.width=14}
markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
}

cat("Markers Summary at 0.05:\n")

summarize_markers(markers)

markers2 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-4)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-4):", num_significant, "\n")
}

cat("Markers Summary at 1e-4:\n")

summarize_markers(markers2)

markers3 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-6)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-6):", num_significant, "\n")
}

cat("Markers Summary at 1e-6:\n")
summarize_markers(markers3)

markers4 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-10)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-10):", num_significant, "\n")
  }

cat("Markers Summary at 1e-10:\n")

summarize_markers(markers4)

markers5 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_upregulated <- sum(markers$avg_log2FC > 1.5)
  num_downregulated <- sum(markers$avg_log2FC < -1)
  num_significant <- sum(markers$p_val_adj < 1e-15)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of upregulated genes (avg_log2FC > 1.5):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_log2FC < -1):", num_downregulated, "\n")
  cat("Number of significant genes (p_val_adj < 1e-15):", num_significant, "\n")
}

cat("Markers Summary at 1e-15:\n")

summarize_markers(markers5)



```



# 4. Volcano Plots
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)
library(ggrepel)


# Ensure correct column names
colnames(DE_results_df)

# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_log2FC > 2 ~ "Most Upregulated",
      p_val_adj < 1e-20 & avg_log2FC < -2 ~ "Most Downregulated",
      p_val_adj < 0.05 & avg_log2FC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_log2FC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_log2FC > 2 | avg_log2FC < -2))

ggplot(volcano_data, aes(x = avg_log2FC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_log2FC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Most Upregulated" = "darkred",
    "Most Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues


```


## EnhancedVolcano plot
```{r , fig.height=12, fig.width=16}

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-6 & abs(filtered_genes$avg_log2FC) >= 1.5, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-6,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]  # Only label significant genes
)



```


## EnhancedVolcano plot
```{r , fig.height=12, fig.width=16}

library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Volcano_Plot_Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- filtered_genes

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_log2FC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', "TOX", "CD52", "TWIST1", "CCR4", "CCR7","PDCD1",
                              'IL7R', 'TCF7',  'MKI67', 'CD70', "DPP4",
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_log2FC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
print(p3)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_log2FC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_log2FC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_log2FC) >= 1.0]
)
print(p4)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'Malignant_vs_Control' folder.")



```
# 5. Enrichment Analysis-All_Pathways
```{r , fig.height=6, fig.width=8}
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "L5_vs_L6/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Define the number of upregulated and downregulated genes to select
UP_genes <- 250
Down_genes <- 250

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1         # Downregulated logFC threshold

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Filter the genes based on avg_log2FC and arrange by p_val_adj
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  filter(avg_log2FC > logFC_up_threshold | avg_log2FC < logFC_down_threshold) %>%
  arrange(p_val_adj)

# Separate upregulated and downregulated genes
upregulated_genes <- filtered_genes %>%
  filter(avg_log2FC > logFC_up_threshold)

downregulated_genes <- filtered_genes %>%
  filter(avg_log2FC < logFC_down_threshold)

# Check if there are fewer than the specified number of upregulated genes
if (nrow(upregulated_genes) < UP_genes) {
  top_upregulated_genes <- upregulated_genes
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj value for the last selected upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
} else {
  # Select the specified number of upregulated genes
  top_upregulated_genes <- upregulated_genes %>%
    head(UP_genes)
  cat("Number of upregulated genes selected:", nrow(top_upregulated_genes), "\n")
  cat("p_val_adj value for the last selected upregulated gene:", tail(top_upregulated_genes$p_val_adj, 1), "\n")
}

# Check if there are fewer than the specified number of downregulated genes
if (nrow(downregulated_genes) < Down_genes) {
  top_downregulated_genes <- downregulated_genes
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj value for the last selected downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
} else {
  # Select the specified number of downregulated genes
  top_downregulated_genes <- downregulated_genes %>%
    head(Down_genes)
  cat("Number of downregulated genes selected:", nrow(top_downregulated_genes), "\n")
  cat("p_val_adj value for the last selected downregulated gene:", tail(top_downregulated_genes$p_val_adj, 1), "\n")
}

# Combine the top upregulated and downregulated genes
top_genes <- bind_rows(top_upregulated_genes, top_downregulated_genes)

# Check for missing genes (NAs) in the gene column and remove them
top_genes <- na.omit(top_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(top_upregulated_genes, paste0(output_folder, "upregulated_genes.csv"), row.names = FALSE)
write.csv(top_downregulated_genes, paste0(output_folder, "downregulated_genes.csv"), row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(top_upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
downregulated_entrez <- bitr(top_downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Check for missing Entrez IDs and retain gene names
missing_upregulated <- top_upregulated_genes$gene[!top_upregulated_genes$gene %in% upregulated_entrez$SYMBOL]
missing_downregulated <- top_downregulated_genes$gene[!top_downregulated_genes$gene %in% downregulated_entrez$SYMBOL]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
cat("Missing downregulated genes:\n", missing_downregulated, "\n")

# Merge the Entrez IDs back with the original data frames to retain gene names
top_upregulated_genes <- merge(top_upregulated_genes, upregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)
top_downregulated_genes <- merge(top_downregulated_genes, downregulated_entrez, by.x = "gene", by.y = "SYMBOL", all.x = TRUE)

# Remove genes that couldn't be mapped to Entrez IDs
top_upregulated_genes <- top_upregulated_genes[!is.na(top_upregulated_genes$ENTREZID), ]
top_downregulated_genes <- top_downregulated_genes[!is.na(top_downregulated_genes$ENTREZID), ]

# Extract Entrez IDs for enrichment analysis
upregulated_entrez <- top_upregulated_genes$ENTREZID
downregulated_entrez <- top_downregulated_genes$ENTREZID

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, readable = TRUE)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      result <- setReadable(result, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0(output_folder, gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0(output_folder, filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(top_upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
safe_enrichGO(top_downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")

```

## Enrichment Analysis_Hallmark
```{r , fig.height=6, fig.width=8}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
library(ggplot2)
library(dplyr)

# Define the output folder where the results will be saved
output_folder <- "L5_vs_L6/"

# Create the output folder if it doesn't exist
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", collection = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
top_upregulated_genes$gene <- toupper(top_upregulated_genes$gene)
top_downregulated_genes$gene <- toupper(top_downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(top_upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(top_downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}


```



