1. load libraries

2. load seurat object

# #Load Seurat Object 
# load("/home/nabbasi/isilon/To_Transfer_between_computers/23-Harmony_Integration/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")
# 
# pseudobulk_seurat <- All_samples_Merged
# 
# # Set the label column with "P1", "P2", "P3", and "Control"
# pseudobulk_seurat$label <- factor(
#   ifelse(pseudobulk_seurat$orig.ident %in% c("L1", "L2"), "P1", 
#          ifelse(pseudobulk_seurat$orig.ident %in% c("L3", "L4"), "P2",
#                 ifelse(pseudobulk_seurat$orig.ident %in% c("L5", "L6", "L7"), "P3", NA))),
#   levels = c("P1", "P2", "P3")
# )
# 
# 
# # Double-check the reference level
# print(levels(pseudobulk_seurat$label))  # Should print "Control" first
# 
# # Verify factor levels
# print(levels(pseudobulk_seurat$label))  # Should show Control first
# 
# # Ensure 'replicate' is a factor
# pseudobulk_seurat$replicate <- as.factor(pseudobulk_seurat$cell_line)
# 
# # Rename the cell type column
# pseudobulk_seurat$cell_type <- "CD4T"

3. DE using LIBRA

library(Libra)



# # Create subsets for each pair of patients
# p1_p2 <- subset(pseudobulk_seurat, label %in% c("P1", "P2"))
# p1_p3 <- subset(pseudobulk_seurat, label %in% c("P1", "P3"))
# p2_p3 <- subset(pseudobulk_seurat, label %in% c("P2", "P3"))
# 
# # Run Libra for P1 vs P2
# libra_p1_p2 <- run_de(
#   input = p1_p2,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p1_p2 <- as.data.frame(df)
# 
# write.csv(p1_p2, "../18March_Patient_comparison_Pseudobulk/P1_vs_P2/Psedobulk_Deseq2_P1_vs_P2.csv", row.names = FALSE)
# 
# # Run Libra for P1 vs P3
# libra_p1_p3 <- run_de(
#   input = p1_p3,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p1_p3 <- as.data.frame(df)
# 
# write.csv(p1_p3, "../18March_Patient_comparison_Pseudobulk/P1_vs_P3/Psedobulk_Deseq2_P1_vs_P3.csv", row.names = FALSE)
# 
# # Run Libra for P2 vs P3
# libra_p2_p3 <- run_de(
#   input = p2_p3,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p2_p3 <- as.data.frame(df)
# 
# write.csv(p2_p3, "../18March_Patient_comparison_Pseudobulk/P2_vs_P3/Psedobulk_Deseq2_P2_vs_P3.csv", row.names = FALSE)
# 
# 
# 
# df <- libra_p1_p2[!(libra_p1_p2$P1.exp < 0.20 & libra_p1_p2$P2.exp < 0.20), ]
# 
# DE_results_df <- as.data.frame(df)
# 
# write.csv(DE_results_df, "../P1_vs_P2/Psedobulk_Deseq2_filtered_on_mean_P1_vs_P2.csv", row.names = FALSE)

Volcano Plot

library(ggplot2)
library(dplyr)
library(ggrepel)


df <- libra_p1_p2[!(libra_p1_p2$P1.exp < 0.20 & libra_p1_p2$P2.exp < 0.20), ]

DE_results_df <- as.data.frame(df)

write.csv(DE_results_df, "../P1_vs_P2/Psedobulk_Deseq2_filtered_on_mean_P1_vs_P2.csv", row.names = FALSE)


# Ensure correct column names
colnames(DE_results_df)
 [1] "cell_type" "gene"      "avg_logFC" "P1.pct"    "P2.pct"    "P1.exp"    "P2.exp"    "p_val"     "p_val_adj" "de_family" "de_method" "de_type"  
# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select genes to label: p_val_adj < 1e-50 OR logFC > 2 OR logFC < -2
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 | avg_logFC > 2 | avg_logFC < -2)

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +  # Main points
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  # Add gene labels WITHOUT any lines connecting them
  geom_text_repel(data = top_genes, 
                  aes(label = gene),  
                  size = 5, box.padding = 0.3, max.overlaps = 15, segment.color = NA) +  

  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  # logFC thresholds
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  # p-value threshold

  ylim(0, 70)  # Set max y-axis limit to avoid extreme values

NA
NA

Volcano Plot

library(ggplot2)
library(dplyr)
library(ggrepel)


# Ensure correct column names
colnames(DE_results_df)
 [1] "cell_type" "gene"      "avg_logFC" "P1.pct"    "P2.pct"    "P1.exp"    "P2.exp"    "p_val"     "p_val_adj" "de_family" "de_method" "de_type"  
# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_logFC > 2 ~ "Most Upregulated",
      p_val_adj < 1e-20 & avg_logFC < -2 ~ "Most Downregulated",
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_logFC > 2 | avg_logFC < -2))

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_logFC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Most Upregulated" = "darkred",
    "Most Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues

NA
NA

4. Summarize Markers

markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers1 Summary at 0.05:\n")
Markers1 Summary at 0.05:
summarize_markers(markers)
Number of genes with p_val_adj = 0: 0 
Number of genes with p_val_adj = 1: 12 
Number of significant genes (p_val_adj < 0.05): 1517 
Number of upregulated genes (avg_logFC > 1): 572 
Number of downregulated genes (avg_logFC < 1): 865 
markers2 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 1e-4)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 1e-4):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers Summary at 1e-4:\n")
Markers Summary at 1e-4:
summarize_markers(markers2)
Number of genes with p_val_adj = 0: 0 
Number of genes with p_val_adj = 1: 12 
Number of significant genes (p_val_adj < 1e-4): 520 
Number of upregulated genes (avg_logFC > 1): 572 
Number of downregulated genes (avg_logFC < 1): 865 

EnhancedVolcano plot


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)

NA
NA
NA

Create the EnhancedVolcano plot


library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- filtered_genes

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', "TOX", "CD52", "TWIST1", "CCR4", "CCR7","PDCD1",
                              'IL7R', 'TCF7',  'MKI67', 'CD70', "DPP4",
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p3)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p4)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.")
All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.

5. Enrichment Analysis-All_Pathways

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1       # Downregulated logFC threshold
pval_threshold <- 0.05           # p-value threshold as specified

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > logFC_up_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < logFC_down_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Malignant_vs_Control/upregulated_genes.csv", row.names = FALSE)
write.csv(downregulated_genes, "Malignant_vs_Control/downregulated_genes.csv", row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 3.51% of input gene IDs are fail to map...
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 6.91% of input gene IDs are fail to map...
# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
  
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
  
# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")

safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")


safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")


safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
No significant Reactome pathways found for: Reactome Pathway Enrichment for Upregulated Genes
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")

NA
NA

Enrichment Analysis_Hallmark


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 128 
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 205 
# Define the output folder where the results will be saved
output_folder <- "Malignant_vs_Control/"

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}


if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
---
title: "PseudoBulk Analysis using Libra RNA assay-Deseq2-LRT_on_list_filtred_on_mean_P1_vs_P2"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  # pdf_document: default
  # word_document: default
  # html_document: default
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

# Load libraries
library(Seurat)
library(Matrix)
library(SingleCellExperiment)
library(DESeq2)
library(Libra)

```


# 2. load seurat object
```{r load_seurat}
# #Load Seurat Object 
# load("/home/nabbasi/isilon/To_Transfer_between_computers/23-Harmony_Integration/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")
# 
# pseudobulk_seurat <- All_samples_Merged
# 
# # Set the label column with "P1", "P2", "P3", and "Control"
# pseudobulk_seurat$label <- factor(
#   ifelse(pseudobulk_seurat$orig.ident %in% c("L1", "L2"), "P1", 
#          ifelse(pseudobulk_seurat$orig.ident %in% c("L3", "L4"), "P2",
#                 ifelse(pseudobulk_seurat$orig.ident %in% c("L5", "L6", "L7"), "P3", NA))),
#   levels = c("P1", "P2", "P3")
# )
# 
# 
# # Double-check the reference level
# print(levels(pseudobulk_seurat$label))  # Should print "Control" first
# 
# # Verify factor levels
# print(levels(pseudobulk_seurat$label))  # Should show Control first
# 
# # Ensure 'replicate' is a factor
# pseudobulk_seurat$replicate <- as.factor(pseudobulk_seurat$cell_line)
# 
# # Rename the cell type column
# pseudobulk_seurat$cell_type <- "CD4T"
```

# 3. DE using LIBRA
```{r , fig.height=14, fig.width=18}
library(Libra)



# # Create subsets for each pair of patients
# p1_p2 <- subset(pseudobulk_seurat, label %in% c("P1", "P2"))
# p1_p3 <- subset(pseudobulk_seurat, label %in% c("P1", "P3"))
# p2_p3 <- subset(pseudobulk_seurat, label %in% c("P2", "P3"))
# 
# # Run Libra for P1 vs P2
# libra_p1_p2 <- run_de(
#   input = p1_p2,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p1_p2 <- as.data.frame(df)
# 
# write.csv(p1_p2, "../18March_Patient_comparison_Pseudobulk/P1_vs_P2/Psedobulk_Deseq2_P1_vs_P2.csv", row.names = FALSE)
# 
# # Run Libra for P1 vs P3
# libra_p1_p3 <- run_de(
#   input = p1_p3,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p1_p3 <- as.data.frame(df)
# 
# write.csv(p1_p3, "../18March_Patient_comparison_Pseudobulk/P1_vs_P3/Psedobulk_Deseq2_P1_vs_P3.csv", row.names = FALSE)
# 
# # Run Libra for P2 vs P3
# libra_p2_p3 <- run_de(
#   input = p2_p3,
#   label_col = "label",
#   cell_type_col = "cell_type",
#   replicate_col = "replicate",
#   de_family = "pseudobulk",
#   de_method = "DESeq2",
#   de_type = "LRT"
# )
# 
# p2_p3 <- as.data.frame(df)
# 
# write.csv(p2_p3, "../18March_Patient_comparison_Pseudobulk/P2_vs_P3/Psedobulk_Deseq2_P2_vs_P3.csv", row.names = FALSE)
# 
# 
# 
# df <- libra_p1_p2[!(libra_p1_p2$P1.exp < 0.20 & libra_p1_p2$P2.exp < 0.20), ]
# 
# DE_results_df <- as.data.frame(df)
# 
# write.csv(DE_results_df, "../P1_vs_P2/Psedobulk_Deseq2_filtered_on_mean_P1_vs_P2.csv", row.names = FALSE)
```




## Volcano Plot
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)
library(ggrepel)


df <- libra_p1_p2[!(libra_p1_p2$P1.exp < 0.20 & libra_p1_p2$P2.exp < 0.20), ]

DE_results_df <- as.data.frame(df)

write.csv(DE_results_df, "../P1_vs_P2/Psedobulk_Deseq2_filtered_on_mean_P1_vs_P2.csv", row.names = FALSE)


# Ensure correct column names
colnames(DE_results_df)

# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select genes to label: p_val_adj < 1e-50 OR logFC > 2 OR logFC < -2
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 | avg_logFC > 2 | avg_logFC < -2)

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +  # Main points
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  # Add gene labels WITHOUT any lines connecting them
  geom_text_repel(data = top_genes, 
                  aes(label = gene),  
                  size = 5, box.padding = 0.3, max.overlaps = 15, segment.color = NA) +  

  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  # logFC thresholds
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  # p-value threshold

  ylim(0, 70)  # Set max y-axis limit to avoid extreme values


```
## Volcano Plot
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)
library(ggrepel)


# Ensure correct column names
colnames(DE_results_df)

# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_logFC > 2 ~ "Most Upregulated",
      p_val_adj < 1e-20 & avg_logFC < -2 ~ "Most Downregulated",
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_logFC > 2 | avg_logFC < -2))

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_logFC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Most Upregulated" = "darkred",
    "Most Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues


```

# 4. Summarize Markers
```{r , fig.height=12, fig.width=14}
markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers1 Summary at 0.05:\n")

summarize_markers(markers)

markers2 <- DE_results_df
summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 1e-4)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 1e-4):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers Summary at 1e-4:\n")

summarize_markers(markers2)




```



## EnhancedVolcano plot
```{r , fig.height=12, fig.width=16}

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)



```


## Create the EnhancedVolcano plot
```{r , fig.height=12, fig.width=16}

library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- filtered_genes

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', "TOX", "CD52", "TWIST1", "CCR4", "CCR7","PDCD1",
                              'IL7R', 'TCF7',  'MKI67', 'CD70', "DPP4",
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p3)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p4)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.")



```


# 5. Enrichment Analysis-All_Pathways
```{r , fig.height=12, fig.width=16}
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1       # Downregulated logFC threshold
pval_threshold <- 0.05           # p-value threshold as specified

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > logFC_up_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < logFC_down_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Malignant_vs_Control/upregulated_genes.csv", row.names = FALSE)
write.csv(downregulated_genes, "Malignant_vs_Control/downregulated_genes.csv", row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
cat("Missing downregulated genes:\n", missing_downregulated, "\n")

# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")


```




## Enrichment Analysis_Hallmark
```{r , fig.height=8, fig.width=12}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# Define the output folder where the results will be saved
output_folder <- "Malignant_vs_Control/"

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}


```







