1. load libraries

2. load seurat object

pseudobulk_seurat <- All_samples_Merged

# Assign labels and ensure 'Control' is the reference
pseudobulk_seurat$label <- factor(
  ifelse(pseudobulk_seurat$orig.ident %in% c("PBMC", "PBMC10x"), 
         "Control", 
         "Malignant"),
  levels = c("Malignant", "Control")  
)


# Double-check the reference level
print(levels(pseudobulk_seurat$label))  # Should print "Control" first
[1] "Malignant" "Control"  
# Verify factor levels
print(levels(pseudobulk_seurat$label))  # Should show Control first
[1] "Malignant" "Control"  
# Ensure 'replicate' is a factor
pseudobulk_seurat$replicate <- as.factor(pseudobulk_seurat$cell_line)

# Rename the cell type column
pseudobulk_seurat$cell_type <- "CD4T"

3. DE using LIBRA

libra_test <-pseudobulk_seurat



DE_results <- run_de(
  pseudobulk_seurat,
  cell_type_col = "cell_type", 
  replicate_col = "replicate",
  label_col = "label",
  de_family = "pseudobulk",   
  de_method = "DESeq2",        
  de_type = "LRT",
)
[1] "CD4T"
converting counts to integer mode
  it appears that the last variable in the design formula, 'group',
  has a factor level, 'Control', which is not the reference level. we recommend
  to use factor(...,levels=...) or relevel() to set this as the reference level
  before proceeding. for more information, please see the 'Note on factor levels'
  in vignette('DESeq2').
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1197 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing

4. Volcano Plot

library(ggplot2)
library(dplyr)

Attachement du package : ‘dplyr’

L'objet suivant est masqué depuis ‘package:Biobase’:

    combine

Les objets suivants sont masqués depuis ‘package:GenomicRanges’:

    intersect, setdiff, union

L'objet suivant est masqué depuis ‘package:GenomeInfoDb’:

    intersect

Les objets suivants sont masqués depuis ‘package:IRanges’:

    collapse, desc, intersect, setdiff, slice, union

Les objets suivants sont masqués depuis ‘package:S4Vectors’:

    first, intersect, rename, setdiff, setequal, union

Les objets suivants sont masqués depuis ‘package:BiocGenerics’:

    combine, intersect, setdiff, union

L'objet suivant est masqué depuis ‘package:matrixStats’:

    count

Les objets suivants sont masqués depuis ‘package:stats’:

    filter, lag

Les objets suivants sont masqués depuis ‘package:base’:

    intersect, setdiff, setequal, union
# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Save results to CSV
write.csv(DE_results_df, file = "1-Pseudobulk_DEseq2_LRT_DE_with_libra.csv", row.names = FALSE)

# Ensure logFC and p-value columns are named correctly
colnames(DE_results_df)
 [1] "cell_type"     "gene"          "avg_logFC"     "Malignant.pct" "Control.pct"   "Malignant.exp" "Control.exp"   "p_val"        
 [9] "p_val_adj"     "de_family"     "de_method"     "de_type"      
# Filtering for significant genes:
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )
ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black")

NA
NA

Volcano Plot

library(EnhancedVolcano)
Le chargement a nécessité le package : ggrepel
EnhancedVolcano(
  DE_results, 
  lab = DE_results$gene,  # Labels for genes
  x = 'avg_logFC',             # Log2 Fold Change (corrected name)
  y = 'p_val_adj',             # Adjusted P-value
  title = 'Volcano Plot: Pseudobulk DESeq2 Analysis',
  subtitle = 'Malignant vs Normal CD4T Cells',
  pCutoff = 0.05,              # Adjusted p-value threshold
  FCcutoff = 2,                # Fold Change threshold
  pointSize = 2.0,             # Adjust point size
  labSize = 4.0,               # Adjust label size
  col = c("black", "grey", "blue", "red"), # Color coding
  colAlpha = 0.6,              # Transparency for points
  legendLabels = c("NS", "Log2FC", "P-value", "Both"),
  legendPosition = "right",
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,       # Connect gene labels
  widthConnectors = 0.5,
  vline = c(-2, 2),            # Vertical lines at logFC ±2
  hline = -log10(0.05),        # Horizontal line at adjusted p-value 0.05
  border = "full"
)

Volcano Plot

library(ggplot2)
library(dplyr)
library(ggrepel)

# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Ensure correct column names
colnames(DE_results_df)
 [1] "cell_type"     "gene"          "avg_logFC"     "Malignant.pct" "Control.pct"   "Malignant.exp" "Control.exp"   "p_val"        
 [9] "p_val_adj"     "de_family"     "de_method"     "de_type"      
# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select genes to label: p_val_adj < 1e-50 OR logFC > 2 OR logFC < -2
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 | avg_logFC > 2 | avg_logFC < -2)

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +  # Main points
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  # Add gene labels WITHOUT any lines connecting them
  geom_text_repel(data = top_genes, 
                  aes(label = gene),  
                  size = 3, box.padding = 0.3, max.overlaps = 15, segment.color = NA) +  

  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  # logFC thresholds
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  # p-value threshold

  ylim(0, 70)  # Set max y-axis limit to avoid extreme values

NA
NA

Volcano Plot

library(ggplot2)
library(dplyr)
library(ggrepel)

# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Ensure correct column names
colnames(DE_results_df)
 [1] "cell_type"     "gene"          "avg_logFC"     "Malignant.pct" "Control.pct"   "Malignant.exp" "Control.exp"   "p_val"        
 [9] "p_val_adj"     "de_family"     "de_method"     "de_type"      
# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_logFC > 2 ~ "Highly Upregulated",
      p_val_adj < 1e-20 & avg_logFC < -2 ~ "Highly Downregulated",
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_logFC > 2 | avg_logFC < -2))

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_logFC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Highly Upregulated" = "darkred",
    "Highly Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues

NA
NA

5. Summarize Markers

markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers1 Summary (MAST with Batch Correction):\n")
Markers1 Summary (MAST with Batch Correction):
summarize_markers(markers)
Number of genes with p_val_adj = 0: 0 
Number of genes with p_val_adj = 1: 164 
Number of significant genes (p_val_adj < 0.05): 5582 
Number of upregulated genes (avg_logFC > 1): 5591 
Number of downregulated genes (avg_logFC < 1): 4822 

9. EnhancedVolcano plot


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.0000905,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'lightblue', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)

EnhancedVolcano plot


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)

NA
NA
NA

Create the EnhancedVolcano plot

 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- markers

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_logFC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)



EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_logFC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)



library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)




EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
) 

NA
NA

ggplot2 for Volcano

library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, "Upregulated genes",
                                                   ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, "Downregulated genes", "Nonsignificant"))

# Create a volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_logFC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels for top and bottom genes
  geom_text_repel(data = top_genes, aes(label = gene), color = "black", vjust = 1, fontface = "bold") +
  geom_text_repel(data = bottom_genes, aes(label = gene), color = "black", vjust = -1, fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # # Add significance threshold lines
   geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
   geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", "Downregulated genes" = "blue", "Nonsignificant" = "darkgrey")) +
  
  # Customize theme if needed
  theme_minimal()

NA
NA
NA
NA
NA

Create the EnhancedVolcano plot


library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                'CD80',  'IL1B', 'RPS4Y1', 
                'IL7R', 'TCF7',  'MKI67', 'CD70', 
                'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p3)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p4)  # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.")
All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.

10. Enrichment Analysis-1

# Load necessary libraries
library(clusterProfiler)

Registered S3 methods overwritten by 'treeio':
  method              from    
  MRCA.phylo          tidytree
  MRCA.treedata       tidytree
  Nnode.treedata      tidytree
  Ntip.treedata       tidytree
  ancestor.phylo      tidytree
  ancestor.treedata   tidytree
  child.phylo         tidytree
  child.treedata      tidytree
  full_join.phylo     tidytree
  full_join.treedata  tidytree
  groupClade.phylo    tidytree
  groupClade.treedata tidytree
  groupOTU.phylo      tidytree
  groupOTU.treedata   tidytree
  is.rooted.treedata  tidytree
  nodeid.phylo        tidytree
  nodeid.treedata     tidytree
  nodelab.phylo       tidytree
  nodelab.treedata    tidytree
  offspring.phylo     tidytree
  offspring.treedata  tidytree
  parent.phylo        tidytree
  parent.treedata     tidytree
  root.treedata       tidytree
  rootnode.phylo      tidytree
  sibling.phylo       tidytree
clusterProfiler v4.6.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

Attachement du package : ‘clusterProfiler’

L'objet suivant est masqué depuis ‘package:IRanges’:

    slice

L'objet suivant est masqué depuis ‘package:S4Vectors’:

    rename

L'objet suivant est masqué depuis ‘package:stats’:

    filter
library(org.Hs.eg.db)
Le chargement a nécessité le package : AnnotationDbi

Attachement du package : ‘AnnotationDbi’

L'objet suivant est masqué depuis ‘package:clusterProfiler’:

    select

L'objet suivant est masqué depuis ‘package:dplyr’:

    select
library(enrichplot)
library(ReactomePA)
ReactomePA v1.42.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479
library(DOSE) # For GSEA analysis
DOSE v3.24.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(ggplot2) # Ensure ggplot2 is available for plotting

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1       # Downregulated logFC threshold
pval_threshold <- 0.05  # p-value threshold as specified

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > logFC_up_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < logFC_down_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Malignant_vs_Control/upregulated_genes.csv", row.names = FALSE)
write.csv(downregulated_genes, "Malignant_vs_Control/downregulated_genes.csv", row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 14.39% of input gene IDs are fail to map...
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 26.94% of input gene IDs are fail to map...
# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
  
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
  
# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
View(upregulated_genes)
View(downregulated_genes)

safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")


safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")


safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")

safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")
No significant Reactome pathways found for: Reactome Pathway Enrichment for Downregulated Genes

Enrichment Analysis-2-Hallmark


# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 588 
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 264 
# Define the output folder where the results will be saved
output_folder <- "Malignant_vs_Control/"

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}


if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
