4. Volcano Plot
library(ggplot2)
library(dplyr)
Attachement du package : ‘dplyr’
L'objet suivant est masqué depuis ‘package:Biobase’:
combine
Les objets suivants sont masqués depuis ‘package:GenomicRanges’:
intersect, setdiff, union
L'objet suivant est masqué depuis ‘package:GenomeInfoDb’:
intersect
Les objets suivants sont masqués depuis ‘package:IRanges’:
collapse, desc, intersect, setdiff, slice, union
Les objets suivants sont masqués depuis ‘package:S4Vectors’:
first, intersect, rename, setdiff, setequal, union
Les objets suivants sont masqués depuis ‘package:BiocGenerics’:
combine, intersect, setdiff, union
L'objet suivant est masqué depuis ‘package:matrixStats’:
count
Les objets suivants sont masqués depuis ‘package:stats’:
filter, lag
Les objets suivants sont masqués depuis ‘package:base’:
intersect, setdiff, setequal, union
# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)
# Save results to CSV
write.csv(DE_results_df, file = "1-Pseudobulk_DEseq2_LRT_DE_with_libra.csv", row.names = FALSE)
# Ensure logFC and p-value columns are named correctly
colnames(DE_results_df)
[1] "cell_type" "gene" "avg_logFC" "Malignant.pct" "Control.pct" "Malignant.exp" "Control.exp" "p_val"
[9] "p_val_adj" "de_family" "de_method" "de_type"
# Filtering for significant genes:
volcano_data <- DE_results_df %>%
mutate(
significance = case_when(
p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
theme_minimal() +
labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-Value",
color = "Significance") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black")

NA
NA
Volcano Plot
library(EnhancedVolcano)
Le chargement a nécessité le package : ggrepel
EnhancedVolcano(
DE_results,
lab = DE_results$gene, # Labels for genes
x = 'avg_logFC', # Log2 Fold Change (corrected name)
y = 'p_val_adj', # Adjusted P-value
title = 'Volcano Plot: Pseudobulk DESeq2 Analysis',
subtitle = 'Malignant vs Normal CD4T Cells',
pCutoff = 0.05, # Adjusted p-value threshold
FCcutoff = 2, # Fold Change threshold
pointSize = 2.0, # Adjust point size
labSize = 4.0, # Adjust label size
col = c("black", "grey", "blue", "red"), # Color coding
colAlpha = 0.6, # Transparency for points
legendLabels = c("NS", "Log2FC", "P-value", "Both"),
legendPosition = "right",
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE, # Connect gene labels
widthConnectors = 0.5,
vline = c(-2, 2), # Vertical lines at logFC ±2
hline = -log10(0.05), # Horizontal line at adjusted p-value 0.05
border = "full"
)

Volcano Plot
library(ggplot2)
library(dplyr)
library(ggrepel)
# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)
# Ensure correct column names
colnames(DE_results_df)
[1] "cell_type" "gene" "avg_logFC" "Malignant.pct" "Control.pct" "Malignant.exp" "Control.exp" "p_val"
[9] "p_val_adj" "de_family" "de_method" "de_type"
# Define significance categories
volcano_data <- DE_results_df %>%
mutate(
significance = case_when(
p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
# Select genes to label: p_val_adj < 1e-50 OR logFC > 2 OR logFC < -2
top_genes <- volcano_data %>%
filter(p_val_adj < 0.05 | avg_logFC > 2 | avg_logFC < -2)
ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
geom_point(alpha = 0.6, size = 2) + # Main points
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
theme_minimal() +
labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-Value",
color = "Significance") +
# Add gene labels WITHOUT any lines connecting them
geom_text_repel(data = top_genes,
aes(label = gene),
size = 3, box.padding = 0.3, max.overlaps = 15, segment.color = NA) +
# Add threshold lines
geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") + # logFC thresholds
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") + # p-value threshold
ylim(0, 70) # Set max y-axis limit to avoid extreme values

NA
NA
Volcano Plot
library(ggplot2)
library(dplyr)
library(ggrepel)
# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)
# Ensure correct column names
colnames(DE_results_df)
[1] "cell_type" "gene" "avg_logFC" "Malignant.pct" "Control.pct" "Malignant.exp" "Control.exp" "p_val"
[9] "p_val_adj" "de_family" "de_method" "de_type"
# Define significance categories
volcano_data <- DE_results_df %>%
mutate(
significance = case_when(
p_val_adj < 1e-20 & avg_logFC > 2 ~ "Highly Upregulated",
p_val_adj < 1e-20 & avg_logFC < -2 ~ "Highly Downregulated",
p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
# Select only very significant genes for labeling
top_genes <- volcano_data %>%
filter(p_val_adj < 0.05 & (avg_logFC > 2 | avg_logFC < -2))
ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
# Main points
geom_point(alpha = 0.7, size = 2.5) +
# Highlight highly significant genes with larger points
geom_point(data = top_genes, aes(x = avg_logFC, y = -log10(p_val_adj)),
color = "black", size = 3, shape = 21, fill = "black") +
# Custom color scheme
scale_color_manual(values = c(
"Highly Upregulated" = "darkred",
"Highly Downregulated" = "darkblue",
"Upregulated" = "red",
"Downregulated" = "blue",
"Not Significant" = "grey"
)) +
# Add gene labels (only for highly significant genes)
geom_text_repel(data = top_genes, aes(label = gene),
size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
# Add threshold lines
geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
# Improve theme
theme_minimal(base_size = 14) +
labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-Value",
color = "Significance") +
ylim(0, 50) # Avoid extreme scaling issues

NA
NA
9. EnhancedVolcano plot
library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 0.0000905,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'lightblue', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

EnhancedVolcano plot
library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 1e-4,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

NA
NA
NA
Create the EnhancedVolcano plot
Malignant_CD4Tcells_vs_Normal_CD4Tcells <- markers
EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_logFC",
y = "p_val_adj",
title = "MAST with Batch Correction (All Genes)",
pCutoff = 0.05,
FCcutoff = 1.0)

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_logFC",
y = "p_val_adj",
selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9',
'CD80', 'IL1B', 'RPS4Y1',
'IL7R', 'TCF7', 'MKI67', 'CD70',
'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9',
'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL',
'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4',
'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 0.05,
FCcutoff = 1.5,
pointSize = 3.0,
labSize = 5.0,
boxedLabels = TRUE,
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5,
colConnectors = 'grey50',
arrowheads = FALSE,
max.overlaps = 30)

library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 0.05,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 0.05,
FCcutoff = 1.0,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)

NA
NA
ggplot2 for Volcano
library(ggplot2)
library(ggrepel)
# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, ]
# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, "Upregulated genes",
ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, "Downregulated genes", "Nonsignificant"))
# Create a volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_logFC, y = -log10(p_val_adj))) +
geom_point(aes(color = color), alpha = 0.7, size = 2) +
# Add labels for top and bottom genes
geom_text_repel(data = top_genes, aes(label = gene), color = "black", vjust = 1, fontface = "bold") +
geom_text_repel(data = bottom_genes, aes(label = gene), color = "black", vjust = -1, fontface = "bold") +
# Customize labels and title
labs(title = "Volcano Plot",
x = "log2 Fold Change",
y = "-log10(p-value)") +
# # Add significance threshold lines
geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
# Set colors for top and bottom genes
scale_color_manual(values = c("Upregulated genes" = "red", "Downregulated genes" = "blue", "Nonsignificant" = "darkgrey")) +
# Customize theme if needed
theme_minimal()

NA
NA
NA
NA
NA
Create the EnhancedVolcano plot
library(ggplot2)
library(EnhancedVolcano)
library(dplyr)
# Define the output directory
output_dir <- "Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)
# First Volcano Plot
p1 <- EnhancedVolcano(
Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
pCutoff = 1e-4,
FCcutoff = 1.0
)
print(p1) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)
# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
Malignant_CD4Tcells_vs_Normal_CD4Tcells,
lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
x = "avg_logFC",
y = "p_val_adj",
selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9',
'CD80', 'IL1B', 'RPS4Y1',
'IL7R', 'TCF7', 'MKI67', 'CD70',
'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9',
'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL',
'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4',
'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 0.05,
FCcutoff = 1.5,
pointSize = 3.0,
labSize = 5.0,
boxedLabels = TRUE,
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5,
colConnectors = 'grey50',
arrowheads = FALSE,
max.overlaps = 30
)
print(p2) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)
# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 1e-4,
FCcutoff = 1.0,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p3) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)
# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 1e-4,
FCcutoff = 1.0,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p4) # Display in notebook

ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)
message("All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.")
All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.
10. Enrichment Analysis-1
# Load necessary libraries
library(clusterProfiler)
Registered S3 methods overwritten by 'treeio':
method from
MRCA.phylo tidytree
MRCA.treedata tidytree
Nnode.treedata tidytree
Ntip.treedata tidytree
ancestor.phylo tidytree
ancestor.treedata tidytree
child.phylo tidytree
child.treedata tidytree
full_join.phylo tidytree
full_join.treedata tidytree
groupClade.phylo tidytree
groupClade.treedata tidytree
groupOTU.phylo tidytree
groupOTU.treedata tidytree
is.rooted.treedata tidytree
nodeid.phylo tidytree
nodeid.treedata tidytree
nodelab.phylo tidytree
nodelab.treedata tidytree
offspring.phylo tidytree
offspring.treedata tidytree
parent.phylo tidytree
parent.treedata tidytree
root.treedata tidytree
rootnode.phylo tidytree
sibling.phylo tidytree
clusterProfiler v4.6.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
Attachement du package : ‘clusterProfiler’
L'objet suivant est masqué depuis ‘package:IRanges’:
slice
L'objet suivant est masqué depuis ‘package:S4Vectors’:
rename
L'objet suivant est masqué depuis ‘package:stats’:
filter
library(org.Hs.eg.db)
Le chargement a nécessité le package : AnnotationDbi
Attachement du package : ‘AnnotationDbi’
L'objet suivant est masqué depuis ‘package:clusterProfiler’:
select
L'objet suivant est masqué depuis ‘package:dplyr’:
select
library(enrichplot)
library(ReactomePA)
ReactomePA v1.42.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479
library(DOSE) # For GSEA analysis
DOSE v3.24.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(ggplot2) # Ensure ggplot2 is available for plotting
# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1 # Upregulated logFC threshold
logFC_down_threshold <- -1 # Downregulated logFC threshold
pval_threshold <- 0.05 # p-value threshold as specified
# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")
# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > logFC_up_threshold &
Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]
downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < logFC_down_threshold &
Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]
# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)
# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Malignant_vs_Control/upregulated_genes.csv", row.names = FALSE)
write.csv(downregulated_genes, "Malignant_vs_Control/downregulated_genes.csv", row.names = FALSE)
# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 14.39% of input gene IDs are fail to map...
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
'select()' returned 1:1 mapping between keys and columns
Avis : 26.94% of input gene IDs are fail to map...
# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]
# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
Missing upregulated genes:
cat("Missing downregulated genes:\n", missing_downregulated, "\n")
Missing downregulated genes:
# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]
# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
if (length(gene_list) > 0) {
result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
} else {
message(paste("No significant enrichment found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
safe_enrichKEGG <- function(entrez_list, title, filename) {
if (length(entrez_list) > 0) {
result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
} else {
message(paste("No significant KEGG pathways found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
safe_enrichReactome <- function(entrez_list, title, filename) {
if (length(entrez_list) > 0) {
result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
p <- dotplot(result, showCategory = 10, title = title)
print(p)
ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
} else {
message(paste("No significant Reactome pathways found for:", title))
}
} else {
message(paste("No genes found for:", title))
}
}
# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
View(upregulated_genes)
View(downregulated_genes)

safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")

safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")
No significant Reactome pathways found for: Reactome Pathway Enrichment for Downregulated Genes
Enrichment Analysis-2-Hallmark
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H") # "H" is for Hallmark gene sets
# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)
# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)
# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
Number of upregulated genes in Hallmark gene sets: 588
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")
Number of downregulated genes in Hallmark gene sets: 264
# Define the output folder where the results will be saved
output_folder <- "Malignant_vs_Control/"
# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
# Perform enrichment analysis for upregulated genes using Hallmark gene sets
hallmark_up <- enricher(gene = upregulated_in_hallmark,
TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")], # Ensure TERM2GENE uses correct columns
pvalueCutoff = 0.05)
# Check if results exist
if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
# Visualize results if available
up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
# Display the plot in the notebook
print(up_dotplot)
# Save the dotplot to a PNG file
ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
# Optionally, save the results as CSV
write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
} else {
cat("No significant enrichment found for upregulated genes.\n")
}
} else {
cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
# Perform enrichment analysis for downregulated genes using Hallmark gene sets
hallmark_down <- enricher(gene = downregulated_in_hallmark,
TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")], # Ensure TERM2GENE uses correct columns
pvalueCutoff = 0.05)
# Check if results exist
if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
# Visualize results if available
down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
# Display the plot in the notebook
print(down_dotplot)
# Save the dotplot to a PNG file
ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
# Optionally, save the results as CSV
write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
} else {
cat("No significant enrichment found for downregulated genes.\n")
}
} else {
cat("No downregulated genes overlap with Hallmark gene sets.\n")
}

NA
NA
---
title: "PseudoBulk Analysis using Libra RNA assay-Deseq2-LRT"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  # pdf_document: default
  # word_document: default
  # html_document: default
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}

# Load libraries
library(Seurat)
library(Matrix)
library(SingleCellExperiment)
library(DESeq2)
library(Libra)

```


# 2. load seurat object
```{r load_seurat}
#Load Seurat Object L7
load("/home/nabbasi/isilon/To_Transfer_between_computers/23-Harmony_Integration/0-robj/5-Harmony_Integrated_All_samples_Merged_CD4Tcells_final_Resolution_Selected_0.8_ADT_Normalized_cleaned_mt.robj")

pseudobulk_seurat <- All_samples_Merged

# Assign labels and ensure 'Control' is the reference
pseudobulk_seurat$label <- factor(
  ifelse(pseudobulk_seurat$orig.ident %in% c("PBMC", "PBMC10x"), 
         "Control", 
         "Malignant"),
  levels = c("Malignant", "Control")  
)


# Double-check the reference level
print(levels(pseudobulk_seurat$label))  # Should print "Control" first

# Verify factor levels
print(levels(pseudobulk_seurat$label))  # Should show Control first

# Ensure 'replicate' is a factor
pseudobulk_seurat$replicate <- as.factor(pseudobulk_seurat$cell_line)

# Rename the cell type column
pseudobulk_seurat$cell_type <- "CD4T"
```

# 3. DE using LIBRA
```{r , fig.height=14, fig.width=18}
libra_test <-pseudobulk_seurat



DE_results <- run_de(
  pseudobulk_seurat,
  cell_type_col = "cell_type", 
  replicate_col = "replicate",
  label_col = "label",
  de_family = "pseudobulk",   
  de_method = "DESeq2",        
  de_type = "LRT",
)

```


# 4. Volcano Plot
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)

# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Save results to CSV
write.csv(DE_results_df, file = "1-Pseudobulk_DEseq2_LRT_DE_with_libra.csv", row.names = FALSE)

# Ensure logFC and p-value columns are named correctly
colnames(DE_results_df)

# Filtering for significant genes:
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )
ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black")


```




## Volcano Plot
```{r , fig.height=14, fig.width=18}
library(EnhancedVolcano)

EnhancedVolcano(
  DE_results, 
  lab = DE_results$gene,  # Labels for genes
  x = 'avg_logFC',             # Log2 Fold Change (corrected name)
  y = 'p_val_adj',             # Adjusted P-value
  title = 'Volcano Plot: Pseudobulk DESeq2 Analysis',
  subtitle = 'Malignant vs Normal CD4T Cells',
  pCutoff = 0.05,              # Adjusted p-value threshold
  FCcutoff = 2,                # Fold Change threshold
  pointSize = 2.0,             # Adjust point size
  labSize = 4.0,               # Adjust label size
  col = c("black", "grey", "blue", "red"), # Color coding
  colAlpha = 0.6,              # Transparency for points
  legendLabels = c("NS", "Log2FC", "P-value", "Both"),
  legendPosition = "right",
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,       # Connect gene labels
  widthConnectors = 0.5,
  vline = c(-2, 2),            # Vertical lines at logFC ±2
  hline = -log10(0.05),        # Horizontal line at adjusted p-value 0.05
  border = "full"
)

```




## Volcano Plot
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)
library(ggrepel)

# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Ensure correct column names
colnames(DE_results_df)

# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select genes to label: p_val_adj < 1e-50 OR logFC > 2 OR logFC < -2
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 | avg_logFC > 2 | avg_logFC < -2)

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  geom_point(alpha = 0.6, size = 2) +  # Main points
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  # Add gene labels WITHOUT any lines connecting them
  geom_text_repel(data = top_genes, 
                  aes(label = gene),  
                  size = 3, box.padding = 0.3, max.overlaps = 15, segment.color = NA) +  

  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  # logFC thresholds
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  # p-value threshold

  ylim(0, 70)  # Set max y-axis limit to avoid extreme values


```
## Volcano Plot
```{r , fig.height=14, fig.width=18}
library(ggplot2)
library(dplyr)
library(ggrepel)

# Extract results from Libra
DE_results_df <- as.data.frame(DE_results)

# Ensure correct column names
colnames(DE_results_df)

# Define significance categories
volcano_data <- DE_results_df %>%
  mutate(
    significance = case_when(
      p_val_adj < 1e-20 & avg_logFC > 2 ~ "Highly Upregulated",
      p_val_adj < 1e-20 & avg_logFC < -2 ~ "Highly Downregulated",
      p_val_adj < 0.05 & avg_logFC > 2 ~ "Upregulated",
      p_val_adj < 0.05 & avg_logFC < -2 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Select only very significant genes for labeling
top_genes <- volcano_data %>%
  filter(p_val_adj < 0.05 & (avg_logFC > 2 | avg_logFC < -2))

ggplot(volcano_data, aes(x = avg_logFC, y = -log10(p_val_adj), color = significance)) +
  
  # Main points
  geom_point(alpha = 0.7, size = 2.5) +
  
  # Highlight highly significant genes with larger points
  geom_point(data = top_genes, aes(x = avg_logFC, y = -log10(p_val_adj)), 
             color = "black", size = 3, shape = 21, fill = "black") +

  # Custom color scheme
  scale_color_manual(values = c(
    "Highly Upregulated" = "darkred",
    "Highly Downregulated" = "darkblue",
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +

  # Add gene labels (only for highly significant genes)
  geom_text_repel(data = top_genes, aes(label = gene),  
                  size = 4, box.padding = 0.5, max.overlaps = 10, segment.color = NA) +
  
  # Add threshold lines
  geom_vline(xintercept = c(-2, 2), linetype = "dashed", color = "black") +  
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +  

  # Improve theme
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: Pseudobulk DESeq2 Analysis",
       x = "Log2 Fold Change",
       y = "-Log10 Adjusted P-Value",
       color = "Significance") +

  ylim(0, 50)  # Avoid extreme scaling issues


```

# 5. Summarize Markers
```{r , fig.height=12, fig.width=14}
markers <- DE_results_df

summarize_markers <- function(markers) {
  num_pval0 <- sum(markers$p_val_adj == 0)
  num_pval1 <- sum(markers$p_val_adj == 1)
  num_significant <- sum(markers$p_val_adj < 0.05)
  num_upregulated <- sum(markers$avg_logFC > 1)
  num_downregulated <- sum(markers$avg_logFC < -1)
  
  cat("Number of genes with p_val_adj = 0:", num_pval0, "\n")
  cat("Number of genes with p_val_adj = 1:", num_pval1, "\n")
  cat("Number of significant genes (p_val_adj < 0.05):", num_significant, "\n")
  cat("Number of upregulated genes (avg_logFC > 1):", num_upregulated, "\n")
  cat("Number of downregulated genes (avg_logFC < 1):", num_downregulated, "\n")
}

cat("Markers1 Summary (MAST with Batch Correction):\n")

summarize_markers(markers)
```


# 9. EnhancedVolcano plot
```{r , fig.height=16, fig.width=20}

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.0000905,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'lightblue', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)

```



## EnhancedVolcano plot
```{r , fig.height=12, fig.width=16}

library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- markers %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.0000905 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)



```

## Create the EnhancedVolcano plot
```{r enhancedV, fig.height=12, fig.width=16}
 Malignant_CD4Tcells_vs_Normal_CD4Tcells <- markers

EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells,
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_logFC",
                y = "p_val_adj",
                title = "MAST with Batch Correction (All Genes)",
                pCutoff = 0.05,
                FCcutoff = 1.0)


EnhancedVolcano(Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
                lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
                x = "avg_logFC", 
                y = "p_val_adj",
                selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                              'CD80',  'IL1B', 'RPS4Y1', 
                              'IL7R', 'TCF7',  'MKI67', 'CD70', 
                              'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                              'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                              'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                              'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                              'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
                title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
                xlab = bquote(~Log[2]~ 'fold change'),
                pCutoff = 0.05,
                FCcutoff = 1.5, 
                pointSize = 3.0,
                labSize = 5.0,
                boxedLabels = TRUE,
                colAlpha = 0.5,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 4.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                colConnectors = 'grey50',
                arrowheads = FALSE,
                max.overlaps = 30)


library(dplyr)
library(EnhancedVolcano)

# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Set to FALSE to remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]  # Only label significant genes
)



EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 0.05,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
) 


```

## ggplot2 for Volcano
```{r , fig.height=8, fig.width=12}
library(ggplot2)
library(ggrepel)

# Identify top and bottom genes
top_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, ]
bottom_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < 0.05 & Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, ]

# Create a new column for color based on significance
Malignant_CD4Tcells_vs_Normal_CD4Tcells$color <- ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > 0.5, "Upregulated genes",
                                                   ifelse(Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < -0.5, "Downregulated genes", "Nonsignificant"))

# Create a volcano plot
ggplot(Malignant_CD4Tcells_vs_Normal_CD4Tcells, aes(x = avg_logFC, y = -log10(p_val_adj))) +
  geom_point(aes(color = color), alpha = 0.7, size = 2) +
  
  # Add labels for top and bottom genes
  geom_text_repel(data = top_genes, aes(label = gene), color = "black", vjust = 1, fontface = "bold") +
  geom_text_repel(data = bottom_genes, aes(label = gene), color = "black", vjust = -1, fontface = "bold") +
  
  # Customize labels and title
  labs(title = "Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10(p-value)") +
  
  # # Add significance threshold lines
   geom_hline(yintercept = -log10(0.00001), linetype = "dashed", color = "black") +
   geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  
  # Set colors for top and bottom genes
  scale_color_manual(values = c("Upregulated genes" = "red", "Downregulated genes" = "blue", "Nonsignificant" = "darkgrey")) +
  
  # Customize theme if needed
  theme_minimal()





```

## Create the EnhancedVolcano plot
```{r , fig.height=8, fig.width=12}

library(ggplot2)
library(EnhancedVolcano)
library(dplyr)

# Define the output directory
output_dir <- "Malignant_vs_Control"
dir.create(output_dir, showWarnings = FALSE)

# First Volcano Plot
p1 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells,
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC",
  y = "p_val_adj",
  title = "Malignant_CD4Tcells_vs_Normal_CD4Tcells",
  pCutoff = 1e-4,
  FCcutoff = 1.0
)
print(p1)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot1.png"), plot = p1, width = 14, height = 10, dpi = 300)

# Second Volcano Plot with selected genes
p2 <- EnhancedVolcano(
  Malignant_CD4Tcells_vs_Normal_CD4Tcells, 
  lab = Malignant_CD4Tcells_vs_Normal_CD4Tcells$gene,
  x = "avg_logFC", 
  y = "p_val_adj",
  selectLab = c('EPCAM', 'BCAT1', 'KIR3DL2', 'FOXM1', 'TWIST1', 'TNFSF9', 
                'CD80',  'IL1B', 'RPS4Y1', 
                'IL7R', 'TCF7',  'MKI67', 'CD70', 
                'IL2RA','TRBV6-2', 'TRBV10-3', 'TRBV4-2', 'TRBV9', 'TRBV7-9', 
                'TRAV12-1', 'CD8B', 'FCGR3A', 'GNLY', 'FOXP3', 'SELL', 
                'GIMAP1', 'RIPOR2', 'LEF1', 'HOXC9', 'SP5',
                'CCL17', 'ETV4', 'THY1', 'FOXA2', 'ITGAD', 'S100P', 'TBX4', 
                'ID1', 'XCL1', 'SOX2', 'CD27', 'CD28','PLS3','CD70','RAB25' , 'TRBV27', 'TRBV2'),
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  xlab = bquote(~Log[2]~ 'fold change'),
  pCutoff = 0.05,
  FCcutoff = 1.5, 
  pointSize = 3.0,
  labSize = 5.0,
  boxedLabels = TRUE,
  colAlpha = 0.5,
  legendPosition = 'right',
  legendLabSize = 10,
  legendIconSize = 4.0,
  drawConnectors = TRUE,
  widthConnectors = 0.5,
  colConnectors = 'grey50',
  arrowheads = FALSE,
  max.overlaps = 30
)
print(p2)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot2.png"), plot = p2, width = 14, height = 10, dpi = 300)

# Filtering genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
  arrange(p_val_adj, desc(abs(avg_logFC)))

# Third Volcano Plot - Filtering by p-value and logFC
p3 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right', 
  labCol = 'black',
  labFace = 'bold',
  boxedLabels = FALSE,  # Remove boxed labels
  pointSize = 3.0,
  labSize = 5.0,
  col = c('grey70', 'black', 'blue', 'red'),  # Customize point colors
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p3)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot3.png"), plot = p3, width = 14, height = 10, dpi = 300)

# Fourth Volcano Plot - More refined filtering
p4 <- EnhancedVolcano(
  filtered_genes, 
  lab = ifelse(filtered_genes$p_val_adj <= 1e-4 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
  x = "avg_logFC", 
  y = "p_val_adj",
  title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
  subtitle = "Highlighting differentially expressed genes",
  pCutoff = 1e-4,
  FCcutoff = 1.0,
  legendPosition = 'right',
  colAlpha = 0.8,  # Slight transparency for non-significant points
  col = c('grey70', 'black', 'blue', 'red'),  # Custom color scheme
  gridlines.major = TRUE,
  gridlines.minor = FALSE,
  selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)
print(p4)  # Display in notebook
ggsave(filename = file.path(output_dir, "VolcanoPlot4.png"), plot = p4, width = 14, height = 10, dpi = 300)

message("All volcano plots have been displayed and saved successfully in the 'L1_vs_Control' folder.")



```


# 10. Enrichment Analysis-1
```{r , fig.height=6, fig.width=8}
# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ReactomePA)
library(DOSE) # For GSEA analysis
library(ggplot2) # Ensure ggplot2 is available for plotting

# Define threshold for differential expression selection (modified thresholds)
logFC_up_threshold <- 1          # Upregulated logFC threshold
logFC_down_threshold <- -1       # Downregulated logFC threshold
pval_threshold <- 0.05  # p-value threshold as specified

# Load your differential expression results (modify based on actual data structure)
# Malignant_CD4Tcells_vs_Normal_CD4Tcells <- read.csv("Your_DE_Results_File.csv")

# Select upregulated and downregulated genes
upregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC > logFC_up_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

downregulated_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells[
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$avg_logFC < logFC_down_threshold & 
  Malignant_CD4Tcells_vs_Normal_CD4Tcells$p_val_adj < pval_threshold, ]

# Check for missing genes (NAs) in the gene column and remove them
upregulated_genes <- na.omit(upregulated_genes)
downregulated_genes <- na.omit(downregulated_genes)

# Save upregulated and downregulated gene results to CSV
write.csv(upregulated_genes, "Malignant_vs_Control/upregulated_genes.csv", row.names = FALSE)
write.csv(downregulated_genes, "Malignant_vs_Control/downregulated_genes.csv", row.names = FALSE)

# Convert gene symbols to Entrez IDs for enrichment analysis, with checks for missing values
upregulated_entrez <- bitr(upregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
downregulated_entrez <- bitr(downregulated_genes$gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)

# Check for missing Entrez IDs
missing_upregulated <- upregulated_genes$gene[is.na(upregulated_entrez$ENTREZID)]
missing_downregulated <- downregulated_genes$gene[is.na(downregulated_entrez$ENTREZID)]

# Print out the missing gene symbols for debugging
cat("Missing upregulated genes:\n", missing_upregulated, "\n")
cat("Missing downregulated genes:\n", missing_downregulated, "\n")

# Remove genes that couldn't be mapped to Entrez IDs
upregulated_entrez <- upregulated_entrez$ENTREZID[!is.na(upregulated_entrez$ENTREZID)]
downregulated_entrez <- downregulated_entrez$ENTREZID[!is.na(downregulated_entrez$ENTREZID)]

# Define a function to safely run enrichment, plot results, and save them
safe_enrichGO <- function(gene_list, title, filename) {
  if (length(gene_list) > 0) {
    result <- enrichGO(gene = gene_list, OrgDb = org.Hs.eg.db, keyType = "SYMBOL",
                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)  
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant enrichment found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichKEGG <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichKEGG(gene = entrez_list, organism = "hsa", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant KEGG pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

safe_enrichReactome <- function(entrez_list, title, filename) {
  if (length(entrez_list) > 0) {
    result <- enrichPathway(gene = entrez_list, organism = "human", pvalueCutoff = 0.05)
    if (!is.null(result) && nrow(as.data.frame(result)) > 0) {
      p <- dotplot(result, showCategory = 10, title = title)
      print(p)
      ggsave(paste0("Malignant_vs_Control/", gsub(".csv", "_dotplot.png", filename)), plot = p, width = 8, height = 6)
      write.csv(as.data.frame(result), file = paste0("Malignant_vs_Control/", filename), row.names = FALSE)
    } else {
      message(paste("No significant Reactome pathways found for:", title))
    }
  } else {
    message(paste("No genes found for:", title))
  }
}

# Perform enrichment analyses, generate plots, and save results
safe_enrichGO(upregulated_genes$gene, "GO Enrichment for Upregulated Genes", "upregulated_GO_results.csv")
safe_enrichGO(downregulated_genes$gene, "GO Enrichment for Downregulated Genes", "downregulated_GO_results.csv")

safe_enrichKEGG(upregulated_entrez, "KEGG Pathway Enrichment for Upregulated Genes", "upregulated_KEGG_results.csv")
safe_enrichKEGG(downregulated_entrez, "KEGG Pathway Enrichment for Downregulated Genes", "downregulated_KEGG_results.csv")

safe_enrichReactome(upregulated_entrez, "Reactome Pathway Enrichment for Upregulated Genes", "upregulated_Reactome_results.csv")
safe_enrichReactome(downregulated_entrez, "Reactome Pathway Enrichment for Downregulated Genes", "downregulated_Reactome_results.csv")


```




## Enrichment Analysis-2-Hallmark
```{r , fig.height=6, fig.width=8}

# Load necessary libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)

# Load Hallmark gene sets from msigdbr
hallmark_sets <- msigdbr(species = "Homo sapiens", category = "H")  # "H" is for Hallmark gene sets

# Convert gene symbols to uppercase for consistency
upregulated_genes$gene <- toupper(upregulated_genes$gene)
downregulated_genes$gene <- toupper(downregulated_genes$gene)

# Check for overlap between your upregulated/downregulated genes and Hallmark gene sets
upregulated_in_hallmark <- intersect(upregulated_genes$gene, hallmark_sets$gene_symbol)
downregulated_in_hallmark <- intersect(downregulated_genes$gene, hallmark_sets$gene_symbol)

# Print the number of overlapping genes for both upregulated and downregulated genes
cat("Number of upregulated genes in Hallmark gene sets:", length(upregulated_in_hallmark), "\n")
cat("Number of downregulated genes in Hallmark gene sets:", length(downregulated_in_hallmark), "\n")

# Define the output folder where the results will be saved
output_folder <- "Malignant_vs_Control/"

# If there are genes to analyze, proceed with enrichment analysis
if (length(upregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for upregulated genes using Hallmark gene sets
  hallmark_up <- enricher(gene = upregulated_in_hallmark, 
                          TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                          pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_up) && nrow(hallmark_up) > 0) {
    # Visualize results if available
    up_dotplot <- dotplot(hallmark_up, showCategory = 20, title = "Hallmark Pathway Enrichment for Upregulated Genes")
    
    # Display the plot in the notebook
    print(up_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_upregulated_dotplot.png"), plot = up_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_up), file = paste0(output_folder, "hallmark_upregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for upregulated genes.\n")
  }
} else {
  cat("No upregulated genes overlap with Hallmark gene sets.\n")
}

if (length(downregulated_in_hallmark) > 0) {
  # Perform enrichment analysis for downregulated genes using Hallmark gene sets
  hallmark_down <- enricher(gene = downregulated_in_hallmark, 
                            TERM2GENE = hallmark_sets[, c("gs_name", "gene_symbol")],  # Ensure TERM2GENE uses correct columns
                            pvalueCutoff = 0.05)
  # Check if results exist
  if (!is.null(hallmark_down) && nrow(hallmark_down) > 0) {
    # Visualize results if available
    down_dotplot <- dotplot(hallmark_down, showCategory = 20, title = "Hallmark Pathway Enrichment for Downregulated Genes")
    
    # Display the plot in the notebook
    print(down_dotplot)
    
    # Save the dotplot to a PNG file
    ggsave(paste0(output_folder, "hallmark_downregulated_dotplot.png"), plot = down_dotplot, width = 10, height = 8)
    
    # Optionally, save the results as CSV
    write.csv(as.data.frame(hallmark_down), file = paste0(output_folder, "hallmark_downregulated_enrichment.csv"), row.names = FALSE)
  } else {
    cat("No significant enrichment found for downregulated genes.\n")
  }
} else {
  cat("No downregulated genes overlap with Hallmark gene sets.\n")
}


```







