1. load libraries
3. Create the EnhancedVolcano plot
library(dplyr)
library(EnhancedVolcano)
# Assuming you have a data frame named Malignant_CD4Tcells_vs_Normal_CD4Tcells
# Filter genes based on lowest p-values but include all genes
filtered_genes <- Malignant_CD4Tcells_vs_Normal_CD4Tcells %>%
arrange(p_val_adj, desc(abs(avg_logFC)))
# Create the EnhancedVolcano plot with the filtered data
EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells(cell lines) vs normal CD4 T cells",
pCutoff = 1e-10,
FCcutoff = 1.5,
legendPosition = 'right',
labCol = 'black',
labFace = 'bold',
boxedLabels = FALSE, # Set to FALSE to remove boxed labels
pointSize = 3.0,
labSize = 5.0,
col = c('grey70', 'black', 'blue', 'red'), # Customize point colors
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0] # Only label significant genes
)

EnhancedVolcano(
filtered_genes,
lab = ifelse(filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0, filtered_genes$gene, NA),
x = "avg_logFC",
y = "p_val_adj",
title = "Malignant CD4 T cells (cell lines) vs Normal CD4 T cells",
subtitle = "Highlighting differentially expressed genes",
pCutoff = 1e-10,
FCcutoff = 1.5,
legendPosition = 'right',
colAlpha = 0.8, # Slight transparency for non-significant points
col = c('grey70', 'black', 'blue', 'red'), # Custom color scheme
gridlines.major = TRUE,
gridlines.minor = FALSE,
selectLab = filtered_genes$gene[filtered_genes$p_val_adj <= 0.05 & abs(filtered_genes$avg_logFC) >= 1.0]
)

NA
NA
7. Save Hallmark and kegg to CSV
# Assuming you have the results stored in fgsea_result_hallmark and fgsea_result_kegg
# Flatten the list columns into character strings for Hallmark results
fgsea_result_hallmark_flattened <- fgsea_result %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened Hallmark results to a CSV file
write.csv(fgsea_result_hallmark_flattened, "fgsea_results_hallmark.csv", row.names = FALSE)
# Flatten the list columns into character strings for KEGG results
fgsea_result_kegg_flattened <- fgsea_result_kegg %>%
mutate(across(where(is.list), ~ sapply(., toString)))
# Write the flattened KEGG results to a CSV file
write.csv(fgsea_result_kegg_flattened, "fgsea_results_kegg.csv", row.names = FALSE)
ggplot(data.frame(gene_symbol = names(gene_list)[1:50], ranks = gene_list[1:50]), aes(gene_symbol, ranks)) +
geom_point() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

8. Hallmark genes found in multiple pathways
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# Assuming 'fgsea_results' is the output of your fgsea analysis
# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result %>%
filter(padj < 0.05) %>%
select(pathway, leadingEdge)
# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
unnest(cols = leadingEdge)
# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
group_by(leadingEdge) %>%
summarise(count = n()) %>%
arrange(desc(count)) # Arrange genes by the number of pathways they appear in
# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
filter(count > 1) # Genes involved in more than one pathway
# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() +
labs(title = "Top Regulator Genes Involved in Multiple Pathways",
x = "Gene",
y = "Number of Pathways") +
theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")
8. Hallmark genes found in multiple pathways
# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
# Assuming 'fgsea_results' is the output of your fgsea analysis
# Step 1: Extract the leading edge genes for each significant pathway
# 'leadingEdge' contains the gene names that contribute to the enrichment of the pathway
significant_gene_sets <- fgsea_result_kegg %>%
filter(padj < 0.05) %>%
select(pathway, leadingEdge)
# Step 2: Unnest the leadingEdge column (convert list to rows)
significant_gene_sets <- significant_gene_sets %>%
unnest(cols = leadingEdge)
# Step 3: Count how many times each gene appears across pathways
gene_count <- significant_gene_sets %>%
group_by(leadingEdge) %>%
summarise(count = n()) %>%
arrange(desc(count)) # Arrange genes by the number of pathways they appear in
# Step 4: Visualize the top regulator genes (genes that appear in multiple pathways)
top_regulator_genes <- gene_count %>%
filter(count > 1) # Genes involved in more than one pathway
# Plot top regulator genes involved in multiple pathways
ggplot(top_regulator_genes, aes(x = reorder(leadingEdge, -count), y = count)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() +
labs(title = "Top Regulator Genes Involved in Multiple Pathways",
x = "Gene",
y = "Number of Pathways") +
theme_minimal()

# Step 5: Output the gene counts to a CSV file for further inspection
write.csv(gene_count, "gene_count_in_multiple_pathways.csv")
