#Differential Expression Analysis
#Load Seurat Object L7
load("../../0-IMP-OBJECTS/All_CD4_Tcells_Merged_1-13_res-0.9.Robj")
All_samples_Merged
An object of class Seurat
62625 features across 46976 samples within 6 assays
Active assay: SCT (25901 features, 3000 variable features)
3 layers present: counts, data, scale.data
5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
4 dimensional reductions calculated: pca, umap, integrated_dr, ref.umap
#Differential Expression Analysis
DefaultAssay(All_samples_Merged) <- "SCT"
Idents(All_samples_Merged) <- "cell_line"
all_markers <- FindAllMarkers(All_samples_Merged, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
Calculating cluster L1
Calculating cluster L2
Calculating cluster L3
Calculating cluster L4
Calculating cluster L5
Calculating cluster L6
Calculating cluster L7
Calculating cluster PBMC
write.csv(all_markers, "all_markers.csv")
head(all_markers)
NA
NA
top10_markers <- all_markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) %>%
arrange(cluster, desc(avg_log2FC))
top10_genes <- unique(top10_markers$gene)
heatmap_data <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")[top10_genes, ]
Avis : The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.
heatmap_data <- heatmap_data - rowMeans(heatmap_data)
cell_line_colors <- rainbow(length(unique(All_samples_Merged$cell_line)))
names(cell_line_colors) <- unique(All_samples_Merged$cell_line)
annotation_colors <- list(cell_line = cell_line_colors)
p <- pheatmap(heatmap_data,
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = All_samples_Merged@meta.data[, "cell_line", drop = FALSE],
annotation_colors = annotation_colors,
main = "Top 10 Upregulated Genes per Cell Line (Sorted by avg_log2FC)",
fontsize_row = 6,
treeheight_col = 0,
cluster_rows = FALSE,
cluster_cols = FALSE)
print(p)
png("heatmap_top10_log2fc_sorted.png", width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
png
2
top10_markers_seurat <- all_markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC)
top10_genes_seurat <- unique(top10_markers_seurat$gene)
heatmap_data_seurat <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")[top10_genes_seurat, ]
heatmap_data_seurat <- heatmap_data_seurat - rowMeans(heatmap_data_seurat)
p2 <- pheatmap(heatmap_data_seurat,
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = All_samples_Merged@meta.data[, "cell_line", drop = FALSE],
annotation_colors = annotation_colors,
main = "Top 10 Markers per Cell Line (Seurat Default)",
fontsize_row = 6,
treeheight_col = 0)
View(top10_markers)
print(p2)
png("heatmap_top10_seurat.png", width = 12, height = 8, units = "in", res = 300)
print(p2)
dev.off()
png
2
library(EnhancedVolcano)
perform_comparison_and_volcano <- function(All_samples_Merged, ident1, ident2) {
Idents(All_samples_Merged) <- "cell_line"
markers <- FindMarkers(All_samples_Merged, ident.1 = ident1, ident.2 = ident2, assay = "SCT")
write.csv(markers, paste0("comparison_", ident1, "_vs_", ident2, ".csv"))
# Create volcano plot
volcano_plot <- EnhancedVolcano(markers,
lab = rownames(markers),
x = 'avg_log2FC',
y = 'p_val_adj',
title = paste(ident1, 'vs', ident2),
pCutoff = 0.05,
FCcutoff = 1,
pointSize = 1.5,
labSize = 4.0,
col = c('grey', 'darkgreen', 'blue', 'red'),
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5)
print(volcano_plot)
png(paste0("volcano_", ident1, "_vs_", ident2, ".png"), width = 12, height = 10, units = "in", res = 300)
print(volcano_plot)
dev.off()
return(markers)
}
# Patient 1
p1_comparison <- perform_comparison_and_volcano(All_samples_Merged, "L1", "L2")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p1_comparison)
# Patient 2
p2_comparison <- perform_comparison_and_volcano(All_samples_Merged, "L3", "L4")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p2_comparison)
# Patient 3
p3_comparison_L5L6 <- perform_comparison_and_volcano(All_samples_Merged, "L5", "L6")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
p3_comparison_L5L7 <- perform_comparison_and_volcano(All_samples_Merged, "L5", "L7")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
p3_comparison_L6L7 <- perform_comparison_and_volcano(All_samples_Merged, "L6", "L7")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p3_comparison_L5L6)
head(p3_comparison_L5L7)
head(p3_comparison_L6L7)
NA
NA
perform_go_enrichment <- function(gene_list, gene_universe, title) {
ego <- enrichGO(gene = gene_list,
universe = gene_universe,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
p <- dotplot(ego, showCategory = 20, title = paste("GO -", title)) +
theme(axis.text.y = element_text(size = 8))
print(p)
png(paste0("GO_enrichment_", gsub(" ", "_", title), ".png"), width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
return(ego)
}
perform_kegg_enrichment <- function(gene_list, gene_universe, title) {
# Convert gene symbols to Entrez IDs
entrez_ids <- bitr(gene_list, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
universe_entrez <- bitr(gene_universe, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
ekegg <- enrichKEGG(gene = entrez_ids,
universe = universe_entrez,
organism = 'hsa',
keyType = "kegg",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
p <- dotplot(ekegg, showCategory = 20, title = paste("KEGG -", title)) +
theme(axis.text.y = element_text(size = 8))
print(p)
png(paste0("KEGG_enrichment_", gsub(" ", "_", title), ".png"), width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
return(ekegg)
}
gene_universe <- rownames(All_samples_Merged)
# Patient 1 (P1) comparison: L1 vs L2
upregulated_genes_P1 <- rownames(p1_comparison[p1_comparison$avg_log2FC > 1 & p1_comparison$p_val_adj < 0.05, ])
downregulated_genes_P1 <- rownames(p1_comparison[p1_comparison$avg_log2FC < -1 & p1_comparison$p_val_adj < 0.05, ])
go_up_P1 <- perform_go_enrichment(upregulated_genes_P1, gene_universe, "Upregulated Genes in L1 vs L2")
go_down_P1 <- perform_go_enrichment(downregulated_genes_P1, gene_universe, "Downregulated Genes in L1 vs L2")
kegg_up_P1 <- perform_kegg_enrichment(upregulated_genes_P1, gene_universe, "Upregulated Genes in L1 vs L2")
'select()' returned 1:1 mapping between keys and columns
Avis : 10.42% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
kegg_down_P1 <- perform_kegg_enrichment(downregulated_genes_P1, gene_universe, "Downregulated Genes in L1 vs L2")
'select()' returned 1:many mapping between keys and columns
Avis : 9.97% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
# Patient 2 (P2) comparison: L3 vs L4
upregulated_genes_P2 <- rownames(p2_comparison[p2_comparison$avg_log2FC > 1 & p2_comparison$p_val_adj < 0.05, ])
downregulated_genes_P2 <- rownames(p2_comparison[p2_comparison$avg_log2FC < -1 & p2_comparison$p_val_adj < 0.05, ])
go_up_P2 <- perform_go_enrichment(upregulated_genes_P2, gene_universe, "Upregulated Genes in L3 vs L4")
go_down_P2 <- perform_go_enrichment(downregulated_genes_P2, gene_universe, "Downregulated Genes in L3 vs L4")
kegg_up_P2 <- perform_kegg_enrichment(upregulated_genes_P2, gene_universe, "Upregulated Genes in L3 vs L4")
'select()' returned 1:1 mapping between keys and columns
Avis : 10.94% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
kegg_down_P2 <- perform_kegg_enrichment(downregulated_genes_P2, gene_universe, "Downregulated Genes in L3 vs L4")
'select()' returned 1:many mapping between keys and columns
Avis : 13.7% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
# Patient 3 (P3) comparisons
# L5 vs L6
upregulated_genes_P3_L5L6 <- rownames(p3_comparison_L5L6[p3_comparison_L5L6$avg_log2FC > 1 & p3_comparison_L5L6$p_val_adj < 0.05, ])
downregulated_genes_P3_L5L6 <- rownames(p3_comparison_L5L6[p3_comparison_L5L6$avg_log2FC < -1 & p3_comparison_L5L6$p_val_adj < 0.05, ])
go_up_P3_L5L6 <- perform_go_enrichment(upregulated_genes_P3_L5L6, gene_universe, "Upregulated Genes in L5 vs L6")
go_down_P3_L5L6 <- perform_go_enrichment(downregulated_genes_P3_L5L6, gene_universe, "Downregulated Genes in L5 vs L6")
kegg_up_P3_L5L6 <- perform_kegg_enrichment(upregulated_genes_P3_L5L6, gene_universe, "Upregulated Genes in L5 vs L6")
'select()' returned 1:1 mapping between keys and columns
Avis : 9.53% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
kegg_down_P3_L5L6 <- perform_kegg_enrichment(downregulated_genes_P3_L5L6, gene_universe, "Downregulated Genes in L5 vs L6")
'select()' returned 1:1 mapping between keys and columns
Avis : 15.5% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
# L5 vs L7
upregulated_genes_P3_L5L7 <- rownames(p3_comparison_L5L7[p3_comparison_L5L7$avg_log2FC > 1 & p3_comparison_L5L7$p_val_adj < 0.05, ])
downregulated_genes_P3_L5L7 <- rownames(p3_comparison_L5L7[p3_comparison_L5L7$avg_log2FC < -1 & p3_comparison_L5L7$p_val_adj < 0.05, ])
go_up_P3_L5L7 <- perform_go_enrichment(upregulated_genes_P3_L5L7, gene_universe, "Upregulated Genes in L5 vs L7")
go_down_P3_L5L7 <- perform_go_enrichment(downregulated_genes_P3_L5L7, gene_universe, "Downregulated Genes in L5 vs L7")
kegg_up_P3_L5L7 <- perform_kegg_enrichment(upregulated_genes_P3_L5L7, gene_universe, "Upregulated Genes in L5 vs L7")
'select()' returned 1:1 mapping between keys and columns
Avis : 13.71% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
kegg_down_P3_L5L7 <- perform_kegg_enrichment(downregulated_genes_P3_L5L7, gene_universe, "Downregulated Genes in L5 vs L7")
'select()' returned 1:many mapping between keys and columns
Avis : 12.59% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
# L6 vs L7
upregulated_genes_P3_L6L7 <- rownames(p3_comparison_L6L7[p3_comparison_L6L7$avg_log2FC > 1 & p3_comparison_L6L7$p_val_adj < 0.05, ])
downregulated_genes_P3_L6L7 <- rownames(p3_comparison_L6L7[p3_comparison_L6L7$avg_log2FC < -1 & p3_comparison_L6L7$p_val_adj < 0.05, ])
go_up_P3_L6L7 <- perform_go_enrichment(upregulated_genes_P3_L6L7, gene_universe, "Upregulated Genes in L6 vs L7")
go_down_P3_L6L7 <- perform_go_enrichment(downregulated_genes_P3_L6L7, gene_universe, "Downregulated Genes in L6 vs L7")
kegg_up_P3_L6L7 <- perform_kegg_enrichment(upregulated_genes_P3_L6L7, gene_universe, "Upregulated Genes in L6 vs L7")
'select()' returned 1:1 mapping between keys and columns
Avis : 16.76% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
kegg_down_P3_L6L7 <- perform_kegg_enrichment(downregulated_genes_P3_L6L7, gene_universe, "Downregulated Genes in L6 vs L7")
'select()' returned 1:many mapping between keys and columns
Avis : 8.58% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 27.64% of input gene IDs are fail to map...
# Function to get top genes from a comparison
library(igraph)
library(STRINGdb)
library(ggraph)
library(tidyverse)
library(tibble)
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
rownames_to_column("gene") %>%
arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Initialize STRINGdb
string_db <- STRINGdb$new(version="11", species=9606, score_threshold=700)
# Map genes to STRING identifiers
mapped_genes <- string_db$map(data.frame(gene=all_top_genes), "gene", removeUnmappedRows = TRUE)
Warning: we couldn't map to STRING 21% of your identifiers
# Get interactions
interactions <- string_db$get_interactions(mapped_genes$STRING_id)
# Create igraph object
g <- graph_from_data_frame(interactions, directed = FALSE)
# Calculate node degrees
V(g)$degree <- degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- betweenness(g)
# Identify communities
communities <- cluster_louvain(g)
V(g)$community <- communities$membership
# Plot the network
set.seed(123) # for reproducibility
ggraph(g, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), show.legend = FALSE) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_interaction_network.png", width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[order(V(g)$degree, decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "9606.ENSP00000385675" "9606.ENSP00000362795" "9606.ENSP00000426022" "9606.ENSP00000482259" "9606.ENSP00000229135"
[6] "9606.ENSP00000216341" "9606.ENSP00000245451" "9606.ENSP00000292301" "9606.ENSP00000398568" "9606.ENSP00000177694"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[order(V(g)$betweenness, decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "9606.ENSP00000385675" "9606.ENSP00000426022" "9606.ENSP00000398568" "9606.ENSP00000362795" "9606.ENSP00000245451"
[6] "9606.ENSP00000482259" "9606.ENSP00000216341" "9606.ENSP00000381448" "9606.ENSP00000292301" "9606.ENSP00000262768"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", vcount(g), "\n")
Number of nodes: 57
cat("Number of edges:", ecount(g), "\n")
Number of edges: 216
cat("Network density:", edge_density(g), "\n")
Network density: 0.1353383
cat("Average path length:", mean_distance(g), "\n")
Average path length: 3.02935
cat("Clustering coefficient:", transitivity(g), "\n")
Clustering coefficient: 0.450727
# Extract edge information
edges_df <- data.frame(
from = ends(g, E(g))[,1],
to = ends(g, E(g))[,2]
)
# Add edge attributes if any
edge_attrs <- edge_attr(g)
for (attr in names(edge_attrs)) {
edges_df[[attr]] <- edge_attrs[[attr]]
}
# Save the edges data frame
write.csv(edges_df, "gene_network_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- data.frame(
id = V(g)$name,
degree = degree(g),
betweenness = betweenness(g)
)
# Add other vertex attributes if any
vertex_attrs <- vertex_attr(g)
for (attr in names(vertex_attrs)) {
if (attr != "name") { # Skip 'name' as we already have it as 'id'
nodes_df[[attr]] <- vertex_attrs[[attr]]
}
}
# Save the nodes data frame
write.csv(nodes_df, "gene_network_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_network_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_network_edges.csv - 216 rows
cat("Nodes file: gene_network_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_network_nodes.csv - 57 rows
# Function to get top genes from a comparison
library(igraph)
library(STRINGdb)
library(ggraph)
library(tidyverse)
library(tibble)
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
rownames_to_column("gene") %>%
arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Initialize STRINGdb
string_db <- STRINGdb$new(version="11", species=9606, score_threshold=700)
# Map genes to STRING identifiers
mapped_genes <- string_db$map(data.frame(gene=all_top_genes), "gene", removeUnmappedRows = TRUE)
Warning: we couldn't map to STRING 21% of your identifiers
# Get interactions
interactions <- string_db$get_interactions(mapped_genes$STRING_id)
# Map STRING identifiers back to gene symbols
interactions$from <- mapped_genes$gene[match(interactions$from, mapped_genes$STRING_id)]
interactions$to <- mapped_genes$gene[match(interactions$to, mapped_genes$STRING_id)]
# Create igraph object
g <- graph_from_data_frame(interactions, directed = FALSE)
# Calculate node degrees
V(g)$degree <- degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- betweenness(g)
# Identify communities
communities <- cluster_louvain(g)
V(g)$community <- communities$membership
# Plot the network
set.seed(123) # for reproducibility
ggraph(g, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), show.legend = FALSE) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_interaction_network.png", width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[order(V(g)$degree, decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "IL6" "CXCR3" "GNGT2" "CCL4" "IFNG" "GZMB" "BMP4" "CCR2" "PRF1" "TBX21"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[order(V(g)$betweenness, decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "IL6" "GNGT2" "PRF1" "CXCR3" "BMP4" "CCL4" "GZMB" "CST3" "CCR2" "TIMP2"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", vcount(g), "\n")
Number of nodes: 57
cat("Number of edges:", ecount(g), "\n")
Number of edges: 216
cat("Network density:", edge_density(g), "\n")
Network density: 0.1353383
cat("Average path length:", mean_distance(g), "\n")
Average path length: 3.02935
cat("Clustering coefficient:", transitivity(g), "\n")
Clustering coefficient: 0.450727
# Extract edge information
edges_df <- data.frame(
from = ends(g, E(g))[,1],
to = ends(g, E(g))[,2]
)
# Add edge attributes if any
edge_attrs <- edge_attr(g)
for (attr in names(edge_attrs)) {
edges_df[[attr]] <- edge_attrs[[attr]]
}
# Save the edges data frame
write.csv(edges_df, "gene_network_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- data.frame(
id = V(g)$name,
degree = degree(g),
betweenness = betweenness(g)
)
# Add other vertex attributes if any
vertex_attrs <- vertex_attr(g)
for (attr in names(vertex_attrs)) {
if (attr != "name") { # Skip 'name' as we already have it as 'id'
nodes_df[[attr]] <- vertex_attrs[[attr]]
}
}
# Save the nodes data frame
write.csv(nodes_df, "gene_network_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_network_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_network_edges.csv - 216 rows
cat("Nodes file: gene_network_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_network_nodes.csv - 57 rows
# Load required libraries
library(igraph)
library(ggraph)
library(tidyverse)
library(tibble)
library(org.Hs.eg.db)
library(GO.db)
library(AnnotationDbi)
library(dplyr)
# Function to get top genes from comparison results
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
tibble::rownames_to_column("gene") %>%
dplyr::arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
dplyr::pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Get GO terms for all top genes
go_terms <- AnnotationDbi::select(org.Hs.eg.db, keys = all_top_genes,
columns = c("SYMBOL", "GO", "ONTOLOGY"),
keytype = "SYMBOL")
'select()' returned 1:many mapping between keys and columns
# Filter for biological process GO terms and remove NA values
go_terms_bp <- go_terms %>%
dplyr::filter(ONTOLOGY == "BP") %>%
dplyr::filter(!is.na(GO))
# Create edges dataframe
edges <- go_terms_bp %>%
dplyr::select(from = SYMBOL, to = GO)
# Print summary of edges
print(paste("Number of edges:", nrow(edges)))
[1] "Number of edges: 2309"
print(head(edges))
# If edges dataframe is empty, stop here
if (nrow(edges) == 0) {
stop("No GO terms found for any genes. Cannot create network.")
}
# Create graph
g <- igraph::graph_from_data_frame(edges, directed = FALSE)
# Calculate node degrees
V(g)$degree <- igraph::degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- igraph::betweenness(g)
# Identify communities
communities <- igraph::cluster_louvain(g)
V(g)$community <- communities$membership
# Get GO term descriptions
go_terms_desc <- AnnotationDbi::select(GO.db, keys = unique(edges$to),
columns = "TERM", keytype = "GOID")
'select()' returned 1:1 mapping between keys and columns
# Add GO term descriptions to the graph
V(g)$description <- go_terms_desc$TERM[match(V(g)$name, go_terms_desc$GOID)]
# Plot the network
set.seed(123) # for reproducibility
p <- ggraph(g, layout = "fr") +
geom_edge_link(alpha = 0.1) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = ifelse(degree > quantile(degree, 0.95), name, "")),
repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene-GO Term Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_go_network.png", p, width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[V(g)$name %in% all_top_genes][order(V(g)$degree[V(g)$name %in% all_top_genes], decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "BMP4" "IL6" "SNCA" "IFNG" "NKX2-5" "SIX1" "WNT11" "SOX4" "CCR2" "IL1A"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[V(g)$name %in% all_top_genes][order(V(g)$betweenness[V(g)$name %in% all_top_genes], decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "BMP4" "IL6" "SNCA" "IFNG" "NKX2-5" "SIX1" "WNT11" "IL1A" "HMGA2" "SOX4"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", igraph::vcount(g), "\n")
Number of nodes: 1572
cat("Number of edges:", igraph::ecount(g), "\n")
Number of edges: 2309
cat("Network density:", igraph::edge_density(g), "\n")
Network density: 0.001869929
cat("Average path length:", igraph::mean_distance(g), "\n")
Average path length: 4.994568
cat("Clustering coefficient:", igraph::transitivity(g), "\n")
Clustering coefficient: 0
# Extract edge information
edges_df <- igraph::as_data_frame(g, what = "edges")
# Save the edges data frame
write.csv(edges_df, "gene_go_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- igraph::as_data_frame(g, what = "vertices")
# Save the nodes data frame
write.csv(nodes_df, "gene_go_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_go_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_go_edges.csv - 2309 rows
cat("Nodes file: gene_go_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_go_nodes.csv - 1572 rows
# Print top GO terms
top_go_terms <- V(g)$name[!(V(g)$name %in% all_top_genes)][order(V(g)$degree[!(V(g)$name %in% all_top_genes)], decreasing = TRUE)][1:20]
cat("\nTop 20 GO terms:\n")
Top 20 GO terms:
for (term in top_go_terms) {
cat(term, "-", V(g)$description[V(g)$name == term], "\n")
}
GO:0045944 - positive regulation of transcription by RNA polymerase II
GO:0000122 - negative regulation of transcription by RNA polymerase II
GO:0007165 - signal transduction
GO:0006357 - regulation of transcription by RNA polymerase II
GO:0045893 - positive regulation of DNA-templated transcription
GO:0045892 - negative regulation of DNA-templated transcription
GO:0010628 - positive regulation of gene expression
GO:0007155 - cell adhesion
GO:0006955 - immune response
GO:0007186 - G protein-coupled receptor signaling pathway
GO:0030154 - cell differentiation
GO:0006954 - inflammatory response
GO:0008284 - positive regulation of cell population proliferation
GO:0006915 - apoptotic process
GO:0045087 - innate immune response
GO:0043066 - negative regulation of apoptotic process
GO:0002250 - adaptive immune response
GO:0090090 - negative regulation of canonical Wnt signaling pathway
GO:0070374 - positive regulation of ERK1 and ERK2 cascade
GO:0098609 - cell-cell adhesion
#save(All_samples_Merged, file = "All_samples_Merged_DE.Robj")