#Differential Expression Analysis
#Load Seurat Object L7
load("../../0-IMP-OBJECTS/Harmony_integrated_All_samples_Merged_with_PBMC10x.Robj")
All_samples_Merged
An object of class Seurat
64169 features across 59355 samples within 6 assays
Active assay: SCT (27417 features, 3000 variable features)
3 layers present: counts, data, scale.data
5 other assays present: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
6 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap, harmony, umap.harmony
DimPlot(All_samples_Merged, reduction = "umap.harmony", group.by = "cell_line",label = T, label.box = T)
#Differential Expression Analysis
DefaultAssay(All_samples_Merged) <- "SCT"
Idents(All_samples_Merged) <- "cell_line"
all_markers <- FindAllMarkers(All_samples_Merged, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
Calculating cluster L1
Calculating cluster L2
Calculating cluster L3
Calculating cluster L4
Calculating cluster L5
Calculating cluster L6
Calculating cluster L7
Calculating cluster PBMC
Calculating cluster PBMC_10x
write.csv(all_markers, "all_markers.csv")
head(all_markers)
NA
NA
top10_markers <- all_markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) %>%
arrange(cluster, desc(avg_log2FC))
top10_genes <- unique(top10_markers$gene)
heatmap_data <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")[top10_genes, ]
Avis : The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.
heatmap_data <- heatmap_data - rowMeans(heatmap_data)
cell_line_colors <- rainbow(length(unique(All_samples_Merged$cell_line)))
names(cell_line_colors) <- unique(All_samples_Merged$cell_line)
annotation_colors <- list(cell_line = cell_line_colors)
p <- pheatmap(heatmap_data,
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = All_samples_Merged@meta.data[, "cell_line", drop = FALSE],
annotation_colors = annotation_colors,
main = "Top 10 Upregulated Genes per Cell Line (Sorted by avg_log2FC)",
fontsize_row = 6,
treeheight_col = 0,
cluster_rows = FALSE,
cluster_cols = FALSE)
print(p)
png("heatmap_top10_log2fc_sorted.png", width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
png
2
top10_markers_seurat <- all_markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC)
top10_genes_seurat <- unique(top10_markers_seurat$gene)
heatmap_data_seurat <- GetAssayData(All_samples_Merged, assay = "SCT", slot = "data")[top10_genes_seurat, ]
heatmap_data_seurat <- heatmap_data_seurat - rowMeans(heatmap_data_seurat)
p2 <- pheatmap(heatmap_data_seurat,
show_rownames = TRUE,
show_colnames = FALSE,
annotation_col = All_samples_Merged@meta.data[, "cell_line", drop = FALSE],
annotation_colors = annotation_colors,
main = "Top 10 Markers per Cell Line (Seurat Default)",
fontsize_row = 6,
treeheight_col = 0)
print(p2)
png("heatmap_top10_seurat.png", width = 12, height = 8, units = "in", res = 300)
print(p2)
dev.off()
png
2
library(EnhancedVolcano)
Le chargement a nécessité le package : ggrepel
perform_comparison_and_volcano <- function(All_samples_Merged, ident1, ident2) {
Idents(All_samples_Merged) <- "cell_line"
markers <- FindMarkers(All_samples_Merged, ident.1 = ident1, ident.2 = ident2, assay = "SCT")
write.csv(markers, paste0("comparison_", ident1, "_vs_", ident2, ".csv"))
# Create volcano plot
volcano_plot <- EnhancedVolcano(markers,
lab = rownames(markers),
x = 'avg_log2FC',
y = 'p_val_adj',
title = paste(ident1, 'vs', ident2),
pCutoff = 0.05,
FCcutoff = 1,
pointSize = 1.5,
labSize = 4.0,
col = c('grey', 'darkgreen', 'blue', 'red'),
colAlpha = 0.5,
legendPosition = 'right',
legendLabSize = 10,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.5)
print(volcano_plot)
png(paste0("volcano_", ident1, "_vs_", ident2, ".png"), width = 12, height = 10, units = "in", res = 300)
print(volcano_plot)
dev.off()
return(markers)
}
# Patient 1
p1_comparison <- perform_comparison_and_volcano(All_samples_Merged, "L1", "L2")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p1_comparison)
# Patient 2
p2_comparison <- perform_comparison_and_volcano(All_samples_Merged, "L3", "L4")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p2_comparison)
# Patient 3
p3_comparison_L5L6 <- perform_comparison_and_volcano(All_samples_Merged, "L5", "L6")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
p3_comparison_L5L7 <- perform_comparison_and_volcano(All_samples_Merged, "L5", "L7")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
p3_comparison_L6L7 <- perform_comparison_and_volcano(All_samples_Merged, "L6", "L7")
Avis : One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
head(p3_comparison_L5L6)
head(p3_comparison_L5L7)
head(p3_comparison_L6L7)
NA
NA
perform_go_enrichment <- function(gene_list, gene_universe, title) {
ego <- enrichGO(gene = gene_list,
universe = gene_universe,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE)
p <- dotplot(ego, showCategory = 20, title = paste("GO -", title)) +
theme(axis.text.y = element_text(size = 8))
print(p)
png(paste0("GO_enrichment_", gsub(" ", "_", title), ".png"), width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
return(ego)
}
perform_kegg_enrichment <- function(gene_list, gene_universe, title) {
# Convert gene symbols to Entrez IDs
entrez_ids <- bitr(gene_list, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
universe_entrez <- bitr(gene_universe, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
ekegg <- enrichKEGG(gene = entrez_ids,
universe = universe_entrez,
organism = 'hsa',
keyType = "kegg",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
p <- dotplot(ekegg, showCategory = 20, title = paste("KEGG -", title)) +
theme(axis.text.y = element_text(size = 8))
print(p)
png(paste0("KEGG_enrichment_", gsub(" ", "_", title), ".png"), width = 12, height = 8, units = "in", res = 300)
print(p)
dev.off()
return(ekegg)
}
gene_universe <- rownames(All_samples_Merged)
# Patient 1 (P1) comparison: L1 vs L2
upregulated_genes_P1 <- rownames(p1_comparison[p1_comparison$avg_log2FC > 1 & p1_comparison$p_val_adj < 0.05, ])
downregulated_genes_P1 <- rownames(p1_comparison[p1_comparison$avg_log2FC < -1 & p1_comparison$p_val_adj < 0.05, ])
go_up_P1 <- perform_go_enrichment(upregulated_genes_P1, gene_universe, "Upregulated Genes in L1 vs L2")
go_down_P1 <- perform_go_enrichment(downregulated_genes_P1, gene_universe, "Downregulated Genes in L1 vs L2")
kegg_up_P1 <- perform_kegg_enrichment(upregulated_genes_P1, gene_universe, "Upregulated Genes in L1 vs L2")
'select()' returned 1:1 mapping between keys and columns
Avis : 14.13% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
kegg_down_P1 <- perform_kegg_enrichment(downregulated_genes_P1, gene_universe, "Downregulated Genes in L1 vs L2")
'select()' returned 1:many mapping between keys and columns
Avis : 12.15% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
# Patient 2 (P2) comparison: L3 vs L4
upregulated_genes_P2 <- rownames(p2_comparison[p2_comparison$avg_log2FC > 1 & p2_comparison$p_val_adj < 0.05, ])
downregulated_genes_P2 <- rownames(p2_comparison[p2_comparison$avg_log2FC < -1 & p2_comparison$p_val_adj < 0.05, ])
go_up_P2 <- perform_go_enrichment(upregulated_genes_P2, gene_universe, "Upregulated Genes in L3 vs L4")
go_down_P2 <- perform_go_enrichment(downregulated_genes_P2, gene_universe, "Downregulated Genes in L3 vs L4")
kegg_up_P2 <- perform_kegg_enrichment(upregulated_genes_P2, gene_universe, "Upregulated Genes in L3 vs L4")
'select()' returned 1:1 mapping between keys and columns
Avis : 16.94% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
kegg_down_P2 <- perform_kegg_enrichment(downregulated_genes_P2, gene_universe, "Downregulated Genes in L3 vs L4")
'select()' returned 1:many mapping between keys and columns
Avis : 13.02% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
# Patient 3 (P3) comparisons
# L5 vs L6
upregulated_genes_P3_L5L6 <- rownames(p3_comparison_L5L6[p3_comparison_L5L6$avg_log2FC > 1 & p3_comparison_L5L6$p_val_adj < 0.05, ])
downregulated_genes_P3_L5L6 <- rownames(p3_comparison_L5L6[p3_comparison_L5L6$avg_log2FC < -1 & p3_comparison_L5L6$p_val_adj < 0.05, ])
go_up_P3_L5L6 <- perform_go_enrichment(upregulated_genes_P3_L5L6, gene_universe, "Upregulated Genes in L5 vs L6")
go_down_P3_L5L6 <- perform_go_enrichment(downregulated_genes_P3_L5L6, gene_universe, "Downregulated Genes in L5 vs L6")
kegg_up_P3_L5L6 <- perform_kegg_enrichment(upregulated_genes_P3_L5L6, gene_universe, "Upregulated Genes in L5 vs L6")
'select()' returned 1:1 mapping between keys and columns
Avis : 11.77% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
kegg_down_P3_L5L6 <- perform_kegg_enrichment(downregulated_genes_P3_L5L6, gene_universe, "Downregulated Genes in L5 vs L6")
'select()' returned 1:1 mapping between keys and columns
Avis : 18.73% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
# L5 vs L7
upregulated_genes_P3_L5L7 <- rownames(p3_comparison_L5L7[p3_comparison_L5L7$avg_log2FC > 1 & p3_comparison_L5L7$p_val_adj < 0.05, ])
downregulated_genes_P3_L5L7 <- rownames(p3_comparison_L5L7[p3_comparison_L5L7$avg_log2FC < -1 & p3_comparison_L5L7$p_val_adj < 0.05, ])
go_up_P3_L5L7 <- perform_go_enrichment(upregulated_genes_P3_L5L7, gene_universe, "Upregulated Genes in L5 vs L7")
go_down_P3_L5L7 <- perform_go_enrichment(downregulated_genes_P3_L5L7, gene_universe, "Downregulated Genes in L5 vs L7")
kegg_up_P3_L5L7 <- perform_kegg_enrichment(upregulated_genes_P3_L5L7, gene_universe, "Upregulated Genes in L5 vs L7")
'select()' returned 1:1 mapping between keys and columns
Avis : 16.38% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
kegg_down_P3_L5L7 <- perform_kegg_enrichment(downregulated_genes_P3_L5L7, gene_universe, "Downregulated Genes in L5 vs L7")
'select()' returned 1:many mapping between keys and columns
Avis : 14.49% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
# L6 vs L7
upregulated_genes_P3_L6L7 <- rownames(p3_comparison_L6L7[p3_comparison_L6L7$avg_log2FC > 1 & p3_comparison_L6L7$p_val_adj < 0.05, ])
downregulated_genes_P3_L6L7 <- rownames(p3_comparison_L6L7[p3_comparison_L6L7$avg_log2FC < -1 & p3_comparison_L6L7$p_val_adj < 0.05, ])
go_up_P3_L6L7 <- perform_go_enrichment(upregulated_genes_P3_L6L7, gene_universe, "Upregulated Genes in L6 vs L7")
go_down_P3_L6L7 <- perform_go_enrichment(downregulated_genes_P3_L6L7, gene_universe, "Downregulated Genes in L6 vs L7")
kegg_up_P3_L6L7 <- perform_kegg_enrichment(upregulated_genes_P3_L6L7, gene_universe, "Upregulated Genes in L6 vs L7")
'select()' returned 1:1 mapping between keys and columns
Avis : 19.91% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
kegg_down_P3_L6L7 <- perform_kegg_enrichment(downregulated_genes_P3_L6L7, gene_universe, "Downregulated Genes in L6 vs L7")
'select()' returned 1:many mapping between keys and columns
Avis : 11.42% of input gene IDs are fail to map...'select()' returned 1:many mapping between keys and columns
Avis : 28.75% of input gene IDs are fail to map...
# Function to get top genes from a comparison
library(igraph)
Attachement du package : ‘igraph’
L'objet suivant est masqué depuis ‘package:IRanges’:
union
L'objet suivant est masqué depuis ‘package:S4Vectors’:
union
Les objets suivants sont masqués depuis ‘package:BiocGenerics’:
normalize, path, union
L'objet suivant est masqué depuis ‘package:clusterProfiler’:
simplify
Les objets suivants sont masqués depuis ‘package:dplyr’:
as_data_frame, groups, union
L'objet suivant est masqué depuis ‘package:Seurat’:
components
Les objets suivants sont masqués depuis ‘package:stats’:
decompose, spectrum
L'objet suivant est masqué depuis ‘package:base’:
union
library(STRINGdb)
library(ggraph)
Attachement du package : ‘ggraph’
L'objet suivant est masqué depuis ‘package:sp’:
geometry
library(tidyverse)
── Attaching core tidyverse packages ───────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
✔ readr 2.1.5 ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%--%() masks igraph::%--%()
✖ lubridate::%within%() masks IRanges::%within%()
✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
✖ IRanges::collapse() masks dplyr::collapse()
✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
✖ purrr::compose() masks igraph::compose()
✖ tidyr::crossing() masks igraph::crossing()
✖ IRanges::desc() masks dplyr::desc()
✖ tidyr::expand() masks S4Vectors::expand()
✖ clusterProfiler::filter() masks dplyr::filter(), stats::filter()
✖ S4Vectors::first() masks dplyr::first()
✖ dplyr::lag() masks stats::lag()
✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
✖ purrr::reduce() masks IRanges::reduce()
✖ S4Vectors::rename() masks clusterProfiler::rename(), dplyr::rename()
✖ lubridate::second() masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ AnnotationDbi::select() masks clusterProfiler::select(), dplyr::select()
✖ purrr::simplify() masks igraph::simplify(), clusterProfiler::simplify()
✖ IRanges::slice() masks clusterProfiler::slice(), dplyr::slice()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tibble)
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
rownames_to_column("gene") %>%
arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Initialize STRINGdb
string_db <- STRINGdb$new(version="11", species=9606, score_threshold=700)
# Map genes to STRING identifiers
mapped_genes <- string_db$map(data.frame(gene=all_top_genes), "gene", removeUnmappedRows = TRUE)
essai de l'URL 'https://stringdb-static.org/download/protein.aliases.v11.0/9606.protein.aliases.v11.0.txt.gz'
Content type 'application/octet-stream' length 13253303 bytes (12.6 MB)
==================================================
downloaded 12.6 MB
essai de l'URL 'https://stringdb-static.org/download/protein.info.v11.0/9606.protein.info.v11.0.txt.gz'
Content type 'application/octet-stream' length 1894048 bytes (1.8 MB)
==================================================
downloaded 1.8 MB
Warning: we couldn't map to STRING 24% of your identifiers
# Get interactions
interactions <- string_db$get_interactions(mapped_genes$STRING_id)
essai de l'URL 'https://stringdb-static.org/download/protein.links.v11.0/9606.protein.links.v11.0.txt.gz'
Content type 'application/octet-stream' length 71241372 bytes (67.9 MB)
==================================================
downloaded 67.9 MB
# Create igraph object
g <- graph_from_data_frame(interactions, directed = FALSE)
# Calculate node degrees
V(g)$degree <- degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- betweenness(g)
# Identify communities
communities <- cluster_louvain(g)
V(g)$community <- communities$membership
# Plot the network
set.seed(123) # for reproducibility
ggraph(g, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), show.legend = FALSE) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_interaction_network.png", width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[order(V(g)$degree, decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "9606.ENSP00000385675" "9606.ENSP00000362795" "9606.ENSP00000248564" "9606.ENSP00000426022"
[5] "9606.ENSP00000482259" "9606.ENSP00000292301" "9606.ENSP00000216341" "9606.ENSP00000229135"
[9] "9606.ENSP00000398568" "9606.ENSP00000177694"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[order(V(g)$betweenness, decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "9606.ENSP00000385675" "9606.ENSP00000362795" "9606.ENSP00000398568" "9606.ENSP00000245451"
[5] "9606.ENSP00000216341" "9606.ENSP00000482259" "9606.ENSP00000292301" "9606.ENSP00000248564"
[9] "9606.ENSP00000426022" "9606.ENSP00000228280"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", vcount(g), "\n")
Number of nodes: 60
cat("Number of edges:", ecount(g), "\n")
Number of edges: 228
cat("Network density:", edge_density(g), "\n")
Network density: 0.1288136
cat("Average path length:", mean_distance(g), "\n")
Average path length: 3.090728
cat("Clustering coefficient:", transitivity(g), "\n")
Clustering coefficient: 0.4502229
# Extract edge information
edges_df <- data.frame(
from = ends(g, E(g))[,1],
to = ends(g, E(g))[,2]
)
# Add edge attributes if any
edge_attrs <- edge_attr(g)
for (attr in names(edge_attrs)) {
edges_df[[attr]] <- edge_attrs[[attr]]
}
# Save the edges data frame
write.csv(edges_df, "gene_network_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- data.frame(
id = V(g)$name,
degree = degree(g),
betweenness = betweenness(g)
)
# Add other vertex attributes if any
vertex_attrs <- vertex_attr(g)
for (attr in names(vertex_attrs)) {
if (attr != "name") { # Skip 'name' as we already have it as 'id'
nodes_df[[attr]] <- vertex_attrs[[attr]]
}
}
# Save the nodes data frame
write.csv(nodes_df, "gene_network_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_network_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_network_edges.csv - 228 rows
cat("Nodes file: gene_network_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_network_nodes.csv - 60 rows
# Function to get top genes from a comparison
library(igraph)
library(STRINGdb)
library(ggraph)
library(tidyverse)
library(tibble)
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
rownames_to_column("gene") %>%
arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Initialize STRINGdb
string_db <- STRINGdb$new(version="11", species=9606, score_threshold=700)
# Map genes to STRING identifiers
mapped_genes <- string_db$map(data.frame(gene=all_top_genes), "gene", removeUnmappedRows = TRUE)
Warning: we couldn't map to STRING 24% of your identifiers
# Get interactions
interactions <- string_db$get_interactions(mapped_genes$STRING_id)
# Map STRING identifiers back to gene symbols
interactions$from <- mapped_genes$gene[match(interactions$from, mapped_genes$STRING_id)]
interactions$to <- mapped_genes$gene[match(interactions$to, mapped_genes$STRING_id)]
# Create igraph object
g <- graph_from_data_frame(interactions, directed = FALSE)
# Calculate node degrees
V(g)$degree <- degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- betweenness(g)
# Identify communities
communities <- cluster_louvain(g)
V(g)$community <- communities$membership
# Plot the network
set.seed(123) # for reproducibility
ggraph(g, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), show.legend = FALSE) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_interaction_network.png", width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[order(V(g)$degree, decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "IL6" "CXCR3" "GNG11" "GNGT2" "CCL4" "CCR2" "GZMB" "IFNG" "PRF1" "TBX21"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[order(V(g)$betweenness, decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "IL6" "CXCR3" "PRF1" "BMP4" "GZMB" "CCL4" "CCR2" "GNG11" "GNGT2" "KITLG"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", vcount(g), "\n")
Number of nodes: 60
cat("Number of edges:", ecount(g), "\n")
Number of edges: 228
cat("Network density:", edge_density(g), "\n")
Network density: 0.1288136
cat("Average path length:", mean_distance(g), "\n")
Average path length: 3.090728
cat("Clustering coefficient:", transitivity(g), "\n")
Clustering coefficient: 0.4502229
# Extract edge information
edges_df <- data.frame(
from = ends(g, E(g))[,1],
to = ends(g, E(g))[,2]
)
# Add edge attributes if any
edge_attrs <- edge_attr(g)
for (attr in names(edge_attrs)) {
edges_df[[attr]] <- edge_attrs[[attr]]
}
# Save the edges data frame
write.csv(edges_df, "gene_network_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- data.frame(
id = V(g)$name,
degree = degree(g),
betweenness = betweenness(g)
)
# Add other vertex attributes if any
vertex_attrs <- vertex_attr(g)
for (attr in names(vertex_attrs)) {
if (attr != "name") { # Skip 'name' as we already have it as 'id'
nodes_df[[attr]] <- vertex_attrs[[attr]]
}
}
# Save the nodes data frame
write.csv(nodes_df, "gene_network_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_network_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_network_edges.csv - 228 rows
cat("Nodes file: gene_network_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_network_nodes.csv - 60 rows
# Load required libraries
library(igraph)
library(ggraph)
library(tidyverse)
library(tibble)
library(org.Hs.eg.db)
library(GO.db)
library(AnnotationDbi)
library(dplyr)
# Function to get top genes from comparison results
get_top_genes <- function(comparison_result, n = 50) {
top_genes <- comparison_result %>%
tibble::rownames_to_column("gene") %>%
dplyr::arrange(desc(abs(avg_log2FC))) %>%
head(n) %>%
dplyr::pull(gene)
return(top_genes)
}
# Combine top genes from all comparisons
all_top_genes <- unique(c(
get_top_genes(p1_comparison),
get_top_genes(p2_comparison),
get_top_genes(p3_comparison_L5L6),
get_top_genes(p3_comparison_L5L7),
get_top_genes(p3_comparison_L6L7)
))
# Get GO terms for all top genes
go_terms <- AnnotationDbi::select(org.Hs.eg.db, keys = all_top_genes,
columns = c("SYMBOL", "GO", "ONTOLOGY"),
keytype = "SYMBOL")
'select()' returned 1:many mapping between keys and columns
# Filter for biological process GO terms and remove NA values
go_terms_bp <- go_terms %>%
dplyr::filter(ONTOLOGY == "BP") %>%
dplyr::filter(!is.na(GO))
# Create edges dataframe
edges <- go_terms_bp %>%
dplyr::select(from = SYMBOL, to = GO)
# Print summary of edges
print(paste("Number of edges:", nrow(edges)))
[1] "Number of edges: 2401"
print(head(edges))
# If edges dataframe is empty, stop here
if (nrow(edges) == 0) {
stop("No GO terms found for any genes. Cannot create network.")
}
# Create graph
g <- igraph::graph_from_data_frame(edges, directed = FALSE)
# Calculate node degrees
V(g)$degree <- igraph::degree(g)
# Calculate betweenness centrality
V(g)$betweenness <- igraph::betweenness(g)
# Identify communities
communities <- igraph::cluster_louvain(g)
V(g)$community <- communities$membership
# Get GO term descriptions
go_terms_desc <- AnnotationDbi::select(GO.db, keys = unique(edges$to),
columns = "TERM", keytype = "GOID")
'select()' returned 1:1 mapping between keys and columns
# Add GO term descriptions to the graph
V(g)$description <- go_terms_desc$TERM[match(V(g)$name, go_terms_desc$GOID)]
# Plot the network
set.seed(123) # for reproducibility
p <- ggraph(g, layout = "fr") +
geom_edge_link(alpha = 0.1) +
geom_node_point(aes(color = factor(community), size = degree)) +
geom_node_text(aes(label = ifelse(degree > quantile(degree, 0.95), name, "")),
repel = TRUE, size = 3) +
scale_color_brewer(palette = "Set1") +
theme_void() +
labs(title = "Gene-GO Term Interaction Network",
subtitle = "Based on top differentially expressed genes",
color = "Community",
size = "Degree")
# Save the plot
ggsave("gene_go_network.png", p, width = 12, height = 10, dpi = 300)
# Identify hub genes
hub_genes <- V(g)$name[V(g)$name %in% all_top_genes][order(V(g)$degree[V(g)$name %in% all_top_genes], decreasing = TRUE)][1:10]
cat("Top 10 hub genes:\n")
Top 10 hub genes:
print(hub_genes)
[1] "BMP4" "FGFR2" "IL6" "SNCA" "IFNG" "NKX2-5" "SIX1" "WNT11" "TWIST1" "AR"
# Identify genes with high betweenness centrality
high_betweenness_genes <- V(g)$name[V(g)$name %in% all_top_genes][order(V(g)$betweenness[V(g)$name %in% all_top_genes], decreasing = TRUE)][1:10]
cat("\nTop 10 genes with high betweenness centrality:\n")
Top 10 genes with high betweenness centrality:
print(high_betweenness_genes)
[1] "BMP4" "IL6" "SNCA" "FGFR2" "IFNG" "NKX2-5" "AR" "SIX1" "WNT11" "TWIST1"
# Calculate and print some network statistics
cat("\nNetwork Statistics:\n")
Network Statistics:
cat("Number of nodes:", igraph::vcount(g), "\n")
Number of nodes: 1609
cat("Number of edges:", igraph::ecount(g), "\n")
Number of edges: 2401
cat("Network density:", igraph::edge_density(g), "\n")
Network density: 0.001856009
cat("Average path length:", igraph::mean_distance(g), "\n")
Average path length: 4.87717
cat("Clustering coefficient:", igraph::transitivity(g), "\n")
Clustering coefficient: 0
# Extract edge information
edges_df <- igraph::as_data_frame(g, what = "edges")
# Save the edges data frame
write.csv(edges_df, "gene_go_edges.csv", row.names = FALSE)
# Extract node information
nodes_df <- igraph::as_data_frame(g, what = "vertices")
# Save the nodes data frame
write.csv(nodes_df, "gene_go_nodes.csv", row.names = FALSE)
# Print a summary of the saved data
cat("Saved network data:\n")
Saved network data:
cat("Edges file: gene_go_edges.csv -", nrow(edges_df), "rows\n")
Edges file: gene_go_edges.csv - 2401 rows
cat("Nodes file: gene_go_nodes.csv -", nrow(nodes_df), "rows\n")
Nodes file: gene_go_nodes.csv - 1609 rows
# Print top GO terms
top_go_terms <- V(g)$name[!(V(g)$name %in% all_top_genes)][order(V(g)$degree[!(V(g)$name %in% all_top_genes)], decreasing = TRUE)][1:20]
cat("\nTop 20 GO terms:\n")
Top 20 GO terms:
for (term in top_go_terms) {
cat(term, "-", V(g)$description[V(g)$name == term], "\n")
}
GO:0045944 - positive regulation of transcription by RNA polymerase II
GO:0000122 - negative regulation of transcription by RNA polymerase II
GO:0007165 - signal transduction
GO:0045893 - positive regulation of DNA-templated transcription
GO:0006357 - regulation of transcription by RNA polymerase II
GO:0008284 - positive regulation of cell population proliferation
GO:0010628 - positive regulation of gene expression
GO:0045892 - negative regulation of DNA-templated transcription
GO:0007155 - cell adhesion
GO:0007186 - G protein-coupled receptor signaling pathway
GO:0006955 - immune response
GO:0030154 - cell differentiation
GO:0043066 - negative regulation of apoptotic process
GO:0006915 - apoptotic process
GO:0006954 - inflammatory response
GO:0045087 - innate immune response
GO:0002250 - adaptive immune response
GO:0090090 - negative regulation of canonical Wnt signaling pathway
GO:0098609 - cell-cell adhesion
GO:0070374 - positive regulation of ERK1 and ERK2 cascade
#save(All_samples_Merged, file = "All_samples_Merged_DE.Robj")