This report demonstrates the visualization and analysis of network communities using graph-based methods in R.
##Data Preparation
###First, we load the dataset and inspect its structure.
# Load data (update the file path to your dataset)
data_path <- "redesteroles.tsv"
data <- tryCatch({
read.table(data_path, header = TRUE, sep = "\t")
}, error = function(e) {
stop("Error reading the file. Ensure the file path and format are correct: ", e$message)
})
# Display the first few rows
head(data)
## node1 node2 node1_string_id
## 1 HMG1 PGSC0003DMT400008586 4113.PGSC0003DMT400035542
## 2 HMG1 PGSC0003DMT400012593 4113.PGSC0003DMT400035542
## 3 HMG1 PGSC0003DMT400013082 4113.PGSC0003DMT400035542
## 4 HMG1 Sgt2 4113.PGSC0003DMT400035542
## 5 HMG1 PGSC0003DMT400058561 4113.PGSC0003DMT400035542
## 6 HMG1 PGSC0003DMT400058564 4113.PGSC0003DMT400035542
## node2_string_id neighborhood_on_chromosome gene_fusion
## 1 4113.PGSC0003DMT400008586 0.000 0
## 2 4113.PGSC0003DMT400012593 0.000 0
## 3 4113.PGSC0003DMT400013082 0.000 0
## 4 4113.PGSC0003DMT400045138 0.000 0
## 5 4113.PGSC0003DMT400058561 0.148 0
## 6 4113.PGSC0003DMT400058564 0.148 0
## phylogenetic_cooccurrence homology coexpression
## 1 0 0 0.582
## 2 0 0 0.582
## 3 0 0 0.582
## 4 0 0 0.000
## 5 0 0 0.301
## 6 0 0 0.301
## experimentally_determined_interaction database_annotated automated_textmining
## 1 0 0.000 0.639
## 2 0 0.000 0.579
## 3 0 0.000 0.374
## 4 0 0.000 0.735
## 5 0 0.418 0.358
## 6 0 0.787 0.358
## combined_score
## 1 0.842
## 2 0.816
## 3 0.727
## 4 0.735
## 5 0.747
## 6 0.907
# Check column names
colnames(data)
## [1] "node1"
## [2] "node2"
## [3] "node1_string_id"
## [4] "node2_string_id"
## [5] "neighborhood_on_chromosome"
## [6] "gene_fusion"
## [7] "phylogenetic_cooccurrence"
## [8] "homology"
## [9] "coexpression"
## [10] "experimentally_determined_interaction"
## [11] "database_annotated"
## [12] "automated_textmining"
## [13] "combined_score"
##Network Construction
###We construct the graph using the igraph package.
# Create a graph object
graph <- graph_from_data_frame(data, directed = FALSE)
# Basic properties of the graph
cat("Number of nodes:", vcount(graph), "\n")
## Number of nodes: 39
cat("Number of edges:", ecount(graph), "\n")
## Number of edges: 254
# graph visualization
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), color = "orange", show.legend = TRUE) +
geom_node_point(aes(size = degree(graph), color = closeness(graph)), show.legend = TRUE) +
scale_color_gradient(low = "blue", high = "red", name = "Closeness Centrality") +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, size = 14))
##Community Detection
###We detect communities using the Louvain method and assign community-based colors.
# Community detection
communities <- cluster_louvain(graph)
V(graph)$community <- communities$membership
# Assign colors to communities
community_colors <- brewer.pal(length(unique(V(graph)$community)), "Set3")
##Enhanced Visualization
###We visualize the graph with enhanced aesthetics.
# Plot the graph with enhanced visualization
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = combined_score), color = "gray", edge_width = 0.5) +
geom_node_point(aes(size = degree(graph), color = factor(community)), alpha = 0.9) +
scale_size_continuous(range = c(3, 8), name = "Node Degree") +
scale_color_manual(values = community_colors, name = "Communities") +
geom_node_text(aes(label = ifelse(degree(graph) > quantile(degree(graph), 0.9), name, "")),
repel = TRUE, size = 3, color = "black") +
theme_void() +
ggtitle("Enhanced Network Community Visualization") +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
##Centrality Analysis
##We calculate centrality metrics and identify the top hub nodes.
# Centrality metrics
centrality_metrics <- data.frame(
node = V(graph)$name,
degree = degree(graph, mode = "all"),
betweenness = betweenness(graph, normalized = TRUE),
closeness = closeness(graph, normalized = TRUE)
)
# Top 10 hub nodes by degree
hub_genes <- centrality_metrics %>%
arrange(desc(degree)) %>%
head(10)
# Display hub nodes
print(hub_genes)
## node degree betweenness closeness
## PGSC0003DMT400008586 PGSC0003DMT400008586 29 0.10826498 0.8372093
## PGSC0003DMT400012593 PGSC0003DMT400012593 29 0.10826498 0.8372093
## PGSC0003DMT400013082 PGSC0003DMT400013082 28 0.10826498 0.8181818
## PGSC0003DMT400042257 PGSC0003DMT400042257 25 0.03396964 0.7500000
## PGSC0003DMT400058561 PGSC0003DMT400058561 22 0.05756814 0.7200000
## PGSC0003DMT400058564 PGSC0003DMT400058564 22 0.05756814 0.7200000
## PGSC0003DMT400005519 PGSC0003DMT400005519 20 0.02366686 0.6792453
## PGSC0003DMT400078196 PGSC0003DMT400078196 20 0.02366686 0.6792453
## PGSC0003DMT400010023 PGSC0003DMT400010023 19 0.01869158 0.6666667
## PGSC0003DMT400017668 PGSC0003DMT400017668 19 0.01458762 0.6666667
Subgraph Analysis
We create subgraphs based on specific attributes.
# Subgraph for high coexpression
high_coexpression <- data %>%
filter(coexpression > 0.7)
subgraph_coexpression <- graph_from_data_frame(high_coexpression, directed = FALSE)
# Visualization of the coexpression subgraph
ggraph(subgraph_coexpression, layout = "fr") +
geom_edge_link(aes(edge_alpha = coexpression), color = "blue", show.legend = TRUE) +
geom_node_point(color = "red", size = 5) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_minimal() +
ggtitle("High Co-expression Subgraph")
Additional Descriptive Statistics and Subgraph Analysis
# Descriptive statistics for additional attributes
attribute_extended_summary <- data %>%
summarise(
avg_neighborhood = mean(neighborhood_on_chromosome, na.rm = TRUE),
avg_phylogenetic = mean(phylogenetic_cooccurrence, na.rm = TRUE)
)
print(attribute_extended_summary)
## avg_neighborhood avg_phylogenetic
## 1 0.02534646 0.03283465
# Filter high `phylogenetic_cooccurrence`
high_phylogenetic <- data %>%
filter(phylogenetic_cooccurrence > 0)
# Create subgraph
subgraph_phylogenetic <- graph_from_data_frame(high_phylogenetic, directed = FALSE)
# Assign colors for nodes
node_colors <- brewer.pal(n = 8, name = "Set2")
V(subgraph_phylogenetic)$color <- node_colors[1:length(V(subgraph_phylogenetic)) %% length(node_colors) + 1]
# Enhanced visualization
ggraph(subgraph_phylogenetic, layout = "fr") +
geom_edge_link(aes(edge_width = phylogenetic_cooccurrence, edge_alpha = phylogenetic_cooccurrence),
color = "orange", show.legend = TRUE) +
scale_edge_width(range = c(0.2, 2), name = "Co-occurrence Strength") +
scale_edge_alpha(range = c(0.4, 1), name = "Edge Transparency") +
geom_node_point(aes(size = degree(subgraph_phylogenetic), color = I(V(subgraph_phylogenetic)$color)),
alpha = 0.8, show.legend = FALSE) +
scale_size_continuous(range = c(3, 10), name = "Node Degree") +
geom_node_text(aes(label = name),
repel = TRUE,
size = 3,
color = "black",
family = "Helvetica") +
theme_void() +
ggtitle("Phylogenetic Co-occurrence Network") +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
Conclusion
This analysis demonstrates the utility of graph-based methods for understanding network structure, community detection, and the role of key nodes in biological networks.