Introduction

This report demonstrates the visualization and analysis of network communities using graph-based methods in R.


##Data Preparation

###First, we load the dataset and inspect its structure.

# Load data (update the file path to your dataset)
data_path <- "redesteroles.tsv"
data <- tryCatch({
  read.table(data_path, header = TRUE, sep = "\t")
}, error = function(e) {
  stop("Error reading the file. Ensure the file path and format are correct: ", e$message)
})

# Display the first few rows
head(data)
##   node1                node2           node1_string_id
## 1  HMG1 PGSC0003DMT400008586 4113.PGSC0003DMT400035542
## 2  HMG1 PGSC0003DMT400012593 4113.PGSC0003DMT400035542
## 3  HMG1 PGSC0003DMT400013082 4113.PGSC0003DMT400035542
## 4  HMG1                 Sgt2 4113.PGSC0003DMT400035542
## 5  HMG1 PGSC0003DMT400058561 4113.PGSC0003DMT400035542
## 6  HMG1 PGSC0003DMT400058564 4113.PGSC0003DMT400035542
##             node2_string_id neighborhood_on_chromosome gene_fusion
## 1 4113.PGSC0003DMT400008586                      0.000           0
## 2 4113.PGSC0003DMT400012593                      0.000           0
## 3 4113.PGSC0003DMT400013082                      0.000           0
## 4 4113.PGSC0003DMT400045138                      0.000           0
## 5 4113.PGSC0003DMT400058561                      0.148           0
## 6 4113.PGSC0003DMT400058564                      0.148           0
##   phylogenetic_cooccurrence homology coexpression
## 1                         0        0        0.582
## 2                         0        0        0.582
## 3                         0        0        0.582
## 4                         0        0        0.000
## 5                         0        0        0.301
## 6                         0        0        0.301
##   experimentally_determined_interaction database_annotated automated_textmining
## 1                                     0              0.000                0.639
## 2                                     0              0.000                0.579
## 3                                     0              0.000                0.374
## 4                                     0              0.000                0.735
## 5                                     0              0.418                0.358
## 6                                     0              0.787                0.358
##   combined_score
## 1          0.842
## 2          0.816
## 3          0.727
## 4          0.735
## 5          0.747
## 6          0.907
# Check column names
colnames(data)
##  [1] "node1"                                
##  [2] "node2"                                
##  [3] "node1_string_id"                      
##  [4] "node2_string_id"                      
##  [5] "neighborhood_on_chromosome"           
##  [6] "gene_fusion"                          
##  [7] "phylogenetic_cooccurrence"            
##  [8] "homology"                             
##  [9] "coexpression"                         
## [10] "experimentally_determined_interaction"
## [11] "database_annotated"                   
## [12] "automated_textmining"                 
## [13] "combined_score"

##Network Construction

###We construct the graph using the igraph package.

# Create a graph object
graph <- graph_from_data_frame(data, directed = FALSE)

# Basic properties of the graph
cat("Number of nodes:", vcount(graph), "\n")
## Number of nodes: 39
cat("Number of edges:", ecount(graph), "\n")
## Number of edges: 254
# graph visualization
ggraph(graph, layout = "fr") + 
  geom_edge_link(aes(edge_alpha = combined_score), color = "orange", show.legend = TRUE) +
  geom_node_point(aes(size = degree(graph), color = closeness(graph)), show.legend = TRUE) +
  scale_color_gradient(low = "blue", high = "red", name = "Closeness Centrality") +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  theme_void() +

  theme(plot.title = element_text(hjust = 0.5, size = 14))

##Community Detection

###We detect communities using the Louvain method and assign community-based colors.

# Community detection
communities <- cluster_louvain(graph)
V(graph)$community <- communities$membership

# Assign colors to communities
community_colors <- brewer.pal(length(unique(V(graph)$community)), "Set3")

##Enhanced Visualization

###We visualize the graph with enhanced aesthetics.

# Plot the graph with enhanced visualization
ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = combined_score), color = "gray", edge_width = 0.5) +
  geom_node_point(aes(size = degree(graph), color = factor(community)), alpha = 0.9) +
  scale_size_continuous(range = c(3, 8), name = "Node Degree") +
  scale_color_manual(values = community_colors, name = "Communities") +
  geom_node_text(aes(label = ifelse(degree(graph) > quantile(degree(graph), 0.9), name, "")),
                 repel = TRUE, size = 3, color = "black") +
  theme_void() +
  ggtitle("Enhanced Network Community Visualization") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    legend.position = "right",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

##Centrality Analysis

##We calculate centrality metrics and identify the top hub nodes.

# Centrality metrics
centrality_metrics <- data.frame(
  node = V(graph)$name,
  degree = degree(graph, mode = "all"),
  betweenness = betweenness(graph, normalized = TRUE),
  closeness = closeness(graph, normalized = TRUE)
)

# Top 10 hub nodes by degree
hub_genes <- centrality_metrics %>%
  arrange(desc(degree)) %>%
  head(10)

# Display hub nodes
print(hub_genes)
##                                      node degree betweenness closeness
## PGSC0003DMT400008586 PGSC0003DMT400008586     29  0.10826498 0.8372093
## PGSC0003DMT400012593 PGSC0003DMT400012593     29  0.10826498 0.8372093
## PGSC0003DMT400013082 PGSC0003DMT400013082     28  0.10826498 0.8181818
## PGSC0003DMT400042257 PGSC0003DMT400042257     25  0.03396964 0.7500000
## PGSC0003DMT400058561 PGSC0003DMT400058561     22  0.05756814 0.7200000
## PGSC0003DMT400058564 PGSC0003DMT400058564     22  0.05756814 0.7200000
## PGSC0003DMT400005519 PGSC0003DMT400005519     20  0.02366686 0.6792453
## PGSC0003DMT400078196 PGSC0003DMT400078196     20  0.02366686 0.6792453
## PGSC0003DMT400010023 PGSC0003DMT400010023     19  0.01869158 0.6666667
## PGSC0003DMT400017668 PGSC0003DMT400017668     19  0.01458762 0.6666667

Subgraph Analysis

We create subgraphs based on specific attributes.

# Subgraph for high coexpression
high_coexpression <- data %>%
  filter(coexpression > 0.7)
subgraph_coexpression <- graph_from_data_frame(high_coexpression, directed = FALSE)

# Visualization of the coexpression subgraph
ggraph(subgraph_coexpression, layout = "fr") +
  geom_edge_link(aes(edge_alpha = coexpression), color = "blue", show.legend = TRUE) +
  geom_node_point(color = "red", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  theme_minimal() +
  ggtitle("High Co-expression Subgraph")

Additional Descriptive Statistics and Subgraph Analysis

# Descriptive statistics for additional attributes
attribute_extended_summary <- data %>%
  summarise(
    avg_neighborhood = mean(neighborhood_on_chromosome, na.rm = TRUE),
    avg_phylogenetic = mean(phylogenetic_cooccurrence, na.rm = TRUE)
  )
print(attribute_extended_summary)
##   avg_neighborhood avg_phylogenetic
## 1       0.02534646       0.03283465
# Filter high `phylogenetic_cooccurrence`
high_phylogenetic <- data %>%
  filter(phylogenetic_cooccurrence > 0)

# Create subgraph
subgraph_phylogenetic <- graph_from_data_frame(high_phylogenetic, directed = FALSE)

# Assign colors for nodes
node_colors <- brewer.pal(n = 8, name = "Set2")
V(subgraph_phylogenetic)$color <- node_colors[1:length(V(subgraph_phylogenetic)) %% length(node_colors) + 1]

# Enhanced visualization
ggraph(subgraph_phylogenetic, layout = "fr") +
  geom_edge_link(aes(edge_width = phylogenetic_cooccurrence, edge_alpha = phylogenetic_cooccurrence), 
                 color = "orange", show.legend = TRUE) +
  scale_edge_width(range = c(0.2, 2), name = "Co-occurrence Strength") +
  scale_edge_alpha(range = c(0.4, 1), name = "Edge Transparency") +
  geom_node_point(aes(size = degree(subgraph_phylogenetic), color = I(V(subgraph_phylogenetic)$color)), 
                  alpha = 0.8, show.legend = FALSE) +
  scale_size_continuous(range = c(3, 10), name = "Node Degree") +
  geom_node_text(aes(label = name), 
                 repel = TRUE, 
                 size = 3, 
                 color = "black", 
                 family = "Helvetica") +
  theme_void() +
  ggtitle("Phylogenetic Co-occurrence Network") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    legend.position = "bottom",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

Conclusion

This analysis demonstrates the utility of graph-based methods for understanding network structure, community detection, and the role of key nodes in biological networks.