Purpose: Identify and characterize trans-regulatory networks mediated by genes within the Inv4m introgression region. This analysis integrates differential expression results with co-expression data from MaizeNetome to construct regulatory networks.
Approach: 1. Define genomic regions (Inv4m inversion, flanking introgression, background) 2. Test for enrichment of DEGs within cis vs trans regions 3. Query MaizeNetome for reference co-expression edges involving introgressed genes 4. Calculate bipartite cis-trans co-expression network from expression data 5. Classify edges as Reference (in MaizeNetome) or Novel (not in MaizeNetome) 6. Visualize networks with igraph/visNetwork
Expected outcomes: - Identification of trans-regulatory hubs within Inv4m - Network visualization showing regulatory relationships - Statistical enrichment of DEGs by genomic region (cis vs trans) - Output files for downstream analysis and manuscript figures
library(dplyr)
library(tidyr)
library(rtracklayer)
library(GenomicRanges)
library(igraph)
library(visNetwork)
#' Calculate mutual information from correlation coefficient
#'
#' Converts Pearson correlation to mutual information estimate.
#'
#' @param r Correlation coefficient
#'
#' @return Mutual information estimate
mutual_info <- function(r) {
0.5 * log2(1 / (1 - r^2))
}
#' Calculate robust distance metric
#'
#' Distance metric from https://doi.org/10.1186/s12859-023-05161-y
#'
#' @param r Correlation coefficient
#' @param n Sample size
#'
#' @return Robust distance metric
robust_dist <- function(r, n) {
sqrt(1 - abs(r))
}
#' Calculate standardized euclidean distance for MST construction
#'
#' For standardized data this is the euclidean distance between vectors
#' https://cmci.colorado.edu/classes/INFO-1301/files/borgatti.htm
#' Approach follows Chenxin Li:
#' https://github.com/cxli233/SimpleTidy_GeneCoEx#z-score
#'
#' @param r_scaled Correlation coefficient from scaled expression data
#' @param n Sample size
#'
#' @return Standardized euclidean distance
std_dist <- function(r_scaled, n) {
sqrt(2 * n *(1 - abs(r_scaled)))
}
#' Format nodes for network visualization
#'
#' Creates standardized node data frame with region colors, labels, and
#' styling.
#'
#' @param node_ids Character vector of gene IDs
#' @param effects_df Data frame with gene expression data
#' @param custom_labels Named vector of custom gene labels
#'
#' @return Data frame with formatted node attributes
format_network_nodes <- function(node_ids, effects_df,
custom_labels = NULL) {
nodes <- data.frame(id = node_ids) %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::select(gene, locus_label, logFC),
by = c("id" = "gene")
) %>%
mutate(
region = case_when(
id %in% inv4m_regulators ~ "inv4m",
id %in% flanking_regulators ~ "flank",
TRUE ~ "out"
),
upregulated = logFC > 0,
color.border = case_when(
region == "inv4m" ~ "#551A8B",
region == "flank" ~ "#FF1493",
region == "out" ~ "#FFD700"
),
color.background = case_when(
upregulated & region == "inv4m" ~ "#551A8B",
upregulated & region == "flank" ~ "#FF1493",
upregulated & region == "out" ~ "#FFD700",
!upregulated ~ "white"
),
borderWidth = 4,
label = coalesce(locus_label, id)
)
dup_labels <- nodes %>%
filter(!is.na(locus_label), locus_label != "") %>%
group_by(locus_label) %>%
filter(n() > 1) %>%
pull(id)
if (length(dup_labels) > 0) {
nodes <- nodes %>%
mutate(label = ifelse(
id %in% dup_labels,
paste0(locus_label, " (", id, ")"),
label
))
}
if (!is.null(custom_labels) && any(names(custom_labels) %in% nodes$id)) {
cat("Custom labels found for nodes:",
sum(names(custom_labels) %in% nodes$id), "\n")
for (gene_id in names(custom_labels)) {
if (gene_id %in% nodes$id) {
nodes$label[nodes$id == gene_id] <- custom_labels[gene_id]
}
}
}
nodes$label <- paste0("<i>", nodes$label, "</i>")
nodes
}
#' Create MST visualization from edge list
#'
#' Builds minimum spanning tree and creates interactive visNetwork plot.
#'
#' @param edges Data frame with from, to, r, corr_std, robust_dist, std_dist,
#' mi columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param layout igraph layout algorithm name (used if edges >= 100)
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_mst <- function(edges, nodes, effects_df,
layout = "layout_nicely", font_size = 20) {
cat("\n=== DIAGNOSTIC: Checking node labels ===\n")
cat("Total nodes:", nrow(nodes), "\n")
cat("Unique node IDs:", length(unique(nodes$id)), "\n")
cat("Unique labels:", length(unique(nodes$label)), "\n")
if (any(duplicated(nodes$label))) {
dup_labels_vec <- nodes$label[duplicated(nodes$label)]
cat("DUPLICATE LABELS FOUND:", length(dup_labels_vec), "\n")
cat("Duplicates:\n")
dup_info <- nodes %>%
filter(label %in% dup_labels_vec) %>%
dplyr::select(id, locus_label, label) %>%
arrange(label)
for (i in 1:nrow(dup_info)) {
cat(sprintf(" %s | locus_label: %s | label: %s\n",
dup_info$id[i],
ifelse(is.na(dup_info$locus_label[i]), "NA",
dup_info$locus_label[i]),
dup_info$label[i]))
}
cat("\n")
stop("Cannot proceed with duplicate labels")
}
g <- graph_from_data_frame(
d = edges %>% dplyr::select(from, to, r, corr_std, robust_dist,
std_dist, mi),
directed = TRUE,
vertices = nodes
)
g_mst <- mst(g, weights = E(g)$std_dist)
cat("MST - Edges:", length(E(g_mst)), "| Nodes:", length(V(g_mst)), "\n")
mst_edges <- igraph::as_data_frame(g_mst, what = "edges") %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::select(gene, logFC_from = logFC),
by = c("from" = "gene")
) %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::select(gene, logFC_to = logFC),
by = c("to" = "gene")
) %>%
mutate(
is_concordant = sign(logFC_from) == sign(logFC_to),
width = mi
)
g_mst_vis <- g_mst %>%
set_edge_attr(
"arrows.to.type",
value = ifelse(mst_edges$is_concordant, "arrow", "bar")
) %>%
set_edge_attr("width", value = mst_edges$width) %>%
set_edge_attr("color", value = "DarkGray")
V(g_mst_vis)$name <- V(g_mst_vis)$label
if (length(E(g_mst)) < 100) {
cat("Using visNetwork with default physics (MST edges < 100)\n")
# Filter nodes to only those in the MST
mst_node_ids <- unique(c(mst_edges$from, mst_edges$to))
mst_nodes <- nodes %>% filter(id %in% mst_node_ids)
mst_edges_vis <- mst_edges %>%
mutate(
arrows.to.type = ifelse(is_concordant, "arrow", "bar"),
color = "DarkGray"
) %>%
dplyr::select(from, to, width, arrows.to.type, color)
visNetwork(mst_nodes, mst_edges_vis) %>%
visNodes(font = list(size = font_size, multi = "html")) %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
nodesIdSelection = TRUE
)
} else {
cat("Using igraph layout (MST edges >= 100)\n")
visIgraph(g_mst_vis) %>%
visNodes(font = list(size = font_size, multi = "html")) %>%
visIgraphLayout(layout = layout) %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
nodesIdSelection = TRUE
)
}
}
#' Visualize full network (non-MST)
#'
#' Creates interactive network visualization for smaller networks.
#'
#' @param edges Data frame with from, to, mi columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_full_network <- function(edges, nodes, effects_df, font_size = 20) {
edge_df <- edges %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::select(gene, logFC_from = logFC),
by = c("from" = "gene")
) %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
dplyr::select(gene, logFC_to = logFC),
by = c("to" = "gene")
) %>%
mutate(
is_concordant = sign(logFC_from) == sign(logFC_to),
arrows.to.type = ifelse(is_concordant, "arrow", "bar"),
color = "DarkGray",
width = mi
) %>%
dplyr::select(from, to, width, arrows.to.type, color)
cat("Full Network - Edges:", nrow(edge_df), "| Nodes:", nrow(nodes), "\n")
visNetwork(nodes, edge_df) %>%
visNodes(font = list(size = font_size, multi = "html")) %>%
visEdges(arrows = "to") %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
nodesIdSelection = TRUE
)
}
#' Main network visualization wrapper
#'
#' Filters edges by variant, prepares nodes, and calls appropriate
#' visualization.
#'
#' @param edges Data frame with from, to, r, corr_std columns
#' (bipartite: cis → trans)
#' @param effects_df Data frame with DEG statistics
#' @param variant Character: "full", "flanking", or "inv4m"
#' @param module_name Character: network title prefix
#' @param custom_labels Named vector of custom gene labels
#' @param edge_threshold Threshold for switching to MST
#' @param font_size Font size for labels
#' @param n_samples Sample size for distance calculation
#' @param use_mst_below_100 If TRUE, use MST for networks < 100 edges;
#' if FALSE, show full network
#'
#' @return visNetwork htmlwidget
visualize_trans_network <- function(edges,
effects_df,
variant = c("full", "flanking", "inv4m"),
module_name = "Network",
custom_labels = NULL,
edge_threshold = 100,
font_size = 20,
n_samples,
use_mst_below_100 = TRUE) {
variant <- match.arg(variant)
filtered_edges <- switch(
variant,
"full" = edges,
"flanking" = edges %>% filter(from %in% flanking_regulators),
"inv4m" = edges %>% filter(from %in% inv4m_regulators)
)
variant_label <- switch(
variant,
"full" = "Full Trans Co-expression",
"flanking" = "Flanking Trans Co-expression",
"inv4m" = "Inv4m Trans Co-expression"
)
title <- paste(module_name, "-", variant_label, "Network")
cat("\n===", title, "===\n")
cat("Edges:", nrow(filtered_edges), "\n")
if (nrow(filtered_edges) == 0) {
cat("No edges for this variant\n")
return(NULL)
}
node_ids <- unique(c(filtered_edges$from, filtered_edges$to))
nodes <- format_network_nodes(node_ids, effects_df, custom_labels)
cat("Nodes:", nrow(nodes), "\n")
prepared_edges <- filtered_edges %>%
mutate(
robust_dist = robust_dist(r, n_samples),
std_dist = std_dist(corr_std, n_samples),
mi = mutual_info(r)
)
if (nrow(filtered_edges) < 100) {
if (use_mst_below_100) {
cat("Using MST with visNetwork physics (edges < 100)\n")
visualize_mst(prepared_edges, nodes, effects_df, font_size = font_size)
} else {
cat("Showing full network with physics (edges < 100)\n")
visualize_full_network(prepared_edges, nodes, effects_df,
font_size = font_size)
}
} else if (nrow(filtered_edges) >= edge_threshold) {
cat("Using MST with igraph layout (edges >=", edge_threshold, ")\n")
visualize_mst(prepared_edges, nodes, effects_df, font_size = font_size)
} else {
cat("Showing full network\n")
visualize_full_network(prepared_edges, nodes, effects_df,
font_size = font_size)
}
}
v5_gff <- "~/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
v5 <- rtracklayer::import(v5_gff) %>%
subset(type == "gene" & seqnames %in% 1:10)
genes <- as.data.frame(v5)
genes$ID <- gsub("gene:", "", genes$ID)
cat("Total genes loaded:", nrow(genes), "\n")
## Total genes loaded: 43459
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]
cat("Inv4m region:", inv4m_start, "-", inv4m_end, "\n")
## Inv4m region: 172883675 - 188132113
inv4m_gene_ids <- genes %>%
filter(
seqnames == 4,
start >= inv4m_start,
end <= inv4m_end
) %>%
pull(ID)
cat("Genes in Inv4m:", length(inv4m_gene_ids), "\n")
## Genes in Inv4m: 434
introgression_start <- 157012149
introgression_end <- 195900523
cis_introgression_gene_ids <- genes %>%
filter(
seqnames == 4,
start >= introgression_start,
end <= introgression_end
) %>%
pull(ID)
cat("Genes in cis introgression:", length(cis_introgression_gene_ids), "\n")
## Genes in cis introgression: 1099
flanking_introgression_gene_ids <- cis_introgression_gene_ids[
!cis_introgression_gene_ids %in% inv4m_gene_ids
]
cat("Genes in flanking regions:", length(flanking_introgression_gene_ids),
"\n")
## Genes in flanking regions: 665
effects_df <- read.csv(
"~/Desktop/predictor_effects_leaf_interaction_model.csv",
na.strings = c("NA", NA, "")
) %>%
filter(predictor == "Inv4m")
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv",
na.strings = c("NA", NA, "")) %>%
dplyr::select(gene, locus_label)
effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA
effects_df <- effects_df %>%
left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
dplyr::select(-locus_label_curated)
curated_updates <- tibble::tribble(
~gene, ~locus_label,
"Zm00001eb384770", "zcn26",
"Zm00001eb192330", "roc4-1",
"Zm00001eb193260", "trxl2",
"Zm00001eb194300", "paspa",
"Zm00001eb213980", "dywl",
"Zm00001eb194080", "ugt85a2",
"Zm00001eb112470", "engd2",
"Zm00001eb332450", "roc4-2",
"Zm00001eb193270", "trxl5",
"Zm00001eb066620", "tipin",
"Zm00001eb241410", "etfa",
"Zm00001eb013800", "mrlk",
"Zm00001eb249540", "prpl29",
"Zm00001eb000540", "prd3",
"Zm00001eb060540", "actin-1",
"Zm00001eb192580", "map4k8",
"Zm00001eb193120", "o3l1",
"Zm00001eb043750", "cdrk9",
"Zm00001eb034810", "mgd2",
"Zm00001eb241920", "gpx1",
"Zm00001eb162710", "spx4",
"Zm00001eb038730", "pht7",
"Zm00001eb370610", "rfk1",
"Zm00001eb386270", "spx6",
"Zm00001eb335670", "sqd3",
"Zm00001eb151650", "pap1",
"Zm00001eb154650", "ppa1",
"Zm00001eb280120", "pfk1",
"Zm00001eb036910", "gpx3",
"Zm00001eb406610", "glk4",
"Zm00001eb048730", "spx2",
"Zm00001eb361620", "ppa2",
"Zm00001eb297970", "sqd2",
"Zm00001eb347070", "sqd1",
"Zm00001eb144680", "rns1",
"Zm00001eb430590", "nrx3",
"Zm00001eb247580", "ppck3",
"Zm00001eb351780", "ugp3",
"Zm00001eb222510", "pht1",
"Zm00001eb126380", "phos1",
"Zm00001eb047070", "pht2",
"Zm00001eb041390", "rns3",
"Zm00001eb116580", "spd1",
"Zm00001eb069630", "oct4",
"Zm00001eb083520", "dgd1",
"Zm00001eb130570", "sag21",
"Zm00001eb239700", "ppa3",
"Zm00001eb258130", "mgd3",
"Zm00001eb202100", "pap14",
"Zm00001eb144670", "rns2",
"Zm00001eb087720", "pht13",
"Zm00001eb277280", "gst19",
"Zm00001eb339870", "pld16",
"Zm00001eb289800", "pah1",
"Zm00001eb369590", "nrx1",
"Zm00001eb238670", "pep2"
)
effects_df <- effects_df %>%
left_join(curated_updates, by = "gene", suffix = c("", "_new")) %>%
mutate(locus_label = coalesce(locus_label_new, locus_label)) %>%
dplyr::select(-locus_label_new)
field_degs <- effects_df %>%
filter(is_hiconf_DEG) %>%
pull(gene) %>%
unique()
cat("Total field DEGs:", length(field_degs), "\n")
## Total field DEGs: 165
cis_trans_table <- with(effects_df, table(in_cis, is_DEG))
cis_trans_test <- fisher.test(cis_trans_table)
cat("=== CIS VS TRANS ENRICHMENT ===\n")
## === CIS VS TRANS ENRICHMENT ===
print(cis_trans_table)
## is_DEG
## in_cis FALSE TRUE
## FALSE 22932 475
## TRUE 307 303
cat("\nFisher's p-value:", format(cis_trans_test$p.value, scientific = TRUE),
"\n")
##
## Fisher's p-value: 1.688983e-301
cat("Odds ratio:", round(cis_trans_test$estimate, 3), "\n\n")
## Odds ratio: 47.613
inv4m_trans_table <- with(
effects_df %>% filter(!in_flank),
table(in_Inv4m, is_DEG)
)
inv4m_trans_test <- fisher.test(inv4m_trans_table)
cat("=== INV4M VS TRANS ENRICHMENT ===\n")
## === INV4M VS TRANS ENRICHMENT ===
print(inv4m_trans_table)
## is_DEG
## in_Inv4m FALSE TRUE
## FALSE 22932 475
## TRUE 125 130
cat("\nFisher's p-value:", format(inv4m_trans_test$p.value, scientific = TRUE),
"\n")
##
## Fisher's p-value: 9.83221e-140
cat("Odds ratio:", round(inv4m_trans_test$estimate, 3), "\n\n")
## Odds ratio: 50.163
flanking_trans_table <- with(
effects_df %>% filter(in_flank | in_trans),
table(in_flank, is_DEG)
)
flanking_trans_test <- fisher.test(flanking_trans_table)
cat("=== FLANKING VS TRANS ENRICHMENT ===\n")
## === FLANKING VS TRANS ENRICHMENT ===
print(flanking_trans_table)
## is_DEG
## in_flank FALSE TRUE
## FALSE 22932 475
## TRUE 182 173
cat("\nFisher's p-value:", format(flanking_trans_test$p.value,
scientific = TRUE), "\n")
##
## Fisher's p-value: 3.080185e-178
cat("Odds ratio:", round(flanking_trans_test$estimate, 3), "\n\n")
## Odds ratio: 45.853
inv4m_flanking_table <- with(
effects_df %>% filter(in_cis),
table(in_Inv4m, is_DEG)
)
inv4m_flanking_test <- fisher.test(inv4m_flanking_table)
cat("=== INV4M VS FLANKING ENRICHMENT ===\n")
## === INV4M VS FLANKING ENRICHMENT ===
print(inv4m_flanking_table)
## is_DEG
## in_Inv4m FALSE TRUE
## FALSE 182 173
## TRUE 125 130
cat("\nFisher's p-value:", format(inv4m_flanking_test$p.value,
scientific = TRUE), "\n")
##
## Fisher's p-value: 6.225037e-01
cat("Odds ratio:", round(inv4m_flanking_test$estimate, 3), "\n\n")
## Odds ratio: 1.094
left_flanking_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/left_MaizeNetome_coexpression_edges.csv"
)
Inv4m_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/inv4m_MaizeNetome_coexpression_edges.csv"
)
right_flanking_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/right_MaizeNetome_coexpression_edges.csv"
)
MaizeNetome <- rbind(
left_flanking_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression"),
Inv4m_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression"),
right_flanking_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression")
) %>%
dplyr::rename(Direct.Connection = Direct.Connetation.Node)
cat("Total MaizeNetome edges:", nrow(MaizeNetome), "\n")
## Total MaizeNetome edges: 640599
v4_v5 <- read.table(
file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v4_to_B73v5.tsv",
sep = "\t",
header = FALSE
) %>%
dplyr::rename(v4 = V1, v5 = V2) %>%
separate_longer_delim(cols = v5, delim = ",")
cat("v4 to v5 mappings:", nrow(v4_v5), "\n")
## v4 to v5 mappings: 43680
MaizeNetome_v5 <- MaizeNetome %>%
left_join(v4_v5, by = c("Element.ID" = "v4")) %>%
dplyr::rename(Element.ID.v5 = v5) %>%
filter(!is.na(Element.ID.v5))
MaizeNetome_v5 <- MaizeNetome_v5 %>%
left_join(v4_v5, by = c("Direct.Connection" = "v4")) %>%
dplyr::rename(Direct.Connection.v5 = v5) %>%
filter(!is.na(Direct.Connection.v5))
cat("Edges after v4->v5 conversion:", nrow(MaizeNetome_v5), "\n")
## Edges after v4->v5 conversion: 767533
cis_degs <- effects_df %>%
filter(in_cis == TRUE, is_hiconf_DEG == TRUE) %>%
arrange(adj.P.Val) %>%
dplyr::select(gene:desc_merged, logFC, adj.P.Val) %>%
inner_join(v4_v5, by = c(gene = "v5")) %>%
group_by(gene) %>%
arrange(adj.P.Val)
trans_degs <- effects_df %>%
filter(in_trans == TRUE, is_hiconf_DEG == TRUE) %>%
arrange(adj.P.Val) %>%
dplyr::select(gene:desc_merged, logFC, adj.P.Val) %>%
inner_join(v4_v5, by = c(gene = "v5")) %>%
group_by(gene) %>%
arrange(adj.P.Val)
cat("Cis genes:", length(unique(cis_degs$gene)), "\n")
## Cis genes: 107
cat("Trans genes:", length(unique(trans_degs$gene)), "\n")
## Trans genes: 39
cis_genes_all <- cis_degs$gene %>% sort() %>% unique()
trans_genes_all <- trans_degs$gene %>% sort() %>% unique()
inv4m_regulators <- cis_genes_all[cis_genes_all %in% inv4m_gene_ids]
flanking_regulators <- cis_genes_all[
cis_genes_all %in% flanking_introgression_gene_ids
]
cat("Inv4m regulators:", length(inv4m_regulators), "\n")
## Inv4m regulators: 37
cat("Flanking regulators:", length(flanking_regulators), "\n")
## Flanking regulators: 70
reference_edges_maizenetome <- MaizeNetome_v5 %>%
filter(
Element.ID.v5 %in% cis_genes_all,
Direct.Connection.v5 %in% trans_genes_all
) %>%
dplyr::rename(
cis_gene = Element.ID.v5,
trans_gene = Direct.Connection.v5
) %>%
dplyr::select(cis_gene, trans_gene, network)
cat("Reference edges from MaizeNetome:", nrow(reference_edges_maizenetome),
"\n")
## Reference edges from MaizeNetome: 103
voomR <- readRDS("~/Desktop/normalized_expression_voom_object.rda")
n_samples <- ncol(voomR$E)
cat("Expression matrix:", nrow(voomR$E), "genes x", ncol(voomR$E),
"samples\n")
## Expression matrix: 24249 genes x 43 samples
linked <- read.table("inv4m_linked_genes_to_filter.txt", header = TRUE)
cat("LD-linked genes loaded:", nrow(linked), "\n")
## LD-linked genes loaded: 58
trans_genes_filtered <- trans_genes_all[!trans_genes_all %in% linked$gene_id]
cat("Trans genes before LD filtering:", length(trans_genes_all), "\n")
## Trans genes before LD filtering: 39
cat("Trans genes after LD filtering:", length(trans_genes_filtered), "\n")
## Trans genes after LD filtering: 37
reference_edges_maizenetome <- reference_edges_maizenetome %>%
filter(trans_gene %in% trans_genes_filtered)
cat("Reference edges after LD filtering:", nrow(reference_edges_maizenetome),
"\n")
## Reference edges after LD filtering: 103
cis_genes <- cis_genes_all[cis_genes_all %in% rownames(voomR$E)]
trans_genes <- trans_genes_filtered[trans_genes_filtered %in% rownames(voomR$E)]
cat("Cis genes in expression data:", length(cis_genes), "\n")
## Cis genes in expression data: 107
cat("Trans genes in expression data:", length(trans_genes), "\n")
## Trans genes in expression data: 37
# Original correlations (for visualization and robust_dist)
expr_cis <- t(voomR$E)[, cis_genes]
expr_trans <- t(voomR$E)[, trans_genes]
corr_bipartite <- cor(expr_cis, expr_trans, use = "pairwise.complete.obs")
# Scaled correlations for std_dist MST construction
# Following Chenxin Li: https://github.com/cxli233/SimpleTidy_GeneCoEx#z-score
expr_cis_scaled <- scale(expr_cis)
expr_trans_scaled <- scale(expr_trans)
corr_bipartite_scaled <- cor(expr_cis_scaled, expr_trans_scaled,
use = "pairwise.complete.obs")
p_values <- matrix(NA, nrow = nrow(corr_bipartite), ncol = ncol(corr_bipartite))
for (i in 1:nrow(corr_bipartite)) {
for (j in 1:ncol(corr_bipartite)) {
r <- corr_bipartite[i, j]
t_stat <- r * sqrt((n_samples - 2) / (1 - r^2))
p_values[i, j] <- 2 * pt(-abs(t_stat), df = n_samples - 2)
}
}
p_adj <- matrix(p.adjust(c(p_values), method = "fdr"), nrow = nrow(p_values))
corr_bipartite_filtered <- corr_bipartite
corr_bipartite_filtered[p_adj > 0.05] <- 0
cat("Significant pairs (FDR < 0.05):", sum(corr_bipartite_filtered != 0), "\n")
## Significant pairs (FDR < 0.05): 3769
edges_idx <- which(corr_bipartite_filtered != 0, arr.ind = TRUE)
bipartite_edges <- data.frame(
from = rownames(corr_bipartite_filtered)[edges_idx[, 1]],
to = colnames(corr_bipartite_filtered)[edges_idx[, 2]],
r = corr_bipartite_filtered[edges_idx],
corr_std = corr_bipartite_scaled[edges_idx]
)
cat("Bipartite network edges:", nrow(bipartite_edges), "\n")
## Bipartite network edges: 3769
bipartite_edges$in_maizenetome <- mapply(
function(f, t) {
any(reference_edges_maizenetome$cis_gene == f &
reference_edges_maizenetome$trans_gene == t)
},
bipartite_edges$from,
bipartite_edges$to
)
bipartite_edges$network_module <- ifelse(
bipartite_edges$in_maizenetome,
"Reference",
"Novel"
)
cat("Edge classification:\n")
## Edge classification:
print(table(bipartite_edges$network_module))
##
## Novel Reference
## 3673 96
reference_edges <- bipartite_edges %>% filter(network_module == "Reference")
novel_edges <- bipartite_edges %>% filter(network_module == "Novel")
cat("\nReference module edges:", nrow(reference_edges), "\n")
##
## Reference module edges: 96
cat("Novel module edges:", nrow(novel_edges), "\n")
## Novel module edges: 3673
visualize_trans_network(
edges = bipartite_edges,
effects_df = effects_df,
variant = "full",
module_name = "Trans Coexpression",
n_samples = n_samples
)
##
## === Trans Coexpression - Full Trans Co-expression Network ===
## Edges: 3769
## Nodes: 144
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 144
## Unique node IDs: 144
## Unique labels: 144
## MST - Edges: 143 | Nodes: 144
## Using igraph layout (MST edges >= 100)
reference_inv4m_edges <- reference_edges %>%
filter(from %in% inv4m_regulators)
novel_inv4m_edges <- novel_edges %>%
filter(from %in% inv4m_regulators)
ref_inv4m_nodes <- unique(c(reference_inv4m_edges$from,
reference_inv4m_edges$to))
novel_inv4m_nodes <- unique(c(novel_inv4m_edges$from, novel_inv4m_edges$to))
novel_only_nodes <- setdiff(novel_inv4m_nodes, ref_inv4m_nodes)
novel_only_edges <- novel_inv4m_edges %>%
filter(from %in% novel_only_nodes & to %in% novel_only_nodes)
visualize_trans_network(
edges = novel_only_edges,
effects_df = effects_df,
variant = "full",
module_name = "NOVEL-ONLY (Inv4m, excluding reference nodes)",
edge_threshold = 100,
font_size = 18,
n_samples = n_samples
)
##
## === NOVEL-ONLY (Inv4m, excluding reference nodes) - Full Trans Co-expression Network ===
## Edges: 558
## Nodes: 49
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 49
## Unique node IDs: 49
## Unique labels: 49
## MST - Edges: 48 | Nodes: 49
## Using visNetwork with default physics (MST edges < 100)
visualize_trans_network(
edges = bipartite_edges,
effects_df = effects_df,
variant = "flanking",
module_name = "Trans Coexpression",
n_samples = n_samples
)
##
## === Trans Coexpression - Flanking Trans Co-expression Network ===
## Edges: 2470
## Nodes: 107
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 107
## Unique node IDs: 107
## Unique labels: 107
## MST - Edges: 106 | Nodes: 107
## Using igraph layout (MST edges >= 100)
visualize_trans_network(
edges = bipartite_edges,
effects_df = effects_df,
variant = "inv4m",
module_name = "Trans Coexpression",
n_samples = n_samples
)
##
## === Trans Coexpression - Inv4m Trans Co-expression Network ===
## Edges: 1299
## Nodes: 74
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 74
## Unique node IDs: 74
## Unique labels: 74
## MST - Edges: 73 | Nodes: 74
## Using visNetwork with default physics (MST edges < 100)
targets_with_flanking <- bipartite_edges %>%
filter(from %in% flanking_regulators) %>%
pull(to) %>%
unique()
edges_no_flanking_targets <- bipartite_edges %>%
filter(
from %in% inv4m_regulators,
!to %in% targets_with_flanking
)
visualize_trans_network(
edges = edges_no_flanking_targets,
effects_df = effects_df,
variant = "full",
module_name = "Inv4m → No Flanking Connections",
edge_threshold = 100,
font_size = 18,
n_samples = n_samples
)
##
## === Inv4m → No Flanking Connections - Full Trans Co-expression Network ===
## Edges: 0
## No edges for this variant
## NULL
strongest_edge_per_target <- bipartite_edges %>%
mutate(abs_r = abs(r)) %>%
group_by(to) %>%
slice_max(abs_r, n = 1, with_ties = FALSE) %>%
ungroup()
targets_strongest_nonflanking <- strongest_edge_per_target %>%
filter(!from %in% flanking_regulators) %>%
pull(to)
edges_strongest_nonflanking_inv4m <- bipartite_edges %>%
filter(
to %in% targets_strongest_nonflanking,
from %in% inv4m_regulators
)
visualize_trans_network(
edges = edges_strongest_nonflanking_inv4m,
effects_df = effects_df,
variant = "full",
module_name = "Inv4m → Strongest Regulator Not Flanking",
edge_threshold = 100,
font_size = 18,
n_samples = n_samples
)
##
## === Inv4m → Strongest Regulator Not Flanking - Full Trans Co-expression Network ===
## Edges: 460
## Nodes: 50
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 50
## Unique node IDs: 50
## Unique labels: 50
## MST - Edges: 49 | Nodes: 50
## Using visNetwork with default physics (MST edges < 100)
visualize_trans_network(
edges = novel_edges,
effects_df = effects_df,
variant = "full",
module_name = "NOVEL",
n_samples = n_samples
)
##
## === NOVEL - Full Trans Co-expression Network ===
## Edges: 3673
## Nodes: 144
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 144
## Unique node IDs: 144
## Unique labels: 144
## MST - Edges: 143 | Nodes: 144
## Using igraph layout (MST edges >= 100)
visualize_trans_network(
edges = novel_edges,
effects_df = effects_df,
variant = "flanking",
module_name = "NOVEL",
n_samples = n_samples
)
##
## === NOVEL - Flanking Trans Co-expression Network ===
## Edges: 2411
## Nodes: 107
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 107
## Unique node IDs: 107
## Unique labels: 107
## MST - Edges: 106 | Nodes: 107
## Using igraph layout (MST edges >= 100)
visualize_trans_network(
edges = novel_edges,
effects_df = effects_df,
variant = "inv4m",
module_name = "NOVEL",
n_samples = n_samples
)
##
## === NOVEL - Inv4m Trans Co-expression Network ===
## Edges: 1262
## Nodes: 74
## Using MST with igraph layout (edges >= 100 )
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 74
## Unique node IDs: 74
## Unique labels: 74
## MST - Edges: 73 | Nodes: 74
## Using visNetwork with default physics (MST edges < 100)
visualize_trans_network(
edges = reference_edges,
effects_df = effects_df,
variant = "full",
module_name = "REFERENCE",
n_samples = n_samples
)
##
## === REFERENCE - Full Trans Co-expression Network ===
## Edges: 96
## Nodes: 49
## Using MST with visNetwork physics (edges < 100)
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 49
## Unique node IDs: 49
## Unique labels: 49
## MST - Edges: 47 | Nodes: 49
## Using visNetwork with default physics (MST edges < 100)
visualize_trans_network(
edges = reference_edges,
effects_df = effects_df,
variant = "flanking",
module_name = "REFERENCE",
n_samples = n_samples
)
##
## === REFERENCE - Flanking Trans Co-expression Network ===
## Edges: 59
## Nodes: 32
## Using MST with visNetwork physics (edges < 100)
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 32
## Unique node IDs: 32
## Unique labels: 32
## MST - Edges: 29 | Nodes: 32
## Using visNetwork with default physics (MST edges < 100)
visualize_trans_network(
edges = reference_edges,
effects_df = effects_df,
variant = "inv4m",
module_name = "REFERENCE",
n_samples = n_samples
)
##
## === REFERENCE - Inv4m Trans Co-expression Network ===
## Edges: 37
## Nodes: 25
## Using MST with visNetwork physics (edges < 100)
##
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 25
## Unique node IDs: 25
## Unique labels: 25
## MST - Edges: 24 | Nodes: 25
## Using visNetwork with default physics (MST edges < 100)
reference_genes <- unique(c(reference_edges$from, reference_edges$to))
novel_genes <- unique(c(novel_edges$from, novel_edges$to))
reference_gene_info <- effects_df %>%
filter(gene %in% reference_genes, predictor == "Inv4m") %>%
dplyr::select(gene, locus_label, desc_merged, logFC, adj.P.Val) %>%
distinct() %>%
mutate(gene_location = ifelse(gene %in% cis_genes, "cis", "trans"))
write.csv(reference_gene_info,
"~/Desktop/reference_trans_network_genes.csv",
row.names = FALSE)
write.csv(reference_edges,
"~/Desktop/reference_trans_network_edges.csv",
row.names = FALSE)
novel_gene_info <- effects_df %>%
filter(gene %in% novel_genes, predictor == "Inv4m") %>%
dplyr::select(gene, locus_label, desc_merged, logFC, adj.P.Val) %>%
distinct() %>%
mutate(gene_location = ifelse(gene %in% cis_genes, "cis", "trans"))
write.csv(novel_gene_info,
"~/Desktop/novel_trans_network_genes.csv",
row.names = FALSE)
write.csv(novel_edges,
"~/Desktop/novel_trans_network_edges.csv",
row.names = FALSE)
cat("Network modules exported to ~/Desktop/\n")
## Network modules exported to ~/Desktop/
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] visNetwork_2.1.4 igraph_2.2.1 rtracklayer_1.70.0
## [4] GenomicRanges_1.62.0 Seqinfo_1.0.0 IRanges_2.44.0
## [7] S4Vectors_0.48.0 BiocGenerics_0.56.0 generics_0.1.4
## [10] tidyr_1.3.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] SummarizedExperiment_1.40.0 rjson_0.2.23
## [3] xfun_0.54 bslib_0.9.0
## [5] htmlwidgets_1.6.4 Biobase_2.70.0
## [7] lattice_0.22-7 vctrs_0.6.5
## [9] tools_4.5.1 bitops_1.0-9
## [11] curl_7.0.0 parallel_4.5.1
## [13] tibble_3.3.0 pkgconfig_2.0.3
## [15] Matrix_1.7-4 cigarillo_1.0.0
## [17] lifecycle_1.0.4 stringr_1.6.0
## [19] compiler_4.5.1 Rsamtools_2.26.0
## [21] Biostrings_2.78.0 statmod_1.5.1
## [23] codetools_0.2-20 htmltools_0.5.8.1
## [25] sass_0.4.10 RCurl_1.98-1.17
## [27] yaml_2.3.10 pillar_1.11.1
## [29] crayon_1.5.3 jquerylib_0.1.4
## [31] BiocParallel_1.44.0 limma_3.66.0
## [33] cachem_1.1.0 DelayedArray_0.36.0
## [35] abind_1.4-8 tidyselect_1.2.1
## [37] digest_0.6.39 stringi_1.8.7
## [39] purrr_1.2.0 restfulr_0.0.16
## [41] fastmap_1.2.0 grid_4.5.1
## [43] cli_3.6.5 SparseArray_1.10.2
## [45] magrittr_2.0.4 S4Arrays_1.10.0
## [47] XML_3.99-0.20 withr_3.0.2
## [49] rmarkdown_2.30 XVector_0.50.0
## [51] httr_1.4.7 matrixStats_1.5.0
## [53] evaluate_1.0.5 knitr_1.50
## [55] BiocIO_1.20.0 rlang_1.1.6
## [57] glue_1.8.0 rstudioapi_0.17.1
## [59] jsonlite_2.0.0 R6_2.6.1
## [61] MatrixGenerics_1.22.0 GenomicAlignments_1.46.0