Overview

Purpose: Identify and characterize trans-regulatory networks mediated by genes within the Inv4m introgression region. This analysis integrates differential expression results with co-expression data from MaizeNetome to construct regulatory networks.

Approach: 1. Define genomic regions (Inv4m inversion, flanking introgression, background) 2. Test for enrichment of DEGs within cis vs trans regions 3. Query MaizeNetome for reference co-expression edges involving introgressed genes 4. Calculate bipartite cis-trans co-expression network from expression data 5. Classify edges as Reference (in MaizeNetome) or Novel (not in MaizeNetome) 6. Visualize networks with igraph/visNetwork

Expected outcomes: - Identification of trans-regulatory hubs within Inv4m - Network visualization showing regulatory relationships - Statistical enrichment of DEGs by genomic region (cis vs trans) - Output files for downstream analysis and manuscript figures

Setup and Dependencies

library(dplyr)
library(tidyr)
library(rtracklayer)
library(GenomicRanges)
library(igraph)
library(visNetwork)

Helper Functions

#' Calculate mutual information from correlation coefficient
#'
#' Converts Pearson correlation to mutual information estimate.
#'
#' @param r Correlation coefficient
#'
#' @return Mutual information estimate
mutual_info <- function(r) {
  0.5 * log2(1 / (1 - r^2))
}

#' Calculate robust distance metric
#'
#' Distance metric from https://doi.org/10.1186/s12859-023-05161-y
#'
#' @param r Correlation coefficient
#' @param n Sample size
#'
#' @return Robust distance metric
robust_dist <- function(r, n) {
  sqrt(1 - abs(r))
}

#' Calculate standardized euclidean distance for MST construction
#'
#' For standardized data this is the euclidean distance between vectors
#' https://cmci.colorado.edu/classes/INFO-1301/files/borgatti.htm
#' Approach follows Chenxin Li: 
#' https://github.com/cxli233/SimpleTidy_GeneCoEx#z-score
#'
#' @param r_scaled Correlation coefficient from scaled expression data
#' @param n Sample size
#'
#' @return Standardized euclidean distance
std_dist <- function(r_scaled, n) {
  sqrt(2 * n *(1 -  abs(r_scaled)))
}

#' Format nodes for network visualization
#'
#' Creates standardized node data frame with region colors, labels, and 
#' styling.
#'
#' @param node_ids Character vector of gene IDs
#' @param effects_df Data frame with gene expression data
#' @param custom_labels Named vector of custom gene labels
#'
#' @return Data frame with formatted node attributes
format_network_nodes <- function(node_ids, effects_df, 
                                 custom_labels = NULL) {
  nodes <- data.frame(id = node_ids) %>%
    left_join(
      effects_df %>%
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        dplyr::select(gene, locus_label, logFC),
      by = c("id" = "gene")
    ) %>%
    mutate(
      region = case_when(
        id %in% inv4m_regulators ~ "inv4m",
        id %in% flanking_regulators ~ "flank",
        TRUE ~ "out"
      ),
      upregulated = logFC > 0,
      color.border = case_when(
        region == "inv4m" ~ "#551A8B",
        region == "flank" ~ "#FF1493",
        region == "out" ~ "#FFD700"
      ),
      color.background = case_when(
        upregulated & region == "inv4m" ~ "#551A8B",
        upregulated & region == "flank" ~ "#FF1493",
        upregulated & region == "out" ~ "#FFD700",
        !upregulated ~ "white"
      ),
      borderWidth = 4,
      label = coalesce(locus_label, id)
    )
  
  dup_labels <- nodes %>%
    filter(!is.na(locus_label), locus_label != "") %>%
    group_by(locus_label) %>%
    filter(n() > 1) %>%
    pull(id)
  
  if (length(dup_labels) > 0) {
    nodes <- nodes %>%
      mutate(label = ifelse(
        id %in% dup_labels,
        paste0(locus_label, " (", id, ")"),
        label
      ))
  }
  
  if (!is.null(custom_labels) && any(names(custom_labels) %in% nodes$id)) {
    cat("Custom labels found for nodes:", 
        sum(names(custom_labels) %in% nodes$id), "\n")
    for (gene_id in names(custom_labels)) {
      if (gene_id %in% nodes$id) {
        nodes$label[nodes$id == gene_id] <- custom_labels[gene_id]
      }
    }
  }
  
  nodes$label <- paste0("<i>", nodes$label, "</i>")
  
  nodes
}

#' Create MST visualization from edge list
#'
#' Builds minimum spanning tree and creates interactive visNetwork plot.
#'
#' @param edges Data frame with from, to, r, corr_std, robust_dist, std_dist, 
#'   mi columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param layout igraph layout algorithm name (used if edges >= 100)
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_mst <- function(edges, nodes, effects_df, 
                          layout = "layout_nicely", font_size = 20) {
  
  cat("\n=== DIAGNOSTIC: Checking node labels ===\n")
  cat("Total nodes:", nrow(nodes), "\n")
  cat("Unique node IDs:", length(unique(nodes$id)), "\n")
  cat("Unique labels:", length(unique(nodes$label)), "\n")
  
  if (any(duplicated(nodes$label))) {
    dup_labels_vec <- nodes$label[duplicated(nodes$label)]
    cat("DUPLICATE LABELS FOUND:", length(dup_labels_vec), "\n")
    cat("Duplicates:\n")
    
    dup_info <- nodes %>% 
      filter(label %in% dup_labels_vec) %>% 
      dplyr::select(id, locus_label, label) %>%
      arrange(label)
    
    for (i in 1:nrow(dup_info)) {
      cat(sprintf("  %s | locus_label: %s | label: %s\n", 
                  dup_info$id[i], 
                  ifelse(is.na(dup_info$locus_label[i]), "NA", 
                         dup_info$locus_label[i]),
                  dup_info$label[i]))
    }
    cat("\n")
    
    stop("Cannot proceed with duplicate labels")
  }
  
  g <- graph_from_data_frame(
    d = edges %>% dplyr::select(from, to, r, corr_std, robust_dist, 
                                 std_dist, mi),
    directed = TRUE,
    vertices = nodes
  )
  
  g_mst <- mst(g, weights = E(g)$std_dist)
  
  cat("MST - Edges:", length(E(g_mst)), "| Nodes:", length(V(g_mst)), "\n")
  
  mst_edges <- igraph::as_data_frame(g_mst, what = "edges") %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        dplyr::select(gene, logFC_from = logFC),
      by = c("from" = "gene")
    ) %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        dplyr::select(gene, logFC_to = logFC),
      by = c("to" = "gene")
    ) %>%
    mutate(
      is_concordant = sign(logFC_from) == sign(logFC_to),
      width = mi
    )
  
  g_mst_vis <- g_mst %>%
    set_edge_attr(
      "arrows.to.type", 
      value = ifelse(mst_edges$is_concordant, "arrow", "bar")
    ) %>%
    set_edge_attr("width", value = mst_edges$width) %>%
    set_edge_attr("color", value = "DarkGray")
  
  V(g_mst_vis)$name <- V(g_mst_vis)$label
  
  if (length(E(g_mst)) < 100) {
    cat("Using visNetwork with default physics (MST edges < 100)\n")
    
    # Filter nodes to only those in the MST
    mst_node_ids <- unique(c(mst_edges$from, mst_edges$to))
    mst_nodes <- nodes %>% filter(id %in% mst_node_ids)
    
    mst_edges_vis <- mst_edges %>%
      mutate(
        arrows.to.type = ifelse(is_concordant, "arrow", "bar"),
        color = "DarkGray"
      ) %>%
      dplyr::select(from, to, width, arrows.to.type, color)
    
    visNetwork(mst_nodes, mst_edges_vis) %>%
      visNodes(font = list(size = font_size, multi = "html")) %>%
      visEdges(arrows = "to") %>%
      visOptions(
        highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
        nodesIdSelection = TRUE
      )
  } else {
    cat("Using igraph layout (MST edges >= 100)\n")
    visIgraph(g_mst_vis) %>%
      visNodes(font = list(size = font_size, multi = "html")) %>%
      visIgraphLayout(layout = layout) %>%
      visOptions(
        highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
        nodesIdSelection = TRUE
      )
  }
}

#' Visualize full network (non-MST)
#'
#' Creates interactive network visualization for smaller networks.
#'
#' @param edges Data frame with from, to, mi columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_full_network <- function(edges, nodes, effects_df, font_size = 20) {
  
  edge_df <- edges %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        dplyr::select(gene, logFC_from = logFC),
      by = c("from" = "gene")
    ) %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        dplyr::select(gene, logFC_to = logFC),
      by = c("to" = "gene")
    ) %>%
    mutate(
      is_concordant = sign(logFC_from) == sign(logFC_to),
      arrows.to.type = ifelse(is_concordant, "arrow", "bar"),
      color = "DarkGray",
      width = mi
    ) %>%
    dplyr::select(from, to, width, arrows.to.type, color)
  
  cat("Full Network - Edges:", nrow(edge_df), "| Nodes:", nrow(nodes), "\n")
  
  visNetwork(nodes, edge_df) %>%
    visNodes(font = list(size = font_size, multi = "html")) %>%
    visEdges(arrows = "to") %>%
    visOptions(
      highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
      nodesIdSelection = TRUE
    )
}

#' Main network visualization wrapper
#'
#' Filters edges by variant, prepares nodes, and calls appropriate 
#' visualization.
#'
#' @param edges Data frame with from, to, r, corr_std columns 
#'   (bipartite: cis → trans)
#' @param effects_df Data frame with DEG statistics
#' @param variant Character: "full", "flanking", or "inv4m"
#' @param module_name Character: network title prefix
#' @param custom_labels Named vector of custom gene labels
#' @param edge_threshold Threshold for switching to MST
#' @param font_size Font size for labels
#' @param n_samples Sample size for distance calculation
#' @param use_mst_below_100 If TRUE, use MST for networks < 100 edges; 
#'   if FALSE, show full network
#'
#' @return visNetwork htmlwidget
visualize_trans_network <- function(edges, 
                                    effects_df,
                                    variant = c("full", "flanking", "inv4m"),
                                    module_name = "Network",
                                    custom_labels = NULL,
                                    edge_threshold = 100,
                                    font_size = 20,
                                    n_samples,
                                    use_mst_below_100 = TRUE) {
  variant <- match.arg(variant)
  
  filtered_edges <- switch(
    variant,
    "full" = edges,
    "flanking" = edges %>% filter(from %in% flanking_regulators),
    "inv4m" = edges %>% filter(from %in% inv4m_regulators)
  )
  
  variant_label <- switch(
    variant,
    "full" = "Full Trans Co-expression",
    "flanking" = "Flanking Trans Co-expression",
    "inv4m" = "Inv4m Trans Co-expression"
  )
  
  title <- paste(module_name, "-", variant_label, "Network")
  
  cat("\n===", title, "===\n")
  cat("Edges:", nrow(filtered_edges), "\n")
  
  if (nrow(filtered_edges) == 0) {
    cat("No edges for this variant\n")
    return(NULL)
  }
  
  node_ids <- unique(c(filtered_edges$from, filtered_edges$to))
  nodes <- format_network_nodes(node_ids, effects_df, custom_labels)
  
  cat("Nodes:", nrow(nodes), "\n")
  
  prepared_edges <- filtered_edges %>%
    mutate(
      robust_dist = robust_dist(r, n_samples),
      std_dist = std_dist(corr_std, n_samples),
      mi = mutual_info(r)
    )
  
  if (nrow(filtered_edges) < 100) {
    if (use_mst_below_100) {
      cat("Using MST with visNetwork physics (edges < 100)\n")
      visualize_mst(prepared_edges, nodes, effects_df, font_size = font_size)
    } else {
      cat("Showing full network with physics (edges < 100)\n")
      visualize_full_network(prepared_edges, nodes, effects_df, 
                             font_size = font_size)
    }
  } else if (nrow(filtered_edges) >= edge_threshold) {
    cat("Using MST with igraph layout (edges >=", edge_threshold, ")\n")
    visualize_mst(prepared_edges, nodes, effects_df, font_size = font_size)
  } else {
    cat("Showing full network\n")
    visualize_full_network(prepared_edges, nodes, effects_df, 
                           font_size = font_size)
  }
}

Define Genomic Regions

v5_gff <- "~/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
v5 <- rtracklayer::import(v5_gff) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(v5)
genes$ID <- gsub("gene:", "", genes$ID)

cat("Total genes loaded:", nrow(genes), "\n")

## Total genes loaded: 43459

inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

cat("Inv4m region:", inv4m_start, "-", inv4m_end, "\n")

## Inv4m region: 172883675 - 188132113

inv4m_gene_ids <- genes %>%
  filter(
    seqnames == 4,
    start >= inv4m_start,
    end <= inv4m_end
  ) %>% 
  pull(ID)

cat("Genes in Inv4m:", length(inv4m_gene_ids), "\n")

## Genes in Inv4m: 434

introgression_start <- 157012149
introgression_end <- 195900523

cis_introgression_gene_ids <- genes %>%
  filter(
    seqnames == 4,
    start >= introgression_start,
    end <= introgression_end
  ) %>% 
  pull(ID)

cat("Genes in cis introgression:", length(cis_introgression_gene_ids), "\n")

## Genes in cis introgression: 1099

flanking_introgression_gene_ids <- cis_introgression_gene_ids[
  !cis_introgression_gene_ids %in% inv4m_gene_ids
]

cat("Genes in flanking regions:", length(flanking_introgression_gene_ids), 
    "\n")

## Genes in flanking regions: 665

Load Differential Expression Results

effects_df <- read.csv(
  "~/Desktop/predictor_effects_leaf_interaction_model.csv", 
  na.strings = c("NA", NA, "")
) %>%
  filter(predictor == "Inv4m")

curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv", 
                    na.strings = c("NA", NA, "")) %>%
  dplyr::select(gene, locus_label)

effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA

effects_df <- effects_df %>%
  left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
  mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
  dplyr::select(-locus_label_curated)

curated_updates <- tibble::tribble(
  ~gene,              ~locus_label,
  "Zm00001eb384770",  "zcn26",
  "Zm00001eb192330",  "roc4-1",
  "Zm00001eb193260",  "trxl2",
  "Zm00001eb194300",  "paspa",
  "Zm00001eb213980",  "dywl",
  "Zm00001eb194080",  "ugt85a2",
  "Zm00001eb112470",  "engd2",
  "Zm00001eb332450",  "roc4-2",
  "Zm00001eb193270",  "trxl5",
  "Zm00001eb066620",  "tipin",
  "Zm00001eb241410",  "etfa",
  "Zm00001eb013800",  "mrlk",
  "Zm00001eb249540",  "prpl29",
  "Zm00001eb000540",  "prd3",
  "Zm00001eb060540",  "actin-1",
  "Zm00001eb192580",  "map4k8",
  "Zm00001eb193120",  "o3l1",
  "Zm00001eb043750",  "cdrk9",
  "Zm00001eb034810",  "mgd2",
  "Zm00001eb241920",  "gpx1",
  "Zm00001eb162710",  "spx4",
  "Zm00001eb038730",  "pht7",
  "Zm00001eb370610",  "rfk1",
  "Zm00001eb386270",  "spx6",
  "Zm00001eb335670",  "sqd3",
  "Zm00001eb151650",  "pap1",
  "Zm00001eb154650",  "ppa1",
  "Zm00001eb280120",  "pfk1",
  "Zm00001eb036910",  "gpx3",
  "Zm00001eb406610",  "glk4",
  "Zm00001eb048730",  "spx2",
  "Zm00001eb361620",  "ppa2",
  "Zm00001eb297970",  "sqd2",
  "Zm00001eb347070",  "sqd1",
  "Zm00001eb144680",  "rns1",
  "Zm00001eb430590",  "nrx3",
  "Zm00001eb247580",  "ppck3",
  "Zm00001eb351780",  "ugp3",
  "Zm00001eb222510",  "pht1",
  "Zm00001eb126380",  "phos1",
  "Zm00001eb047070",  "pht2",
  "Zm00001eb041390",  "rns3",
  "Zm00001eb116580",  "spd1",
  "Zm00001eb069630",  "oct4",
  "Zm00001eb083520",  "dgd1",
  "Zm00001eb130570",  "sag21",
  "Zm00001eb239700",  "ppa3",  
  "Zm00001eb258130",  "mgd3",
  "Zm00001eb202100",  "pap14",
  "Zm00001eb144670",  "rns2",
  "Zm00001eb087720",  "pht13",
  "Zm00001eb277280",  "gst19",
  "Zm00001eb339870",  "pld16",
  "Zm00001eb289800",  "pah1",
  "Zm00001eb369590",  "nrx1",
  "Zm00001eb238670",  "pep2"
)

effects_df <- effects_df %>%
  left_join(curated_updates, by = "gene", suffix = c("", "_new")) %>%
  mutate(locus_label = coalesce(locus_label_new, locus_label)) %>%
  dplyr::select(-locus_label_new)

field_degs <- effects_df %>% 
  filter(is_hiconf_DEG) %>% 
  pull(gene) %>% 
  unique()

cat("Total field DEGs:", length(field_degs), "\n")

## Total field DEGs: 165

DEG Enrichment by Genomic Region

cis_trans_table <- with(effects_df, table(in_cis, is_DEG))
cis_trans_test <- fisher.test(cis_trans_table)

cat("=== CIS VS TRANS ENRICHMENT ===\n")

## === CIS VS TRANS ENRICHMENT ===

print(cis_trans_table)

##        is_DEG
## in_cis  FALSE  TRUE
##   FALSE 22932   475
##   TRUE    307   303

cat("\nFisher's p-value:", format(cis_trans_test$p.value, scientific = TRUE), 
    "\n")

## 
## Fisher's p-value: 1.688983e-301

cat("Odds ratio:", round(cis_trans_test$estimate, 3), "\n\n")

## Odds ratio: 47.613

inv4m_trans_table <- with(
  effects_df %>% filter(!in_flank),
  table(in_Inv4m, is_DEG)
)
inv4m_trans_test <- fisher.test(inv4m_trans_table)

cat("=== INV4M VS TRANS ENRICHMENT ===\n")

## === INV4M VS TRANS ENRICHMENT ===

print(inv4m_trans_table)

##         is_DEG
## in_Inv4m FALSE  TRUE
##    FALSE 22932   475
##    TRUE    125   130

cat("\nFisher's p-value:", format(inv4m_trans_test$p.value, scientific = TRUE), 
    "\n")

## 
## Fisher's p-value: 9.83221e-140

cat("Odds ratio:", round(inv4m_trans_test$estimate, 3), "\n\n")

## Odds ratio: 50.163

flanking_trans_table <- with(
  effects_df %>% filter(in_flank | in_trans),
  table(in_flank, is_DEG)
)
flanking_trans_test <- fisher.test(flanking_trans_table)

cat("=== FLANKING VS TRANS ENRICHMENT ===\n")

## === FLANKING VS TRANS ENRICHMENT ===

print(flanking_trans_table)

##         is_DEG
## in_flank FALSE  TRUE
##    FALSE 22932   475
##    TRUE    182   173

cat("\nFisher's p-value:", format(flanking_trans_test$p.value, 
                                   scientific = TRUE), "\n")

## 
## Fisher's p-value: 3.080185e-178

cat("Odds ratio:", round(flanking_trans_test$estimate, 3), "\n\n")

## Odds ratio: 45.853

inv4m_flanking_table <- with(
  effects_df %>% filter(in_cis),
  table(in_Inv4m, is_DEG)
)
inv4m_flanking_test <- fisher.test(inv4m_flanking_table)

cat("=== INV4M VS FLANKING ENRICHMENT ===\n")

## === INV4M VS FLANKING ENRICHMENT ===

print(inv4m_flanking_table)

##         is_DEG
## in_Inv4m FALSE TRUE
##    FALSE   182  173
##    TRUE    125  130

cat("\nFisher's p-value:", format(inv4m_flanking_test$p.value, 
                                   scientific = TRUE), "\n")

## 
## Fisher's p-value: 6.225037e-01

cat("Odds ratio:", round(inv4m_flanking_test$estimate, 3), "\n\n")

## Odds ratio: 1.094

Load MaizeNetome Reference Data

left_flanking_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/left_MaizeNetome_coexpression_edges.csv"
)

Inv4m_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/inv4m_MaizeNetome_coexpression_edges.csv"
)

right_flanking_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/right_MaizeNetome_coexpression_edges.csv"
)

MaizeNetome <- rbind(
  left_flanking_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression"),
  Inv4m_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression"),
  right_flanking_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression")
) %>% 
  dplyr::rename(Direct.Connection = Direct.Connetation.Node)

cat("Total MaizeNetome edges:", nrow(MaizeNetome), "\n")

## Total MaizeNetome edges: 640599

Convert Gene IDs from v4 to v5

v4_v5 <- read.table(
  file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v4_to_B73v5.tsv", 
  sep = "\t", 
  header = FALSE
) %>%
  dplyr::rename(v4 = V1, v5 = V2) %>%
  separate_longer_delim(cols = v5, delim = ",")

cat("v4 to v5 mappings:", nrow(v4_v5), "\n")

## v4 to v5 mappings: 43680

MaizeNetome_v5 <- MaizeNetome %>%
  left_join(v4_v5, by = c("Element.ID" = "v4")) %>%
  dplyr::rename(Element.ID.v5 = v5) %>%
  filter(!is.na(Element.ID.v5))

MaizeNetome_v5 <- MaizeNetome_v5 %>%
  left_join(v4_v5, by = c("Direct.Connection" = "v4")) %>%
  dplyr::rename(Direct.Connection.v5 = v5) %>%
  filter(!is.na(Direct.Connection.v5))

cat("Edges after v4->v5 conversion:", nrow(MaizeNetome_v5), "\n")

## Edges after v4->v5 conversion: 767533

Define Cis and Trans Gene Sets

cis_degs <- effects_df %>%
  filter(in_cis == TRUE, is_hiconf_DEG == TRUE) %>%
  arrange(adj.P.Val) %>%
  dplyr::select(gene:desc_merged, logFC, adj.P.Val) %>%
  inner_join(v4_v5, by = c(gene = "v5")) %>%
  group_by(gene) %>%
  arrange(adj.P.Val)

trans_degs <- effects_df %>%
  filter(in_trans == TRUE, is_hiconf_DEG == TRUE) %>%
  arrange(adj.P.Val) %>%
  dplyr::select(gene:desc_merged, logFC, adj.P.Val) %>%
  inner_join(v4_v5, by = c(gene = "v5")) %>%
  group_by(gene) %>%
  arrange(adj.P.Val)

cat("Cis genes:", length(unique(cis_degs$gene)), "\n")

## Cis genes: 107

cat("Trans genes:", length(unique(trans_degs$gene)), "\n")

## Trans genes: 39

Extract Reference Edges

cis_genes_all <- cis_degs$gene %>% sort() %>% unique()
trans_genes_all <- trans_degs$gene %>% sort() %>% unique()

inv4m_regulators <- cis_genes_all[cis_genes_all %in% inv4m_gene_ids]
flanking_regulators <- cis_genes_all[
  cis_genes_all %in% flanking_introgression_gene_ids
]

cat("Inv4m regulators:", length(inv4m_regulators), "\n")

## Inv4m regulators: 37

cat("Flanking regulators:", length(flanking_regulators), "\n")

## Flanking regulators: 70

reference_edges_maizenetome <- MaizeNetome_v5 %>%
  filter(
    Element.ID.v5 %in% cis_genes_all,
    Direct.Connection.v5 %in% trans_genes_all
  ) %>%
  dplyr::rename(
    cis_gene = Element.ID.v5,
    trans_gene = Direct.Connection.v5
  ) %>%
  dplyr::select(cis_gene, trans_gene, network)

cat("Reference edges from MaizeNetome:", nrow(reference_edges_maizenetome), 
    "\n")

## Reference edges from MaizeNetome: 103

Calculate Bipartite Trans Co-expression Network

voomR <- readRDS("~/Desktop/normalized_expression_voom_object.rda")

n_samples <- ncol(voomR$E)

cat("Expression matrix:", nrow(voomR$E), "genes x", ncol(voomR$E), 
    "samples\n")

## Expression matrix: 24249 genes x 43 samples

linked <- read.table("inv4m_linked_genes_to_filter.txt", header = TRUE)

cat("LD-linked genes loaded:", nrow(linked), "\n")

## LD-linked genes loaded: 58

trans_genes_filtered <- trans_genes_all[!trans_genes_all %in% linked$gene_id]

cat("Trans genes before LD filtering:", length(trans_genes_all), "\n")

## Trans genes before LD filtering: 39

cat("Trans genes after LD filtering:", length(trans_genes_filtered), "\n")

## Trans genes after LD filtering: 37

reference_edges_maizenetome <- reference_edges_maizenetome %>%
  filter(trans_gene %in% trans_genes_filtered)

cat("Reference edges after LD filtering:", nrow(reference_edges_maizenetome), 
    "\n")

## Reference edges after LD filtering: 103

cis_genes <- cis_genes_all[cis_genes_all %in% rownames(voomR$E)]
trans_genes <- trans_genes_filtered[trans_genes_filtered %in% rownames(voomR$E)]

cat("Cis genes in expression data:", length(cis_genes), "\n")

## Cis genes in expression data: 107

cat("Trans genes in expression data:", length(trans_genes), "\n")

## Trans genes in expression data: 37

# Original correlations (for visualization and robust_dist)
expr_cis <- t(voomR$E)[, cis_genes]
expr_trans <- t(voomR$E)[, trans_genes]
corr_bipartite <- cor(expr_cis, expr_trans, use = "pairwise.complete.obs")

# Scaled correlations for std_dist MST construction
# Following Chenxin Li: https://github.com/cxli233/SimpleTidy_GeneCoEx#z-score
expr_cis_scaled <- scale(expr_cis)
expr_trans_scaled <- scale(expr_trans)
corr_bipartite_scaled <- cor(expr_cis_scaled, expr_trans_scaled, 
                              use = "pairwise.complete.obs")

p_values <- matrix(NA, nrow = nrow(corr_bipartite), ncol = ncol(corr_bipartite))

for (i in 1:nrow(corr_bipartite)) {
  for (j in 1:ncol(corr_bipartite)) {
    r <- corr_bipartite[i, j]
    t_stat <- r * sqrt((n_samples - 2) / (1 - r^2))
    p_values[i, j] <- 2 * pt(-abs(t_stat), df = n_samples - 2)
  }
}

p_adj <- matrix(p.adjust(c(p_values), method = "fdr"), nrow = nrow(p_values))

corr_bipartite_filtered <- corr_bipartite
corr_bipartite_filtered[p_adj > 0.05] <- 0

cat("Significant pairs (FDR < 0.05):", sum(corr_bipartite_filtered != 0), "\n")

## Significant pairs (FDR < 0.05): 3769

edges_idx <- which(corr_bipartite_filtered != 0, arr.ind = TRUE)

bipartite_edges <- data.frame(
  from = rownames(corr_bipartite_filtered)[edges_idx[, 1]],
  to = colnames(corr_bipartite_filtered)[edges_idx[, 2]],
  r = corr_bipartite_filtered[edges_idx],
  corr_std = corr_bipartite_scaled[edges_idx]
)

cat("Bipartite network edges:", nrow(bipartite_edges), "\n")

## Bipartite network edges: 3769

Classify Edges as Reference or Novel

bipartite_edges$in_maizenetome <- mapply(
  function(f, t) {
    any(reference_edges_maizenetome$cis_gene == f & 
        reference_edges_maizenetome$trans_gene == t)
  },
  bipartite_edges$from,
  bipartite_edges$to
)

bipartite_edges$network_module <- ifelse(
  bipartite_edges$in_maizenetome,
  "Reference",
  "Novel"
)

cat("Edge classification:\n")

## Edge classification:

print(table(bipartite_edges$network_module))

## 
##     Novel Reference 
##      3673        96

reference_edges <- bipartite_edges %>% filter(network_module == "Reference")
novel_edges <- bipartite_edges %>% filter(network_module == "Novel")

cat("\nReference module edges:", nrow(reference_edges), "\n")

## 
## Reference module edges: 96

cat("Novel module edges:", nrow(novel_edges), "\n")

## Novel module edges: 3673

Visualize Whole Trans Networks

visualize_trans_network(
  edges = bipartite_edges,
  effects_df = effects_df,
  variant = "full",
  module_name = "Trans Coexpression",
  n_samples = n_samples
)

## 
## === Trans Coexpression - Full Trans Co-expression Network ===
## Edges: 3769 
## Nodes: 144 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 144 
## Unique node IDs: 144 
## Unique labels: 144 
## MST - Edges: 143 | Nodes: 144 
## Using igraph layout (MST edges >= 100)

reference_inv4m_edges <- reference_edges %>% 
  filter(from %in% inv4m_regulators)
novel_inv4m_edges <- novel_edges %>% 
  filter(from %in% inv4m_regulators)

ref_inv4m_nodes <- unique(c(reference_inv4m_edges$from, 
                             reference_inv4m_edges$to))
novel_inv4m_nodes <- unique(c(novel_inv4m_edges$from, novel_inv4m_edges$to))
novel_only_nodes <- setdiff(novel_inv4m_nodes, ref_inv4m_nodes)

novel_only_edges <- novel_inv4m_edges %>%
  filter(from %in% novel_only_nodes & to %in% novel_only_nodes)

visualize_trans_network(
  edges = novel_only_edges,
  effects_df = effects_df,
  variant = "full",
  module_name = "NOVEL-ONLY (Inv4m, excluding reference nodes)",
  edge_threshold = 100,
  font_size = 18,
  n_samples = n_samples
)

## 
## === NOVEL-ONLY (Inv4m, excluding reference nodes) - Full Trans Co-expression Network ===
## Edges: 558 
## Nodes: 49 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 49 
## Unique node IDs: 49 
## Unique labels: 49 
## MST - Edges: 48 | Nodes: 49 
## Using visNetwork with default physics (MST edges < 100)

visualize_trans_network(
  edges = bipartite_edges,
  effects_df = effects_df,
  variant = "flanking",
  module_name = "Trans Coexpression",
  n_samples = n_samples
)

## 
## === Trans Coexpression - Flanking Trans Co-expression Network ===
## Edges: 2470 
## Nodes: 107 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 107 
## Unique node IDs: 107 
## Unique labels: 107 
## MST - Edges: 106 | Nodes: 107 
## Using igraph layout (MST edges >= 100)

visualize_trans_network(
  edges = bipartite_edges,
  effects_df = effects_df,
  variant = "inv4m",
  module_name = "Trans Coexpression",
  n_samples = n_samples
)

## 
## === Trans Coexpression - Inv4m Trans Co-expression Network ===
## Edges: 1299 
## Nodes: 74 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 74 
## Unique node IDs: 74 
## Unique labels: 74 
## MST - Edges: 73 | Nodes: 74 
## Using visNetwork with default physics (MST edges < 100)

targets_with_flanking <- bipartite_edges %>%
  filter(from %in% flanking_regulators) %>%
  pull(to) %>%
  unique()

edges_no_flanking_targets <- bipartite_edges %>%
  filter(
    from %in% inv4m_regulators,
    !to %in% targets_with_flanking
  )

visualize_trans_network(
  edges = edges_no_flanking_targets,
  effects_df = effects_df,
  variant = "full",
  module_name = "Inv4m → No Flanking Connections",
  edge_threshold = 100,
  font_size = 18,
  n_samples = n_samples
)

## 
## === Inv4m → No Flanking Connections - Full Trans Co-expression Network ===
## Edges: 0 
## No edges for this variant

## NULL

strongest_edge_per_target <- bipartite_edges %>%
  mutate(abs_r = abs(r)) %>%
  group_by(to) %>%
  slice_max(abs_r, n = 1, with_ties = FALSE) %>%
  ungroup()

targets_strongest_nonflanking <- strongest_edge_per_target %>%
  filter(!from %in% flanking_regulators) %>%
  pull(to)

edges_strongest_nonflanking_inv4m <- bipartite_edges %>%
  filter(
    to %in% targets_strongest_nonflanking,
    from %in% inv4m_regulators
  )

visualize_trans_network(
  edges = edges_strongest_nonflanking_inv4m,
  effects_df = effects_df,
  variant = "full",
  module_name = "Inv4m → Strongest Regulator Not Flanking",
  edge_threshold = 100,
  font_size = 18,
  n_samples = n_samples
)

## 
## === Inv4m → Strongest Regulator Not Flanking - Full Trans Co-expression Network ===
## Edges: 460 
## Nodes: 50 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 50 
## Unique node IDs: 50 
## Unique labels: 50 
## MST - Edges: 49 | Nodes: 50 
## Using visNetwork with default physics (MST edges < 100)

Visualize Novel Trans Networks

visualize_trans_network(
  edges = novel_edges,
  effects_df = effects_df,
  variant = "full",
  module_name = "NOVEL",
  n_samples = n_samples
)

## 
## === NOVEL - Full Trans Co-expression Network ===
## Edges: 3673 
## Nodes: 144 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 144 
## Unique node IDs: 144 
## Unique labels: 144 
## MST - Edges: 143 | Nodes: 144 
## Using igraph layout (MST edges >= 100)

visualize_trans_network(
  edges = novel_edges,
  effects_df = effects_df,
  variant = "flanking",
  module_name = "NOVEL",
  n_samples = n_samples
)

## 
## === NOVEL - Flanking Trans Co-expression Network ===
## Edges: 2411 
## Nodes: 107 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 107 
## Unique node IDs: 107 
## Unique labels: 107 
## MST - Edges: 106 | Nodes: 107 
## Using igraph layout (MST edges >= 100)

visualize_trans_network(
  edges = novel_edges,
  effects_df = effects_df,
  variant = "inv4m",
  module_name = "NOVEL",
  n_samples = n_samples
)

## 
## === NOVEL - Inv4m Trans Co-expression Network ===
## Edges: 1262 
## Nodes: 74 
## Using MST with igraph layout (edges >= 100 )
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 74 
## Unique node IDs: 74 
## Unique labels: 74 
## MST - Edges: 73 | Nodes: 74 
## Using visNetwork with default physics (MST edges < 100)

Visualize Reference Trans Networks

visualize_trans_network(
  edges = reference_edges,
  effects_df = effects_df,
  variant = "full",
  module_name = "REFERENCE",
  n_samples = n_samples
)

## 
## === REFERENCE - Full Trans Co-expression Network ===
## Edges: 96 
## Nodes: 49 
## Using MST with visNetwork physics (edges < 100)
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 49 
## Unique node IDs: 49 
## Unique labels: 49 
## MST - Edges: 47 | Nodes: 49 
## Using visNetwork with default physics (MST edges < 100)

visualize_trans_network(
  edges = reference_edges,
  effects_df = effects_df,
  variant = "flanking",
  module_name = "REFERENCE",
  n_samples = n_samples
)

## 
## === REFERENCE - Flanking Trans Co-expression Network ===
## Edges: 59 
## Nodes: 32 
## Using MST with visNetwork physics (edges < 100)
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 32 
## Unique node IDs: 32 
## Unique labels: 32 
## MST - Edges: 29 | Nodes: 32 
## Using visNetwork with default physics (MST edges < 100)

visualize_trans_network(
  edges = reference_edges,
  effects_df = effects_df,
  variant = "inv4m",
  module_name = "REFERENCE",
  n_samples = n_samples
)

## 
## === REFERENCE - Inv4m Trans Co-expression Network ===
## Edges: 37 
## Nodes: 25 
## Using MST with visNetwork physics (edges < 100)
## 
## === DIAGNOSTIC: Checking node labels ===
## Total nodes: 25 
## Unique node IDs: 25 
## Unique labels: 25 
## MST - Edges: 24 | Nodes: 25 
## Using visNetwork with default physics (MST edges < 100)

Export Network Modules

reference_genes <- unique(c(reference_edges$from, reference_edges$to))
novel_genes <- unique(c(novel_edges$from, novel_edges$to))

reference_gene_info <- effects_df %>%
  filter(gene %in% reference_genes, predictor == "Inv4m") %>%
  dplyr::select(gene, locus_label, desc_merged, logFC, adj.P.Val) %>%
  distinct() %>%
  mutate(gene_location = ifelse(gene %in% cis_genes, "cis", "trans"))

write.csv(reference_gene_info, 
          "~/Desktop/reference_trans_network_genes.csv", 
          row.names = FALSE)
write.csv(reference_edges, 
          "~/Desktop/reference_trans_network_edges.csv", 
          row.names = FALSE)

novel_gene_info <- effects_df %>%
  filter(gene %in% novel_genes, predictor == "Inv4m") %>%
  dplyr::select(gene, locus_label, desc_merged, logFC, adj.P.Val) %>%
  distinct() %>%
  mutate(gene_location = ifelse(gene %in% cis_genes, "cis", "trans"))

write.csv(novel_gene_info, 
          "~/Desktop/novel_trans_network_genes.csv", 
          row.names = FALSE)
write.csv(novel_edges, 
          "~/Desktop/novel_trans_network_edges.csv", 
          row.names = FALSE)

cat("Network modules exported to ~/Desktop/\n")

## Network modules exported to ~/Desktop/

Session Information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] visNetwork_2.1.4     igraph_2.2.1         rtracklayer_1.70.0  
##  [4] GenomicRanges_1.62.0 Seqinfo_1.0.0        IRanges_2.44.0      
##  [7] S4Vectors_0.48.0     BiocGenerics_0.56.0  generics_0.1.4      
## [10] tidyr_1.3.1          dplyr_1.1.4         
## 
## loaded via a namespace (and not attached):
##  [1] SummarizedExperiment_1.40.0 rjson_0.2.23               
##  [3] xfun_0.54                   bslib_0.9.0                
##  [5] htmlwidgets_1.6.4           Biobase_2.70.0             
##  [7] lattice_0.22-7              vctrs_0.6.5                
##  [9] tools_4.5.1                 bitops_1.0-9               
## [11] curl_7.0.0                  parallel_4.5.1             
## [13] tibble_3.3.0                pkgconfig_2.0.3            
## [15] Matrix_1.7-4                cigarillo_1.0.0            
## [17] lifecycle_1.0.4             stringr_1.6.0              
## [19] compiler_4.5.1              Rsamtools_2.26.0           
## [21] Biostrings_2.78.0           statmod_1.5.1              
## [23] codetools_0.2-20            htmltools_0.5.8.1          
## [25] sass_0.4.10                 RCurl_1.98-1.17            
## [27] yaml_2.3.10                 pillar_1.11.1              
## [29] crayon_1.5.3                jquerylib_0.1.4            
## [31] BiocParallel_1.44.0         limma_3.66.0               
## [33] cachem_1.1.0                DelayedArray_0.36.0        
## [35] abind_1.4-8                 tidyselect_1.2.1           
## [37] digest_0.6.39               stringi_1.8.7              
## [39] purrr_1.2.0                 restfulr_0.0.16            
## [41] fastmap_1.2.0               grid_4.5.1                 
## [43] cli_3.6.5                   SparseArray_1.10.2         
## [45] magrittr_2.0.4              S4Arrays_1.10.0            
## [47] XML_3.99-0.20               withr_3.0.2                
## [49] rmarkdown_2.30              XVector_0.50.0             
## [51] httr_1.4.7                  matrixStats_1.5.0          
## [53] evaluate_1.0.5              knitr_1.50                 
## [55] BiocIO_1.20.0               rlang_1.1.6                
## [57] glue_1.8.0                  rstudioapi_0.17.1          
## [59] jsonlite_2.0.0              R6_2.6.1                   
## [61] MatrixGenerics_1.22.0       GenomicAlignments_1.46.0

Trans-regulatory Network Analysis of Inv4m Introgression

Fausto Rodriguez

2025-12-09