Overview

Purpose: Identify and characterize trans-regulatory networks mediated by genes within the Inv4m introgression region. This analysis integrates differential expression results with co-expression data from MaizeNetome to construct regulatory networks.

Approach: 1. Define genomic regions (Inv4m inversion, flanking introgression, in_trans) 2. Test for enrichment of DEGs within these regions 3. Query MaizeNetome for co-expression edges involving introgressed genes 4. Filter for trans-regulatory relationships (introgressed → in_trans targets) 5. Validate using expression correlation from RNA-seq data 6. Visualize networks with igraph/visNetwork

Expected outcomes: - Identification of trans-regulatory hubs within Inv4m - Network visualization showing regulatory relationships - Statistical enrichment of DEGs by genomic region - Output files for downstream analysis and manuscript figures

Setup and Dependencies

library(dplyr)
library(tidyr)
library(rtracklayer)
library(GenomicRanges)
library(igraph)
library(visNetwork)

Helper functions

#' Compute correlation matrix with FDR-adjusted p-values
#'
#' Calculates pairwise correlations and adjusts p-values for multiple testing.
#'
#' @param x Numeric matrix or data frame
#' @param type Correlation method: "pearson" or "spearman"
#' @param use Handling of missing values
#' @param adjust Multiple testing correction method
#'
#' @return List with correlation matrix (R), adjusted p-values (P), 
#'   unadjusted p-values (P.unadj), correlation type, and adjustment method
adjust.corr <- function(x, 
                        type = c("pearson", "spearman"), 
                        use = c("complete.obs", "pairwise.complete.obs"), 
                        adjust = "holm") {
  opt <- options(scipen = 5)
  on.exit(options(opt))
  
  type <- match.arg(type)
  use <- match.arg(use)
  
  x <- if (use == "complete.obs") {
    as.matrix(na.omit(x))
  } else {
    as.matrix(x)
  }
  
  R <- Hmisc::rcorr(x, type = type)
  P <- P.unadj <- R$P
  p <- P[lower.tri(P)]
  adj.p <- p.adjust(p, method = adjust)
  P[lower.tri(P)] <- adj.p
  P[upper.tri(P)] <- 0
  P <- P + t(P)
  P <- ifelse(P < 1e-04, 0, P)
  P <- format(round(P, 4))
  diag(P) <- ""
  P[c(grep("0.0000", P), grep("^ 0$", P))] <- "<.0001"
  P[grep("0.000$", P)] <- "<.001"
  
  P.unadj <- ifelse(P.unadj < 1e-04, 0, P.unadj)
  P.unadj <- format(round(P.unadj, 4))
  diag(P.unadj) <- ""
  P.unadj[c(grep("0.0000$", P.unadj), grep("^ 0$", P.unadj))] <- "<.0001"
  P.unadj[grep("0.000$", P.unadj)] <- "<.001"
  
  result <- list(
    R = R, 
    P = P, 
    P.unadj = P.unadj, 
    type = type, 
    adjustr = adjust
  )
  class(result) <- "adjust.corr"
  result
}

Define Genomic Regions

Load B73 v5 Gene Annotations

Purpose: Extract gene coordinates for defining introgression boundaries.

Approach: Import GFF3 file and subset to protein-coding genes on chromosomes 1-10.

v5_gff <- "~/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
v5 <- rtracklayer::import(v5_gff) %>%
  subset(type == "gene" & seqnames %in% 1:10)

genes <- as.data.frame(v5)
genes$ID <- gsub("gene:", "", genes$ID)

cat("Total genes loaded:", nrow(genes), "\n")

## Total genes loaded: 43459

Define Inv4m Inversion Coordinates

Purpose: Identify genes within the structural inversion boundaries.

Approach: Use known breakpoint genes to define Inv4m start/end positions.

# Breakpoint genes
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

cat("Inv4m region:", inv4m_start, "-", inv4m_end, "\n")

## Inv4m region: 172883675 - 188132113

# Extract all genes within inversion
inv4m_gene_ids <- genes %>%
  filter(
    seqnames == 4,
    start >= inv4m_start,
    end <= inv4m_end
  ) %>% 
  pull(ID)

cat("Genes in Inv4m:", length(inv4m_gene_ids), "\n")

## Genes in Inv4m: 434

cat("Breakpoints included:", 
    "Zm00001eb190470" %in% inv4m_gene_ids, 
    "Zm00001eb194800" %in% inv4m_gene_ids, "\n")

## Breakpoints included: TRUE TRUE

Define Shared Introgression Region

Purpose: Identify the larger introgression region that includes both Inv4m and flanking sequences.

Approach: Use manually curated coordinates from genotype analysis.

# Coordinates from RNA-seq genotype table
introgression_start <- 157012149
introgression_end <- 195900523

cis_introgression_gene_ids <- genes %>%
  filter(
    seqnames == 4,
    start >= introgression_start,
    end <= introgression_end
  ) %>% 
  pull(ID)

cat("Genes in shared introgression:", 
    length(cis_introgression_gene_ids), "\n")

## Genes in shared introgression: 1099

Define Flanking Introgression Region

Purpose: Separate introgressed genes in_trans the inversion from those within it.

flanking_introgression_gene_ids <- cis_introgression_gene_ids[
  !cis_introgression_gene_ids %in% inv4m_gene_ids
]

cat("Genes in flanking regions:", 
    length(flanking_introgression_gene_ids), "\n")

## Genes in flanking regions: 665

cat("All Inv4m genes within shared introgression:", 
    all(inv4m_gene_ids %in% cis_introgression_gene_ids), "\n")

## All Inv4m genes within shared introgression: TRUE

Load and Annotate Differential Expression Results

Purpose: Load DEG analysis results and annotate with genomic region information.

Approach: Read predictor effects from leaf developmental gradient analysis, filter for Inv4m genotype effect, and classify genes by genomic location.

Expected outcome: Data frame with DEG statistics and region annotations. ## Load Field DEG List

# Load differential expression results
effects_df <- read.csv("~/Desktop/predictor_effects_leaf_interaction_model.csv") %>%
  filter(predictor == "Inv4m")

# Load curated labels
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv") %>%
  dplyr::select(gene, locus_label)

effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA


# Merge curated labels
effects_df <- effects_df %>%
  left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
  mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
  dplyr::select(-locus_label_curated)

# Curated updates: only gene and locus_label
curated <- tibble::tribble(
  ~gene,              ~locus_label,
  # custom labels for trans network
  "Zm00001eb384770",  "zcn26",
  "Zm00001eb192330",  "roc4-1",
  "Zm00001eb193260",  "trxl2",
  "Zm00001eb194300",  "paspa",
  "Zm00001eb213980",  "dywl",
  "Zm00001eb194080",  "ugt85a2",
  "Zm00001eb112470",  "engd2",
  "Zm00001eb332450",  "roc4-2",
  "Zm00001eb193270",  "trxl5",
  "Zm00001eb066620",  "tipin",
  "Zm00001eb241410",  "etfa",
  "Zm00001eb013800",  "mrlk",
  "Zm00001eb249540",  "prpl29",
  "Zm00001eb000540",  "prd3",
  "Zm00001eb060540",  "actin-1",
  "Zm00001eb192580",  "map4k8",
  "Zm00001eb193120",  "o3l1",
  "Zm00001eb043750",  "cdrk9",
  # From original curated list
  "Zm00001eb034810",  "mgd2",
  "Zm00001eb241920",  "gpx1",
  "Zm00001eb162710",  "spx4",
  "Zm00001eb038730",  "pht7",
  "Zm00001eb370610",  "rfk1",
  "Zm00001eb386270",  "spx6",
  "Zm00001eb335670",  "sqd3",
  "Zm00001eb151650",  "pap1",
  "Zm00001eb154650",  "ppa1",
  "Zm00001eb280120",  "pfk1",
  "Zm00001eb036910",  "gpx3",
  "Zm00001eb406610",  "glk4",
  "Zm00001eb048730",  "spx2",
  "Zm00001eb361620",  "ppa2",
  "Zm00001eb297970",  "sqd2",
  "Zm00001eb347070",  "sqd1",
  "Zm00001eb144680",  "rns1",
  "Zm00001eb430590",  "nrx3",
  "Zm00001eb247580",  "ppck3",
  "Zm00001eb351780",  "ugp3",
  "Zm00001eb222510",  "pht1",
  "Zm00001eb126380",  "phos1",
  "Zm00001eb047070",  "pht2",
  "Zm00001eb041390",  "rns3",
  "Zm00001eb116580",  "spd1",
  "Zm00001eb069630",  "oct4",
  "Zm00001eb083520",  "dgd1",
  "Zm00001eb130570",  "sag21",
  "Zm00001eb239700",  "ppa3",  
  "Zm00001eb258130",  "mgd3",
  "Zm00001eb202100",  "pap14",
  "Zm00001eb144670",  "rns2",
  "Zm00001eb087720",  "pht13",
  "Zm00001eb277280",  "gst19",
  "Zm00001eb339870",  "pld16",
  "Zm00001eb289800",  "pah1",
  "Zm00001eb369590",  "nrx1",
  "Zm00001eb238670",  "pep2"
)

# Update only the locus_label using curated values
effects_df <- effects_df %>%
  left_join(curated, by = "gene", suffix = c("", "_new")) %>%
  mutate(locus_label = coalesce(locus_label_new, locus_label)) %>%
  dplyr::select(-locus_label_new)

# Replace with your actual field DEG gene IDs
field_degs <- effects_df %>% filter(is_hiconf_DEG) %>% pull(gene) %>% unique()

cat("\nField DEG list loaded:\n")

## 
## Field DEG list loaded:

cat("  Total field DEGs:", length(field_degs), "\n")

##   Total field DEGs: 165

DEG Enrichment by Genomic Region

Purpose: Test whether DEGs are enriched within introgression regions.

Approach: Fisher’s exact tests comparing odds of significance across genomic regions.

DEG Enrichment Tests

Shared Introgression vs Outside

cis_outside_table <- with(
  effects_df,
  table(in_cis, is_DEG)
)

cis_outside_test <- fisher.test(cis_outside_table)

cat("Contingency table:\n")

## Contingency table:

print(cis_outside_table)

##        is_DEG
## in_cis  FALSE  TRUE
##   FALSE 22932   475
##   TRUE    307   303

cat("\nFisher's exact test p-value:", 
    format(cis_outside_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 1.688983e-301

cat("Odds ratio:", round(cis_outside_test$estimate, 3), "\n")

## Odds ratio: 47.613

Inv4m vs Outside (excluding flanking)

inv4m_outside_table <- with(
  effects_df %>% filter(!in_flank),
  table(in_Inv4m, is_DEG)
)

inv4m_outside_test <- fisher.test(inv4m_outside_table)

cat("Contingency table:\n")

## Contingency table:

print(inv4m_outside_table)

##         is_DEG
## in_Inv4m FALSE  TRUE
##    FALSE 22932   475
##    TRUE    125   130

cat("\nFisher's exact test p-value:", 
    format(inv4m_outside_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 9.83221e-140

cat("Odds ratio:", round(inv4m_outside_test$estimate, 3), "\n")

## Odds ratio: 50.163

Flanking vs Outside

flanking_outside_table <- with(
  effects_df %>% filter(in_flank | in_trans),
  table(in_flank, is_DEG)
)

flanking_outside_test <- fisher.test(flanking_outside_table)

cat("Contingency table:\n")

## Contingency table:

print(flanking_outside_table)

##         is_DEG
## in_flank FALSE  TRUE
##    FALSE 22932   475
##    TRUE    182   173

cat("\nFisher's exact test p-value:", 
    format(flanking_outside_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 3.080185e-178

cat("Odds ratio:", round(flanking_outside_test$estimate, 3), "\n")

## Odds ratio: 45.853

Inv4m vs Flanking (within shared introgression)

inv4m_flanking_table <- with(
  effects_df %>% filter(in_cis),
  table(in_Inv4m, is_DEG)
)

inv4m_flanking_test <- fisher.test(inv4m_flanking_table)

cat("Contingency table:\n")

## Contingency table:

print(inv4m_flanking_table)

##         is_DEG
## in_Inv4m FALSE TRUE
##    FALSE   182  173
##    TRUE    125  130

cat("\nFisher's exact test p-value:", 
    format(inv4m_flanking_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 6.225037e-01

cat("Odds ratio:", round(inv4m_flanking_test$estimate, 3), "\n")

## Odds ratio: 1.094

High Confidence DEG Enrichment Tests

Purpose: Test enrichment of highly differentially expressed genes (|log2FC| > 1.5).

cat("Total High Confidence DEGs:", sum(effects_df$is_hiconf_DEG), "\n")

## Total High Confidence DEGs: 167

Shared Introgression vs Outside (High Confidence DEGs)

cis_outside_hiconf <- with(
  effects_df,
  table(in_cis, is_hiconf_DEG)
)

cis_outside_hiconf_test <- fisher.test(cis_outside_hiconf)

cat("Contingency table:\n")

## Contingency table:

print(cis_outside_hiconf)

##        is_hiconf_DEG
## in_cis  FALSE  TRUE
##   FALSE 23361    46
##   TRUE    489   121

cat("\nFisher's exact test p-value:", 
    format(cis_outside_hiconf_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 4.837737e-158

Inv4m vs Outside (High Confidence DEGs)

inv4m_outside_hiconf <- with(
  effects_df %>% filter(!in_flank),
  table(in_Inv4m, is_hiconf_DEG)
)

inv4m_outside_hiconf_test <- fisher.test(inv4m_outside_hiconf)

cat("Contingency table:\n")

## Contingency table:

print(inv4m_outside_hiconf)

##         is_hiconf_DEG
## in_Inv4m FALSE  TRUE
##    FALSE 23361    46
##    TRUE    210    45

cat("\nFisher's exact test p-value:", 
    format(inv4m_outside_hiconf_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 6.673456e-65

Flanking vs Outside (High Confidence DEGs)

flanking_outside_hiconf <- with(
  effects_df %>% filter(in_flank | in_trans),
  table(in_flank, is_hiconf_DEG)
)

flanking_outside_hiconf_test <- fisher.test(flanking_outside_hiconf)

cat("Contingency table:\n")

## Contingency table:

print(flanking_outside_hiconf)

##         is_hiconf_DEG
## in_flank FALSE  TRUE
##    FALSE 23361    46
##    TRUE    279    76

cat("\nFisher's exact test p-value:", 
    format(flanking_outside_hiconf_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 1.919309e-109

Inv4m vs Flanking (High Confidence DEGs)

inv4m_flanking_hiconf <- with(
  effects_df %>% filter(in_cis),
  table(in_Inv4m, is_hiconf_DEG)
)

inv4m_flanking_hiconf_test <- fisher.test(inv4m_flanking_hiconf)

cat("Contingency table:\n")

## Contingency table:

print(inv4m_flanking_hiconf)

##         is_hiconf_DEG
## in_Inv4m FALSE TRUE
##    FALSE   279   76
##    TRUE    210   45

cat("\nFisher's exact test p-value:", 
    format(inv4m_flanking_hiconf_test$p.value, scientific = TRUE), "\n")

## 
## Fisher's exact test p-value: 2.594766e-01

Load MaizeNetome Co-expression Data

Purpose: Import co-expression edges from MaizeNetome for the chromosome 4 introgression region.

Approach: Load three separate queries covering left flank, Inv4m, and right flank, then combine.

Expected outcome: Edge list with co-expression relationships.

# Files were collected manually by searching for the genes in these v4 
# coordinate limits in the search tool of MaizeNetome

# Left: 4:155195539-170747954
left_flanking_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/left_MaizeNetome_coexpression_edges.csv"
)

# Inv4m: 4:170747955-186145605
Inv4m_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/inv4m_MaizeNetome_coexpression_edges.csv"
)

# Right: 4:186145606-195900523
right_flanking_maizenetome <- read.csv(
  "~/Desktop/inv4m/coexpression/right_MaizeNetome_coexpression_edges.csv"
)

MaizeNetome <- rbind(
  left_flanking_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression"),
  Inv4m_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression"),
  right_flanking_maizenetome %>%
    separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
    mutate(network = "coexpression")
) %>% 
  dplyr::rename(Direct.Connection = Direct.Connetation.Node)

cat("Total edges:", nrow(MaizeNetome), "\n")

## Total edges: 640599

cat("Unique query genes:", 
    length(unique(MaizeNetome$Element.ID)), "\n")

## Unique query genes: 717

cat("Unique connected genes:", 
    length(unique(MaizeNetome$Direct.Connection)), "\n")

## Unique connected genes: 23748

Convert Gene IDs from v4 to v5

Purpose: MaizeNetome uses B73 v4 gene IDs; convert to v5 for integration with current analysis.

Approach: Load v4-to-v5 crossreference table and apply to both query genes and connection targets.

v4_v5 <- read.table(
  file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v4_to_B73v5.tsv", 
  sep = "\t", 
  header = FALSE
) %>%
  dplyr::rename(v4 = V1, v5 = V2) %>%
  separate_longer_delim(cols = v5, delim = ",")

cat("v4 to v5 mappings:", nrow(v4_v5), "\n")

## v4 to v5 mappings: 43680

# Load v4 GFF for gene metadata
v4_gff <- "~/ref/zea/Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2.gff3"
v4 <- rtracklayer::import(v4_gff) %>%
  subset(type == "gene")

v4_genes <- as.data.frame(v4)
v4_genes$ID <- gsub("gene:", "", v4_genes$ID)

cat("v4 genes loaded:", nrow(v4_genes), "\n")

## v4 genes loaded: 39498

# Convert query gene IDs (Element.ID)
MaizeNetome_v5 <- MaizeNetome %>%
  left_join(v4_v5, by = c("Element.ID" = "v4")) %>%
  dplyr::rename(Element.ID.v5 = v5) %>%
  filter(!is.na(Element.ID.v5))

# Convert target gene IDs (Direct.Connection)
MaizeNetome_v5 <- MaizeNetome_v5 %>%
  left_join(v4_v5, by = c("Direct.Connection" = "v4")) %>%
  dplyr::rename(Direct.Connection.v5 = v5) %>%
  filter(!is.na(Direct.Connection.v5))

cat("Edges after v4->v5 conversion:", nrow(MaizeNetome_v5), "\n")

## Edges after v4->v5 conversion: 767533

cat("Unique v5 query genes:", 
    length(unique(MaizeNetome_v5$Element.ID.v5)), "\n")

## Unique v5 query genes: 706

cat("Unique v5 target genes:", 
    length(unique(MaizeNetome_v5$Direct.Connection.v5)), "\n")

## Unique v5 target genes: 21854

Load GRASSIUS TF Annotations

Purpose: Annotate transcription factors for network interpretation.

grassius <- read.csv(
  "~/Desktop/grassius_annotation_v5.csv", 
  na.strings = ""
) %>%
  inner_join(v4_v5, by = c("gene.ID" = "v5")) %>%
  distinct()

cat("TF annotations:", nrow(grassius), "\n")

## TF annotations: 3487

Identify Trans-regulatory Candidates

Purpose: Define potential regulators (introgressed DEGs) and targets (in_trans DEGs).

Approach: Filter DEGs by genomic location and significance thresholds.

# Regulators: DEGs within shared introgression
regulators <- effects_df %>%
  filter(in_cis == TRUE, is_hiconf_DEG == TRUE) %>%
  arrange(adj.P.Val) %>%
  select(gene:desc_merged, logFC, adj.P.Val) %>%
  inner_join(v4_v5, by = c(gene = "v5")) %>%
  group_by(gene) %>%
  arrange(adj.P.Val)

# Targets: DEGs in_trans introgression
targets <- effects_df %>%
  filter(in_trans == TRUE, is_hiconf_DEG == TRUE) %>%
  arrange(adj.P.Val) %>%
  select(gene:desc_merged, logFC, adj.P.Val) %>%
  inner_join(v4_v5, by = c(gene = "v5")) %>%
  group_by(gene) %>%
  arrange(adj.P.Val)

cat("Candidate regulators (introgressed DEGs):", 
    length(unique(regulators$gene)), "\n")

## Candidate regulators (introgressed DEGs): 107

cat("Candidate targets (in_trans DEGs):", 
    length(unique(targets$gene)), "\n")

## Candidate targets (in_trans DEGs): 39

Filter for Trans-regulatory Edges

Purpose: Extract co-expression edges connecting introgressed genes to genes in_trans the introgression.

Approach: Filter MaizeNetome edges where query gene is in shared introgression and target is in_trans.

# Define gene sets
introgressed_regulators <- regulators$gene %>% sort() %>% unique()
outside_targets <- targets$gene %>% sort() %>% unique()

# Further subdivide introgressed regulators
inv4m_regulators <- introgressed_regulators[
  introgressed_regulators %in% inv4m_gene_ids
]
flanking_regulators <- introgressed_regulators[
  introgressed_regulators %in% flanking_introgression_gene_ids
]

cat("Inv4m regulators:", length(inv4m_regulators), "\n")

## Inv4m regulators: 37

cat("Flanking regulators:", length(flanking_regulators), "\n")

## Flanking regulators: 70

# Extract trans edges from MaizeNetome
trans_edges <- MaizeNetome_v5 %>%
  filter(
    Element.ID.v5 %in% introgressed_regulators,
    Direct.Connection.v5 %in% outside_targets
  ) %>%
  dplyr::rename(
    introgressed_gene = Element.ID.v5,
    target_gene = Direct.Connection.v5
  ) %>%
  select(introgressed_gene, target_gene, network)

cat("Trans-regulatory edges from MaizeNetome:", nrow(trans_edges), "\n")

## Trans-regulatory edges from MaizeNetome: 103

cat("Unique regulators with trans edges:", 
    length(unique(trans_edges$introgressed_gene)), "\n")

## Unique regulators with trans edges: 33

cat("Unique targets with trans edges:", 
    length(unique(trans_edges$target_gene)), "\n")

## Unique targets with trans edges: 17

Validate Edges with Expression Correlation

Purpose: Validate MaizeNetome co-expression edges using actual RNA-seq expression data.

Approach: Calculate pairwise correlations for all putative trans-regulatory pairs and filter by significance.

voomR <- readRDS("~/Desktop/normalized_expression_voom_object.rda")

cat("Expression matrix dimensions:", 
    nrow(voomR$E), "genes x", ncol(voomR$E), "samples\n")

## Expression matrix dimensions: 24249 genes x 43 samples

# Genes involved in trans network
trans_network_genes <- union(
  trans_edges$introgressed_gene,
  trans_edges$target_gene
)

cat("Genes in trans network (before filtering):", 
    length(trans_network_genes), "\n")

## Genes in trans network (before filtering): 50

# Filter to genes present in expression data
genes_in_expr <- trans_network_genes %in% rownames(voomR$E)
trans_network_genes <- trans_network_genes[genes_in_expr]

cat("Genes in trans network (in expression data):", 
    length(trans_network_genes), "\n")

## Genes in trans network (in expression data): 50

cat("Genes missing from expression data:", 
    sum(!genes_in_expr), "\n")

## Genes missing from expression data: 0

# Filter regulator and target sets
outside_targets_expr <- outside_targets[
  outside_targets %in% rownames(voomR$E)
]
introgressed_regulators_expr <- introgressed_regulators[
  introgressed_regulators %in% rownames(voomR$E)
]

cat("Regulators in expression data:", 
    length(introgressed_regulators_expr), 
    "/", length(introgressed_regulators), "\n")

## Regulators in expression data: 107 / 107

cat("Targets in expression data:", 
    length(outside_targets_expr), 
    "/", length(outside_targets), "\n")

## Targets in expression data: 39 / 39

# Calculate correlation matrix for network genes only
expr_matrix <- t(voomR$E)[, trans_network_genes]
corr_matrix <- round(cor(expr_matrix, use = "pairwise.complete.obs"), 2)
p_adj <- adjust.corr(expr_matrix, adjust = "fdr")

# Filter correlations by significance
corr_filtered <- corr_matrix
corr_filtered[p_adj$P > 0.05] <- 0

cat("Correlation matrix dimensions:", 
    nrow(corr_matrix), "x", ncol(corr_matrix), "\n")

## Correlation matrix dimensions: 50 x 50

cat("Significant correlations:", 
    sum(corr_filtered != 0), "\n")

## Significant correlations: 2464

Construct Validated Edge List

Purpose: Create edge list with correlation coefficients and network distances for validated trans-regulatory pairs.

Approach: Extract correlations for introgressed → in_trans pairs, filter by significance and direction.

# Helper functions for edge weights
n_samples <- ncol(voomR$E)

mutual_info <- function(r) {
  0.5 * log2(1 / (1 - r^2))
}

# https://doi.org/10.1186/s12859-023-05161-y
robust_d <- function(r, n) {
  sqrt(1 - abs(r))
}

# for standardized data This is the euclidean distance between
# vectors
# https://cmci.colorado.edu/classes/INFO-1301/files/borgatti.htm
standard_d <- function(r, n) {
   srt(2*n*abs(1-r))
}

# Validate gene IDs against correlation matrix
corr_genes <- colnames(corr_filtered)
regulators_valid <- introgressed_regulators_expr[
  introgressed_regulators_expr %in% corr_genes
]
targets_valid <- outside_targets_expr[
  outside_targets_expr %in% corr_genes
]

cat("Original regulators:", length(introgressed_regulators_expr), 
    "| Valid:", length(regulators_valid), "\n")

## Original regulators: 107 | Valid: 33

cat("Original targets:", length(outside_targets_expr), 
    "| Valid:", length(targets_valid), "\n")

## Original targets: 39 | Valid: 17

# Extract correlations for trans pairs using validated IDs
corr_trans <- corr_filtered[
  regulators_valid,
  targets_valid
]

cat("Correlation submatrix:", 
    nrow(corr_trans), "regulators x", 
    ncol(corr_trans), "targets\n")

## Correlation submatrix: 33 regulators x 17 targets

# Convert to edge list
edge_idx <- cbind(
  c(row(corr_trans)), 
  c(col(corr_trans)), 
  c(corr_trans)
)

all_edges <- data.frame(
  from = rownames(corr_trans)[edge_idx[, 1]],
  to = colnames(corr_trans)[edge_idx[, 2]],
  r = edge_idx[, 3]
)

cat("All possible edges:", nrow(all_edges), "\n")

## All possible edges: 561

cat("Non-zero correlations:", sum(all_edges$r != 0), "\n")

## Non-zero correlations: 551

# Filter to edges matching MaizeNetome predictions
two_way <- trans_edges %>%
  filter(
    introgressed_gene %in% regulators_valid,
    target_gene %in% targets_valid
  ) %>%
  select(from = introgressed_gene, to = target_gene)

cat("MaizeNetome edges (validated):", nrow(two_way), "\n")

## MaizeNetome edges (validated): 103

edge_list <- all_edges %>%
  inner_join(two_way, by = c("from", "to")) %>%
  mutate(
    e = case_when(abs(r) > 0 ~ 1, .default = 0),
    weight = mutual_info(r),
    distance = robust_d(r, n_samples)
  ) %>%
  filter(e == 1, from != to) %>%
  mutate(color = "lightgrey") %>%
  arrange(from, -abs(r))

cat("Validated trans-regulatory edges:", nrow(edge_list), "\n")

## Validated trans-regulatory edges: 96

cat("Unique regulators:", 
    length(unique(edge_list$from)), "\n")

## Unique regulators: 32

cat("Unique targets:", 
    length(unique(edge_list$to)), "\n")

## Unique targets: 17

Visualize Networks

Network Visualization Setup

Purpose: Create reusable functions for network formatting and visualization.

#' Format nodes for network visualization
#'
#' Creates standardized node data frame with region colors, labels, and styling.
#'
#' @param node_ids Character vector of gene IDs
#' @param effects_df Data frame with gene expression data
#' @param custom_labels Named vector of custom gene labels
#'
#' @return Data frame with formatted node attributes
format_network_nodes <- function(node_ids, effects_df, custom_labels =NULL) {
  nodes <- data.frame(id = node_ids) %>%
    left_join(
      effects_df %>%
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        select(gene, locus_label, logFC),
      by = c("id" = "gene")
    ) %>%
    mutate(
      region = case_when(
        id %in% inv4m_regulators ~ "inv4m",
        id %in% flanking_regulators ~ "flank",
        TRUE ~ "out"
      ),
      upregulated = logFC > 0,
      color.border = case_when(
        region == "inv4m" ~ "#551A8B",
        region == "flank" ~ "#FF1493",
        region == "out" ~ "#FFD700"
      ),
      color.background = case_when(
        upregulated & region == "inv4m" ~ "#551A8B",
        upregulated & region == "flank" ~ "#FF1493",
        upregulated & region == "out" ~ "#FFD700",
        !upregulated ~ "white"
      ),
      borderWidth = 4,
      label = coalesce(locus_label, id)
    )
    if(!is.null(custom_labels) && any(names(custom_labels) %in% nodes$id)) {
      cat("Custom labels found for nodes:", 
          sum(names(custom_labels) %in% nodes$id), "\n")
      # Apply custom labels
      for (gene_id in names(custom_labels)) {
        if (gene_id %in% nodes$id) {
          nodes$label[nodes$id == gene_id] <- custom_labels[gene_id]
        }
      }
  }
  # Add italics
  nodes$label <- paste0("<i>", nodes$label, "</i>")
  
  nodes
}

#' Create MST visualization from edge list
#'
#' Builds minimum spanning tree and creates interactive visNetwork plot.
#'
#' @param edges Data frame with from, to, distance, r, color, weight columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param layout igraph layout algorithm name
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_mst <- function(edges, nodes, effects_df, 
                          layout = "layout_nicely", font_size = 20) {
  # Create igraph object
  g <- graph_from_data_frame(
    d = edges %>% select(from, to, distance, r, color, weight),
    directed = TRUE,
    vertices = nodes
  )
  
  # Compute MST
  g_mst <- mst(g, weights = E(g)$distance)
  
  cat("MST - Edges:", length(E(g_mst)), "| Nodes:", length(V(g_mst)), "\n")
  
  # Check labels here
  cat("Sample labels:", head(V(g_mst)$label, 10), "\n")
  
  # Get edge data for arrow types
  mst_edges <- as_data_frame(g_mst, what = "edges") %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        select(gene, logFC_from = logFC),
      by = c("from" = "gene")
    ) %>%
    left_join(
      effects_df %>% 
        group_by(gene) %>%
        dplyr::slice(1) %>%
        ungroup() %>%
        select(gene, logFC_to = logFC),
      by = c("to" = "gene")
    ) %>%
    mutate(
      concordant = sign(logFC_from) == sign(logFC_to),
      color = ifelse(is.na(color), "lightgrey", color)
    )
  
  # Set edge attributes
  g_mst_vis <- g_mst %>%
    set_edge_attr(
      "arrows.to.type", 
      value = ifelse(mst_edges$concordant, "arrow", "bar")
    ) %>%
    set_edge_attr("width", value = 3) %>%
    set_edge_attr("color", value = mst_edges$color)
  
  # CRITICAL FIX: Copy label to name so visIgraph uses it
  V(g_mst_vis)$name <- V(g_mst_vis)$label
  
  # Plot
  visIgraph(g_mst_vis) %>%
    visNodes(font = list(size = font_size, multi = "html")) %>%
    visIgraphLayout(layout = layout) %>%
    visOptions(
      highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
      nodesIdSelection = TRUE
    )
}

Complete Trans Co-expression Network (MST)

Purpose: Minimum spanning tree of all validated trans-regulatory edges from introgressed genes (Inv4m + flanking) to in_trans targets.

complete_trans_edges <- edge_list %>%
  filter(from %in% c(inv4m_regulators, flanking_regulators))

cat("Complete trans co-expression network:\n")

## Complete trans co-expression network:

cat("  Edges:", nrow(complete_trans_edges), "\n")

##   Edges: 96

cat("  Regulators:", length(unique(complete_trans_edges$from)), 
    "(Inv4m:", sum(unique(complete_trans_edges$from) %in% inv4m_regulators),
    "| Flanking:", 
    sum(unique(complete_trans_edges$from) %in% flanking_regulators), ")\n")

##   Regulators: 32 (Inv4m: 13 | Flanking: 19 )

cat("  Targets:", length(unique(complete_trans_edges$to)), "\n\n")

##   Targets: 17

complete_trans_node_ids <- unique(c(complete_trans_edges$from, 
                                     complete_trans_edges$to))
complete_trans_nodes <- format_network_nodes(
  complete_trans_node_ids, 
  effects_df
)

visualize_mst(complete_trans_edges, complete_trans_nodes, effects_df)

## MST - Edges: 47 | Nodes: 49 
## Sample labels: <i>Zm00001eb186400</i> <i>Zm00001eb186410</i> <i>Zm00001eb186840</i> <i>gpm458</i> <i>Zm00001eb187400</i> <i>his402</i> <i>Zm00001eb187440</i> <i>smn</i> <i>Zm00001eb188030</i> <i>his2b5</i>

Flanking Trans Co-expression Network (MST)

Purpose: Minimum spanning tree of validated trans-regulatory relationships from flanking introgression genes to in_trans targets.

flanking_trans_edges <- edge_list %>% 
  filter(
    from %in% flanking_regulators,
    !to %in% flanking_regulators
  )

cat("Flanking trans co-expression network:\n")

## Flanking trans co-expression network:

cat("  Edges:", nrow(flanking_trans_edges), "\n")

##   Edges: 59

cat("  Regulators:", length(unique(flanking_trans_edges$from)), "\n")

##   Regulators: 19

cat("  Targets:", length(unique(flanking_trans_edges$to)), "\n\n")

##   Targets: 13

flanking_trans_node_ids <- unique(c(flanking_trans_edges$from, 
                                     flanking_trans_edges$to))
flanking_trans_nodes <- format_network_nodes(
  flanking_trans_node_ids, 
  effects_df
)

visualize_mst(flanking_trans_edges, flanking_trans_nodes, effects_df, 
              font_size = 25)

## MST - Edges: 29 | Nodes: 32 
## Sample labels: <i>Zm00001eb186400</i> <i>Zm00001eb186410</i> <i>Zm00001eb186840</i> <i>gpm458</i> <i>Zm00001eb187400</i> <i>his402</i> <i>Zm00001eb187440</i> <i>smn</i> <i>Zm00001eb188030</i> <i>his2b5</i>

Inv4m Trans Co-expression Network (MST)

Purpose: Minimum spanning tree of validated trans-regulatory relationships from Inv4m inversion genes to in_trans targets.

inv4m_trans_edges <- edge_list %>%
  filter(
    from %in% inv4m_regulators,
    !to %in% flanking_regulators
  )

cat("Inv4m trans co-expression network:\n")

## Inv4m trans co-expression network:

cat("  Edges:", nrow(inv4m_trans_edges), "\n")

##   Edges: 37

cat("  Regulators:", length(unique(inv4m_trans_edges$from)), "\n")

##   Regulators: 13

cat("  Targets:", length(unique(inv4m_trans_edges$to)), "\n\n")

##   Targets: 12

inv4m_trans_node_ids <- unique(c(inv4m_trans_edges$from, 
                                  inv4m_trans_edges$to))
inv4m_trans_nodes <- format_network_nodes(
  inv4m_trans_node_ids, 
  effects_df
)

visualize_mst(inv4m_trans_edges, inv4m_trans_nodes, effects_df, font_size = 25)

## MST - Edges: 24 | Nodes: 25 
## Sample labels: <i>jmj21</i> <i>jmj2</i> <i>pcna2</i> <i>roc4-1</i> <i>map4k8</i> <i>cct</i> <i>sae2</i> <i>o3l1</i> <i>trxl2</i> <i>ugt85a2</i>

Inv4m Exclusive Trans Network (MST)

Purpose: Minimum spanning tree excluding targets that are also regulated by flanking genes.

flanking_targets <- edge_list %>%
  filter(from %in% flanking_regulators) %>%
  pull(to) %>%
  unique()

inv4m_exclusive_trans_edges <- edge_list %>%
  filter(
    from %in% inv4m_regulators,
    !to %in% inv4m_regulators,
    !to %in% flanking_regulators,
    !to %in% flanking_targets
  )

cat("Inv4m exclusive trans co-expression network:\n")

## Inv4m exclusive trans co-expression network:

cat("  Edges:", nrow(inv4m_exclusive_trans_edges), "\n")

##   Edges: 4

cat("  Regulators:", length(unique(inv4m_exclusive_trans_edges$from)), "\n")

##   Regulators: 2

cat("  Targets:", length(unique(inv4m_exclusive_trans_edges$to)), "\n")

##   Targets: 4

cat("  (Excludes", length(flanking_targets), 
    "targets of flanking regulators)\n\n")

##   (Excludes 13 targets of flanking regulators)

inv4m_exclusive_node_ids <- unique(c(inv4m_exclusive_trans_edges$from, 
                                      inv4m_exclusive_trans_edges$to))
inv4m_exclusive_nodes <- format_network_nodes(
  inv4m_exclusive_node_ids, 
  effects_df
)

visualize_mst(inv4m_exclusive_trans_edges, inv4m_exclusive_nodes, effects_df, 
              font_size = 25)

## MST - Edges: 4 | Nodes: 6 
## Sample labels: <i>jmj2</i> <i>trxl2</i> <i>actin-1</i> <i>prpl29</i> <i>dywl</i> <i>roc4-2</i>

Export Trans Co-expression Network Gene Sets

Purpose: Export gene sets from the validated trans-regulatory network for GO enrichment analysis.

# Trans network regulators
trans_regulators <- data.frame(
  gene = unique(edge_list$from)
) %>%
  mutate(
    region = case_when(
      gene %in% inv4m_regulators ~ "inv4m",
      gene %in% flanking_regulators ~ "flanking",
      TRUE ~ "other"
    )
  ) %>%
  left_join(
    effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
    by = "gene"
  ) %>%
  arrange(region, desc(abs(logFC)))

# Trans network targets
trans_targets <- data.frame(
  gene = unique(edge_list$to)
) %>%
  left_join(
    effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
    by = "gene"
  ) %>%
  arrange(desc(abs(logFC)))

# All trans network genes
trans_network_all <- data.frame(
  gene = unique(c(edge_list$from, edge_list$to))
) %>%
  mutate(
    node_type = case_when(
      gene %in% edge_list$from & gene %in% edge_list$to ~ "both",
      gene %in% edge_list$from ~ "regulator",
      gene %in% edge_list$to ~ "target"
    ),
    region = case_when(
      gene %in% inv4m_regulators ~ "inv4m",
      gene %in% flanking_regulators ~ "flanking",
      TRUE ~ "in_trans"
    )
  ) %>%
  left_join(
    effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
    by = "gene"
  )

# Edge list with annotations
trans_edges_annotated <- edge_list %>%
  left_join(
    effects_df %>% 
      select(gene, regulator_symbol = locus_label, 
             regulator_desc = desc_merged, regulator_logFC = logFC),
    by = c("from" = "gene")
  ) %>%
  left_join(
    effects_df %>% 
      select(gene, target_symbol = locus_label, 
             target_desc = desc_merged, target_logFC = logFC),
    by = c("to" = "gene")
  ) %>%
  mutate(
    regulator_region = case_when(
      from %in% inv4m_regulators ~ "inv4m",
      from %in% flanking_regulators ~ "flanking",
      TRUE ~ "other"
    ),
    concordant = sign(regulator_logFC) == sign(target_logFC)
  )

# Write outputs
write.csv(
  trans_regulators,
  file = "~/Desktop/trans_network_regulators.csv",
  row.names = FALSE
)

write.csv(
  trans_targets,
  file = "~/Desktop/trans_network_targets.csv",
  row.names = FALSE
)

write.csv(
  trans_network_all,
  file = "~/Desktop/trans_network_all_genes.csv",
  row.names = FALSE
)

write.csv(
  trans_edges_annotated,
  file = "~/Desktop/trans_network_edges_annotated.csv",
  row.names = FALSE
)

cat("\n=== TRANS CO-EXPRESSION NETWORK SUMMARY ===\n")

## 
## === TRANS CO-EXPRESSION NETWORK SUMMARY ===

cat("Regulators (introgressed DEGs):", nrow(trans_regulators), "\n")

## Regulators (introgressed DEGs): 34

cat("  Inv4m:", sum(trans_regulators$region == "inv4m"), "\n")

##   Inv4m: 15

cat("  Flanking:", sum(trans_regulators$region == "flanking"), "\n")

##   Flanking: 19

cat("\nTargets (in_trans DEGs):", nrow(trans_targets), "\n")

## 
## Targets (in_trans DEGs): 17

cat("\nTotal network genes:", nrow(trans_network_all), "\n")

## 
## Total network genes: 51

cat("Total validated edges:", nrow(edge_list), "\n")

## Total validated edges: 96

cat("  Concordant (same direction):", 
    sum(trans_edges_annotated$concordant), "\n")

##   Concordant (same direction): 45

cat("  Discordant (opposite direction):", 
    sum(!trans_edges_annotated$concordant), "\n")

##   Discordant (opposite direction): 57

Session Information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] visNetwork_2.1.4     igraph_2.2.1         rtracklayer_1.70.0  
##  [4] GenomicRanges_1.62.0 Seqinfo_1.0.0        IRanges_2.44.0      
##  [7] S4Vectors_0.48.0     BiocGenerics_0.56.0  generics_0.1.4      
## [10] tidyr_1.3.1          dplyr_1.1.4         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1            farver_2.1.2               
##  [3] Biostrings_2.78.0           S7_0.2.1                   
##  [5] bitops_1.0-9                fastmap_1.2.0              
##  [7] RCurl_1.98-1.17             GenomicAlignments_1.46.0   
##  [9] XML_3.99-0.20               digest_0.6.39              
## [11] rpart_4.1.24                lifecycle_1.0.4            
## [13] cluster_2.1.8.1             statmod_1.5.1              
## [15] magrittr_2.0.4              compiler_4.5.1             
## [17] rlang_1.1.6                 Hmisc_5.2-4                
## [19] sass_0.4.10                 tools_4.5.1                
## [21] yaml_2.3.10                 data.table_1.17.8          
## [23] knitr_1.50                  S4Arrays_1.10.0            
## [25] htmlwidgets_1.6.4           curl_7.0.0                 
## [27] DelayedArray_0.36.0         RColorBrewer_1.1-3         
## [29] abind_1.4-8                 BiocParallel_1.44.0        
## [31] withr_3.0.2                 foreign_0.8-90             
## [33] purrr_1.2.0                 nnet_7.3-20                
## [35] grid_4.5.1                  colorspace_2.1-2           
## [37] ggplot2_4.0.1               scales_1.4.0               
## [39] SummarizedExperiment_1.40.0 cli_3.6.5                  
## [41] rmarkdown_2.30              crayon_1.5.3               
## [43] rstudioapi_0.17.1           httr_1.4.7                 
## [45] rjson_0.2.23                cachem_1.1.0               
## [47] stringr_1.6.0               parallel_4.5.1             
## [49] XVector_0.50.0              restfulr_0.0.16            
## [51] matrixStats_1.5.0           base64enc_0.1-3            
## [53] vctrs_0.6.5                 Matrix_1.7-4               
## [55] jsonlite_2.0.0              Formula_1.2-5              
## [57] htmlTable_2.4.3             limma_3.66.0               
## [59] jquerylib_0.1.4             glue_1.8.0                 
## [61] codetools_0.2-20            stringi_1.8.7              
## [63] gtable_0.3.6                BiocIO_1.20.0              
## [65] tibble_3.3.0                pillar_1.11.1              
## [67] htmltools_0.5.8.1           R6_2.6.1                   
## [69] evaluate_1.0.5              lattice_0.22-7             
## [71] Biobase_2.70.0              backports_1.5.0            
## [73] Rsamtools_2.26.0            cigarillo_1.0.0            
## [75] bslib_0.9.0                 gridExtra_2.3              
## [77] SparseArray_1.10.2          checkmate_2.3.3            
## [79] xfun_0.54                   MatrixGenerics_1.22.0      
## [81] pkgconfig_2.0.3

Trans-regulatory Network Analysis of Inv4m Introgression

Fausto Rodriguez

2025-12-04