Purpose: Identify and characterize trans-regulatory networks mediated by genes within the Inv4m introgression region. This analysis integrates differential expression results with co-expression data from MaizeNetome to construct regulatory networks.
Approach: 1. Define genomic regions (Inv4m inversion, flanking introgression, in_trans) 2. Test for enrichment of DEGs within these regions 3. Query MaizeNetome for co-expression edges involving introgressed genes 4. Filter for trans-regulatory relationships (introgressed → in_trans targets) 5. Validate using expression correlation from RNA-seq data 6. Visualize networks with igraph/visNetwork
Expected outcomes: - Identification of trans-regulatory hubs within Inv4m - Network visualization showing regulatory relationships - Statistical enrichment of DEGs by genomic region - Output files for downstream analysis and manuscript figures
library(dplyr)
library(tidyr)
library(rtracklayer)
library(GenomicRanges)
library(igraph)
library(visNetwork)
#' Compute correlation matrix with FDR-adjusted p-values
#'
#' Calculates pairwise correlations and adjusts p-values for multiple testing.
#'
#' @param x Numeric matrix or data frame
#' @param type Correlation method: "pearson" or "spearman"
#' @param use Handling of missing values
#' @param adjust Multiple testing correction method
#'
#' @return List with correlation matrix (R), adjusted p-values (P),
#' unadjusted p-values (P.unadj), correlation type, and adjustment method
adjust.corr <- function(x,
type = c("pearson", "spearman"),
use = c("complete.obs", "pairwise.complete.obs"),
adjust = "holm") {
opt <- options(scipen = 5)
on.exit(options(opt))
type <- match.arg(type)
use <- match.arg(use)
x <- if (use == "complete.obs") {
as.matrix(na.omit(x))
} else {
as.matrix(x)
}
R <- Hmisc::rcorr(x, type = type)
P <- P.unadj <- R$P
p <- P[lower.tri(P)]
adj.p <- p.adjust(p, method = adjust)
P[lower.tri(P)] <- adj.p
P[upper.tri(P)] <- 0
P <- P + t(P)
P <- ifelse(P < 1e-04, 0, P)
P <- format(round(P, 4))
diag(P) <- ""
P[c(grep("0.0000", P), grep("^ 0$", P))] <- "<.0001"
P[grep("0.000$", P)] <- "<.001"
P.unadj <- ifelse(P.unadj < 1e-04, 0, P.unadj)
P.unadj <- format(round(P.unadj, 4))
diag(P.unadj) <- ""
P.unadj[c(grep("0.0000$", P.unadj), grep("^ 0$", P.unadj))] <- "<.0001"
P.unadj[grep("0.000$", P.unadj)] <- "<.001"
result <- list(
R = R,
P = P,
P.unadj = P.unadj,
type = type,
adjustr = adjust
)
class(result) <- "adjust.corr"
result
}
Purpose: Extract gene coordinates for defining introgression boundaries.
Approach: Import GFF3 file and subset to protein-coding genes on chromosomes 1-10.
v5_gff <- "~/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
v5 <- rtracklayer::import(v5_gff) %>%
subset(type == "gene" & seqnames %in% 1:10)
genes <- as.data.frame(v5)
genes$ID <- gsub("gene:", "", genes$ID)
cat("Total genes loaded:", nrow(genes), "\n")
## Total genes loaded: 43459
Purpose: Identify genes within the structural inversion boundaries.
Approach: Use known breakpoint genes to define Inv4m start/end positions.
# Breakpoint genes
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]
cat("Inv4m region:", inv4m_start, "-", inv4m_end, "\n")
## Inv4m region: 172883675 - 188132113
# Extract all genes within inversion
inv4m_gene_ids <- genes %>%
filter(
seqnames == 4,
start >= inv4m_start,
end <= inv4m_end
) %>%
pull(ID)
cat("Genes in Inv4m:", length(inv4m_gene_ids), "\n")
## Genes in Inv4m: 434
cat("Breakpoints included:",
"Zm00001eb190470" %in% inv4m_gene_ids,
"Zm00001eb194800" %in% inv4m_gene_ids, "\n")
## Breakpoints included: TRUE TRUE
Purpose: Separate introgressed genes in_trans the inversion from those within it.
flanking_introgression_gene_ids <- cis_introgression_gene_ids[
!cis_introgression_gene_ids %in% inv4m_gene_ids
]
cat("Genes in flanking regions:",
length(flanking_introgression_gene_ids), "\n")
## Genes in flanking regions: 665
cat("All Inv4m genes within shared introgression:",
all(inv4m_gene_ids %in% cis_introgression_gene_ids), "\n")
## All Inv4m genes within shared introgression: TRUE
Purpose: Load DEG analysis results and annotate with genomic region information.
Approach: Read predictor effects from leaf developmental gradient analysis, filter for Inv4m genotype effect, and classify genes by genomic location.
Expected outcome: Data frame with DEG statistics and region annotations. ## Load Field DEG List
# Load differential expression results
effects_df <- read.csv("~/Desktop/predictor_effects_leaf_interaction_model.csv") %>%
filter(predictor == "Inv4m")
# Load curated labels
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv") %>%
dplyr::select(gene, locus_label)
effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA
# Merge curated labels
effects_df <- effects_df %>%
left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
dplyr::select(-locus_label_curated)
# Curated updates: only gene and locus_label
curated <- tibble::tribble(
~gene, ~locus_label,
# custom labels for trans network
"Zm00001eb384770", "zcn26",
"Zm00001eb192330", "roc4-1",
"Zm00001eb193260", "trxl2",
"Zm00001eb194300", "paspa",
"Zm00001eb213980", "dywl",
"Zm00001eb194080", "ugt85a2",
"Zm00001eb112470", "engd2",
"Zm00001eb332450", "roc4-2",
"Zm00001eb193270", "trxl5",
"Zm00001eb066620", "tipin",
"Zm00001eb241410", "etfa",
"Zm00001eb013800", "mrlk",
"Zm00001eb249540", "prpl29",
"Zm00001eb000540", "prd3",
"Zm00001eb060540", "actin-1",
"Zm00001eb192580", "map4k8",
"Zm00001eb193120", "o3l1",
"Zm00001eb043750", "cdrk9",
# From original curated list
"Zm00001eb034810", "mgd2",
"Zm00001eb241920", "gpx1",
"Zm00001eb162710", "spx4",
"Zm00001eb038730", "pht7",
"Zm00001eb370610", "rfk1",
"Zm00001eb386270", "spx6",
"Zm00001eb335670", "sqd3",
"Zm00001eb151650", "pap1",
"Zm00001eb154650", "ppa1",
"Zm00001eb280120", "pfk1",
"Zm00001eb036910", "gpx3",
"Zm00001eb406610", "glk4",
"Zm00001eb048730", "spx2",
"Zm00001eb361620", "ppa2",
"Zm00001eb297970", "sqd2",
"Zm00001eb347070", "sqd1",
"Zm00001eb144680", "rns1",
"Zm00001eb430590", "nrx3",
"Zm00001eb247580", "ppck3",
"Zm00001eb351780", "ugp3",
"Zm00001eb222510", "pht1",
"Zm00001eb126380", "phos1",
"Zm00001eb047070", "pht2",
"Zm00001eb041390", "rns3",
"Zm00001eb116580", "spd1",
"Zm00001eb069630", "oct4",
"Zm00001eb083520", "dgd1",
"Zm00001eb130570", "sag21",
"Zm00001eb239700", "ppa3",
"Zm00001eb258130", "mgd3",
"Zm00001eb202100", "pap14",
"Zm00001eb144670", "rns2",
"Zm00001eb087720", "pht13",
"Zm00001eb277280", "gst19",
"Zm00001eb339870", "pld16",
"Zm00001eb289800", "pah1",
"Zm00001eb369590", "nrx1",
"Zm00001eb238670", "pep2"
)
# Update only the locus_label using curated values
effects_df <- effects_df %>%
left_join(curated, by = "gene", suffix = c("", "_new")) %>%
mutate(locus_label = coalesce(locus_label_new, locus_label)) %>%
dplyr::select(-locus_label_new)
# Replace with your actual field DEG gene IDs
field_degs <- effects_df %>% filter(is_hiconf_DEG) %>% pull(gene) %>% unique()
cat("\nField DEG list loaded:\n")
##
## Field DEG list loaded:
cat(" Total field DEGs:", length(field_degs), "\n")
## Total field DEGs: 165
Purpose: Test whether DEGs are enriched within introgression regions.
Approach: Fisher’s exact tests comparing odds of significance across genomic regions.
inv4m_outside_table <- with(
effects_df %>% filter(!in_flank),
table(in_Inv4m, is_DEG)
)
inv4m_outside_test <- fisher.test(inv4m_outside_table)
cat("Contingency table:\n")
## Contingency table:
print(inv4m_outside_table)
## is_DEG
## in_Inv4m FALSE TRUE
## FALSE 22932 475
## TRUE 125 130
cat("\nFisher's exact test p-value:",
format(inv4m_outside_test$p.value, scientific = TRUE), "\n")
##
## Fisher's exact test p-value: 9.83221e-140
cat("Odds ratio:", round(inv4m_outside_test$estimate, 3), "\n")
## Odds ratio: 50.163
flanking_outside_table <- with(
effects_df %>% filter(in_flank | in_trans),
table(in_flank, is_DEG)
)
flanking_outside_test <- fisher.test(flanking_outside_table)
cat("Contingency table:\n")
## Contingency table:
print(flanking_outside_table)
## is_DEG
## in_flank FALSE TRUE
## FALSE 22932 475
## TRUE 182 173
cat("\nFisher's exact test p-value:",
format(flanking_outside_test$p.value, scientific = TRUE), "\n")
##
## Fisher's exact test p-value: 3.080185e-178
cat("Odds ratio:", round(flanking_outside_test$estimate, 3), "\n")
## Odds ratio: 45.853
Purpose: Test enrichment of highly differentially expressed genes (|log2FC| > 1.5).
cat("Total High Confidence DEGs:", sum(effects_df$is_hiconf_DEG), "\n")
## Total High Confidence DEGs: 167
inv4m_outside_hiconf <- with(
effects_df %>% filter(!in_flank),
table(in_Inv4m, is_hiconf_DEG)
)
inv4m_outside_hiconf_test <- fisher.test(inv4m_outside_hiconf)
cat("Contingency table:\n")
## Contingency table:
print(inv4m_outside_hiconf)
## is_hiconf_DEG
## in_Inv4m FALSE TRUE
## FALSE 23361 46
## TRUE 210 45
cat("\nFisher's exact test p-value:",
format(inv4m_outside_hiconf_test$p.value, scientific = TRUE), "\n")
##
## Fisher's exact test p-value: 6.673456e-65
flanking_outside_hiconf <- with(
effects_df %>% filter(in_flank | in_trans),
table(in_flank, is_hiconf_DEG)
)
flanking_outside_hiconf_test <- fisher.test(flanking_outside_hiconf)
cat("Contingency table:\n")
## Contingency table:
print(flanking_outside_hiconf)
## is_hiconf_DEG
## in_flank FALSE TRUE
## FALSE 23361 46
## TRUE 279 76
cat("\nFisher's exact test p-value:",
format(flanking_outside_hiconf_test$p.value, scientific = TRUE), "\n")
##
## Fisher's exact test p-value: 1.919309e-109
inv4m_flanking_hiconf <- with(
effects_df %>% filter(in_cis),
table(in_Inv4m, is_hiconf_DEG)
)
inv4m_flanking_hiconf_test <- fisher.test(inv4m_flanking_hiconf)
cat("Contingency table:\n")
## Contingency table:
print(inv4m_flanking_hiconf)
## is_hiconf_DEG
## in_Inv4m FALSE TRUE
## FALSE 279 76
## TRUE 210 45
cat("\nFisher's exact test p-value:",
format(inv4m_flanking_hiconf_test$p.value, scientific = TRUE), "\n")
##
## Fisher's exact test p-value: 2.594766e-01
Purpose: Import co-expression edges from MaizeNetome for the chromosome 4 introgression region.
Approach: Load three separate queries covering left flank, Inv4m, and right flank, then combine.
Expected outcome: Edge list with co-expression relationships.
# Files were collected manually by searching for the genes in these v4
# coordinate limits in the search tool of MaizeNetome
# Left: 4:155195539-170747954
left_flanking_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/left_MaizeNetome_coexpression_edges.csv"
)
# Inv4m: 4:170747955-186145605
Inv4m_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/inv4m_MaizeNetome_coexpression_edges.csv"
)
# Right: 4:186145606-195900523
right_flanking_maizenetome <- read.csv(
"~/Desktop/inv4m/coexpression/right_MaizeNetome_coexpression_edges.csv"
)
MaizeNetome <- rbind(
left_flanking_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression"),
Inv4m_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression"),
right_flanking_maizenetome %>%
separate_longer_delim(cols = Direct.Connetation.Node, delim = "\n") %>%
mutate(network = "coexpression")
) %>%
dplyr::rename(Direct.Connection = Direct.Connetation.Node)
cat("Total edges:", nrow(MaizeNetome), "\n")
## Total edges: 640599
cat("Unique query genes:",
length(unique(MaizeNetome$Element.ID)), "\n")
## Unique query genes: 717
cat("Unique connected genes:",
length(unique(MaizeNetome$Direct.Connection)), "\n")
## Unique connected genes: 23748
Purpose: MaizeNetome uses B73 v4 gene IDs; convert to v5 for integration with current analysis.
Approach: Load v4-to-v5 crossreference table and apply to both query genes and connection targets.
v4_v5 <- read.table(
file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v4_to_B73v5.tsv",
sep = "\t",
header = FALSE
) %>%
dplyr::rename(v4 = V1, v5 = V2) %>%
separate_longer_delim(cols = v5, delim = ",")
cat("v4 to v5 mappings:", nrow(v4_v5), "\n")
## v4 to v5 mappings: 43680
# Load v4 GFF for gene metadata
v4_gff <- "~/ref/zea/Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2.gff3"
v4 <- rtracklayer::import(v4_gff) %>%
subset(type == "gene")
v4_genes <- as.data.frame(v4)
v4_genes$ID <- gsub("gene:", "", v4_genes$ID)
cat("v4 genes loaded:", nrow(v4_genes), "\n")
## v4 genes loaded: 39498
# Convert query gene IDs (Element.ID)
MaizeNetome_v5 <- MaizeNetome %>%
left_join(v4_v5, by = c("Element.ID" = "v4")) %>%
dplyr::rename(Element.ID.v5 = v5) %>%
filter(!is.na(Element.ID.v5))
# Convert target gene IDs (Direct.Connection)
MaizeNetome_v5 <- MaizeNetome_v5 %>%
left_join(v4_v5, by = c("Direct.Connection" = "v4")) %>%
dplyr::rename(Direct.Connection.v5 = v5) %>%
filter(!is.na(Direct.Connection.v5))
cat("Edges after v4->v5 conversion:", nrow(MaizeNetome_v5), "\n")
## Edges after v4->v5 conversion: 767533
cat("Unique v5 query genes:",
length(unique(MaizeNetome_v5$Element.ID.v5)), "\n")
## Unique v5 query genes: 706
cat("Unique v5 target genes:",
length(unique(MaizeNetome_v5$Direct.Connection.v5)), "\n")
## Unique v5 target genes: 21854
Purpose: Annotate transcription factors for network interpretation.
grassius <- read.csv(
"~/Desktop/grassius_annotation_v5.csv",
na.strings = ""
) %>%
inner_join(v4_v5, by = c("gene.ID" = "v5")) %>%
distinct()
cat("TF annotations:", nrow(grassius), "\n")
## TF annotations: 3487
Purpose: Define potential regulators (introgressed DEGs) and targets (in_trans DEGs).
Approach: Filter DEGs by genomic location and significance thresholds.
# Regulators: DEGs within shared introgression
regulators <- effects_df %>%
filter(in_cis == TRUE, is_hiconf_DEG == TRUE) %>%
arrange(adj.P.Val) %>%
select(gene:desc_merged, logFC, adj.P.Val) %>%
inner_join(v4_v5, by = c(gene = "v5")) %>%
group_by(gene) %>%
arrange(adj.P.Val)
# Targets: DEGs in_trans introgression
targets <- effects_df %>%
filter(in_trans == TRUE, is_hiconf_DEG == TRUE) %>%
arrange(adj.P.Val) %>%
select(gene:desc_merged, logFC, adj.P.Val) %>%
inner_join(v4_v5, by = c(gene = "v5")) %>%
group_by(gene) %>%
arrange(adj.P.Val)
cat("Candidate regulators (introgressed DEGs):",
length(unique(regulators$gene)), "\n")
## Candidate regulators (introgressed DEGs): 107
cat("Candidate targets (in_trans DEGs):",
length(unique(targets$gene)), "\n")
## Candidate targets (in_trans DEGs): 39
Purpose: Extract co-expression edges connecting introgressed genes to genes in_trans the introgression.
Approach: Filter MaizeNetome edges where query gene is in shared introgression and target is in_trans.
# Define gene sets
introgressed_regulators <- regulators$gene %>% sort() %>% unique()
outside_targets <- targets$gene %>% sort() %>% unique()
# Further subdivide introgressed regulators
inv4m_regulators <- introgressed_regulators[
introgressed_regulators %in% inv4m_gene_ids
]
flanking_regulators <- introgressed_regulators[
introgressed_regulators %in% flanking_introgression_gene_ids
]
cat("Inv4m regulators:", length(inv4m_regulators), "\n")
## Inv4m regulators: 37
cat("Flanking regulators:", length(flanking_regulators), "\n")
## Flanking regulators: 70
# Extract trans edges from MaizeNetome
trans_edges <- MaizeNetome_v5 %>%
filter(
Element.ID.v5 %in% introgressed_regulators,
Direct.Connection.v5 %in% outside_targets
) %>%
dplyr::rename(
introgressed_gene = Element.ID.v5,
target_gene = Direct.Connection.v5
) %>%
select(introgressed_gene, target_gene, network)
cat("Trans-regulatory edges from MaizeNetome:", nrow(trans_edges), "\n")
## Trans-regulatory edges from MaizeNetome: 103
cat("Unique regulators with trans edges:",
length(unique(trans_edges$introgressed_gene)), "\n")
## Unique regulators with trans edges: 33
cat("Unique targets with trans edges:",
length(unique(trans_edges$target_gene)), "\n")
## Unique targets with trans edges: 17
Purpose: Validate MaizeNetome co-expression edges using actual RNA-seq expression data.
Approach: Calculate pairwise correlations for all putative trans-regulatory pairs and filter by significance.
voomR <- readRDS("~/Desktop/normalized_expression_voom_object.rda")
cat("Expression matrix dimensions:",
nrow(voomR$E), "genes x", ncol(voomR$E), "samples\n")
## Expression matrix dimensions: 24249 genes x 43 samples
# Genes involved in trans network
trans_network_genes <- union(
trans_edges$introgressed_gene,
trans_edges$target_gene
)
cat("Genes in trans network (before filtering):",
length(trans_network_genes), "\n")
## Genes in trans network (before filtering): 50
# Filter to genes present in expression data
genes_in_expr <- trans_network_genes %in% rownames(voomR$E)
trans_network_genes <- trans_network_genes[genes_in_expr]
cat("Genes in trans network (in expression data):",
length(trans_network_genes), "\n")
## Genes in trans network (in expression data): 50
cat("Genes missing from expression data:",
sum(!genes_in_expr), "\n")
## Genes missing from expression data: 0
# Filter regulator and target sets
outside_targets_expr <- outside_targets[
outside_targets %in% rownames(voomR$E)
]
introgressed_regulators_expr <- introgressed_regulators[
introgressed_regulators %in% rownames(voomR$E)
]
cat("Regulators in expression data:",
length(introgressed_regulators_expr),
"/", length(introgressed_regulators), "\n")
## Regulators in expression data: 107 / 107
cat("Targets in expression data:",
length(outside_targets_expr),
"/", length(outside_targets), "\n")
## Targets in expression data: 39 / 39
# Calculate correlation matrix for network genes only
expr_matrix <- t(voomR$E)[, trans_network_genes]
corr_matrix <- round(cor(expr_matrix, use = "pairwise.complete.obs"), 2)
p_adj <- adjust.corr(expr_matrix, adjust = "fdr")
# Filter correlations by significance
corr_filtered <- corr_matrix
corr_filtered[p_adj$P > 0.05] <- 0
cat("Correlation matrix dimensions:",
nrow(corr_matrix), "x", ncol(corr_matrix), "\n")
## Correlation matrix dimensions: 50 x 50
cat("Significant correlations:",
sum(corr_filtered != 0), "\n")
## Significant correlations: 2464
Purpose: Create edge list with correlation coefficients and network distances for validated trans-regulatory pairs.
Approach: Extract correlations for introgressed → in_trans pairs, filter by significance and direction.
# Helper functions for edge weights
n_samples <- ncol(voomR$E)
mutual_info <- function(r) {
0.5 * log2(1 / (1 - r^2))
}
# https://doi.org/10.1186/s12859-023-05161-y
robust_d <- function(r, n) {
sqrt(1 - abs(r))
}
# for standardized data This is the euclidean distance between
# vectors
# https://cmci.colorado.edu/classes/INFO-1301/files/borgatti.htm
standard_d <- function(r, n) {
srt(2*n*abs(1-r))
}
# Validate gene IDs against correlation matrix
corr_genes <- colnames(corr_filtered)
regulators_valid <- introgressed_regulators_expr[
introgressed_regulators_expr %in% corr_genes
]
targets_valid <- outside_targets_expr[
outside_targets_expr %in% corr_genes
]
cat("Original regulators:", length(introgressed_regulators_expr),
"| Valid:", length(regulators_valid), "\n")
## Original regulators: 107 | Valid: 33
cat("Original targets:", length(outside_targets_expr),
"| Valid:", length(targets_valid), "\n")
## Original targets: 39 | Valid: 17
# Extract correlations for trans pairs using validated IDs
corr_trans <- corr_filtered[
regulators_valid,
targets_valid
]
cat("Correlation submatrix:",
nrow(corr_trans), "regulators x",
ncol(corr_trans), "targets\n")
## Correlation submatrix: 33 regulators x 17 targets
# Convert to edge list
edge_idx <- cbind(
c(row(corr_trans)),
c(col(corr_trans)),
c(corr_trans)
)
all_edges <- data.frame(
from = rownames(corr_trans)[edge_idx[, 1]],
to = colnames(corr_trans)[edge_idx[, 2]],
r = edge_idx[, 3]
)
cat("All possible edges:", nrow(all_edges), "\n")
## All possible edges: 561
cat("Non-zero correlations:", sum(all_edges$r != 0), "\n")
## Non-zero correlations: 551
# Filter to edges matching MaizeNetome predictions
two_way <- trans_edges %>%
filter(
introgressed_gene %in% regulators_valid,
target_gene %in% targets_valid
) %>%
select(from = introgressed_gene, to = target_gene)
cat("MaizeNetome edges (validated):", nrow(two_way), "\n")
## MaizeNetome edges (validated): 103
edge_list <- all_edges %>%
inner_join(two_way, by = c("from", "to")) %>%
mutate(
e = case_when(abs(r) > 0 ~ 1, .default = 0),
weight = mutual_info(r),
distance = robust_d(r, n_samples)
) %>%
filter(e == 1, from != to) %>%
mutate(color = "lightgrey") %>%
arrange(from, -abs(r))
cat("Validated trans-regulatory edges:", nrow(edge_list), "\n")
## Validated trans-regulatory edges: 96
cat("Unique regulators:",
length(unique(edge_list$from)), "\n")
## Unique regulators: 32
cat("Unique targets:",
length(unique(edge_list$to)), "\n")
## Unique targets: 17
Purpose: Create reusable functions for network formatting and visualization.
#' Format nodes for network visualization
#'
#' Creates standardized node data frame with region colors, labels, and styling.
#'
#' @param node_ids Character vector of gene IDs
#' @param effects_df Data frame with gene expression data
#' @param custom_labels Named vector of custom gene labels
#'
#' @return Data frame with formatted node attributes
format_network_nodes <- function(node_ids, effects_df, custom_labels =NULL) {
nodes <- data.frame(id = node_ids) %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(gene, locus_label, logFC),
by = c("id" = "gene")
) %>%
mutate(
region = case_when(
id %in% inv4m_regulators ~ "inv4m",
id %in% flanking_regulators ~ "flank",
TRUE ~ "out"
),
upregulated = logFC > 0,
color.border = case_when(
region == "inv4m" ~ "#551A8B",
region == "flank" ~ "#FF1493",
region == "out" ~ "#FFD700"
),
color.background = case_when(
upregulated & region == "inv4m" ~ "#551A8B",
upregulated & region == "flank" ~ "#FF1493",
upregulated & region == "out" ~ "#FFD700",
!upregulated ~ "white"
),
borderWidth = 4,
label = coalesce(locus_label, id)
)
if(!is.null(custom_labels) && any(names(custom_labels) %in% nodes$id)) {
cat("Custom labels found for nodes:",
sum(names(custom_labels) %in% nodes$id), "\n")
# Apply custom labels
for (gene_id in names(custom_labels)) {
if (gene_id %in% nodes$id) {
nodes$label[nodes$id == gene_id] <- custom_labels[gene_id]
}
}
}
# Add italics
nodes$label <- paste0("<i>", nodes$label, "</i>")
nodes
}
#' Create MST visualization from edge list
#'
#' Builds minimum spanning tree and creates interactive visNetwork plot.
#'
#' @param edges Data frame with from, to, distance, r, color, weight columns
#' @param nodes Data frame with formatted node attributes
#' @param effects_df Data frame with logFC for arrow type calculation
#' @param layout igraph layout algorithm name
#' @param font_size Node label font size
#'
#' @return visNetwork htmlwidget
visualize_mst <- function(edges, nodes, effects_df,
layout = "layout_nicely", font_size = 20) {
# Create igraph object
g <- graph_from_data_frame(
d = edges %>% select(from, to, distance, r, color, weight),
directed = TRUE,
vertices = nodes
)
# Compute MST
g_mst <- mst(g, weights = E(g)$distance)
cat("MST - Edges:", length(E(g_mst)), "| Nodes:", length(V(g_mst)), "\n")
# Check labels here
cat("Sample labels:", head(V(g_mst)$label, 10), "\n")
# Get edge data for arrow types
mst_edges <- as_data_frame(g_mst, what = "edges") %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(gene, logFC_from = logFC),
by = c("from" = "gene")
) %>%
left_join(
effects_df %>%
group_by(gene) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(gene, logFC_to = logFC),
by = c("to" = "gene")
) %>%
mutate(
concordant = sign(logFC_from) == sign(logFC_to),
color = ifelse(is.na(color), "lightgrey", color)
)
# Set edge attributes
g_mst_vis <- g_mst %>%
set_edge_attr(
"arrows.to.type",
value = ifelse(mst_edges$concordant, "arrow", "bar")
) %>%
set_edge_attr("width", value = 3) %>%
set_edge_attr("color", value = mst_edges$color)
# CRITICAL FIX: Copy label to name so visIgraph uses it
V(g_mst_vis)$name <- V(g_mst_vis)$label
# Plot
visIgraph(g_mst_vis) %>%
visNodes(font = list(size = font_size, multi = "html")) %>%
visIgraphLayout(layout = layout) %>%
visOptions(
highlightNearest = list(enabled = TRUE, degree = 1, hover = TRUE),
nodesIdSelection = TRUE
)
}
Purpose: Minimum spanning tree of all validated trans-regulatory edges from introgressed genes (Inv4m + flanking) to in_trans targets.
complete_trans_edges <- edge_list %>%
filter(from %in% c(inv4m_regulators, flanking_regulators))
cat("Complete trans co-expression network:\n")
## Complete trans co-expression network:
cat(" Edges:", nrow(complete_trans_edges), "\n")
## Edges: 96
cat(" Regulators:", length(unique(complete_trans_edges$from)),
"(Inv4m:", sum(unique(complete_trans_edges$from) %in% inv4m_regulators),
"| Flanking:",
sum(unique(complete_trans_edges$from) %in% flanking_regulators), ")\n")
## Regulators: 32 (Inv4m: 13 | Flanking: 19 )
cat(" Targets:", length(unique(complete_trans_edges$to)), "\n\n")
## Targets: 17
complete_trans_node_ids <- unique(c(complete_trans_edges$from,
complete_trans_edges$to))
complete_trans_nodes <- format_network_nodes(
complete_trans_node_ids,
effects_df
)
visualize_mst(complete_trans_edges, complete_trans_nodes, effects_df)
## MST - Edges: 47 | Nodes: 49
## Sample labels: <i>Zm00001eb186400</i> <i>Zm00001eb186410</i> <i>Zm00001eb186840</i> <i>gpm458</i> <i>Zm00001eb187400</i> <i>his402</i> <i>Zm00001eb187440</i> <i>smn</i> <i>Zm00001eb188030</i> <i>his2b5</i>
Purpose: Minimum spanning tree of validated trans-regulatory relationships from flanking introgression genes to in_trans targets.
flanking_trans_edges <- edge_list %>%
filter(
from %in% flanking_regulators,
!to %in% flanking_regulators
)
cat("Flanking trans co-expression network:\n")
## Flanking trans co-expression network:
cat(" Edges:", nrow(flanking_trans_edges), "\n")
## Edges: 59
cat(" Regulators:", length(unique(flanking_trans_edges$from)), "\n")
## Regulators: 19
cat(" Targets:", length(unique(flanking_trans_edges$to)), "\n\n")
## Targets: 13
flanking_trans_node_ids <- unique(c(flanking_trans_edges$from,
flanking_trans_edges$to))
flanking_trans_nodes <- format_network_nodes(
flanking_trans_node_ids,
effects_df
)
visualize_mst(flanking_trans_edges, flanking_trans_nodes, effects_df,
font_size = 25)
## MST - Edges: 29 | Nodes: 32
## Sample labels: <i>Zm00001eb186400</i> <i>Zm00001eb186410</i> <i>Zm00001eb186840</i> <i>gpm458</i> <i>Zm00001eb187400</i> <i>his402</i> <i>Zm00001eb187440</i> <i>smn</i> <i>Zm00001eb188030</i> <i>his2b5</i>
Purpose: Minimum spanning tree of validated trans-regulatory relationships from Inv4m inversion genes to in_trans targets.
inv4m_trans_edges <- edge_list %>%
filter(
from %in% inv4m_regulators,
!to %in% flanking_regulators
)
cat("Inv4m trans co-expression network:\n")
## Inv4m trans co-expression network:
cat(" Edges:", nrow(inv4m_trans_edges), "\n")
## Edges: 37
cat(" Regulators:", length(unique(inv4m_trans_edges$from)), "\n")
## Regulators: 13
cat(" Targets:", length(unique(inv4m_trans_edges$to)), "\n\n")
## Targets: 12
inv4m_trans_node_ids <- unique(c(inv4m_trans_edges$from,
inv4m_trans_edges$to))
inv4m_trans_nodes <- format_network_nodes(
inv4m_trans_node_ids,
effects_df
)
visualize_mst(inv4m_trans_edges, inv4m_trans_nodes, effects_df, font_size = 25)
## MST - Edges: 24 | Nodes: 25
## Sample labels: <i>jmj21</i> <i>jmj2</i> <i>pcna2</i> <i>roc4-1</i> <i>map4k8</i> <i>cct</i> <i>sae2</i> <i>o3l1</i> <i>trxl2</i> <i>ugt85a2</i>
Purpose: Minimum spanning tree excluding targets that are also regulated by flanking genes.
flanking_targets <- edge_list %>%
filter(from %in% flanking_regulators) %>%
pull(to) %>%
unique()
inv4m_exclusive_trans_edges <- edge_list %>%
filter(
from %in% inv4m_regulators,
!to %in% inv4m_regulators,
!to %in% flanking_regulators,
!to %in% flanking_targets
)
cat("Inv4m exclusive trans co-expression network:\n")
## Inv4m exclusive trans co-expression network:
cat(" Edges:", nrow(inv4m_exclusive_trans_edges), "\n")
## Edges: 4
cat(" Regulators:", length(unique(inv4m_exclusive_trans_edges$from)), "\n")
## Regulators: 2
cat(" Targets:", length(unique(inv4m_exclusive_trans_edges$to)), "\n")
## Targets: 4
cat(" (Excludes", length(flanking_targets),
"targets of flanking regulators)\n\n")
## (Excludes 13 targets of flanking regulators)
inv4m_exclusive_node_ids <- unique(c(inv4m_exclusive_trans_edges$from,
inv4m_exclusive_trans_edges$to))
inv4m_exclusive_nodes <- format_network_nodes(
inv4m_exclusive_node_ids,
effects_df
)
visualize_mst(inv4m_exclusive_trans_edges, inv4m_exclusive_nodes, effects_df,
font_size = 25)
## MST - Edges: 4 | Nodes: 6
## Sample labels: <i>jmj2</i> <i>trxl2</i> <i>actin-1</i> <i>prpl29</i> <i>dywl</i> <i>roc4-2</i>
Purpose: Export gene sets from the validated trans-regulatory network for GO enrichment analysis.
# Trans network regulators
trans_regulators <- data.frame(
gene = unique(edge_list$from)
) %>%
mutate(
region = case_when(
gene %in% inv4m_regulators ~ "inv4m",
gene %in% flanking_regulators ~ "flanking",
TRUE ~ "other"
)
) %>%
left_join(
effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
by = "gene"
) %>%
arrange(region, desc(abs(logFC)))
# Trans network targets
trans_targets <- data.frame(
gene = unique(edge_list$to)
) %>%
left_join(
effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
by = "gene"
) %>%
arrange(desc(abs(logFC)))
# All trans network genes
trans_network_all <- data.frame(
gene = unique(c(edge_list$from, edge_list$to))
) %>%
mutate(
node_type = case_when(
gene %in% edge_list$from & gene %in% edge_list$to ~ "both",
gene %in% edge_list$from ~ "regulator",
gene %in% edge_list$to ~ "target"
),
region = case_when(
gene %in% inv4m_regulators ~ "inv4m",
gene %in% flanking_regulators ~ "flanking",
TRUE ~ "in_trans"
)
) %>%
left_join(
effects_df %>% select(gene, locus_label, desc_merged, logFC, adj.P.Val),
by = "gene"
)
# Edge list with annotations
trans_edges_annotated <- edge_list %>%
left_join(
effects_df %>%
select(gene, regulator_symbol = locus_label,
regulator_desc = desc_merged, regulator_logFC = logFC),
by = c("from" = "gene")
) %>%
left_join(
effects_df %>%
select(gene, target_symbol = locus_label,
target_desc = desc_merged, target_logFC = logFC),
by = c("to" = "gene")
) %>%
mutate(
regulator_region = case_when(
from %in% inv4m_regulators ~ "inv4m",
from %in% flanking_regulators ~ "flanking",
TRUE ~ "other"
),
concordant = sign(regulator_logFC) == sign(target_logFC)
)
# Write outputs
write.csv(
trans_regulators,
file = "~/Desktop/trans_network_regulators.csv",
row.names = FALSE
)
write.csv(
trans_targets,
file = "~/Desktop/trans_network_targets.csv",
row.names = FALSE
)
write.csv(
trans_network_all,
file = "~/Desktop/trans_network_all_genes.csv",
row.names = FALSE
)
write.csv(
trans_edges_annotated,
file = "~/Desktop/trans_network_edges_annotated.csv",
row.names = FALSE
)
cat("\n=== TRANS CO-EXPRESSION NETWORK SUMMARY ===\n")
##
## === TRANS CO-EXPRESSION NETWORK SUMMARY ===
cat("Regulators (introgressed DEGs):", nrow(trans_regulators), "\n")
## Regulators (introgressed DEGs): 34
cat(" Inv4m:", sum(trans_regulators$region == "inv4m"), "\n")
## Inv4m: 15
cat(" Flanking:", sum(trans_regulators$region == "flanking"), "\n")
## Flanking: 19
cat("\nTargets (in_trans DEGs):", nrow(trans_targets), "\n")
##
## Targets (in_trans DEGs): 17
cat("\nTotal network genes:", nrow(trans_network_all), "\n")
##
## Total network genes: 51
cat("Total validated edges:", nrow(edge_list), "\n")
## Total validated edges: 96
cat(" Concordant (same direction):",
sum(trans_edges_annotated$concordant), "\n")
## Concordant (same direction): 45
cat(" Discordant (opposite direction):",
sum(!trans_edges_annotated$concordant), "\n")
## Discordant (opposite direction): 57
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] visNetwork_2.1.4 igraph_2.2.1 rtracklayer_1.70.0
## [4] GenomicRanges_1.62.0 Seqinfo_1.0.0 IRanges_2.44.0
## [7] S4Vectors_0.48.0 BiocGenerics_0.56.0 generics_0.1.4
## [10] tidyr_1.3.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2
## [3] Biostrings_2.78.0 S7_0.2.1
## [5] bitops_1.0-9 fastmap_1.2.0
## [7] RCurl_1.98-1.17 GenomicAlignments_1.46.0
## [9] XML_3.99-0.20 digest_0.6.39
## [11] rpart_4.1.24 lifecycle_1.0.4
## [13] cluster_2.1.8.1 statmod_1.5.1
## [15] magrittr_2.0.4 compiler_4.5.1
## [17] rlang_1.1.6 Hmisc_5.2-4
## [19] sass_0.4.10 tools_4.5.1
## [21] yaml_2.3.10 data.table_1.17.8
## [23] knitr_1.50 S4Arrays_1.10.0
## [25] htmlwidgets_1.6.4 curl_7.0.0
## [27] DelayedArray_0.36.0 RColorBrewer_1.1-3
## [29] abind_1.4-8 BiocParallel_1.44.0
## [31] withr_3.0.2 foreign_0.8-90
## [33] purrr_1.2.0 nnet_7.3-20
## [35] grid_4.5.1 colorspace_2.1-2
## [37] ggplot2_4.0.1 scales_1.4.0
## [39] SummarizedExperiment_1.40.0 cli_3.6.5
## [41] rmarkdown_2.30 crayon_1.5.3
## [43] rstudioapi_0.17.1 httr_1.4.7
## [45] rjson_0.2.23 cachem_1.1.0
## [47] stringr_1.6.0 parallel_4.5.1
## [49] XVector_0.50.0 restfulr_0.0.16
## [51] matrixStats_1.5.0 base64enc_0.1-3
## [53] vctrs_0.6.5 Matrix_1.7-4
## [55] jsonlite_2.0.0 Formula_1.2-5
## [57] htmlTable_2.4.3 limma_3.66.0
## [59] jquerylib_0.1.4 glue_1.8.0
## [61] codetools_0.2-20 stringi_1.8.7
## [63] gtable_0.3.6 BiocIO_1.20.0
## [65] tibble_3.3.0 pillar_1.11.1
## [67] htmltools_0.5.8.1 R6_2.6.1
## [69] evaluate_1.0.5 lattice_0.22-7
## [71] Biobase_2.70.0 backports_1.5.0
## [73] Rsamtools_2.26.0 cigarillo_1.0.0
## [75] bslib_0.9.0 gridExtra_2.3
## [77] SparseArray_1.10.2 checkmate_2.3.3
## [79] xfun_0.54 MatrixGenerics_1.22.0
## [81] pkgconfig_2.0.3