Overview

This analysis performs differential gene expression analysis on RNA-seq data from maize leaf samples across multiple experimental factors:

  • Leaf tissue position: Continuous gradient from apical to basal (leaf 1-4)
  • Phosphorus treatment: High P (+P) vs Low P (-P)
  • Genotype: Control (CTRL) vs Inversion 4m (Inv4m)

The analysis uses the limma-voom pipeline to identify genes that respond to:

  1. Inv4m: Genotype effect (Inversion 4m vs Control)
  2. leaf: Leaf stage effect
  3. -P: Phosphorus deficiency effect
  4. leaf:-P: Interaction between leaf position and P treatment

Fold Change Thresholds

  • Leaf effect: ±0.7 log2FC
  • P x Leaf interaction: ±0.7 log2FC
  • Other effects (-P, Inv4m): ±2.0 log2FC

Key Genomic Regions

  • Inv4m inversion: Chr4:172,883,675-188,132,113
  • Shared introgression: Chr4:157,012,149-195,900,523

Genes are classified as:

  • in_Inv4m: Within the inversion proper
  • in_cis: Within the shared introgression (includes inversion + flanking)
  • in_trans: Outside the shared introgression region

DEG Classification Tiers

Three levels of stringency for different analyses:

  1. Significant DEGs: for Manhattan plots, FDR < 0.05 (is_DEG).

  2. High-confidence DEGs: for GO enrichment, FDR < 0.05 AND \(log_2 FC \pm 2\) (is_hiconf_DEG).

  3. Selected DEGs: for manuscript main tables, the union (\(\cup\)) of the top 20 by significance and the top 20 by Mahalanobis distance (is_selected_DEG).

Libraries

library(edgeR)          # Differential expression analysis
library(limma)          # Linear models for RNA-seq
library(rtracklayer)    # Genomic annotation handling
library(GenomicRanges)  # Genomic ranges operations
library(dplyr)          # Data manipulation
library(tidyr)          # Data manipulation
library(stringr)        # String manipulation
library(ggplot2)        # Plotting
library(ggpubr)         # Publication ready plots
library(ggtext)         # Formatted text in plots
library(robustbase)     # Robust statistics (MCD for Mahalanobis)

Data Loading and Preprocessing

Load Expression Counts

counts <- read.csv("../data/inv4mRNAseq_gene_sample_exp.csv")

cat("Loaded expression data:\n")
## Loaded expression data:
cat("  Dimensions:", dim(counts), "\n")
##   Dimensions: 39756 62
cat("  Genes:", nrow(counts), "\n")
##   Genes: 39756
cat("  Samples:", ncol(counts) - 2, "\n")
##   Samples: 60

Load Sample Metadata

sampleInfo <- read.csv("../data/PSU-PHO22_Metadata.csv")

cat("\nSample metadata:\n")
## 
## Sample metadata:
cat("  Total samples:", nrow(sampleInfo), "\n")
##   Total samples: 64
cat("  Genotypes:", paste(unique(sampleInfo$Genotype), collapse = ", "), "\n")
##   Genotypes: CTRL, INV4
cat("  Treatments:", paste(unique(sampleInfo$Treatment), collapse = ", "), "\n")
##   Treatments: Low_P, High_P

Prepare Count Matrix

# Create sample ID mapping
tag <- sampleInfo$side_tag
names(tag) <- sampleInfo$library

# Extract gene IDs
gene_ids <- data.frame(gene = counts[, 2])

# Convert to matrix and set row names
counts <- as.matrix(counts[, -c(1:2)])
rownames(counts) <- gene_ids$gene

# Map sample names using tags
sampleNames <- tag[colnames(counts)]
colnames(counts) <- sampleNames

# Reorder metadata to match count matrix column order
sampleInfo <- sampleInfo[match(sampleNames, sampleInfo$side_tag), ]

cat("\nAll samples in metadata:", 
    all(sampleNames %in% sampleInfo$side_tag), "\n")
## 
## All samples in metadata: TRUE
cat("Count matrix prepared:\n")
## Count matrix prepared:
cat("  Genes:", nrow(counts), "\n")
##   Genes: 39756
cat("  Samples:", ncol(counts), "\n")
##   Samples: 60

Create DGEList Object

# Create DGEList with counts and sample information
y <- DGEList(counts = counts, samples = sampleInfo)

# Define groups from Treatment and Genotype interaction
y$group <- interaction(y$samples$Treatment, y$samples$Genotype)

cat("\nDGEList object created\n")
## 
## DGEList object created
head(y$samples)

Expression Filtering and Sample Quality Control

Filter Genes by Expression Level

Using filterByExpr to remove genes with low counts across samples.

# Keep genes with sufficient expression
keep <- filterByExpr(y, group = y$group)
y_filtered <- y[keep, ]

{
cat("\nExpression filtering:\n")
cat("  Genes before:", nrow(y), "\n")
cat("  Genes after:", nrow(y_filtered), "\n")
cat("  Genes removed:", sum(!keep), "\n")
}
## 
## Expression filtering:
##   Genes before: 39756 
##   Genes after: 24249 
##   Genes removed: 15507

Initial MDS: Quality Control Check

Compute MDS on all libraries (after gene filtering but before sample filtering) to identify low-quality samples based on library size.

# MDS with all samples (before library size filtering)
mds_all <- plotMDS(y_filtered, pch = 21, plot = TRUE)

# Histogram of library sizes
hist(
  y$samples$lib.size / 1e6,
  main = "Library Size Distribution",
  xlab = "Library Size (millions of reads)",
  breaks = 20
)

{
cat("\nLibrary size summary:\n")
print(summary(y$samples$lib.size))
cat("\nSamples with lib.size < 20 million:", 
    sum(y$samples$lib.size < 2e7), "\n")
}
## 
## Library size summary:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   327979 10716231 31446964 27205445 36129276 56805083 
## 
## Samples with lib.size < 20 million: 17
# Prepare data for plotting
mds_qc_data <- y_filtered$samples %>%
  mutate(
    dim1 = mds_all$x,
    dim2 = mds_all$y,
    lib_size_millions = lib.size / 1e6
  )

# Plot MDS colored by library size
ggplot(mds_qc_data, aes(x = dim1, y = dim2, color = lib_size_millions)) +
  geom_point(size = 3) +
  scale_color_viridis_c(option = "plasma") +
  labs(
    x = paste0("Dim1 (", round(100 * mds_all$var.explained[1]), "%)"),
    y = paste0("Dim2 (", round(100 * mds_all$var.explained[2]), "%)"),
    color = "Library Size\n(millions)"
  ) +
  theme_classic2(base_size = 15)

Filter Low Quality Libraries

Samples with library size < 20 million reads are considered low quality.

# Flag low count libraries
y_filtered$samples$lowCount <- y_filtered$samples$lib.size < 2e7

# Remove low quality samples
y_filtered_bySample <- y_filtered[, !y_filtered$samples$lowCount]

{
  cat("\nLow quality libraries:\n")
  table(y_filtered$samples$lowCount)
  cat("\nSamples after filtering:\n")
  cat("  Retained:", ncol(y_filtered_bySample), "\n")
  cat("  Removed:", sum(y_filtered$samples$lowCount), "\n")
}
## 
## Low quality libraries:
## 
## Samples after filtering:
##   Retained: 43 
##   Removed: 17

Sample Distribution Diagnostics

Verify experimental design balance across factors after quality filtering.

{
  with(y_filtered_bySample$samples, {
    cat("\n=== Sample Distribution by Factors ===\n")
    cat("\n-- Treatment --\n")
    table(Treatment)
    cat("\n-- Genotype --\n")
    table(Genotype)
    cat("\n-- Leaf Tissue --\n")
    table(leaf_tissue)
    cat("\n-- Treatment × Leaf Tissue --\n")
    table(Treatment, leaf_tissue)
    cat("\n-- Genotype × Leaf Tissue --\n")
    table(Genotype, leaf_tissue)
    cat("\n-- Treatment × Genotype × Leaf Tissue (3-way) --\n")
    table(Treatment, Genotype, leaf_tissue)
  })
}
## 
## === Sample Distribution by Factors ===
## 
## -- Treatment --
## 
## -- Genotype --
## 
## -- Leaf Tissue --
## 
## -- Treatment × Leaf Tissue --
## 
## -- Genotype × Leaf Tissue --
## 
## -- Treatment × Genotype × Leaf Tissue (3-way) --
## , , leaf_tissue = 1
## 
##          Genotype
## Treatment CTRL INV4
##    High_P    3    3
##    Low_P     1    4
## 
## , , leaf_tissue = 2
## 
##          Genotype
## Treatment CTRL INV4
##    High_P    3    3
##    Low_P     3    2
## 
## , , leaf_tissue = 3
## 
##          Genotype
## Treatment CTRL INV4
##    High_P    3    3
##    Low_P     2    3
## 
## , , leaf_tissue = 4
## 
##          Genotype
## Treatment CTRL INV4
##    High_P    3    3
##    Low_P     2    2

Quality Control: Batch Effects

Compute MDS Dimensions

MDS reveals sources of variation in gene expression across samples. Dimensions 3 and 4 are extracted from eigenvectors scaled by square root of variance explained.

mds <- plotMDS(
  y_filtered_bySample,
  pch = 21,
  label = y_filtered_bySample$samples$side_tag,
  plot = FALSE
)

# Store MDS coordinates in sample data
d <- y_filtered_bySample$samples
d$dim1 <- mds$x
d$dim2 <- mds$y
d$dim3 <- mds$eigen.vectors[, 3] * sqrt(mds$var.explained[3])
d$dim4 <- mds$eigen.vectors[, 4] * sqrt(mds$var.explained[4])

# Prepare factors for plotting
d$Treatment <- factor(d$Treatment)
levels(d$Treatment) <- c("+P", "-P")
d$Genotype <- factor(d$Genotype)
d$RNA_Plant <- factor(d$RNA_Plant)

# Variance explained by each dimension
tibble(
  dimension = paste("Dim", 1:4),
  var_explained = round(mds$var.explained[1:4], 4)
)

MDS: Dimensions 1 and 2 Colored by Multiple Factors

Exploring which experimental factors drive the main dimensions of variation.

# Treatment
p1 <- ggplot(d, aes(x = dim1, y = dim2, color = Treatment)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Treatment")

# Row position
p2 <- ggplot(d, aes(x = dim1, y = dim2, color = row, shape = Treatment)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Row Position")

# Collection time
p3 <- ggplot(d, aes(x = dim1, y = dim2, color = decimal_time)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Collection Time")

# Collector
p4 <- ggplot(d, aes(x = dim1, y = dim2, color = COLLECTOR)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Collector")

# Genotype
p5 <- ggplot(d, aes(x = dim1, y = dim2, color = Genotype)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Genotype")

# Leaf tissue
p6 <- d %>%
  mutate(leaf = factor(leaf_tissue)) %>%
  ggplot(aes(x = dim1, y = dim2, color = leaf)) +
  geom_point(size = 3) +
  theme_classic2(base_size = 15) +
  ggtitle("Leaf Tissue")

ggarrange(p1, p2, p3, p4, p5, p6, ncol = 3, nrow = 2)

MDS: Primary Plot (Leaf Tissue and Treatment)

d %>%
  mutate(leaf = factor(leaf_tissue)) %>%
  ggplot(aes(x = dim1, y = dim2)) +
  xlab(paste0("dim1 (", round(100 * mds$var.explained[1]), "%)")) +
  ylab(paste0("dim2 (", round(100 * mds$var.explained[2]), "%)")) +
  geom_point(aes(fill = leaf, shape = Treatment), size = 4) +
  scale_fill_viridis_d() +
  scale_shape_manual(values = c(24, 21)) +
  guides(
    shape = guide_legend(
      title = "Treatment",
      order = 1,
      override.aes = list(size = 7)
    ),
    fill = guide_legend(
      title = "Leaf",
      order = 2,
      override.aes = list(geom = "point", shape = 22, size = 7)
    )
  ) +
  theme_classic2(base_size = 25) +
  theme(
    legend.box = "horizontal",
    legend.spacing = unit(0, "line"),
    legend.box.spacing = unit(0, "in"),
    legend.position = c(0.75, 0.17)
  )

MDS: Dimensions 3 and 4

The third dimension separates by genotype.

# Set up genotype labels with italic formatting
labels <- c("CTRL", "*Inv4m*")
names(labels) <- c("CTRL", "INV4")

d %>%
  ggplot(aes(x = dim3, y = dim4, fill = Genotype, shape = Treatment)) +
  xlab(paste0("dim3 (", round(100 * mds$var.explained[3]), "%)")) +
  ylab(paste0("dim4 (", round(100 * mds$var.explained[4]), "%)")) +
  geom_point(size = 4) +
  scale_fill_viridis_d(direction = -1, labels = labels) +
  scale_shape_manual(values = c(24, 21)) +
  guides(
    shape = "none",
    fill = guide_legend(
      title = "Genotype",
      order = 2,
      override.aes = list(geom = "point", shape = 22, size = 7, reverse = TRUE)
    )
  ) +
  theme_classic2(base_size = 25) +
  theme(
    legend.position = c(0.89, 0.9),
    legend.text = element_markdown(),
    legend.spacing = unit(0, "line"),
    legend.box.spacing = unit(0, "line")
  )

MDS Dimension Correlations

# Calculate correlations and p-values
mds_cor_results <- tibble(
  dimension = c("Dim1", "Dim2", "Dim3"),
  factor = c("Treatment", "Leaf tissue", "Genotype"),
  correlation = c(
    cor(d$dim1, as.numeric(d$Treatment)),
    cor(d$dim2, d$leaf_tissue),
    cor(d$dim3, as.numeric(d$Genotype))
  ),
  p_value = c(
    cor.test(d$dim1, as.numeric(d$Treatment))$p.value,
    cor.test(d$dim2, d$leaf_tissue)$p.value,
    cor.test(d$dim3, as.numeric(d$Genotype))$p.value
  )
) %>%
  mutate(
    adj_p_value = p.adjust(p_value, method = "fdr"),
    correlation = round(correlation, 3),
    p_value = signif(p_value, 3),
    adj_p_value = signif(adj_p_value, 3)
  )

mds_cor_results

Normalization and Design Matrix

Apply TMM (Trimmed Mean of M-values) normalization to account for sequencing depth differences, then fit a linear model including spatial covariates (Plot_Row, Plot_Column), biological factors (Genotype, leaf_tissue, Treatment), and the leaf_tissue:Treatment interaction. Voom transformation estimates mean-variance relationships and computes precision weights for each observation.

# TMM normalization
y_filtered_bySample <- calcNormFactors(y_filtered_bySample)

# Design matrix: spatial covariates + biological factors + interaction
design <- model.matrix(
  ~ Plot_Column + Plot_Row + leaf_tissue + Treatment + Genotype + leaf_tissue:Treatment,
  d
)

# Voom transformation with precision weights
voomR <- voom(y_filtered_bySample, design = design, plot = FALSE)

# Save normalized expression for downstream analyses
saveRDS(voomR$E, file = "~/Desktop/normalized_expression_logCPM_leaf_trt.rda")
saveRDS(voomR, file = "~/Desktop/normalized_expression_voom_object_leaf_trt.rda")

{
cat("Normalization factors range:", 
    range(y_filtered_bySample$samples$norm.factors), "\n\n\n")
cat("Design matrix:", nrow(design), "samples ×", ncol(design), 
    "coefficients\n")
design %>% as.data.frame %>% tibble() %>% print()

cat("Voom expression matrix:", nrow(voomR$E), "genes ×", 
    ncol(voomR$E), "samples\n")
}
## Normalization factors range: 0.8389793 1.136351 
## 
## 
## Design matrix: 43 samples × 7 coefficients
## # A tibble: 43 × 7
##    `(Intercept)` Plot_Column Plot_Row leaf_tissue `Treatment-P` GenotypeINV4
##            <dbl>       <dbl>    <dbl>       <dbl>         <dbl>        <dbl>
##  1             1           3        4           1             1            1
##  2             1           3        4           2             1            1
##  3             1           3        4           3             1            1
##  4             1           3        4           4             1            1
##  5             1           4        6           1             1            1
##  6             1           4        6           3             1            1
##  7             1           4        5           1             1            0
##  8             1           4        5           2             1            0
##  9             1           4        5           3             1            0
## 10             1           4        5           4             1            0
## # ℹ 33 more rows
## # ℹ 1 more variable: `leaf_tissue:Treatment-P` <dbl>
## Voom expression matrix: 24249 genes × 43 samples

Linear Model Fitting

Fit linear model to voom-transformed data, then apply robust empirical Bayes moderation to stabilize variance estimates and improve power for differential expression testing.

# Fit linear model
fit <- lmFit(voomR)

# Empirical Bayes moderation (robust = TRUE for outlier resistance)
ebfit <- eBayes(fit, robust = TRUE)

{
  cat("Model fitted:", nrow(fit$coefficients), "genes ×", 
      ncol(fit$coefficients), "coefficients\n")
  cat("\nSignificant genes per coefficient (FDR < 0.05):\n")
  print(colSums(abs(decideTests(ebfit))))
}
## Model fitted: 24249 genes × 7 coefficients
## 
## Significant genes per coefficient (FDR < 0.05):
##             (Intercept)             Plot_Column                Plot_Row 
##                   19081                     847                       0 
##             leaf_tissue             Treatment-P            GenotypeINV4 
##                   10024                      48                    3155 
## leaf_tissue:Treatment-P 
##                    3189

Effect Estimation and Confidence Intervals

Extract Coefficients of Interest

We focus on biological effects while accounting for spatial covariates.

# Define predictors of interest with ORIGINAL names from model
predictors_original <- c(
  "leaf_tissue",
  "Treatment-P",
  "GenotypeINV4",
  "leaf_tissue:Treatment-P"
)

topTable(ebfit)%>% colnames()
##  [1] "Plot_Column"             "Plot_Row"               
##  [3] "leaf_tissue"             "Treatment.P"            
##  [5] "GenotypeINV4"            "leaf_tissue.Treatment.P"
##  [7] "AveExpr"                 "F"                      
##  [9] "P.Value"                 "adj.P.Val"
predictors_toptable <- c(
"leaf_tissue",
"Treatment.P",
"GenotypeINV4",
"leaf_tissue.Treatment.P"
)

# Define STANDARDIZED names for output
predictors_standard <- c(
  "leaf",
  "-P",
  "Inv4m",
  "leaf:-P"
)

# Create mapping
predictor_map <- setNames(predictors_standard, predictors_original)

cat("\nExtracting coefficients:\n")
## 
## Extracting coefficients:
for (i in seq_along(predictors_original)) {
  cat("  ", predictors_original[i], "→", predictors_standard[i], "\n")
}
##    leaf_tissue → leaf 
##    Treatment-P → -P 
##    GenotypeINV4 → Inv4m 
##    leaf_tissue:Treatment-P → leaf:-P

Calculate Effects and Confidence Intervals

For each predictor, extract results and calculate 95% confidence intervals.

results <- list()

for (x in predictors_original) {
  # Extract topTable results
  r <- topTable(
    ebfit,
    coef = x,
    sort.by = "none",
    n = Inf
  ) %>%
    tibble::rownames_to_column("gene") %>%
    mutate(predictor = predictor_map[x])
  
  # Calculate 95% confidence intervals
  t_quantile <- qt(0.975, ebfit$df.residual + ebfit$df.prior)
  standard_error <- ebfit$stdev.unscaled[, x] * sqrt(ebfit$s2.post)
  critical_value <- t_quantile * standard_error
  
  r$upper <- r$logFC + critical_value
  r$lower <- r$logFC - critical_value
  
  results[[predictor_map[x]]] <- r
}

cat("\nEffects extracted for", length(results), "predictors\n")
## 
## Effects extracted for 4 predictors
cat("  Genes per predictor:", nrow(results[[1]]), "\n")
##   Genes per predictor: 24249

Create Combined Effects Table

Combine all predictor results and annotate with gene information.

# Define factor level order for predictors
effect_order <- c("Inv4m",  "leaf", "-P", "leaf:-P")

effects_df <- results %>%
  bind_rows() %>%
  mutate(predictor = factor(predictor, levels = effect_order)) %>%
  # Add negative log10 p-value for visualization
  mutate(neglogP = -log10(adj.P.Val))

effects_df <- effects_df %>%
  mutate(is_DEG = adj.P.Val < 0.05) %>%
  mutate(regulation = case_when(
    is_DEG & logFC > 0 ~ "Upregulated",
    is_DEG & logFC < 0 ~ "Downregulated",
    .default = "Unregulated"
  ))

cat("\nCombined effects table created:\n")
## 
## Combined effects table created:
with(effects_df, {
  table(predictor, is_DEG)
  table(predictor, regulation)
})
##          regulation
## predictor Downregulated Unregulated Upregulated
##   Inv4m            1482       21094        1673
##   leaf             4960       14225        5064
##   -P                  3       24201          45
##   leaf:-P          1575       21060        1614

Define Robust Outlier Detection Function

Identifies extreme differentially expressed genes using robust Mahalanobis distance based on the Minimum Covariance Determinant (MCD) method. This approach is resistant to the influence of outliers themselves, providing more reliable outlier detection than classical methods.

The MCD method estimates location and covariance using only the most central 75% of observations (alpha = 0.75), making it robust to contamination.

# Helper function: Calculate robust Mahalanobis distance for one predictor
calculate_robust_distance <- function(per_predictor, mcd_alpha) {
  # Extract bivariate data (logFC and -log10(FDR))
  bivariate <- per_predictor %>%
    select(logFC, neglogP) %>%
    as.matrix()
  
  # Compute robust location and covariance using MCD
  mcd_result <- covMcd(bivariate, alpha = mcd_alpha)
  
  # Calculate robust Mahalanobis distances
  per_predictor$mahalanobis <- mahalanobis(
    x = bivariate,
    center = mcd_result$center,
    cov = mcd_result$cov
  )
  
  per_predictor
}

# Main function: Add robust Mahalanobis outlier flags
add_mahalanobis_outliers <- function(
    data = NULL,
    distance_quantile = 0.05,
    FDR = 0.05,
    mcd_alpha = 0.75
) {
  # Calculate robust Mahalanobis distance per predictor
  data <- split(data, factor(data$predictor)) %>%
    lapply(calculate_robust_distance, mcd_alpha = mcd_alpha) %>%
    bind_rows()
  
  # Chi-square cutoff for bivariate data (df = 2)
  cutoff <- qchisq(p = 1 - distance_quantile, df = 2)
  
  # Flag outliers: significant AND extreme distance
  data$is_mh_outlier <- (data$adj.P.Val < FDR) & (data$mahalanobis > cutoff)
  
  # Sort by distance within groups
  data %>%
    ungroup() %>%
    group_by(predictor, regulation) %>%
    arrange(-mahalanobis, .by_group = TRUE) %>%
    ungroup()
}

Calculate Mahalanobis Distances and Outlier Flags

effects_df <- add_mahalanobis_outliers(
  effects_df, 
  distance_quantile = 0.05, 
  FDR = 0.05
)

cat("\nMahalanobis outliers detected:\n")
## 
## Mahalanobis outliers detected:
with(effects_df, {
  table(predictor, is_mh_outlier)
})
##          is_mh_outlier
## predictor FALSE  TRUE
##   Inv4m   21094  3155
##   leaf    18724  5525
##   -P      24201    48
##   leaf:-P 21060  3189

Gene Annotation

Load gene symbols, functional descriptions (Pannzer), and genomic coordinates (B73 v5 GFF3). Gene IDs are cleaned and coordinates imported as both GRanges (for overlap operations) and data.frame (for dplyr filtering).

# Gene symbols and locus names
gene_symbol <- read.table(
  "/Users/fvrodriguez/Library/CloudStorage/GoogleDrive-frodrig4@ncsu.edu/My Drive/repos/inv4mRNA/data/gene_symbol.tab",
  quote = "",
  header = TRUE,
  sep = "\t",
  na.strings = ""
)

# Pannzer functional annotations
pannzer <- read.table(
  "../data/PANNZER_DESC.tab",
  quote = "",
  header = TRUE,
  sep = "\t"
) %>%
  group_by(gene_model) %>%
  slice(1) %>%
  select(gene_model, desc)

# Merge annotations
gene_pannzer <- gene_symbol %>%
  left_join(pannzer, by = c("gene_model" = "gene_model"))

# Genomic coordinates (GRanges + data.frame)
v5_gff_file <- "/Users/fvrodriguez/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
genes_gr <- rtracklayer::import(v5_gff_file) %>%
  subset(type == "gene" & seqnames %in% 1:10)
genes <- as.data.frame(genes_gr)
genes$ID <- gsub("gene:", "", genes$ID)

{
cat("Annotations loaded:\n")
cat("  Gene symbols:", nrow(gene_symbol), "\n")
cat("  Pannzer descriptions:", nrow(pannzer), "\n")
cat("  Merged annotations:", nrow(gene_pannzer), "\n")
cat("  Genomic features:", nrow(genes), "genes across", 
    length(unique(genes$seqnames)), "chromosomes\n")
}
## Annotations loaded:
##   Gene symbols: 44364 
##   Pannzer descriptions: 28308 
##   Merged annotations: 44364 
##   Genomic features: 43459 genes across 10 chromosomes

Define Genomic Regions of Interest

Define three nested regions on chromosome 4: (1) Inv4m inversion proper (gene-defined boundaries), (2) shared introgression including flanking regions (manually verified from genotyping data), and (3) flanking regions (in introgression but outside inversion).

# Inv4m inversion boundaries (defined by specific genes)
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

# Shared introgression boundaries (from RNAseq genotype verification)
introgression_start <- 157012149
introgression_end <- 195900523

# Extract gene IDs for each region
inv4m_gene_ids <- genes %>%
  filter(seqnames == 4, start >= inv4m_start, end <= inv4m_end) %>%
  pull(ID)

shared_introgression_gene_ids <- genes %>%
  filter(seqnames == 4, start >= introgression_start, end <= introgression_end) %>%
  pull(ID)

flanking_introgression_gene_ids <- shared_introgression_gene_ids[
  !(shared_introgression_gene_ids %in% inv4m_gene_ids)
]

cat("Inv4m inversion: Chr4:", inv4m_start, "-", inv4m_end, 
    "(", length(inv4m_gene_ids), "genes )\n")
## Inv4m inversion: Chr4: 172883675 - 188132113 ( 434 genes )
cat("Shared introgression: Chr4:", introgression_start, "-", introgression_end,
    "(", length(shared_introgression_gene_ids), "genes )\n")
## Shared introgression: Chr4: 157012149 - 195900523 ( 1099 genes )
cat("Introgressed flanking:", length(flanking_introgression_gene_ids), "genes\n")
## Introgressed flanking: 665 genes

Three-Tier DEG Classification

The goal of this section is to classify differentially expressed genes (DEGs) into three tiers of stringency based on significance (FDR) and effect size (\(\log_2\text{FC}\)), as well as a final set of genes selected for deep-dive analysis.

Methodology for Fold Change Thresholds

The effect sizes are interpreted differently based on the predictor:

Leaf Position (leaf predictor): This was modeled as a continuous slope (\(\beta\)). To be considered a large effect, the total change across the 3 units (Leaf 1 \(\rightarrow\) Leaf 4) must meet the \(|\log_2\text{FC}| > 2\) criterion.

\[\\ |\text{Total Change}| = |3\beta| > 2 \implies |\beta| > \frac{2}{3} \approx 0.67\]

We use \(|\beta| > 0.7\) as the threshold for the leaf slope an interacition with -P.

Categorical Predictors (-P, Inv4m, Interaction): The standard large effect size threshold of \(|\log_2\text{FC}| > 2.0\) is applied directly.

Three-Tier Definitions

  • Significant DEGs (is_DEG): All genes with FDR (adjusted P-value) \(< 0.05\).

  • High-confidence DEGs (is_hiconf_DEG): Significant DEGs that also meet the large effect size thresholds defined above.

  • Selected DEGs (is_selected_DEG): The union of the top 20 genes ranked by P-value and the top 20 high-confidence genes ranked by the Mahalanobis distance to the non-significant centroid near the origin of the \((log_2(FC) \times-log_{10}FDR)\) plane.

Significant DEGs

This first tier identifies all genes that pass the statistical significance threshold, regardless of effect size. We define is_DEG as any gene where the adjusted P-value (is_significant) is less than 0.05.

{
  cat("\nSignificant DEGs (is_DEG, FDR < 0.05) Count:\n")
  print(with(effects_df, table(predictor, is_DEG)))
}
## 
## Significant DEGs (is_DEG, FDR < 0.05) Count:
##          is_DEG
## predictor FALSE  TRUE
##   Inv4m   21094  3155
##   leaf    14225 10024
##   -P      24201    48
##   leaf:-P 21060  3189

High-Confidence DEGs

The second tier filters the significant DEGs further by applying the custom large effect size thresholds. This step defines is_hiconf_DEG as only those genes that are significant AND meet the \(\log_2\text{FC}\) threshold specific to their predictor type (0.7 for leaf slope, 2.0 for categorical terms).

is_large_effect <- rep(FALSE, nrow(effects_df))
is_leaf <- effects_df$predictor == "leaf"
is_interaction <- effects_df$predictor == "leaf:-P"
is_large_effect[is_leaf & abs(effects_df$logFC) > 0.7] <- TRUE
is_large_effect[is_interaction  & abs(effects_df$logFC) > 0.7] <- TRUE
is_large_effect[!is_leaf & abs(effects_df$logFC) > 2] <- TRUE

effects_df <- effects_df %>%
  mutate(is_hiconf_DEG = is_DEG & is_large_effect)
{
  cat("\nHigh-Confidence DEGs (is_hiconf_DEG) Count:\n")
  print(with(effects_df, table(predictor, is_hiconf_DEG)))
}
## 
## High-Confidence DEGs (is_hiconf_DEG) Count:
##          is_hiconf_DEG
## predictor FALSE  TRUE
##   Inv4m   24096   153
##   leaf    23465   784
##   -P      24232    17
##   leaf:-P 23861   388

Selected DEGs (Rank-Based)

The final tier, is_selected_DEG, selects the most interesting genes for visualization and detailed annotation. A gene is selected if it is among the top N by P-value (among all DEGs) OR among the top N by Mahalanobis distance (among high-confidence DEGs). This step requires calculating intra-group rankings first.

rank_threshold <- 20

effects_df <- effects_df %>%
  group_by(is_hiconf_DEG, predictor, regulation) %>%
  mutate(
    pval_rank = row_number(desc(neglogP)),
    mahal_rank = row_number(desc(mahalanobis))
  ) %>%
  ungroup() %>%
  mutate(
    is_selected_DEG = (pval_rank <= rank_threshold & is_hiconf_DEG) |
                      (mahal_rank <= rank_threshold & is_hiconf_DEG)
  )

{
  cat(sprintf("\nSelected DEGs (is_selected_DEG, Top N=%d by Rank) Count:\n", 
              rank_threshold))
  
  with(
    effects_df %>% filter(is_selected_DEG & regulation != "Unregulated"), 
    table(regulation, predictor, is_selected_DEG)
  )
}
## 
## Selected DEGs (is_selected_DEG, Top N=20 by Rank) Count:
## , , is_selected_DEG = TRUE
## 
##                predictor
## regulation      Inv4m leaf -P leaf:-P
##   Downregulated    22   36  1      30
##   Upregulated      22   29 16      33

Annotate with Gene Information

# Join gene annotations
effects_df <- effects_df %>%
  left_join(
    gene_pannzer,
    by = c(gene = "gene_model"), 
    relationship = "many-to-many"
  ) %>%
  mutate(desc_merged = coalesce(locus_name, desc)) %>%
  select(predictor, regulation, gene, locus_symbol, desc_merged, everything()) %>%
  inner_join(
    genes %>%
      select(gene = ID, CHR = seqnames, BP = start) %>%
      mutate(CHR = as.character(CHR) %>% as.integer()),
    by = "gene"
  )

{
  cat("\nAnnotations added:\n")
  cat("  Final columns:", ncol(effects_df), "\n")
  cat("  Genes with coordinates:\n")
  with(effects_df, table(predictor, !is.na(CHR)))
}
## 
## Annotations added:
##   Final columns: 25 
##   Genes with coordinates:
##          
## predictor  TRUE
##   Inv4m   24067
##   leaf    24067
##   -P      24067
##   leaf:-P 24067

Add Region Classification

Classify genes by genomic location relative to Inv4m.

effects_df <- effects_df %>%
  mutate(
    in_Inv4m = gene %in% inv4m_gene_ids,
    in_cis = gene %in% shared_introgression_gene_ids,
    in_flank = gene %in% flanking_introgression_gene_ids,
    in_trans = !in_cis
  )
{
  cat("\nRegion classification:\n")
  cat("  Inv4m genes:", sum(effects_df$in_Inv4m)/4, "\n")
  cat("  Cis genes (shared introgression):", sum(effects_df$in_cis)/4, "\n")
  cat("  Flanking genes:", sum(effects_df$in_flank)/4, "\n")
  cat("  Trans genes:", sum(effects_df$in_trans)/4, "\n")
}
## 
## Region classification:
##   Inv4m genes: 255 
##   Cis genes (shared introgression): 612 
##   Flanking genes: 357 
##   Trans genes: 23455

Regional Enrichment Analysis

Test whether DEGs are enriched in specific genomic regions using Fisher’s exact test.

# Helper function for Fisher's exact test
run_fisher <- function(data, region_col, deg_col) {
  contingency <- table(data[[region_col]], data[[deg_col]])
  fisher_result <- fisher.test(contingency)
  
  tibble(
    in_region_DEG = contingency[2, 2],
    in_region_total = sum(contingency[2, ]),
    outside_DEG = contingency[1, 2],
    outside_total = sum(contingency[1, ]),
    odds_ratio = fisher_result$estimate,
    p_value = fisher_result$p.value,
    enrichment = (contingency[2, 2] / sum(contingency[2, ])) / 
                 (contingency[1, 2] / sum(contingency[1, ]))
  )
}

DEG Enrichment by Region

# Create separate data frame for Inv4m predictor
Region_effects <- effects_df %>%
  filter(predictor == "Inv4m") %>%
  mutate(outside = in_trans)

# All DEG enrichment tests
deg_tests <- bind_rows(
  run_fisher(Region_effects, "in_cis", "is_DEG") %>%
    mutate(comparison = "Shared introgression vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_flank | outside),
    "in_flank",
    "is_DEG"
  ) %>%
    mutate(comparison = "Flanking vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_Inv4m | outside),
    "in_Inv4m",
    "is_DEG"
  ) %>%
    mutate(comparison = "Inv4m vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_cis),
    "in_Inv4m",
    "is_DEG"
  ) %>%
    mutate(comparison = "Inv4m vs Flanking")
) %>%
  select(comparison, everything())

cat("DEG Enrichment (FDR < 0.05):\n")
## DEG Enrichment (FDR < 0.05):
deg_tests
# High-confidence DEG enrichment tests
hiconf_deg_tests <- bind_rows(
  run_fisher(Region_effects, "in_cis", "is_hiconf_DEG") %>%
    mutate(comparison = "Shared introgression vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_flank | outside),
    "in_flank",
    "is_hiconf_DEG"
  ) %>%
    mutate(comparison = "Flanking vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_Inv4m | outside),
    "in_Inv4m",
    "is_hiconf_DEG"
  ) %>%
    mutate(comparison = "Inv4m vs Outside"),
  run_fisher(
    Region_effects %>% filter(in_cis),
    "in_Inv4m",
    "is_hiconf_DEG"
  ) %>%
    mutate(comparison = "Inv4m vs Flanking")
) %>%
  select(comparison, everything())

{
cat("\nHigh-confidence DEG Enrichment (FDR < 0.05, |logFC| > 2):\n")
hiconf_deg_tests
}
## 
## High-confidence DEG Enrichment (FDR < 0.05, |logFC| > 2):
{
cat("\n=== Key Findings ===\n")
cat("1. DEGs are enriched in Inv4m region vs genome-wide\n")
cat("2. Both Inv4m and flanking show enrichment vs outside\n")
cat("3. Regional enrichment pattern observed for high-confidence DEGs\n")
cat("\nInterpretation: The introgression region (inversion + flanking)\n")
cat("shows elevated differential expression patterns.\n")
}
## 
## === Key Findings ===
## 1. DEGs are enriched in Inv4m region vs genome-wide
## 2. Both Inv4m and flanking show enrichment vs outside
## 3. Regional enrichment pattern observed for high-confidence DEGs
## 
## Interpretation: The introgression region (inversion + flanking)
## shows elevated differential expression patterns.

Quality Control Tables

Top 10 DEGs per Predictor and Regulation

Showing top differentially expressed genes by adjusted p-value.

top_degs_qc <- effects_df %>%
  filter(is_DEG, regulation != "Unregulated") %>%
  group_by(predictor, regulation) %>%
  arrange(-neglogP, .by_group = TRUE) %>%
  slice(1:10) %>%
  select(
    predictor, gene, locus_symbol, 
    desc_merged, logFC, neglogP
  ) %>%
  arrange(-neglogP)

{
cat("\nTop 10 DEGs per predictor and regulation:\n")
top_degs_qc
}
## 
## Top 10 DEGs per predictor and regulation:

Top Mahalanobis Outliers

Most extreme differentially expressed genes.

top_outliers_qc <- effects_df %>%
  filter(is_mh_outlier) %>%
  group_by(predictor, regulation) %>%
  arrange(-mahalanobis, .by_group = TRUE) %>%
  slice(1:10) %>%
  select(
    predictor, regulation, gene, 
    locus_symbol, desc_merged,
    logFC, neglogP, mahalanobis
  ) %>%
  arrange(-neglogP)
{
cat("\nTop Mahalanobis outliers per predictor and regulation:\n")
top_outliers_qc
}
## 
## Top Mahalanobis outliers per predictor and regulation:

DEG Summary Statistics

# Overall DEG counts by predictor
overall_summary <- effects_df %>%
  group_by(predictor) %>%
  summarise(
    total_genes = n(),
    n_significant = sum(is_DEG),
    n_DEG = sum(is_DEG),
    n_hiconf_DEG = sum(is_hiconf_DEG),
    n_selected_DEG = sum(is_selected_DEG),
    pct_DEG = round(100 * n_DEG / total_genes, 2)
  )

# Region distribution for Inv4m effect
inv4m_region_summary <- effects_df %>%
  filter(predictor == "Inv4m", is_DEG) %>%
  group_by(regulation, in_Inv4m, in_cis) %>%
  summarise(n = n(), .groups = "drop")

{
cat("\n=== DEG Summary Statistics ===\n")
overall_summary

cat("\nInv4m DEGs by region and regulation:\n")
inv4m_region_summary
}
## 
## === DEG Summary Statistics ===
## 
## Inv4m DEGs by region and regulation:

Selected DEGs for Manuscript

Extract selected DEGs for detailed presentation in manuscript tables.

selected_degs <- effects_df %>%
  filter(is_selected_DEG) %>%
  select(
    predictor,
    regulation,
    gene,
    locus_symbol,
    desc_merged,
    logFC,
    neglogP,
    pval_rank,
    mahal_rank
  ) %>%
  arrange(predictor, regulation, -neglogP)

{
cat("\n=== Selected DEGs for Manuscript ===\n")
cat("Total selected DEGs:", nrow(selected_degs), "\n\n")
cat("Counts by predictor and regulation:\n")
print(with(selected_degs, table(predictor, regulation)))
}
## 
## === Selected DEGs for Manuscript ===
## Total selected DEGs: 188 
## 
## Counts by predictor and regulation:
##          regulation
## predictor Downregulated Upregulated
##   Inv4m              24          22
##   leaf               36          29
##   -P                  1          16
##   leaf:-P            30          30

Selected DEGs: Phosphorus Effect

Export selected DEGs specific to the phosphorus effect with pannzer description.

p_selected <- selected_degs %>%
  mutate(
    regulation = factor(
      regulation,      
      levels = c("Upregulated", "Downregulated")
    )
  ) %>%
  filter(predictor == "-P" & !is.na(desc_merged)) %>%
  arrange(regulation, -neglogP)
{
cat("\nPhosphorus effect selected DEGs:\n")
cat("  Upregulated:", sum(p_selected$regulation == "Upregulated"), "\n")
cat("  Downregulated:", sum(p_selected$regulation == "Downregulated"), "\n")

p_selected
}
## 
## Phosphorus effect selected DEGs:
##   Upregulated: 11 
##   Downregulated: 1

Selected DEGs: Leaf

leaf_selected <- selected_degs %>%
  mutate(
    regulation = factor(
      regulation,      
      levels = c("Upregulated", "Downregulated")
    )
  ) %>%
  filter(predictor == "leaf" & !is.na(desc_merged)) %>%
  arrange(regulation, -neglogP)
{
cat("\nLeaf stage effect selected DEGs:\n")
cat("  Upregulated:", sum(p_selected$regulation == "Upregulated"), "\n")
cat("  Downregulated:", sum(p_selected$regulation == "Downregulated"), "\n")

leaf_selected
}
## 
## Leaf stage effect selected DEGs:
##   Upregulated: 11 
##   Downregulated: 1

Selected DEGs: Leaf:Treatment Interaction

Export selected DEGs specific to the leaf:treatment interaction.

interaction_selected <- effects_df %>%
  # selected_degs %>%
  mutate(
    regulation = factor(
      regulation,      
      levels = c("Upregulated", "Downregulated")
    )
  ) %>%
  group_by(regulation) %>%
  filter(is_hiconf_DEG) %>%
  # filter(predictor == "leaf:-P" & !is.na(desc_merged)) %>%
  filter(predictor == "leaf:-P") %>%
  arrange(regulation, -neglogP)
{
cat("\nLeaf:Treatment interaction selected DEGs:\n")
cat("  Upregulated:", sum(interaction_selected$regulation == "Upregulated"), "\n")
cat("  Downregulated:", 
    sum(interaction_selected$regulation == "Downregulated"), "\n")

interaction_selected
}
## 
## Leaf:Treatment interaction selected DEGs:
##   Upregulated: 251 
##   Downregulated: 138

Plot interactions

# Load required libraries
library(stringr)

# Load normalized expression data
voomR <- readRDS("~/Desktop/normalized_expression_voom_object_leaf_trt.rda")

# Define genes of interest - ALL selected interaction genes
interaction_genes <- interaction_selected %>%
  filter(predictor == "leaf:-P") %>%
  pull(gene) %>%
  unique()

# Extract expression matrix for genes of interest
expr_matrix <- voomR$E[interaction_genes, ]

# Convert to long format and join with sample metadata
expr_long <- as.data.frame(t(expr_matrix)) %>%
  tibble::rownames_to_column("sample") %>%
  pivot_longer(
    cols = -sample,
    names_to = "gene",
    values_to = "log2CPM"
  ) %>%
  left_join(
    voomR$targets %>% 
      tibble::rownames_to_column("sample") %>%
      select(sample, Treatment, leaf_tissue, Genotype),
    by = "sample"
  ) %>%
  left_join(
    interaction_selected %>%
      filter(predictor == "leaf:-P") %>%
      select(gene, regulation) %>%
      distinct(),
    by = "gene"
  )

# Recode treatment levels
expr_long$Treatment <- factor(expr_long$Treatment)
levels(expr_long$Treatment) <- c("+P", "-P")

expr_long$leafxP <- factor(expr_long$regulation)
levels(expr_long$leafxP ) <- c("Positive", "Negative")

# Calculate mean expression per gene per leaf stage per treatment
expr_summary <- expr_long %>%
  group_by(gene, Treatment, leaf_tissue, leafxP) %>%
  summarise(mean_log2CPM = mean(log2CPM), .groups = "drop")

# Create spaghetti plot with all genes superimposed, faceted by regulation
p_spaghetti <- expr_summary %>%
  ggplot(aes(x = leaf_tissue, y = mean_log2CPM, color = Treatment)) +
  geom_line(
    aes(group = interaction(gene, Treatment)),
    alpha = 0.1,
    linewidth = 0.8
  ) +
  geom_smooth(
    aes(group = Treatment),
    method = "lm",
    formula = y ~ x,
    se = FALSE,
    linewidth = 2
  ) +
  scale_color_manual(
    values = c("+P" = "darkgoldenrod", "-P" = "purple4"),
    name = "Treatment"
  ) +
  facet_wrap(~ leafxP, ncol = 2) +
  labs(
    x = "Leaf Stage",
    y = expression(log[2]*"(CPM)"),
    legend ="",
    title = "Leaf Stage × Phosphorus Interaction"
  ) +
  guides(color = guide_legend(reverse = TRUE)) +
  theme_classic(base_size = 25) +
  theme(
    strip.background = element_blank(),
    strip.text = element_text(size = 24, face = "bold"),
    plot.title = element_text(size = 24, face = "bold",hjust =0.5),
    legend.position = c(0.07,0.95),
    legend.box.background = element_rect(fill = "transparent", color = NA), 
    legend.title = element_blank()
  )

p_spaghetti

# Save plot
ggsave(
  "~/Desktop/leaf_P_interaction_spaghetti.pdf",
  plot = p_spaghetti,
  width = 12,
  height = 6
)
{
cat("Spaghetti plot created: leaf_P_interaction_spaghetti.pdf\n")
cat("All selected interaction genes trajectories by regulation category\n")
cat("Total unique genes plotted:", length(interaction_genes), "\n")
}
## Spaghetti plot created: leaf_P_interaction_spaghetti.pdf
## All selected interaction genes trajectories by regulation category
## Total unique genes plotted: 385
# Load normalized expression data
voomR <- readRDS("~/Desktop/normalized_expression_voom_object_leaf_trt.rda")

# Define genes of interest
interaction_genes <- interaction_selected %>%
  filter(!is.na(desc)) %>%
  group_by(regulation) %>%
  filter(predictor == "leaf:-P") %>%
  arrange(-neglogP) %>%
  slice(1:8)


# Extract interaction effect sizes to order genes
gene_order <- effects_df %>%
  group_by(regulation) %>%
  filter(gene %in% interaction_genes$gene, predictor == "leaf:-P") %>%
  arrange(desc(regulation),-neglogP) %>%
  pull(gene) 

# Extract expression matrix for genes of interest
expr_matrix <- voomR$E[interaction_genes$gene, ]

# Convert to long format and join with sample metadata
expr_long <- as.data.frame(t(expr_matrix)) %>%
  tibble::rownames_to_column("sample") %>%
  pivot_longer(
    cols = -sample,
    names_to = "gene",
    values_to = "log2CPM"
  ) %>%
  left_join(
    voomR$targets %>% 
      tibble::rownames_to_column("sample") %>%
      select(sample, Treatment, leaf_tissue, Genotype),
    by = "sample"
  ) %>% inner_join(
    interaction_genes, by = "gene"
  )

# Recode treatment levels
expr_long$Treatment <- factor(expr_long$Treatment)
levels(expr_long$Treatment) <- c("+P", "-P")

expr_long$leafxP <- factor(expr_long$regulation)
levels(expr_long$leafxP ) <- c("Positive", "Negative")

# Add gene annotations with wrapped descriptions
gene_annotations <- gene_pannzer %>%
  filter(gene_model %in% interaction_genes$gene) %>%
  mutate(gene_label = paste0(
    gene_model, 
    "\n", 
    str_wrap(coalesce(locus_name, desc), width = 30, whitespace_only = FALSE)
  ))

expr_long <- expr_long %>%
  left_join(
    gene_annotations %>% select(gene = gene_model, gene_label),
    by = "gene"
  ) %>%
  mutate(gene_label = if_else(is.na(gene_label), gene, gene_label))

# Order gene labels by interaction effect
expr_long <- expr_long %>%
  mutate(
    gene = factor(gene, levels = gene_order),
    gene_label = factor(gene_label, levels = unique(gene_label[order(gene)]))
  )

# Create reaction norm plot
p <- expr_long %>%
  ggplot(
    aes(x = leaf_tissue, y = log2CPM, color = Treatment, group = Treatment)
  ) +
  stat_summary(fun = mean, geom = "line", linewidth = 1) +
  stat_summary(fun = mean, geom = "point", size = 2.5) +
  stat_summary(
    fun.data = mean_se, 
    geom = "errorbar", 
    width = 0.1,
    linewidth = 0.5
  ) +
  scale_color_manual(
    values = c("+P" = "gold", "-P" = "purple4"),
    name = "Treatment"
  ) +
  facet_wrap(~ gene_label, scales = "free_y", ncol = 4) +
  labs(
    x = "Leaf Stage",
    y = expression(log[2]*"(CPM)"),
    title = "Leaf × Phosphorus Interaction"
  ) +
  theme_classic(base_size = 25) +
  theme(
    strip.background = element_blank(),
    strip.text = element_text(size = 9, face = "bold",hjust =0),
    legend.position = "top"
  )

p

# Save plot
ggsave(
  "~/Desktop/leaf_P_interaction_reaction_norms.pdf",
  plot = p,
  width = 10,
  height = 8
)
{
cat("Reaction norm plot created: leaf_P_interaction_reaction_norms.pdf\n")
cat("Genes ordered by interaction logFC (positive to negative)\n")
}
## Reaction norm plot created: leaf_P_interaction_reaction_norms.pdf
## Genes ordered by interaction logFC (positive to negative)

Export Results

Export Full Effects Table

# Full effects table with all columns
write.csv(
  effects_df,
  file = "~/Desktop/predictor_effects_leaf_interaction_model.csv",
  row.names = FALSE
)

cat("\nFull effects table exported:\n")
cat("  predictor_effects_leaf_interaction_model.csv\n")

Export Selected DEGs

# Selected DEGs for manuscript
write.csv(
  selected_degs,
  file = "~/Desktop/selected_DEGs_leaf_interaction_model.csv",
  row.names = FALSE
)

# Phosphorus selected DEGs
write.csv(
  p_selected,
  file = "~/Desktop/p_selected_DEGs_leaf_interaction_model.csv",
  row.names = FALSE
)

write.csv(
  p_selected,
  file = "~/Desktop/leaf_selected_DEGs_leaf_interaction_model.csv",
  row.names = FALSE
)

# Inv4m selected DEGs
write.csv(
  selected_degs %>% filter(predictor == "Inv4m"),
  file = "~/Desktop/inv4m_selected_DEGs_leaf_interaction_model.csv",
  row.names = FALSE
)

# Leaf:Treatment interaction selected DEGs
write.csv(
  interaction_selected,
  file = "~/Desktop/leafxP_selected_DEGs_leaf_interaction_model.csv",
  row.names = FALSE
)

{
cat("\nSelected DEG tables exported:\n")
cat("  selected_DEGs_leaf_interaction_model.csv - All selected DEGs\n")
cat("  p_selected_DEGs_leaf_interaction_model.csv - Phosphorus effect\n")
cat("  p_selected_DEGs_leaf_interaction_model.csv - Leaf effect\n")
cat("  inv4m_selected_DEGs_leaf_interaction_model.csv - Inv4m effect\n")
cat("  leafxP_selected_DEGs_leaf_interaction_model.csv - Leaf:Treatment interaction\n")
}

Summary

This analysis identified differentially expressed genes across four main effects:

  1. Inv4m genotype: Genes with different expression in Inv4m vs Control (|logFC| > 2, FDR < 0.05)

  2. Leaf position gradient: Genes showing expression changes along the apical-basal axis (|logFC| > 0.7, FDR < 0.05)

  3. Phosphorus deficiency: Genes responding to low P treatment (|logFC| > 2, FDR < 0.05)

  4. Leaf × Treatment interaction: Genes where the leaf gradient differs between P treatments (|logFC| > 2, FDR < 0.05)

Key Findings

  • MDS analysis revealed that:

    • Dimension 1 correlates with phosphorus treatment
    • Dimension 2 correlates with leaf tissue position
    • Dimension 3 correlates with genotype
  • Spatial covariates (Plot_Row, Plot_Column) were included to account for field position effects

  • Region enrichment: DEGs show enrichment in the Inv4m region, with patterns observed across both inversion and flanking regions

  • Three-tier classification:

    • Significant DEGs (FDR < 0.05): 16408 genes
    • High-confidence DEGs (FDR < 0.05, |logFC| > 2): 1345 genes
    • Selected DEGs (top 20 by significance/Mahalanobis): 188 genes

Output Files

All results include:

  • Effect sizes (log2 fold change) with 95% confidence intervals
  • FDR-adjusted p-values
  • Gene annotations (symbols and descriptions)
  • Genomic coordinates
  • Region classification (cis/trans, within Inv4m)
  • Mahalanobis outlier flags
  • Three-tier DEG classifications

The full effects table contains 96268 gene × predictor combinations.

Session Information

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] robustbase_0.99-6    ggtext_0.1.2         ggpubr_0.6.2        
##  [4] ggplot2_4.0.0        stringr_1.5.2        tidyr_1.3.1         
##  [7] dplyr_1.1.4          rtracklayer_1.69.1   GenomicRanges_1.61.5
## [10] Seqinfo_0.99.2       IRanges_2.43.5       S4Vectors_0.47.4    
## [13] BiocGenerics_0.55.4  generics_0.1.4       edgeR_4.7.6         
## [16] limma_3.65.7        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1            viridisLite_0.4.2          
##  [3] farver_2.1.2                Biostrings_2.77.2          
##  [5] S7_0.2.0                    bitops_1.0-9               
##  [7] fastmap_1.2.0               RCurl_1.98-1.17            
##  [9] GenomicAlignments_1.45.4    XML_3.99-0.19              
## [11] digest_0.6.37               lifecycle_1.0.4            
## [13] statmod_1.5.1               magrittr_2.0.4             
## [15] compiler_4.5.1              rlang_1.1.6                
## [17] sass_0.4.10                 tools_4.5.1                
## [19] yaml_2.3.10                 knitr_1.50                 
## [21] ggsignif_0.6.4              S4Arrays_1.9.1             
## [23] labeling_0.4.3              curl_7.0.0                 
## [25] DelayedArray_0.35.3         xml2_1.4.0                 
## [27] RColorBrewer_1.1-3          abind_1.4-8                
## [29] BiocParallel_1.43.4         withr_3.0.2                
## [31] purrr_1.1.0                 grid_4.5.1                 
## [33] scales_1.4.0                SummarizedExperiment_1.39.2
## [35] cli_3.6.5                   rmarkdown_2.30             
## [37] crayon_1.5.3                ragg_1.5.0                 
## [39] rstudioapi_0.17.1           httr_1.4.7                 
## [41] rjson_0.2.23                commonmark_2.0.0           
## [43] cachem_1.1.0                splines_4.5.1              
## [45] parallel_4.5.1              XVector_0.49.1             
## [47] restfulr_0.0.16             matrixStats_1.5.0          
## [49] vctrs_0.6.5                 Matrix_1.7-4               
## [51] jsonlite_2.0.0              carData_3.0-5              
## [53] litedown_0.7                car_3.1-3                  
## [55] rstatix_0.7.3               Formula_1.2-5              
## [57] systemfonts_1.3.1           locfit_1.5-9.12            
## [59] jquerylib_0.1.4             glue_1.8.0                 
## [61] DEoptimR_1.1-4              codetools_0.2-20           
## [63] cowplot_1.2.0               stringi_1.8.7              
## [65] gtable_0.3.6                BiocIO_1.19.0              
## [67] tibble_3.3.0                pillar_1.11.1              
## [69] htmltools_0.5.8.1           R6_2.6.1                   
## [71] textshaping_1.0.4           evaluate_1.0.5             
## [73] lattice_0.22-7              Biobase_2.69.1             
## [75] markdown_2.0                backports_1.5.0            
## [77] Rsamtools_2.25.3            gridtext_0.1.5             
## [79] broom_1.0.10                bslib_0.9.0                
## [81] Rcpp_1.1.0                  nlme_3.1-168               
## [83] SparseArray_1.9.1           mgcv_1.9-3                 
## [85] xfun_0.53                   MatrixGenerics_1.21.0      
## [87] pkgconfig_2.0.3