Overview

This analysis performs differential gene expression analysis on RNA-seq data from Crow et al. (2020), examining maize apical tissue samples across multiple experimental factors:

  • Tissue type: V1_Root, V1_Leaf,V3_Leaf, V3_Root, V3_SAM
  • temperature treatment: CTRL (warm) vs cold
  • Genotype: CTRL (B73) vs Inv4m (inverted karyotype)
  • Donor: PT vs Mi21 (treated as batch effect)

The analysis uses the limma-voom pipeline to identify genes that respond to:

  1. temp (cold): temperature treatment effect
  2. Inv4m: Genotype effect
  3. temp:Inv4m: Interaction between temperature and genotype

Linear Model Specification

This model describes the log-transformed expression \(y_{g,i}\) of gene \(g\) in sample \(i\):

\[ \begin{aligned} y_{g,i} &= \beta_{g,0} + \beta_{g,D}\,D_i + \sum_{t=1}^{4}\beta_{g,T_t}\,T_{t,i} + \beta_{g,\text{temp}}\,\text{temp}_i \\ &\quad + \beta_{g,G}\,G_i + \beta_{g,\text{temp} \times G}\,(\text{temp}_i \times G_i) + \varepsilon_{g,i} \end{aligned} \]

with \(\varepsilon_{g,i} \sim N(0, \phi_i\sigma^2_{g})\)

Model Components

Symbol Description Notes
\(y_{g,i}\) log\(_2\)(CPM) expression Response variable
\(\beta_{g,0}\) Intercept Baseline: PT, V1_Root, CTRL temp, CTRL genotype
\(\beta_{g,D}\) Donor effect (Mi21 vs PT) Batch correction
\(\beta_{g,T_t}\) Tissue effects 4 dummy variables (V1_Root is reference)
\(\beta_{g,\text{temp}}\) Cold temperature effect Main effect
\(\beta_{g,G}\) Inv4m genotype effect Main effect
\(\beta_{g,\text{temp} \times G}\) temperature × Genotype Interaction (key effect)
\(\varepsilon_{g,i}\) Residual error Voom-weighted

Fold Change Thresholds

  • Interaction (temp:Inv4m): ±0.5 log2FC
  • Main effects (temp, Inv4m): ±1.5 log2FC

DEG Classification Tiers

  1. Significant DEGs: FDR < 0.05 (is_DEG)
  2. High-confidence DEGs: FDR < 0.05 AND appropriate |log2FC| threshold (is_hiconf_DEG)
  3. Selected DEGs: Top 10 by significance + top 10 by Mahalanobis distance (is_selected_DEG)

Libraries

library(edgeR)          # Differential expression analysis
library(limma)          # Linear models for RNA-seq
library(rtracklayer)    # Genomic annotation handling
library(GenomicRanges)  # Genomic ranges operations
library(dplyr)          # Data manipulation
library(tidyr)          # Data manipulation
library(stringr)        # String manipulation
library(ggplot2)        # Plotting
library(ggpubr)         # Publication ready plots
library(ggfx)           # Extra effects for ggplots (shadows)
library(ggtext)         # Formatted text in plots
library(robustbase)     # Robust statistics (MCD for Mahalanobis)

Data Loading and Preprocessing

Load Expression Counts

counts <- read.csv("~/Desktop/crow2020_gene_xp.csv")

{
  cat("Loaded expression data:\n")
  cat("  Dimensions:", dim(counts), "\n")
  cat("  Genes:", nrow(counts), "\n")
  cat("  Samples:", ncol(counts) - 1, "\n")
}
## Loaded expression data:
##   Dimensions: 39756 347 
##   Genes: 39756 
##   Samples: 346

Load Sample Metadata

quant_dir <- "/Volumes/DOE_CAREER/inv4m/run/results_quantify/kallisto_quant"

# Define paths
sra_file <- "/Volumes/DOE_CAREER/inv4m/crow2020/SraRunTable.csv"
sample_submission_file <- "/Volumes/DOE_CAREER/inv4m/crow2020/gc7_sample_submission.csv"

# Load data
sra_meta <- read.csv(sra_file, stringsAsFactors = FALSE, check.names = FALSE)
submission_meta <- read.csv(sample_submission_file, stringsAsFactors = FALSE)

# Standardize SRA metadata
sra_meta <- sra_meta %>%
  rename(
    samp = `Library Name`,
    genotype = genotype,
    temp = temp,
    tissue = tissue,
    Run = Run
  )

# Join core metadata and filter to NIL lines of interest
metadata <- submission_meta %>%
  filter(old_line %in% c("PT_NIL", "Mi21_NIL")) %>%
  select(samp, old_line, ID, source, harvest_date, tube_num_full) %>%
  inner_join(sra_meta, by = "samp") %>%
  arrange(Run)

# Create analysis factors and sampling covariates
metadata <- metadata %>%
  mutate(
    # === Biological factors ===
    # Donor
    Donor = case_when(
      grepl("^PT_", old_line) ~ "PT",
      grepl("^Mi21_", old_line) ~ "Mi21",
      TRUE ~ NA_character_
    ),
    Donor = factor(Donor, levels = c("PT", "Mi21")),
    
    # Genotype
    Genotype = factor(genotype, levels = c("B73", "Inv4m")),
    Genotype = recode(Genotype, "B73" = "CTRL", "Inv4m" = "Inv4m"),
    
    # temperature
    temp = factor(temp, levels = c("warm", "cold")),
    temp = recode(temp, "warm" = "CTRL", "cold" = "cold"),
    
    # Tissue — with leaf subtypes
    Tissue = case_when(
      tissue == "V1_Leaf_tissue"    ~ "V1_Leaf",
      tissue == "V1_Primary_root"   ~ "V1_Root",
      tissue == "V3_Leaf_tissue"    ~ "V3_Leaf",
      tissue == "V3_leaf_base"      ~ "V3_Leaf_Base",
      tissue == "V3_leaf_sheath"    ~ "V3_Leaf_Sheath",
      tissue == "V3_S2_leaf_base"   ~ "V3_S2_Leaf_Base",
      tissue == "V3_S2_leaf_tip"    ~ "V3_S2_Leaf_Tip",
      tissue == "V3_Primary_root"   ~ "V3_Root",
      tissue == "V3_Stem_SAM"       ~ "V3_SAM",
      TRUE ~ NA_character_
    ),
    Tissue = factor(Tissue, levels = c(
      "V1_Root", "V3_Root", "V1_Leaf", "V3_Leaf", 
      "V3_Leaf_Base", "V3_Leaf_Sheath", 
      "V3_S2_Leaf_Base", "V3_S2_Leaf_Tip", "V3_SAM"
    )),
    
    # === Sampling & technical covariates ===
    # Unique plant identifier (from submission metadata)
    plant_id = ID,
    
    # Collection team (source column = technician/team ID)
    collector_team = as.factor(source),
    
    # Harvest date (biological sampling day)
    harvest_date = as.Date(harvest_date, format = "%m/%d/%Y"),
    
    # Plant serial number (numeric suffix of ID; proxy for planting/sampling order)
    plant_serial_number = as.numeric(stringr::str_sub(ID, 5)),
    
    # Within-plant tissue order (0 = leaf, 1 = root, 2 = base, etc.)
    tissue_order_index = as.numeric(
      stringr::str_extract(tube_num_full, "\\.[0-9]+")
    ) %>% 
      dplyr::coalesce(0) %>% 
      abs(),
    
    # SRA upload time (proxy for library prep + sequencing batch)
    sra_upload_time = as.POSIXct(create_date, format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
    
    # Combined harvest batch (date + team)
    harvest_batch = interaction(harvest_date, collector_team, drop = TRUE)
  ) %>%
  # Remove samples with unmapped tissue
  filter(!is.na(Tissue))

metadata$harvest_date_f <- factor(metadata$harvest_date)

{
  cat("\nSample metadata:\n")
  cat("  Total samples:", nrow(metadata), "\n")
  cat("  Genotypes:", paste(unique(metadata$Genotype), collapse = ", "), "\n")
  cat("  temperatures:", paste(unique(metadata$temp), collapse = ", "), "\n")
  cat("  Donors:", paste(unique(metadata$Donor), collapse = ", "), "\n")
  cat("  Tissues:", paste(levels(metadata$Tissue), collapse = ", "), "\n")
  cat("  Collection teams:", paste(levels(metadata$collector_team), collapse = ", "), "\n")
  cat("  Harvest dates:", paste(sort(unique(metadata$harvest_date)), collapse = ", "), "\n")
  cat("\nTissue counts:\n")
  print(sort(table(metadata$Tissue), decreasing = TRUE))
}
## 
## Sample metadata:
##   Total samples: 346 
##   Genotypes: CTRL, Inv4m 
##   temperatures: CTRL, cold 
##   Donors: PT, Mi21 
##   Tissues: V1_Root, V3_Root, V1_Leaf, V3_Leaf, V3_Leaf_Base, V3_Leaf_Sheath, V3_S2_Leaf_Base, V3_S2_Leaf_Tip, V3_SAM 
##   Collection teams: 1, 2 
##   Harvest dates: 2017-03-07, 2017-03-08, 2017-03-09, 2017-03-13, 2017-03-14, 2017-03-15, 2017-03-16, 2017-03-17, 2017-03-24, 2017-03-28, 2017-04-27, 2017-04-28, 2017-05-02, 2017-05-03, 2017-05-04, 2017-05-12, 2017-05-15, 2017-05-16 
## 
## Tissue counts:
## 
##         V3_Root         V1_Root          V3_SAM         V1_Leaf V3_S2_Leaf_Base 
##              42              41              40              39              38 
##         V3_Leaf  V3_Leaf_Sheath    V3_Leaf_Base  V3_S2_Leaf_Tip 
##              37              37              36              36

Verify Experimental Design Balance

{
  cat("\n=== Experimental Design Balance ===\n")
  
  cat("\nDonor × Genotype × temperature:\n")
  print(with(metadata, table(Donor, Genotype, temp)))
  
  cat("\nTissue × Genotype × temperature:\n")
  print(with(metadata, table(Tissue, Genotype, temp)))
  
  cat("\nDonor × Genotype (overall):\n")
  print(with(metadata, table(Donor, Genotype)))
}
## 
## === Experimental Design Balance ===
## 
## Donor × Genotype × temperature:
## , , temp = CTRL
## 
##       Genotype
## Donor  CTRL Inv4m
##   PT     58    40
##   Mi21   51    50
## 
## , , temp = cold
## 
##       Genotype
## Donor  CTRL Inv4m
##   PT     42    31
##   Mi21   39    35
## 
## 
## Tissue × Genotype × temperature:
## , , temp = CTRL
## 
##                  Genotype
## Tissue            CTRL Inv4m
##   V1_Root           11    10
##   V3_Root           13    11
##   V1_Leaf           10     9
##   V3_Leaf           12     9
##   V3_Leaf_Base      12    11
##   V3_Leaf_Sheath    13    11
##   V3_S2_Leaf_Base   13     9
##   V3_S2_Leaf_Tip    12     9
##   V3_SAM            13    11
## 
## , , temp = cold
## 
##                  Genotype
## Tissue            CTRL Inv4m
##   V1_Root           11     9
##   V3_Root           11     7
##   V1_Leaf           12     8
##   V3_Leaf           10     6
##   V3_Leaf_Base       6     7
##   V3_Leaf_Sheath     7     6
##   V3_S2_Leaf_Base    8     8
##   V3_S2_Leaf_Tip     8     7
##   V3_SAM             8     8
## 
## 
## Donor × Genotype (overall):
##       Genotype
## Donor  CTRL Inv4m
##   PT    100    71
##   Mi21   90    85

Prepare Count Matrix

# Extract gene IDs
gene_ids <- counts[, 1]

# Convert to matrix
counts_matrix <- as.matrix(counts[, -1])
rownames(counts_matrix) <- gene_ids

# Match sample names to metadata
# Assuming column names in counts are Run IDs
sample_order <- match(colnames(counts_matrix), metadata$Run)
metadata <- metadata[sample_order, ]

{
  cat("\nCount matrix prepared:\n")
  cat("  Genes:", nrow(counts_matrix), "\n")
  cat("  Samples:", ncol(counts_matrix), "\n")
  cat("  All samples matched:", 
      all(colnames(counts_matrix) == metadata$Run), "\n")
}
## 
## Count matrix prepared:
##   Genes: 39756 
##   Samples: 346 
##   All samples matched: TRUE

Create DGEList Object

# Create DGEList with counts and sample information
y <- DGEList(counts = counts_matrix, samples = metadata)

# Define groups from experimental factors
y$samples$group <- interaction(
  y$samples$temp,
  y$samples$Genotype,
  y$samples$Tissue
)

{
  cat("\nDGEList object created\n")
  cat("  Number of groups:", nlevels(y$samples$group), "\n")
  head(y$samples)
}
## 
## DGEList object created
##   Number of groups: 36

Expression Filtering and Sample Quality Control

Filter Genes by Expression Level

# Keep genes with sufficient expression
keep <- filterByExpr(y, group = y$samples$group)
y_filtered <- y[keep, ]

{
  cat("\nExpression filtering:\n")
  cat("  Genes before:", nrow(y), "\n")
  cat("  Genes after:", nrow(y_filtered), "\n")
  cat("  Genes removed:", sum(!keep), "\n")
}
## 
## Expression filtering:
##   Genes before: 39756 
##   Genes after: 26490 
##   Genes removed: 13266

Library Size Quality Control

# Histogram of library sizes
hist(
  y_filtered$samples$lib.size / 1e6,
  main = "Library Size Distribution",
  xlab = "Library Size (millions of reads)",
  breaks = 20
)

{
  cat("\nLibrary size summary:\n")
  print(summary(y_filtered$samples$lib.size))
  cat("\nSamples with lib.size < 10 million:", 
      sum(y_filtered$samples$lib.size < 1e7), "\n")
}
## 
## Library size summary:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   203085  1797064  3155701  3597346  4886950 18016248 
## 
## Samples with lib.size < 10 million: 336

Filter Low Quality Libraries

# Flag and remove low count libraries (< 10M reads)
y_filtered$samples$lowCount <- y_filtered$samples$lib.size < 1e5
y_filtered <- y_filtered[, !y_filtered$samples$lowCount]

{
  cat("\nLow quality libraries removed:\n")
  cat("  Retained:", ncol(y_filtered), "\n")
  cat("  Removed:", sum(y$samples$lib.size < 1e5), "\n")
}
## 
## Low quality libraries removed:
##   Retained: 346 
##   Removed: 0

Sample Distribution After Filtering

{
  with(y_filtered$samples, {
    cat("\n=== Sample Distribution After QC ===\n")
    cat("\n-- Genotype --\n")
    print(table(Genotype))
    cat("\n-- Temperature --\n")
    print(table(temp))
    cat("\n-- Tissue --\n")
    print(table(Tissue))
    cat("\n-- Donor --\n")
    print(table(Donor))
  })
}
## 
## === Sample Distribution After QC ===
## 
## -- Genotype --
## Genotype
##  CTRL Inv4m 
##   190   156 
## 
## -- Temperature --
## temp
## CTRL cold 
##  199  147 
## 
## -- Tissue --
## Tissue
##         V1_Root         V3_Root         V1_Leaf         V3_Leaf    V3_Leaf_Base 
##              41              42              39              37              36 
##  V3_Leaf_Sheath V3_S2_Leaf_Base  V3_S2_Leaf_Tip          V3_SAM 
##              37              38              36              40 
## 
## -- Donor --
## Donor
##   PT Mi21 
##  171  175

Quality Control: Batch Effects

Compute MDS Dimensions (10 dimensions for diagnostics)

# TMM normalization
y_filtered <- calcNormFactors(y_filtered, method = "TMM")

# Compute MDS
mds <- plotMDS(y_filtered, plot = FALSE)

# Use 10 dimensions based on scree plot
n_mds_dims <- 10

# Step 1: Prepare metadata columns (from your processed metadata)
meta_cols <- metadata %>%
  select(
    Run,
    temp, Genotype, Donor, Tissue,
    plant_id, collector_team, harvest_date,
    plant_serial_number, tissue_order_index,
    sra_upload_time, harvest_batch
  )

# Step 2: Prepare DGE sample info (from y_filtered)
dge_cols <- y_filtered$samples %>%
  select(Run, lib.size, norm.factors, group)

# Step 3: Prepare MDS coordinates
mds_coords <- tibble(Run = y_filtered$samples$Run)
for (i in 1:n_mds_dims) {
  mds_coords[[paste0("dim", i)]] <- mds$eigen.vectors[, i] * sqrt(mds$var.explained[i])
}

# Step 4: Merge all three cleanly
d <- meta_cols %>%
  inner_join(dge_cols, by = "Run") %>%
  inner_join(mds_coords, by = "Run")

# Report variance explained
var_explained <- mds$var.explained[1:n_mds_dims]
cumvar_pct <- cumsum(var_explained) / sum(mds$var.explained[mds$var.explained > 0]) * 100

var_explained_df <- tibble(
  dimension = paste0("dim", 1:n_mds_dims),
  var_explained = round(var_explained, 4),
  cum_var_pct = round(cumvar_pct, 1)
)

cat("Variance explained by first", n_mds_dims, "MDS dimensions:\n")
## Variance explained by first 10 MDS dimensions:
print(var_explained_df)
## # A tibble: 10 × 3
##    dimension var_explained cum_var_pct
##    <chr>             <dbl>       <dbl>
##  1 dim1             0.305         30.5
##  2 dim2             0.112         41.7
##  3 dim3             0.0524        47  
##  4 dim4             0.0416        51.1
##  5 dim5             0.0292        54  
##  6 dim6             0.0243        56.5
##  7 dim7             0.0201        58.5
##  8 dim8             0.0119        59.7
##  9 dim9             0.0107        60.7
## 10 dim10            0.0104        61.8

MDS Dimension Correlations: Biological & Technical Covariates

# Define all covariates to test
biological_covs <- c("temp", "Genotype", "Donor", "Tissue")
technical_covs <- c(
  "collector_team", 
  "harvest_date", 
  "plant_serial_number", 
  "tissue_order_index", 
  "sra_upload_time"
)
all_covs <- c(biological_covs, technical_covs)

# Prepare a numeric version of d for correlation
d_numeric <- d %>%
  mutate(
    # Convert factors to numeric (ordered levels)
    temp_n = as.numeric(temp),
    Genotype_n = as.numeric(Genotype),
    Donor_n = as.numeric(Donor),
    Tissue_n = as.numeric(Tissue),
    collector_team_n = as.numeric(collector_team),
    # Date/time as numeric (days since origin)
    harvest_date_n = as.numeric(harvest_date),
    sra_upload_time_n = as.numeric(sra_upload_time)
    # plant_serial_number and tissue_order_index are already numeric
  )

# Get MDS dimension names (dim1 to dim10)
mds_dims <- paste0("dim", 1:10)

# Initialize results tibble
cor_results <- tibble(
  dimension = character(),
  covariate = character(),
  correlation = numeric(),
  p_value = numeric()
)

# Perform correlations
for (dim in mds_dims) {
  for (cov in all_covs) {
    cov_n <- paste0(cov, "_n")
    
    # Skip if numeric version doesn't exist (e.g., already numeric)
    if (!cov_n %in% names(d_numeric)) {
      if (cov %in% c("plant_serial_number", "tissue_order_index")) {
        cov_n <- cov  # these are already numeric
      } else {
        next
      }
    }
    
    x <- d_numeric[[dim]]
    y <- d_numeric[[cov_n]]
    
    # Skip if no variation
    if (sd(x, na.rm = TRUE) == 0 || sd(y, na.rm = TRUE) == 0) {
      next
    }
    
    # Spearman correlation (robust, handles ties)
    test_result <- tryCatch({
      cor.test(x, y, method = "spearman", exact = FALSE)
    }, error = function(e) {
      NULL
    })
    
    if (!is.null(test_result)) {
      cor_results <- cor_results %>%
        add_row(
          dimension = dim,
          covariate = cov,
          correlation = test_result$estimate,
          p_value = test_result$p.value
        )
    }
  }
}

# Adjust p-values (FDR across all tests)
cor_results <- cor_results %>%
  mutate(
    adj_p_value = p.adjust(p_value, method = "fdr"),
    significant = adj_p_value < 0.05,
    correlation = round(correlation, 3),
    p_value = signif(p_value, 3),
    adj_p_value = signif(adj_p_value, 3)
  ) %>%
  arrange(desc(abs(correlation)))

# Display top significant results
cat("\nTop significant MDS correlations (FDR < 0.05):\n")
## 
## Top significant MDS correlations (FDR < 0.05):
print(cor_results %>% filter(significant) %>% head(15))
## # A tibble: 15 × 6
##    dimension covariate           correlation  p_value adj_p_value significant
##    <chr>     <chr>                     <dbl>    <dbl>       <dbl> <lgl>      
##  1 dim2      tissue_order_index       -0.702 1.03e-52    9.25e-51 TRUE       
##  2 dim7      temp                      0.578 3.29e-32    1.48e-30 TRUE       
##  3 dim2      Tissue                   -0.563 2.35e-30    7.05e-29 TRUE       
##  4 dim5      temp                     -0.557 1.35e-29    3.05e-28 TRUE       
##  5 dim7      plant_serial_number       0.545 3.38e-28    6.08e-27 TRUE       
##  6 dim3      Tissue                    0.535 5.10e-27    7.65e-26 TRUE       
##  7 dim6      harvest_date              0.479 2.97e-21    3.82e-20 TRUE       
##  8 dim4      temp                     -0.397 1.7 e-14    1.72e-13 TRUE       
##  9 dim5      plant_serial_number      -0.397 1.72e-14    1.72e-13 TRUE       
## 10 dim7      harvest_date              0.352 1.54e-11    1.38e-10 TRUE       
## 11 dim4      plant_serial_number      -0.35  2.15e-11    1.76e-10 TRUE       
## 12 dim8      harvest_date              0.321 1.04e- 9    7.81e- 9 TRUE       
## 13 dim1      Tissue                    0.307 5.50e- 9    3.81e- 8 TRUE       
## 14 dim10     sra_upload_time          -0.244 4.20e- 6    2.70e- 5 TRUE       
## 15 dim5      harvest_date             -0.22  3.66e- 5    2.2 e- 4 TRUE
# Optional: Save full results for later
mds_cor_full <- cor_results

MDS: Multiple Factors

library(ggpubr)

# 1. strongest effect
p1 <- ggplot(d, aes(x = dim1, y = dim2, color = harvest_date)) +
  geom_point(size = 3) +
  scale_color_viridis_c(option = "plasma", name = "Harvest Date") +
  theme_classic(base_size = 14) +
  theme(legend.position = "top")

# 2. Collection team (technical)
p2 <- ggplot(d, aes(x = dim1, y = dim2, color = collector_team)) +
  geom_point(size = 3) +
  theme_classic(base_size = 14) +
  theme(legend.position = "top",
         plot.margin = margin(r = 60))

# 3. Temperature (biological)
p3 <- ggplot(d, aes(x = dim1, y = dim2, color = temp)) +
  geom_point(size = 3) +
  theme_classic(base_size = 14) +
  theme(legend.position = "top")

# 4. Genotype (biological)
p4 <- ggplot(d, aes(x = dim1, y = dim2, color = Genotype)) +
  geom_point(size = 3) +
  theme_classic(base_size = 14) +
  theme(legend.position = "top",
         plot.margin = margin(r = 60))

# 5. Donor (biological)
p5 <- ggplot(d, aes(x = dim1, y = dim2, color = Donor)) +
  geom_point(size = 3) +
  theme_classic(base_size = 14) +
  theme(legend.position = "top",
         plot.margin = margin(r = 60))

# 6. Tissue (biological)
p6 <- ggplot(d, aes(x = dim1, y = dim2, color = Tissue)) +
  geom_point(size = 3) +
  theme_classic(base_size = 14) +
  theme(legend.position = "top",
         plot.margin = margin(r = 80))

# Arrange in 2 columns, 3 rows
ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)

Normalization and Design Matrix

# Design matrix
design <- model.matrix(
  ~ Donor + Tissue*Genotype+ temp*Genotype,
  data = y_filtered$samples
)

# Voom transformation with precision weights
voomR <- voom(y_filtered, design = design, plot = TRUE)

# Save normalized expression
saveRDS(voomR$E, file = "../results/normalized_expression_logCPM_crow2020.rds")
saveRDS(voomR, file = "../results/normalized_expression_voom_object_crow2020.rds")

{
  cat("Normalization factors range:", 
      range(y_filtered$samples$norm.factors), "\n\n")
  cat("Design matrix:", nrow(design), "samples ×", ncol(design), 
      "coefficients\n")
  cat("\nCoefficients:\n")
  print(colnames(design))
  cat("\nVoom expression matrix:", nrow(voomR$E), "genes ×", 
      ncol(voomR$E), "samples\n")
}
## Normalization factors range: 0.5098395 1.649046 
## 
## Design matrix: 346 samples × 21 coefficients
## 
## Coefficients:
##  [1] "(Intercept)"                         "DonorMi21"                          
##  [3] "TissueV3_Root"                       "TissueV1_Leaf"                      
##  [5] "TissueV3_Leaf"                       "TissueV3_Leaf_Base"                 
##  [7] "TissueV3_Leaf_Sheath"                "TissueV3_S2_Leaf_Base"              
##  [9] "TissueV3_S2_Leaf_Tip"                "TissueV3_SAM"                       
## [11] "GenotypeInv4m"                       "tempcold"                           
## [13] "TissueV3_Root:GenotypeInv4m"         "TissueV1_Leaf:GenotypeInv4m"        
## [15] "TissueV3_Leaf:GenotypeInv4m"         "TissueV3_Leaf_Base:GenotypeInv4m"   
## [17] "TissueV3_Leaf_Sheath:GenotypeInv4m"  "TissueV3_S2_Leaf_Base:GenotypeInv4m"
## [19] "TissueV3_S2_Leaf_Tip:GenotypeInv4m"  "TissueV3_SAM:GenotypeInv4m"         
## [21] "GenotypeInv4m:tempcold"             
## 
## Voom expression matrix: 26490 genes × 346 samples

Batch-Corrected Expression (Visualization Only)

# Remove collection order and donor batch effect for visualization
# Design matrix with ONLY technical effects to remove
design_batch <- model.matrix(
  ~  Donor,
  data = metadata
)

# Remove ONLY these effects
corrected_logCPM <- removeBatchEffect(
  voomR$E,
  design = design_batch
)

# Save for downstream visualization
saveRDS(
  corrected_logCPM, 
  "../results/normalized_expression_batch_corrected_crow2020.rds"
)

{
  cat("\nBatch-corrected expression saved (for visualization only)\n")
  cat("Use voomR for differential expression analysis\n")
}
## 
## Batch-corrected expression saved (for visualization only)
## Use voomR for differential expression analysis

Linear Model Fitting

# Fit linear model
fit <- lmFit(voomR, design)

# Apply empirical Bayes moderation
ebfit <- eBayes(fit)

{
  cat("Model fitted:", nrow(fit$coefficients), "genes ×", 
      ncol(fit$coefficients), "coefficients\n")
  cat("\nSignificant genes per coefficient (FDR < 0.05):\n")
  print(colSums(abs(decideTests(ebfit))))
}
## Model fitted: 26490 genes × 21 coefficients
## 
## Significant genes per coefficient (FDR < 0.05):
##                         (Intercept)                           DonorMi21 
##                               24030                                2009 
##                       TissueV3_Root                       TissueV1_Leaf 
##                                 197                               20368 
##                       TissueV3_Leaf                  TissueV3_Leaf_Base 
##                               20153                               19640 
##                TissueV3_Leaf_Sheath               TissueV3_S2_Leaf_Base 
##                               18818                               18863 
##                TissueV3_S2_Leaf_Tip                        TissueV3_SAM 
##                               20418                               14898 
##                       GenotypeInv4m                            tempcold 
##                                 175                               14600 
##         TissueV3_Root:GenotypeInv4m         TissueV1_Leaf:GenotypeInv4m 
##                                   0                                  26 
##         TissueV3_Leaf:GenotypeInv4m    TissueV3_Leaf_Base:GenotypeInv4m 
##                                  29                                  28 
##  TissueV3_Leaf_Sheath:GenotypeInv4m TissueV3_S2_Leaf_Base:GenotypeInv4m 
##                                  19                                  44 
##  TissueV3_S2_Leaf_Tip:GenotypeInv4m          TissueV3_SAM:GenotypeInv4m 
##                                  30                                   8 
##              GenotypeInv4m:tempcold 
##                                  14

Effect Estimation and Confidence Intervals

Extract Coefficients of Interest

# Map coefficients (excluding Intercept and Donor batch)
predictor_map <- c(
  "DonorMi21"                       = "Mi21",
  "TissueV1_Leaf"                   = "V1L",
  "TissueV3_Root"                   = "V3R",
  "TissueV3_Leaf"                   = "V3L",
  "TissueV3_Leaf_Base"              = "V3LBase",
  "TissueV3_Leaf_Sheath"            = "V3LSheath",
  "TissueV3_S2_Leaf_Base"           = "V3L2Base",
  "TissueV3_S2_Leaf_Tip"            = "V3L2Tip",
  "TissueV3_SAM"                    = "V3SAM",
  "GenotypeInv4m"                   = "Inv4m",
  "tempcold"                        = "cold",
  "TissueV3_Root:GenotypeInv4m"     = "V3R:Inv4m",
  "TissueV1_Leaf:GenotypeInv4m"     = "V1L:Inv4m",
  "TissueV3_Leaf:GenotypeInv4m"     = "V3L:Inv4m",
  "TissueV3_Leaf_Base:GenotypeInv4m"= "V3LB:Inv4m",
  "TissueV3_Leaf_Sheath:GenotypeInv4m" = "V3LS:Inv4m",
  "TissueV3_S2_Leaf_Base:GenotypeInv4m" = "V3L2B:Inv4m",
  "TissueV3_S2_Leaf_Tip:GenotypeInv4m"  = "V3L2T:Inv4m",
  "TissueV3_SAM:GenotypeInv4m"      = "V3SAM:Inv4m",
  "GenotypeInv4m:tempcold"          = "Inv4m:cold"   # ← FIXED: matches actual coef name
)

{
  cat("\nExtracting coefficients:\n")
  for (i in seq_along(predictor_map)) {
    cat("  ", names(predictor_map)[i], "→", predictor_map[i], "\n")
  }
}
## 
## Extracting coefficients:
##    DonorMi21 → Mi21 
##    TissueV1_Leaf → V1L 
##    TissueV3_Root → V3R 
##    TissueV3_Leaf → V3L 
##    TissueV3_Leaf_Base → V3LBase 
##    TissueV3_Leaf_Sheath → V3LSheath 
##    TissueV3_S2_Leaf_Base → V3L2Base 
##    TissueV3_S2_Leaf_Tip → V3L2Tip 
##    TissueV3_SAM → V3SAM 
##    GenotypeInv4m → Inv4m 
##    tempcold → cold 
##    TissueV3_Root:GenotypeInv4m → V3R:Inv4m 
##    TissueV1_Leaf:GenotypeInv4m → V1L:Inv4m 
##    TissueV3_Leaf:GenotypeInv4m → V3L:Inv4m 
##    TissueV3_Leaf_Base:GenotypeInv4m → V3LB:Inv4m 
##    TissueV3_Leaf_Sheath:GenotypeInv4m → V3LS:Inv4m 
##    TissueV3_S2_Leaf_Base:GenotypeInv4m → V3L2B:Inv4m 
##    TissueV3_S2_Leaf_Tip:GenotypeInv4m → V3L2T:Inv4m 
##    TissueV3_SAM:GenotypeInv4m → V3SAM:Inv4m 
##    GenotypeInv4m:tempcold → Inv4m:cold

Extract Results Function

#' Extract differential expression results for specified predictors
#'
#' @param ebfit An eBayes fitted model object from limma
#' @param predictor_map Named vector mapping coefficient names to display names
#'
#' @return A data frame with DE results for all specified predictors
extract_predictor_effects <- function(ebfit, predictor_map) {
  coef_names <- colnames(coef(ebfit))
  coef_indices <- match(names(predictor_map), coef_names)
  
  if (any(is.na(coef_indices))) {
    missing <- names(predictor_map)[is.na(coef_indices)]
    stop("Coefficients not found: ", paste(missing, collapse = ", "),
         "\nAvailable: ", paste(coef_names, collapse = ", "))
  }
  
  result_list <- lapply(seq_along(coef_indices), function(i) {
    idx <- coef_indices[i]
    
    tt <- topTable(ebfit, coef = idx, sort.by = "none", n = Inf)
    
    # Calculate 95% confidence intervals
    crit_value <- qt(0.975, ebfit$df.residual + ebfit$df.prior)
    std_errors <- ebfit$stdev.unscaled[, idx] * sqrt(ebfit$s2.post)
    tt$std_err <- std_errors
    tt$upper <- tt$logFC + crit_value * std_errors
    tt$lower <- tt$logFC - crit_value * std_errors
    tt$predictor <- predictor_map[i]
    tt$response <- rownames(tt)
    
    tt
  })
  
  effects <- do.call(rbind, result_list)
  rownames(effects) <- NULL
  
  effects %>%
    select(predictor, response, everything()) %>%
    arrange(adj.P.Val)
}

Extract Effects

# Define effect order by number of DEGs detected
effect_order <- c(
  "V3L2Tip",
  "V1L",
  "V3L",
  "V3LBase",
  "V3L2Base",
  "V3LSheath",
  "V3SAM",
  "cold",
  "Mi21",
  "V3R",
  "Inv4m",
  "V3L2B:Inv4m",
  "V3L2T:Inv4m",
  "V3L:Inv4m",
  "V3LB:Inv4m",
  "V1L:Inv4m",
  "V3LS:Inv4m",
  "Inv4m:cold",
  "V3SAM:Inv4m",
  "V3R:Inv4m"
)

effects_df <- extract_predictor_effects(ebfit, predictor_map) %>%
  dplyr::rename(gene = response) %>%
  mutate(predictor = factor(predictor, levels = effect_order)) %>%
  mutate(neglogP = -log10(adj.P.Val))

{
  cat("\nTotal tests:", nrow(effects_df), "\n")
  cat("Tests per predictor:\n")
  print(table(effects_df$predictor))
}
## 
## Total tests: 529800 
## Tests per predictor:
## 
##     V3L2Tip         V1L         V3L     V3LBase    V3L2Base   V3LSheath 
##       26490       26490       26490       26490       26490       26490 
##       V3SAM        cold        Mi21         V3R       Inv4m V3L2B:Inv4m 
##       26490       26490       26490       26490       26490       26490 
## V3L2T:Inv4m   V3L:Inv4m  V3LB:Inv4m   V1L:Inv4m  V3LS:Inv4m  Inv4m:cold 
##       26490       26490       26490       26490       26490       26490 
## V3SAM:Inv4m   V3R:Inv4m 
##       26490       26490

Annotate with DEG Status

effects_df <- effects_df %>%
  mutate(is_DEG = adj.P.Val < 0.05) %>%
  mutate(regulation = case_when(
    is_DEG & logFC > 0 ~ "Upregulated",
    is_DEG & logFC < 0 ~ "Downregulated",
    .default = "Unregulated"
  ))

{
  cat("\nDEG counts:\n")
  print(with(effects_df, table(predictor, is_DEG)))
  cat("\nRegulation:\n")
  print(with(effects_df, table(predictor, regulation)))
}
## 
## DEG counts:
##              is_DEG
## predictor     FALSE  TRUE
##   V3L2Tip      6072 20418
##   V1L          6122 20368
##   V3L          6337 20153
##   V3LBase      6850 19640
##   V3L2Base     7627 18863
##   V3LSheath    7672 18818
##   V3SAM       11592 14898
##   cold        11890 14600
##   Mi21        24481  2009
##   V3R         26293   197
##   Inv4m       26315   175
##   V3L2B:Inv4m 26446    44
##   V3L2T:Inv4m 26460    30
##   V3L:Inv4m   26461    29
##   V3LB:Inv4m  26462    28
##   V1L:Inv4m   26464    26
##   V3LS:Inv4m  26471    19
##   Inv4m:cold  26476    14
##   V3SAM:Inv4m 26482     8
##   V3R:Inv4m   26490     0
## 
## Regulation:
##              regulation
## predictor     Downregulated Unregulated Upregulated
##   V3L2Tip             11313        6072        9105
##   V1L                 11139        6122        9229
##   V3L                 11112        6337        9041
##   V3LBase             10819        6850        8821
##   V3L2Base            10230        7627        8633
##   V3LSheath           10227        7672        8591
##   V3SAM                6729       11592        8169
##   cold                 7719       11890        6881
##   Mi21                  913       24481        1096
##   V3R                    99       26293          98
##   Inv4m                  98       26315          77
##   V3L2B:Inv4m            16       26446          28
##   V3L2T:Inv4m            17       26460          13
##   V3L:Inv4m              10       26461          19
##   V3LB:Inv4m              9       26462          19
##   V1L:Inv4m               8       26464          18
##   V3LS:Inv4m             10       26471           9
##   Inv4m:cold              5       26476           9
##   V3SAM:Inv4m             6       26482           2
##   V3R:Inv4m               0       26490           0

Define Robust Outlier Detection Function

# Modified helper function with sample size check
calculate_robust_distance <- function(per_predictor, mcd_alpha) {
  # Require at least 10 significant DEGs for robust estimation
  if (nrow(per_predictor) < 10) {
    per_predictor$mahalanobis <- NA_real_
    return(per_predictor)
  }
  
  bivariate <- per_predictor %>%
    select(logFC, neglogP) %>%
    as.matrix()
  
  # Check for variance (all same values would cause singularity)
  if (var(bivariate[, 1]) < 1e-10 || var(bivariate[, 2]) < 1e-10) {
    per_predictor$mahalanobis <- NA_real_
    return(per_predictor)
  }
  
  tryCatch({
    mcd_result <- covMcd(bivariate, alpha = mcd_alpha)
    
    per_predictor$mahalanobis <- mahalanobis(
      x = bivariate,
      center = mcd_result$center,
      cov = mcd_result$cov
    )
  }, error = function(e) {
    warning("Could not calculate Mahalanobis distance for group: ", 
            unique(per_predictor$predictor), " - ", 
            unique(per_predictor$regulation))
    per_predictor$mahalanobis <- NA_real_
  })
  
  per_predictor
}

# Modified main function
add_mahalanobis_outliers <- function(
    data = NULL,
    distance_quantile = 0.05,
    FDR = 0.05,
    mcd_alpha = 0.75
) {
  # Calculate distances per predictor AND regulation
  data <- data %>%
    group_by(predictor, regulation) %>%
    group_split() %>%
    lapply(calculate_robust_distance, mcd_alpha = mcd_alpha) %>%
    bind_rows()
  
  cutoff <- qchisq(p = 1 - distance_quantile, df = 2)
  
  data$is_mh_outlier <- (data$adj.P.Val < FDR) & 
                        (data$mahalanobis > cutoff) & 
                        !is.na(data$mahalanobis)
  
  data %>%
    ungroup() %>%
    group_by(predictor, regulation) %>%
    arrange(-mahalanobis, .by_group = TRUE) %>%
    ungroup()
}

Calculate Mahalanobis Distances

effects_df <- add_mahalanobis_outliers(
  effects_df, 
  distance_quantile = 0.05, 
  FDR = 0.05
)

{
  cat("\nMahalanobis outliers detected:\n")
  print(with(effects_df, table(predictor, is_mh_outlier, useNA = "ifany")))
  
  cat("\nGroups with insufficient data (NA):\n")
  na_summary <- effects_df %>%
    filter(is.na(mahalanobis)) %>%
    count(predictor, regulation)
  print(na_summary)
}
## 
## Mahalanobis outliers detected:
##              is_mh_outlier
## predictor     FALSE  TRUE
##   V3L2Tip     21859  4631
##   V1L         21701  4789
##   V3L         21866  4624
##   V3LBase     21966  4524
##   V3L2Base    21604  4886
##   V3LSheath   22051  4439
##   V3SAM       22789  3701
##   cold        23183  3307
##   Mi21        25963   527
##   V3R         26445    45
##   Inv4m       26446    44
##   V3L2B:Inv4m 26486     4
##   V3L2T:Inv4m 26488     2
##   V3L:Inv4m   26487     3
##   V3LB:Inv4m  26489     1
##   V1L:Inv4m   26488     2
##   V3LS:Inv4m  26489     1
##   Inv4m:cold  26490     0
##   V3SAM:Inv4m 26490     0
##   V3R:Inv4m   26490     0
## 
## Groups with insufficient data (NA):
## # A tibble: 15 × 3
##    predictor   regulation        n
##    <fct>       <chr>         <int>
##  1 V3L2B:Inv4m Unregulated   26446
##  2 V3L2T:Inv4m Unregulated   26460
##  3 V3L:Inv4m   Unregulated   26461
##  4 V3LB:Inv4m  Downregulated     9
##  5 V3LB:Inv4m  Unregulated   26462
##  6 V1L:Inv4m   Downregulated     8
##  7 V1L:Inv4m   Unregulated   26464
##  8 V3LS:Inv4m  Unregulated   26471
##  9 V3LS:Inv4m  Upregulated       9
## 10 Inv4m:cold  Downregulated     5
## 11 Inv4m:cold  Upregulated       9
## 12 V3SAM:Inv4m Downregulated     6
## 13 V3SAM:Inv4m Unregulated   26482
## 14 V3SAM:Inv4m Upregulated       2
## 15 V3R:Inv4m   Unregulated   26490

Gene Annotation

# Gene symbols and locus names
gene_symbol <- read.table(
  "../data/gene_symbol.tab",
  quote = "",
  header = TRUE,
  sep = "\t",
  na.strings = ""
)

# Pannzer functional annotations
pannzer <- read.table(
  "../data/PANNZER_DESC.tab",
  quote = "",
  header = TRUE,
  sep = "\t"
) %>%
  group_by(gene_model) %>%
  dplyr::slice(1) %>%
  select(gene_model, desc)

# Merge annotations
gene_symbol_unique <- gene_symbol %>%
  group_by(gene_model, locus_symbol) %>%
  dplyr::slice(1) %>%
  ungroup()

gene_pannzer <- gene_symbol_unique %>%
  left_join(pannzer, by = "gene_model") %>%
  group_by(gene_model) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  select(gene_model, locus_symbol, locus_name, desc)

# Genomic coordinates
v5_gff_file <- "/Users/fvrodriguez/ref/zea/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.59.chr.gff3"
genes_gr <- rtracklayer::import(v5_gff_file) %>%
  subset(type == "gene" & seqnames %in% 1:10)
genes <- as.data.frame(genes_gr)
genes$ID <- gsub("gene:", "", genes$ID)

{
  cat("Annotations loaded:\n")
  cat("  Gene symbols:", nrow(gene_symbol), "\n")
  cat("  Pannzer descriptions:", nrow(pannzer), "\n")
  cat("  Genomic features:", nrow(genes), "genes\n")
}
## Annotations loaded:
##   Gene symbols: 44364 
##   Pannzer descriptions: 28308 
##   Genomic features: 43459 genes

Define Genomic Regions

# Inv4m inversion boundaries
inv4m_start <- genes[genes$ID == "Zm00001eb190470", "start"]
inv4m_end <- genes[genes$ID == "Zm00001eb194800", "end"]

# Shared introgression boundaries
introgression_start <- 157012149
introgression_end <- 195900523

# Extract gene IDs
inv4m_gene_ids <- genes %>%
  filter(seqnames == 4, start >= inv4m_start, end <= inv4m_end) %>%
  pull(ID)

shared_introgression_gene_ids <- genes %>%
  filter(seqnames == 4, start >= introgression_start, end <= introgression_end) %>%
  pull(ID)

flanking_introgression_gene_ids <- shared_introgression_gene_ids[
  !(shared_introgression_gene_ids %in% inv4m_gene_ids)
]

{
  cat("Inv4m inversion:", length(inv4m_gene_ids), "genes\n")
  cat("Shared introgression:", length(shared_introgression_gene_ids), "genes\n")
  cat("Flanking:", length(flanking_introgression_gene_ids), "genes\n")
}
## Inv4m inversion: 434 genes
## Shared introgression: 1099 genes
## Flanking: 665 genes

Three-Tier DEG Classification

High-Confidence DEGs

# Define fold change thresholds
is_large_effect <- rep(FALSE, nrow(effects_df))
is_interaction <- effects_df$predictor == "cold:Inv4m"
is_large_effect[is_interaction & abs(effects_df$logFC) > 0.5] <- TRUE
is_large_effect[!is_interaction & abs(effects_df$logFC) > 1.5] <- TRUE

effects_df <- effects_df %>%
  mutate(is_hiconf_DEG = is_DEG & is_large_effect)

{
  cat("\nHigh-Confidence DEGs:\n")
  print(with(effects_df, table(predictor, is_hiconf_DEG)))
}
## 
## High-Confidence DEGs:
##              is_hiconf_DEG
## predictor     FALSE  TRUE
##   V3L2Tip     15181 11309
##   V1L         15236 11254
##   V3L         15368 11122
##   V3LBase     15898 10592
##   V3L2Base    17614  8876
##   V3LSheath   17189  9301
##   V3SAM       21776  4714
##   cold        26172   318
##   Mi21        26388   102
##   V3R         26436    54
##   Inv4m       26392    98
##   V3L2B:Inv4m 26456    34
##   V3L2T:Inv4m 26465    25
##   V3L:Inv4m   26469    21
##   V3LB:Inv4m  26467    23
##   V1L:Inv4m   26466    24
##   V3LS:Inv4m  26476    14
##   Inv4m:cold  26482     8
##   V3SAM:Inv4m 26483     7
##   V3R:Inv4m   26490     0

Selected DEGs

rank_threshold <- 10

effects_df <- effects_df %>%
  group_by(is_hiconf_DEG, predictor, regulation) %>%
  mutate(
    pval_rank = row_number(dplyr::desc(neglogP)),
    mahal_rank = row_number(dplyr::desc(mahalanobis))
  ) %>%
  ungroup() %>%
  mutate(
    is_selected_DEG = (pval_rank <= rank_threshold & is_hiconf_DEG) |
                      (mahal_rank <= rank_threshold & is_hiconf_DEG)
  )

{
  cat("\nSelected DEGs (Top", rank_threshold, "by rank):\n")
  print(with(
    effects_df %>% filter(is_selected_DEG & regulation != "Unregulated"),
    table(regulation, predictor)
  ))
}
## 
## Selected DEGs (Top 10 by rank):
##                predictor
## regulation      V3L2Tip V1L V3L V3LBase V3L2Base V3LSheath V3SAM cold Mi21 V3R
##   Downregulated      14  14  13      12       13        11    11   10   11  10
##   Upregulated        10  12  11      11       14        11    10   11   11  11
##                predictor
## regulation      Inv4m V3L2B:Inv4m V3L2T:Inv4m V3L:Inv4m V3LB:Inv4m V1L:Inv4m
##   Downregulated    11          12          12         7          5         6
##   Upregulated      10          13          11        12         14        12
##                predictor
## regulation      V3LS:Inv4m Inv4m:cold V3SAM:Inv4m V3R:Inv4m
##   Downregulated          6          1           5         0
##   Upregulated            8          7           2         0

Annotate with Gene Information

effects_df <- effects_df %>%
  left_join(
    gene_pannzer,
    by = c(gene = "gene_model"),
    relationship = "many-to-many"
  ) %>%
  mutate(desc_merged = coalesce(locus_name, desc)) %>%
  select(predictor, regulation, gene, locus_symbol, desc_merged, everything()) %>%
  inner_join(
    genes %>%
      select(gene = ID, CHR = seqnames, BP = start) %>%
      mutate(CHR = as.character(CHR) %>% as.integer()),
    by = "gene"
  ) %>%
  mutate(
    locus_label = case_when(
      is.na(locus_symbol) ~ NA_character_,
      grepl("^si\\d*[a-h]", locus_symbol) ~ NA_character_,
      grepl("^umc", locus_symbol) ~ NA_character_,
      grepl("^Zm00001eb", locus_symbol) ~ NA_character_,
      TRUE ~ locus_symbol
    )
  )

{
  cat("\nAnnotations added:", ncol(effects_df), "columns\n")
}
## 
## Annotations added: 27 columns

Add Region Classification

effects_df <- effects_df %>%
  mutate(
    in_Inv4m = gene %in% inv4m_gene_ids,
    in_cis = gene %in% shared_introgression_gene_ids,
    in_flank = gene %in% flanking_introgression_gene_ids,
    in_trans = !in_cis
  )

# Summarize genomic region classification by predictor
region_summary <- effects_df %>%
  group_by(predictor) %>%
  summarise(
    in_Inv4m = sum(in_Inv4m, na.rm = TRUE),
    in_cis   = sum(in_cis,   na.rm = TRUE),
    in_flank = sum(in_flank, na.rm = TRUE),
    in_trans = sum(in_trans, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(match(predictor, effect_order))  # keep your desired order

# Print as a clean table
{
  cat("\nRegion classification by predictor:\n")
  print(region_summary, row.names = FALSE)
  }
## 
## Region classification by predictor:
## # A tibble: 20 × 5
##    predictor   in_Inv4m in_cis in_flank in_trans
##    <fct>          <int>  <int>    <int>    <int>
##  1 V3L2Tip          293    682      389    25587
##  2 V1L              293    682      389    25587
##  3 V3L              293    682      389    25587
##  4 V3LBase          293    682      389    25587
##  5 V3L2Base         293    682      389    25587
##  6 V3LSheath        293    682      389    25587
##  7 V3SAM            293    682      389    25587
##  8 cold             293    682      389    25587
##  9 Mi21             293    682      389    25587
## 10 V3R              293    682      389    25587
## 11 Inv4m            293    682      389    25587
## 12 V3L2B:Inv4m      293    682      389    25587
## 13 V3L2T:Inv4m      293    682      389    25587
## 14 V3L:Inv4m        293    682      389    25587
## 15 V3LB:Inv4m       293    682      389    25587
## 16 V1L:Inv4m        293    682      389    25587
## 17 V3LS:Inv4m       293    682      389    25587
## 18 Inv4m:cold       293    682      389    25587
## 19 V3SAM:Inv4m      293    682      389    25587
## 20 V3R:Inv4m        293    682      389    25587

Quality Control Tables

Top DEGs per Predictor

top_degs_qc <- effects_df %>%
  filter(is_DEG, regulation != "Unregulated") %>%
  group_by(predictor, regulation) %>%
  arrange(-neglogP, .by_group = TRUE) %>%
  dplyr::slice(1:10) %>%
  select(predictor, gene, locus_symbol, desc_merged, logFC, neglogP) %>%
  arrange(predictor, regulation, -neglogP)

top_degs_qc

DEG Summary Statistics

overall_summary <- effects_df %>%
  group_by(predictor) %>%
  summarise(
    total_genes = n(),
    n_DEG = sum(is_DEG),
    n_hiconf_DEG = sum(is_hiconf_DEG),
    n_selected_DEG = sum(is_selected_DEG),
    pct_DEG = round(100 * n_DEG / total_genes, 2)
  )

overall_summary
trans_effects <- read.csv("~/Desktop/network_effects.csv") 

Selected DEGs for Manuscript

selected_degs <- effects_df %>%
  filter(is_selected_DEG) %>%
  select(
    predictor,
    regulation,
    gene,
    locus_symbol,
    locus_label,
    desc_merged,
    logFC,
    neglogP,
    mahalanobis,
    pval_rank,
    mahal_rank
  ) %>%
  arrange(predictor, regulation, -neglogP)

{
  cat("\nSelected DEGs for manuscript:", nrow(selected_degs), "\n")
  print(with(selected_degs, table(predictor, regulation)))
}
## 
## Selected DEGs for manuscript: 383 
##              regulation
## predictor     Downregulated Upregulated
##   V3L2Tip                14          10
##   V1L                    14          12
##   V3L                    13          11
##   V3LBase                12          11
##   V3L2Base               13          14
##   V3LSheath              11          11
##   V3SAM                  11          10
##   cold                   10          11
##   Mi21                   11          11
##   V3R                    10          11
##   Inv4m                  11          10
##   V3L2B:Inv4m            12          13
##   V3L2T:Inv4m            12          11
##   V3L:Inv4m               7          12
##   V3LB:Inv4m              5          14
##   V1L:Inv4m               6          12
##   V3LS:Inv4m              6           8
##   Inv4m:cold              1           5
##   V3SAM:Inv4m             5           2
##   V3R:Inv4m               0           0

Export Results

# Full effects table
write.csv(
  effects_df,
  file = "../results/predictor_effects_crow2020.csv",
  row.names = FALSE
)

# Selected DEGs
write.csv(
  selected_degs,
  file = "../results/selected_DEGs_crow2020.csv",
  row.names = FALSE
)

{
  cat("\nResults exported:\n")
  cat("  predictor_effects_crow2020.csv\n")
  cat("  selected_DEGs_crow2020.csv\n")
}

Summary

This analysis identified differentially expressed genes across three main effects:

  1. Cold temperature: 14486 DEGs
  2. Inv4m genotype: 175 DEGs
  3. temp × Genotype interaction: 0 DEGs

Donor batch effect was controlled as a fixed effect in the model. Expression data was normalized using TMM and transformed with voom for variance stabilization.

Session Information

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] robustbase_0.99-6    ggtext_0.1.2         ggfx_1.0.3          
##  [4] ggpubr_0.6.2         ggplot2_4.0.1        stringr_1.6.0       
##  [7] tidyr_1.3.1          dplyr_1.1.4          rtracklayer_1.70.0  
## [10] GenomicRanges_1.62.0 Seqinfo_1.0.0        IRanges_2.44.0      
## [13] S4Vectors_0.48.0     BiocGenerics_0.56.0  generics_0.1.4      
## [16] edgeR_4.8.0          limma_3.66.0        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1            viridisLite_0.4.2          
##  [3] farver_2.1.2                Biostrings_2.78.0          
##  [5] S7_0.2.1                    bitops_1.0-9               
##  [7] fastmap_1.2.0               RCurl_1.98-1.17            
##  [9] GenomicAlignments_1.46.0    XML_3.99-0.20              
## [11] digest_0.6.39               lifecycle_1.0.4            
## [13] statmod_1.5.1               magrittr_2.0.4             
## [15] compiler_4.5.1              rlang_1.1.6                
## [17] sass_0.4.10                 tools_4.5.1                
## [19] utf8_1.2.6                  yaml_2.3.10                
## [21] knitr_1.50                  ggsignif_0.6.4             
## [23] labeling_0.4.3              S4Arrays_1.10.0            
## [25] curl_7.0.0                  DelayedArray_0.36.0        
## [27] xml2_1.5.0                  RColorBrewer_1.1-3         
## [29] abind_1.4-8                 BiocParallel_1.44.0        
## [31] withr_3.0.2                 purrr_1.2.0                
## [33] grid_4.5.1                  scales_1.4.0               
## [35] SummarizedExperiment_1.40.0 cli_3.6.5                  
## [37] rmarkdown_2.30              crayon_1.5.3               
## [39] ragg_1.5.0                  rstudioapi_0.17.1          
## [41] httr_1.4.7                  rjson_0.2.23               
## [43] cachem_1.1.0                parallel_4.5.1             
## [45] XVector_0.50.0              restfulr_0.0.16            
## [47] matrixStats_1.5.0           vctrs_0.6.5                
## [49] Matrix_1.7-4                jsonlite_2.0.0             
## [51] carData_3.0-5               car_3.1-3                  
## [53] rstatix_0.7.3               Formula_1.2-5              
## [55] systemfonts_1.3.1           magick_2.9.0               
## [57] locfit_1.5-9.12             jquerylib_0.1.4            
## [59] glue_1.8.0                  DEoptimR_1.1-4             
## [61] codetools_0.2-20            cowplot_1.2.0              
## [63] stringi_1.8.7               gtable_0.3.6               
## [65] BiocIO_1.20.0               tibble_3.3.0               
## [67] pillar_1.11.1               htmltools_0.5.8.1          
## [69] R6_2.6.1                    textshaping_1.0.4          
## [71] evaluate_1.0.5              lattice_0.22-7             
## [73] Biobase_2.70.0              backports_1.5.0            
## [75] Rsamtools_2.26.0            cigarillo_1.0.0            
## [77] gridtext_0.1.5              broom_1.0.10               
## [79] bslib_0.9.0                 Rcpp_1.1.0                 
## [81] SparseArray_1.10.2          xfun_0.54                  
## [83] MatrixGenerics_1.22.0       pkgconfig_2.0.3