Overview

Purpose: Construct expression indices for photosynthesis and senescence gene sets to proxy physiological trajectories across leaf developmental stages.

Approach:

  1. Define gene sets from literature and GO annotations
  2. Calculate aggregate expression indices across developmental stages
  3. Validate patterns with individual marker genes
  4. Test for enrichment in differentially expressed genes

Setup

Libraries

library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)
library(clusterProfiler)
library(ggtext) 
library(ggrepel) 

Load Core Data

We load differential expression results, normalized expression values, and gene annotations. Curated locus labels are merged to improve gene annotation quality.

# Load differential expression results
effects_df <- read.csv("~/Desktop/DEG_effects.csv")

# Load curated labels from external file
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv") %>%
  select(gene, locus_label)

effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA

# Coalesce curated locus_label into effects_df
effects_df <- effects_df %>%
  left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
  mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
  select(-locus_label_curated)

# Expression data
normalized_expression <- readRDS(
  "~/Desktop/normalized_expression_logCPM.rda"
)

# Gene universe
universe <- effects_df %>%
  filter(predictor == "Leaf") %>%
  pull(gene) %>%
  unique()

# GO annotations
TERM2GENE <- readRDS(
  "/Users/fvrodriguez/Desktop/GOMAP_maize_B73_NAM5/TERM2GENE_Fattel_2024_full.rds"
)

cat("Expression matrix:", dim(normalized_expression), "\n")
## Expression matrix: 24249 43
cat("Universe size:", length(universe), "genes\n")
## Universe size: 24011 genes

Gene Set Definitions

Version Cross-reference

B73 genome versions need to be cross-referenced to map literature genes to the current v5 annotation.

v3_v5 <- read.table(
  file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v3_to_B73v5.tsv",
  sep = "\t",
  header = FALSE
) %>%
  rename(v3 = "V1", v5 = "V2") %>%
  separate_longer_delim(cols = v5, delim = ",")

v4_v5 <- read.table(
  file = "/Users/fvrodriguez/Desktop/B73_gene_xref/B73v4_to_B73v5.tsv",
  sep = "\t",
  header = FALSE
) %>%
  rename(v4 = "V1", v5 = "V2") %>%
  separate_longer_delim(cols = v5, delim = ",")

Literature-based Senescence Gene Sets

We compile senescence-associated genes from multiple literature sources:

  • SAG orthologs from cross-species comparisons
  • Staygreen network genes (Sekhon et al. 2019)
  • Natural senescence genes
# SAG orthologs
sag_orthologs <- read.csv("~/Desktop/SAG_orthologs.csv") %>%
  janitor::clean_names() %>%
  inner_join(v3_v5, by = c(gene_id_maize = "v3"))

# Staygreen network
staygreen_network <- read.csv(
  "~/Desktop/staygreen_network_sekhon2019.csv", 
  na.strings = ""
) %>%
  janitor::clean_names() %>%
  inner_join(v4_v5, by = c(gene = "v4"))

# Natural senescence
natural_senescence <- read.csv("~/Desktop/natural_senescence.csv") %>%
  janitor::clean_names() %>%
  inner_join(v3_v5, by = c(gene_id = "v3"))

cat("SAG orthologs:", length(unique(sag_orthologs$v5)), "\n")
## SAG orthologs: 47
cat("Staygreen network:", length(unique(staygreen_network$v5)), "\n")
## Staygreen network: 695
cat("Natural senescence:", length(unique(natural_senescence$v5)), "\n")
## Natural senescence: 4722

GO-based Gene Sets

Photosynthesis and leaf senescence gene sets are defined using GO terms.

photosynthesis_genes <- TERM2GENE %>%
  filter(GO %in% c(
    "GO:0015979", # photosynthesis
    "GO:0019684", # photosynthesis, light reaction
    "GO:0009768"  # photosynthesis, light harvesting
  )) %>%
  pull(gene) %>%
  unique()

leaf_senescence_genes <- TERM2GENE %>%
  filter(GO %in% c(
    "GO:0010150" # leaf senescence
  )) %>%
  pull(gene) %>%
  unique()

cat("Photosynthesis genes:", length(photosynthesis_genes), "\n")
## Photosynthesis genes: 672
cat("Leaf senescence genes:", length(leaf_senescence_genes), "\n")
## Leaf senescence genes: 200

Build Custom Annotation

Combine all gene sets into a unified annotation structure.

custom_annotation <- c(
  list(
    staygreen = staygreen_network$v5,
    sag = sag_orthologs$v5,
    photosynthesis = intersect(photosynthesis_genes, universe),
    leaf_senescence = intersect(leaf_senescence_genes, universe)
  ),
  natural_senescence %>%
    select(term = set, gene = v5) %>%
    distinct() %>%
    split(.$term) %>%
    lapply(function(x) x$gene)
)

cat("Custom annotation terms:", length(custom_annotation), "\n")
## Custom annotation terms: 6
lapply(custom_annotation, length)
## $staygreen
## [1] 771
## 
## $sag
## [1] 48
## 
## $photosynthesis
## [1] 525
## 
## $leaf_senescence
## [1] 157
## 
## $natural
## [1] 3681
## 
## $natural_induced
## [1] 1075

Expression Index Construction

Index Calculation Function

We aggregate gene set expression using mean expression across all genes in each set. This provides a single metric representing the activity of the entire pathway.

#' Calculate Gene Set Expression Indices
#'
#' @param expression_matrix Matrix with genes as rows, samples as columns
#' @param gene_sets Named list of gene ID vectors
#' @param method Aggregation method: "sum", "mean", or "median"
#' @return Data frame with sample-level indices
calculate_gene_set_indices <- function(expression_matrix,
                                       gene_sets,
                                       method = "sum") {
  indices <- lapply(names(gene_sets), function(set_name) {
    genes_in_set <- gene_sets[[set_name]]
    genes_available <- intersect(genes_in_set, rownames(expression_matrix))
    
    if (length(genes_available) == 0) {
      warning(paste("No genes from", set_name, "found in expression matrix"))
      return(rep(NA, ncol(expression_matrix)))
    }
    
    set_expression <- expression_matrix[genes_available, , drop = FALSE]
    
    index <- switch(method,
      "sum" = colSums(set_expression, na.rm = TRUE),
      "mean" = colMeans(set_expression, na.rm = TRUE),
      "median" = apply(set_expression, 2, median, na.rm = TRUE),
      stop("method must be 'sum', 'mean', or 'median'")
    )
    
    cat(sprintf(
      "%s: %d/%d genes found\n",
      set_name, length(genes_available), length(genes_in_set)
    ))
    
    index
  })
  
  indices_df <- as.data.frame(indices)
  names(indices_df) <- names(gene_sets)
  indices_df$sample <- colnames(expression_matrix)
  
  indices_df %>% select(sample, everything())
}

Calculate Indices

Compute photosynthesis and leaf senescence indices for all samples.

gene_set_list <- list(
  photosynthesis = custom_annotation$photosynthesis,
  senescence = custom_annotation$leaf_senescence
)

indices <- calculate_gene_set_indices(
  expression_matrix = normalized_expression,
  gene_sets = gene_set_list,
  method = "mean"
) %>%
  mutate(
    leaf_stage = as.numeric(sub(".*L([0-9]+)[LR]$", "\\1", sample)),
    treatment = factor(substr(sample, 1, 2), levels = c("+P", "-P"))
  )
## photosynthesis: 525/525 genes found
## senescence: 157/157 genes found
head(indices)
##                  sample photosynthesis senescence leaf_stage treatment
## -P1MICH_L1L -P1MICH_L1L       6.497981   3.308076          1        -P
## -P1MICH_L2L -P1MICH_L2L       6.429558   3.314505          2        -P
## -P1MICH_L3L -P1MICH_L3L       6.177238   3.335557          3        -P
## -P1MICH_L4L -P1MICH_L4L       6.047347   3.448313          4        -P
## -P2MICH_L1L -P2MICH_L1L       6.553928   3.236783          1        -P
## -P2MICH_L3L -P2MICH_L3L       6.195773   3.355784          3        -P

Statistical Models

Test the relationship between indices and leaf stage, including treatment interactions.

cat("=== Photosynthesis Index ===\n")
## === Photosynthesis Index ===
lm(photosynthesis ~ treatment * leaf_stage, data = indices) %>%
  summary() %>%
  print()
## 
## Call:
## lm(formula = photosynthesis ~ treatment * leaf_stage, data = indices)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43946 -0.08044 -0.00287  0.08854  0.36700 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.36169    0.07536  84.418  < 2e-16 ***
## treatment-P             0.37115    0.11295   3.286 0.002155 ** 
## leaf_stage             -0.10768    0.02752  -3.913 0.000355 ***
## treatment-P:leaf_stage -0.15544    0.04196  -3.704 0.000656 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1507 on 39 degrees of freedom
## Multiple R-squared:  0.6837, Adjusted R-squared:  0.6594 
## F-statistic:  28.1 on 3 and 39 DF,  p-value: 7.591e-10
cat("\n=== Senescence Index ===\n")
## 
## === Senescence Index ===
lm(senescence ~ treatment * leaf_stage, data = indices) %>%
  summary() %>%
  print()
## 
## Call:
## lm(formula = senescence ~ treatment * leaf_stage, data = indices)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19688 -0.06820  0.01176  0.06429  0.19348 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.23068    0.05011  64.477  < 2e-16 ***
## treatment-P            -0.10184    0.07510  -1.356  0.18290    
## leaf_stage              0.03167    0.01830   1.731  0.09137 .  
## treatment-P:leaf_stage  0.08774    0.02790   3.144  0.00318 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1002 on 39 degrees of freedom
## Multiple R-squared:  0.5489, Adjusted R-squared:  0.5142 
## F-statistic: 15.82 on 3 and 39 DF,  p-value: 6.946e-07

Main Figure: Gene Set Trajectories

This section creates the two-panel figure showing normalized gene set indices and individual gene expression patterns (spaghetti plots).

Prepare Data for Spaghetti Plots

Calculate gene-centered expression for all genes in each set to visualize individual gene trajectories.

# Prepare expression data for photosynthesis genes - centered per gene
xp_photo_genes <- intersect(
  custom_annotation$photosynthesis,
  rownames(normalized_expression)
)
xp_photosynthesis <- normalized_expression[xp_photo_genes, ] %>%
  as.data.frame() %>%
  tibble::rownames_to_column("gene") %>%
  pivot_longer(-gene, names_to = "sample", values_to = "expression") %>%
  mutate(
    leaf_stage = as.numeric(sub(".*L([0-9]+)[LR]$", "\\1", sample))
  ) %>%
  group_by(gene) %>%
  mutate(centered_expr = expression - mean(expression)) %>%
  ungroup() %>%
  group_by(gene, leaf_stage) %>%
  summarise(mean_expr = mean(centered_expr), .groups = "drop") %>%
  arrange(gene) %>%
  mutate(gene_set = "Photosynthesis")

# Prepare expression data for senescence genes - centered per gene
xp_senescence_genes <- intersect(
  custom_annotation$leaf_senescence,
  rownames(normalized_expression)
)
xp_senescence <- normalized_expression[xp_senescence_genes, ] %>%
  as.data.frame() %>%
  tibble::rownames_to_column("gene") %>%
  pivot_longer(-gene, names_to = "sample", values_to = "expression") %>%
  mutate(
    leaf_stage = as.numeric(sub(".*L([0-9]+)[LR]$", "\\1", sample))
  ) %>%
  group_by(gene) %>%
  mutate(centered_expr = expression - mean(expression)) %>%
  ungroup() %>%
  group_by(gene, leaf_stage) %>%
  summarise(mean_expr = mean(centered_expr), .groups = "drop") %>%
  arrange(gene) %>%
  mutate(gene_set = "Senescence")

cat("Photosynthesis genes:", length(xp_photo_genes), "\n")
## Photosynthesis genes: 525
cat("Senescence genes:", length(xp_senescence_genes), "\n")
## Senescence genes: 157

Panel A: Normalized Gene Set Indices

Normalize indices to 0-1 scale and fit linear models to quantify temporal trends.

# Normalize indices
indices_normalized <- indices %>%
  mutate(
    photosynthesis = (photosynthesis - min(photosynthesis)) /
      (max(photosynthesis) - min(photosynthesis)),
    senescence = (senescence - min(senescence)) /
      (max(senescence) - min(senescence))
  )

cat("=== Photosynthesis Index (Normalized) ===\n")
## === Photosynthesis Index (Normalized) ===
lm_photo <- lm(photosynthesis ~ leaf_stage, data = indices_normalized)
summary(lm_photo) %>% print()
## 
## Call:
## lm(formula = photosynthesis ~ leaf_stage, data = indices_normalized)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44587 -0.07017  0.00795  0.10089  0.19383 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.97702    0.04848  20.154  < 2e-16 ***
## leaf_stage  -0.13279    0.01794  -7.402 4.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1302 on 41 degrees of freedom
## Multiple R-squared:  0.572,  Adjusted R-squared:  0.5615 
## F-statistic: 54.79 on 1 and 41 DF,  p-value: 4.486e-09
cat("\n=== Senescence Index (Normalized) ===\n")
## 
## === Senescence Index (Normalized) ===
lm_sen <- lm(senescence ~ leaf_stage, data = indices_normalized)
summary(lm_sen) %>% print()
## 
## Call:
## lm(formula = senescence ~ leaf_stage, data = indices_normalized)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45703 -0.15322  0.00116  0.11296  0.49468 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.15554    0.07944   1.958 0.057056 .  
## leaf_stage   0.11659    0.02940   3.966 0.000286 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2134 on 41 degrees of freedom
## Multiple R-squared:  0.2773, Adjusted R-squared:  0.2597 
## F-statistic: 15.73 on 1 and 41 DF,  p-value: 0.0002864
# Extract regression statistics
photo_stats <- summary(lm_photo)
sen_stats <- summary(lm_sen)

photo_r2 <- photo_stats$r.squared
photo_p <- coef(photo_stats)["leaf_stage", "Pr(>|t|)"]

sen_r2 <- sen_stats$r.squared
sen_p <- coef(sen_stats)["leaf_stage", "Pr(>|t|)"]

# Format p-values
format_pval <- function(p) {
  if (p < 0.001) return("p < 0.001")
  if (p < 0.01) return(sprintf("p = %.3f", p))
  return(sprintf("p = %.2f", p))
}

# Create annotation text
stats_text <- sprintf(
  "R² = %.3f\n %s\n\n\nR² = %.3f\n %s",
  photo_r2, format_pval(photo_p),
  sen_r2, format_pval(sen_p)
)

# Summary statistics for normalized indices
indices_summary <- indices_normalized %>%
  group_by(leaf_stage) %>%
  summarise(
    photo_mean = mean(photosynthesis),
    photo_se = sd(photosynthesis) / sqrt(n()),
    senescence_mean = mean(senescence),
    senescence_se = sd(senescence) / sqrt(n()),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(photo_mean, senescence_mean),
    names_to = "index_type",
    values_to = "mean"
  ) %>%
  mutate(
    se = ifelse(index_type == "photo_mean", photo_se, senescence_se),
    index_type = factor(
      index_type,
      levels = c("photo_mean", "senescence_mean"),
      labels = c("Photosynthesis", "Leaf\nSenescence")
    )
  )

# Normalized trajectories plot
base_size <- 30
base_family <- "Helvetica"

p_normalized <- ggplot(
  indices_summary,
  aes(x = leaf_stage, y = mean, color = index_type)
) +
  geom_line(linewidth = 1.5) +
  geom_point(size = 4) +
  geom_errorbar(
    aes(ymin = mean - se, ymax = mean + se),
    width = 0.2,
    linewidth = 1
  ) +
  scale_color_manual(
    values = c("Photosynthesis" = "darkgreen", "Leaf\nSenescence" = "orange")
  ) +
  annotate(
    "text",
    x = 1.0,
    y = 0.45,
    label = stats_text,
    hjust = 0,
    vjust = 0,
    size = 4
  ) +
  labs(
    x = "Leaf",
    y = "Normalized Expression Index",
    color = NULL
  ) +
  ggpubr::theme_classic2(base_size = base_size) +
  theme(
    legend.position = c(0.6, 0.9),
    legend.text = element_text(size = 20),
    legend.background = element_rect(fill = "transparent"),
    legend.key = element_blank(),
  # plot.margin = margin(5.5, 7, 70, 5.5, "pt")
    plot.margin = margin(5.5, 5.5, 50, 7, "pt")
  )

print(p_normalized)

Panel B: Spaghetti Plot

Show individual gene trajectories (centered expression) with linear trends for each gene set.

p_spaghetti <- xp_photosynthesis %>%
  arrange(gene_set) %>%
  ggplot(aes(x = leaf_stage, y = mean_expr, group = gene) ) +
  geom_line(
    alpha = 0.1,
    linewidth = 0.5,
    color = "darkgreen"
  ) +
  geom_smooth(
    data = xp_photosynthesis,
    aes(x = leaf_stage, y = mean_expr, group = 1),
    method = "lm",
    se = FALSE,
    linewidth = 2,
    color = "darkgreen"
  ) +
   geom_line(
   data= xp_senescence,
   aes(x = leaf_stage, y = mean_expr, group = gene),
    alpha = 0.1,
    linewidth = 0.5,
    color = "orange"
  )  +
    geom_smooth(
    data = xp_senescence,
    aes(x = leaf_stage, y = mean_expr, group = 1),
    method = "lm",
    se = FALSE,
    linewidth = 2,
    color = "orange"
  ) +
  labs(
    x = "Leaf",
    y = expression("Centered " * log[10] * "(CPM)")
  ) +
  coord_cartesian(ylim = c(-1, 1)) +
  scale_y_continuous(position = "right") +
  ggpubr::theme_classic2(base_size = base_size) +
  theme(
    axis.title.y.right = element_text(margin = margin(l = -5)),
    plot.margin = margin(5.5, 7, 50, 5.5, "pt")
  )

print(p_spaghetti)

Combine Top Panels

Create the two-panel figure showing both normalized indices and spaghetti plots side-by-side.

p_top_panel <- ggarrange(
  p_normalized + rremove("xlab"),
  p_spaghetti + rremove("xlab"), ncol = 2) %>%
  annotate_figure(
      bottom = text_grob("Leaf",  size = base_size,vjust = -2),
    )

Marker Gene Validation

Select representative marker gene pairs to validate the broader gene set patterns. Each pair contains one photosynthesis and one senescence gene.

Define and Extract Marker Genes

# Define marker gene pairs (photosynthesis, senescence)
marker_genes <- data.frame(
  locus_label = c("pep1", "salt1", "ssu1", "mir3",
                  "nye2", "sgrl1", "cab1", "dapat3"),
  gene_set = c("Photosynthesis", "Senescence",
                "Photosynthesis", "Senescence",
                "Senescence", "Senescence",
                "Photosynthesis", "Senescence"),
  pair = c("Pair1", "Pair1", "Pair2", "Pair2",
           "Pair3", "Pair3", "Pair4", "Pair4"),
  stringsAsFactors = FALSE
)

# Extract gene IDs for each locus
gene_ids <- effects_df %>%
  filter(locus_label %in% marker_genes$locus_label, !is.na(locus_label)) %>%
  select(gene, locus_label, desc_merged) %>%
  distinct() %>%
  group_by(locus_label) %>%
  slice(1) %>%
  ungroup()

# Extract expression data for all marker genes
marker_data <- lapply(seq_len(nrow(gene_ids)), function(i) {
  g <- gene_ids$gene[i]
  data.frame(
    sample = colnames(normalized_expression),
    expression = normalized_expression[g, ],
    gene = g
  )
}) %>%
  bind_rows() %>%
  left_join(gene_ids, by = "gene") %>%
  left_join(marker_genes, by = "locus_label") %>%
  mutate(
    leaf_stage = as.numeric(sub(".*L([0-9]+)[LR]$", "\\1", sample))
  )

# Calculate summary statistics (across all samples, no treatment split)
marker_summary <- marker_data %>%
  group_by(gene, locus_label, gene_set, desc_merged, pair, leaf_stage) %>%
  summarise(
    mean_expr = mean(expression),
    se_expr = sd(expression) / sqrt(n()),
    .groups = "drop"
  )

Sort Pairs by Expression Level

Order gene pairs from highest to lowest maximum expression to highlight the most strongly expressed markers.

# Calculate maximum expression per pair for sorting
pair_max_expr <- marker_summary %>%
  group_by(pair) %>%
  summarise(max_expr = max(mean_expr), .groups = "drop") %>%
  arrange(desc(max_expr))

# Create colored HTML labels for each gene
marker_summary_colored <- marker_summary %>%
  select(pair, locus_label, desc_merged, gene_set) %>%
  distinct() %>%
  mutate(
    # Wrap text
    wrapped_text = stringr::str_wrap(desc_merged, width = 25),
    # Replace \n with <br>
    with_br = gsub("\n", "<br>", wrapped_text),
    # Add color tags based on gene set
    with_color = case_when(
      gene_set == "Photosynthesis" ~ 
        paste0("<span style='color:darkgreen'>", with_br, "</span>"),
      gene_set == "Senescence" ~ 
        paste0("<span style='color:orange'>", with_br, "</span>"),
      TRUE ~ with_br
    ),
    # Create sort order: Photosynthesis = 1, Senescence = 2
    sort_order = ifelse(gene_set == "Photosynthesis", 1, 2)
  )

# Create facet labels by combining both genes in each pair
# Sort so Photosynthesis appears first (will be on top)
pair_labels <- marker_summary_colored %>%
  arrange(pair, sort_order) %>%  # Sort by pair, then by sort_order
  group_by(pair) %>%
  summarise(
    facet_label = paste(with_color, collapse = "<br><br>"),
    .groups = "drop"
  ) %>%
  left_join(pair_max_expr, by = "pair") %>%
  arrange(desc(max_expr))

# Apply sorted factor levels
marker_summary <- marker_summary %>%
  left_join(
    pair_labels %>% select(pair, facet_label, max_expr),
    by = "pair"
  ) %>%
  mutate(
    facet_label = factor(facet_label, levels = pair_labels$facet_label)
  )

Then update the plot theme:

p_markers <- ggplot(
  marker_summary,
  aes(x = leaf_stage, y = mean_expr, color = gene_set, group = gene)
) +
  geom_line(linewidth = 1.5) +
  geom_point(size = 5) +
  geom_errorbar(
    aes(ymin = mean_expr - se_expr, ymax = mean_expr + se_expr),
    width = 0.2,
    linewidth = 0.8
  ) +
  geom_text(
    data = marker_summary %>% filter(leaf_stage == 1),
    aes(x = leaf_stage, 
        y = mean_expr,
        label = locus_label, 
        color = gene_set),
    nudge_x = -0.5,  # Nudge slightly to the left of the point
    hjust = 1,        # Right-align the text (so it sits to the left of the point)
    fontface = "bold",
    size = 5
  ) +
  scale_color_manual(
    values = c("Photosynthesis" = "darkgreen", "Senescence" = "orange")
  ) +
  scale_x_continuous(expand = expansion(add = c(1.5, 0.2)),
                     breaks=1:4) +
  facet_wrap(
    ~ facet_label,
    nrow = 2,
    scales = "free_y"
  ) +
  labs(
    x = "Leaf",
    y = expression(log[10] * "(CPM)"),
    color = NULL
  ) +
  theme_classic(base_size = base_size ) +
  theme(
    strip.background = element_blank(),
    strip.text = element_markdown(size = 15, hjust = 0,face = "bold"),
    # plot.margin = margin(5.5, 7, 50, 5.5, "pt")
    plot.margin = margin(-50, 90, 5.5, 5.5, "pt"),
    axis.title.y = element_text(margin = margin(r = -10)),
    panel.spacing.x = unit(0, "cm"),
    panel.spacing.y = unit(0, "cm"),
    legend.position = "none"
  )
p_markers

Combine All Panels for Main Figure Create final composite figure with gene set indices (top) and marker genes (bottom).

left_panel <- cowplot::plot_grid(
  p_top_panel, p_markers,
  ncol = 1,
  align='h',
  axis='lr'
  # labels = LETTERS[1:2],
  # label_size = 45
)


ggsave(
  "~/Desktop/left_panel.png",
  plot = left_panel,
  width = 9,
  height = 15,
  dpi = 150
)
print(left_panel)

Supplementary Visualizations

Individual Index Plots

Compare raw index values across leaf stages and treatments.

p1 <- indices %>%
  ggplot(aes(
    y = photosynthesis,
    x = leaf_stage,
    fill = factor(leaf_stage),
    shape = treatment
  )) +
  geom_point(size = 3, color = "black") +
  scale_shape_manual(values = c(21, 25)) +
  scale_fill_viridis_d() +
  labs(
    x = "Leaf",
    y = "Photosynthesis Index",
    fill = "Leaf",
    shape = "Treatment"
  ) +
  theme_classic(base_size = 14)

p2 <- indices %>%
  ggplot(aes(
    y = senescence,
    x = leaf_stage,
    fill = factor(leaf_stage),
    shape = treatment
  )) +
  geom_point(size = 3, color = "black") +
  scale_shape_manual(values = c(21, 25)) +
  scale_fill_viridis_d() +
  labs(
    x = "Leaf",
    y = "Senescence Index",
    fill = "Leaf",
    shape = "Treatment"
  ) +
  theme_classic(base_size = 14)

p_individual <- ggarrange(p1, p2, ncol = 2, 
                          common.legend = TRUE, legend = "right")
print(p_individual)

Bivariate Relationship

Examine the correlation between photosynthesis and senescence indices.

p_bivariate <- indices %>%
  ggplot(aes(
    x = photosynthesis,
    y = senescence,
    fill = factor(leaf_stage),
    shape = factor(treatment)
  )) +
  geom_point(size = 3, color = "black") +
  scale_fill_viridis_d() +
  scale_shape_manual(values = c(21, 25)) +
  guides(fill = guide_legend(override.aes = list(shape = 21))) +
  labs(
    x = "Photosynthesis Index",
    y = "Senescence Index",
    fill = "Leaf",
    shape = "Treatment"
  ) +
  theme_classic(base_size = 14)

print(p_bivariate)

Gene Set Enrichment Analysis

Test whether our custom gene sets are enriched in differentially expressed genes for leaf stage and phosphorus treatment effects.

Enrichment Function

#' Fisher's Exact Test for Custom Gene Set Enrichment
#'
#' @param deg_lists Named list of DEG vectors by cluster
#' @param annotation Named list of gene sets
#' @param universe Background gene set
#' @return Data frame with enrichment results
test_custom_enrichment <- function(deg_lists, annotation, universe) {
  annotation_filtered <- lapply(annotation, function(x) intersect(x, universe))
  annotation_filtered <- annotation_filtered[
    sapply(annotation_filtered, length) > 0
  ]
  
  results <- list()
  
  for (cluster_name in names(deg_lists)) {
    deg_genes <- deg_lists[[cluster_name]]
    
    for (term_name in names(annotation_filtered)) {
      term_genes <- annotation_filtered[[term_name]]
      overlap <- intersect(deg_genes, term_genes)
      
      in_both <- length(overlap)
      in_deg_only <- length(setdiff(deg_genes, term_genes))
      in_term_only <- length(setdiff(term_genes, deg_genes))
      in_neither <- length(universe) - in_both - in_deg_only - in_term_only
      
      contingency <- matrix(
        c(in_both, in_deg_only, in_term_only, in_neither),
        nrow = 2
      )
      fisher_result <- fisher.test(contingency, alternative = "greater")
      
      results[[length(results) + 1]] <- data.frame(
        Cluster = cluster_name,
        ID = term_name,
        Description = term_name,
        GeneRatio = paste0(in_both, "/", length(deg_genes)),
        BgRatio = paste0(length(term_genes), "/", length(universe)),
        pvalue = fisher_result$p.value,
        geneID = paste(overlap, collapse = "/"),
        Count = in_both,
        stringsAsFactors = FALSE
      )
    }
  }
  
  results_df <- do.call(rbind, results)
  results_df$p.adjust <- p.adjust(results_df$pvalue, method = "BH")
  
  results_df %>% arrange(p.adjust)
}

Run Enrichment Tests

# Prepare DEG lists
DEGs <- with(effects_df %>% filter(is_hiconf_DEG), {
  list(
    Leaf.Down = unique(gene[
      predictor == "Leaf" & regulation == "Downregulated"
    ]),
    Leaf.Up = unique(gene[predictor == "Leaf" & regulation == "Upregulated"]),
    `-P.Down` = unique(gene[
      predictor == "-P" & regulation == "Downregulated"
    ]),
    `-P.Up` = unique(gene[predictor == "-P" & regulation == "Upregulated"])
  )
})

custom_enrichment_results <- test_custom_enrichment(
  DEGs,
  custom_annotation,
  universe
) %>%
  filter(p.adjust < 0.05)

custom_enrichment_results %>%
  select(-geneID) %>%
  print()
##   Cluster              ID     Description GeneRatio    BgRatio       pvalue
## 1 Leaf.Up         natural         natural   175/634 3284/24011 7.312839e-21
## 2 Leaf.Up natural_induced natural_induced    83/634  991/24011 8.520661e-21
## 3   -P.Up natural_induced natural_induced    83/661  991/24011 1.205018e-19
## 4   -P.Up         natural         natural   161/661 3284/24011 6.002531e-14
## 5 Leaf.Up             sag             sag     8/634   40/24011 8.242430e-06
## 6 Leaf.Up       staygreen       staygreen    32/634  572/24011 6.252709e-05
## 7   -P.Up       staygreen       staygreen    27/661  572/24011 4.943556e-03
## 8   -P.Up             sag             sag     5/661   40/24011 4.620103e-03
##   Count     p.adjust
## 1   175 1.022479e-19
## 2    83 1.022479e-19
## 3    83 9.640141e-19
## 4   161 3.601519e-13
## 5     8 3.956366e-05
## 6    32 2.501084e-04
## 7    27 1.483067e-02
## 8     5 1.483067e-02

Gene-Level Statistics

Extract detailed statistics for genes in enriched terms.

#' Extract DEG Stats for Custom Annotation
#'
#' @param annotation_id Term ID to filter
#' @param gene_term_df Data frame with gene-term mappings
#' @param effects_data DEG results
#' @param filter_hiconf Filter to high-confidence DEGs
#' @return Tibble with DEG statistics
get_annotation_degs <- function(annotation_id,
                                gene_term_df,
                                effects_data,
                                filter_hiconf = TRUE) {
  genes_in_category <- gene_term_df %>%
    filter(ID == annotation_id) %>%
    pull(gene)
  
  result <- effects_data %>%
    filter(gene %in% genes_in_category)
  
  if (filter_hiconf) {
    result <- result %>% filter(is_hiconf_DEG)
  }
  
  result %>%
    select(
      predictor:gene, locus_label, desc_merged,
      logFC, neglogP, mahalanobis
    ) %>%
    group_by(predictor, regulation) %>%
    arrange(predictor, regulation, -mahalanobis) %>%
    tibble()
}

# Extract genes from enriched terms
genes_in_custom_terms <- custom_enrichment_results %>%
  separate_longer_delim(geneID, delim = "/") %>%
  rename(gene = "geneID") %>%
  select(Cluster, ID, Description, gene) %>%
  inner_join(
    effects_df %>% select(gene, locus_label, desc_merged) %>% distinct(),
    by = "gene"
  )

# Example: Natural senescence genes
get_annotation_degs(
  annotation_id = "natural",
  gene_term_df = genes_in_custom_terms,
  effects_data = effects_df,
  filter_hiconf = TRUE
) %>%
  filter(predictor == "Leaf") %>%
  print(n = 50)
## # A tibble: 176 × 8
##    predictor regulation gene  locus_label desc_merged  logFC neglogP mahalanobis
##    <chr>     <chr>      <chr> <chr>       <chr>        <dbl>   <dbl>       <dbl>
##  1 Leaf      Downregul… Zm00… glp1        germin-lik… -0.787    4.28        30.0
##  2 Leaf      Upregulat… Zm00… osm1        osmotin-li…  3.19     4.03       382. 
##  3 Leaf      Upregulat… Zm00… nactf122    NAC-transc…  2.12     6.07       177. 
##  4 Leaf      Upregulat… Zm00… ox1         Peroxidase   2.02     5.41       160. 
##  5 Leaf      Upregulat… Zm00… pht2        phosphate …  1.97     6.67       157. 
##  6 Leaf      Upregulat… Zm00… <NA>        Indole-2-m…  1.91     7.53       152. 
##  7 Leaf      Upregulat… Zm00… <NA>        <NA>         1.92     4.22       142. 
##  8 Leaf      Upregulat… Zm00… <NA>        Ergosterol…  1.76     7.14       131. 
##  9 Leaf      Upregulat… Zm00… fad15       fatty acid…  1.76     6.39       127. 
## 10 Leaf      Upregulat… Zm00… <NA>        Outer arm …  1.66     8.84       127. 
## 11 Leaf      Upregulat… Zm00… fad17       fatty acid…  1.75     6.07       126. 
## 12 Leaf      Upregulat… Zm00… fomt4       flavonoid …  1.78     4.57       124. 
## 13 Leaf      Upregulat… Zm00… <NA>        Chemocyanin  1.75     4.20       119. 
## 14 Leaf      Upregulat… Zm00… prp11       pathogenes…  1.72     5.05       117. 
## 15 Leaf      Upregulat… Zm00… prt1        Putative c…  1.45    10.9        116. 
## 16 Leaf      Upregulat… Zm00… <NA>        Tryptamine…  1.67     4.62       110. 
## 17 Leaf      Upregulat… Zm00… <NA>        Germin-lik…  1.66     3.06       105. 
## 18 Leaf      Upregulat… Zm00… prp4        pathogenes…  1.54     6.77       103. 
## 19 Leaf      Upregulat… Zm00… <NA>        Cytochrome…  1.61     4.56       102. 
## 20 Leaf      Upregulat… Zm00… <NA>        Cellulose …  1.43     8.23        97.9
## 21 Leaf      Upregulat… Zm00… <NA>        Peptidase …  1.59     3.72        97.8
## 22 Leaf      Upregulat… Zm00… chn33       chitinase33  1.52     5.35        95.0
## 23 Leaf      Upregulat… Zm00… <NA>        Putative a…  1.54     4.39        93.4
## 24 Leaf      Upregulat… Zm00… <NA>        Mono-/di-a…  1.37     8.52        92.8
## 25 Leaf      Upregulat… Zm00… prp9        pathogenes…  1.51     5.10        92.7
## 26 Leaf      Upregulat… Zm00… <NA>        Carbohydra…  1.46     6.12        90.7
## 27 Leaf      Upregulat… Zm00… <NA>        Armadillo    1.41     7.25        90.5
## 28 Leaf      Upregulat… Zm00… <NA>        Alpha-gluc…  1.44     5.91        87.6
## 29 Leaf      Upregulat… Zm00… pip2d       plasma mem…  1.50     3.55        87.5
## 30 Leaf      Upregulat… Zm00… <NA>        Probable p…  1.32     8.12        85.2
## 31 Leaf      Upregulat… Zm00… <NA>        Benzyl alc…  1.40     6.27        84.7
## 32 Leaf      Upregulat… Zm00… <NA>        <NA>         1.40     6.25        84.7
## 33 Leaf      Upregulat… Zm00… ks4         kaurene sy…  1.37     6.77        83.9
## 34 Leaf      Upregulat… Zm00… lbd41       LBD-transc…  1.46     3.58        83.2
## 35 Leaf      Upregulat… Zm00… <NA>        sphinganin…  1.40     5.74        82.7
## 36 Leaf      Upregulat… Zm00… <NA>        Cysteine-r…  1.40     5.25        81.3
## 37 Leaf      Upregulat… Zm00… <NA>        Epimerase …  1.32     6.72        79.1
## 38 Leaf      Upregulat… Zm00… nactf108    NAC-transc…  1.17     9.20        78.5
## 39 Leaf      Upregulat… Zm00… <NA>        <NA>         1.28     7.17        77.4
## 40 Leaf      Upregulat… Zm00… <NA>        Putative r…  1.40     3.49        76.7
## 41 Leaf      Upregulat… Zm00… <NA>        Receptor-l…  1.14     9.33        76.3
## 42 Leaf      Upregulat… Zm00… <NA>        Cysteine d…  1.08    10.00        76.0
## 43 Leaf      Upregulat… Zm00… <NA>        Bowman-Bir…  1.40     3.46        75.8
## 44 Leaf      Upregulat… Zm00… <NA>        Putrescine…  1.17     8.76        75.7
## 45 Leaf      Upregulat… Zm00… <NA>        Type IV in…  1.30     5.91        73.8
## 46 Leaf      Upregulat… Zm00… red1        Tropinone …  0.960   10.8         72.5
## 47 Leaf      Upregulat… Zm00… <NA>        Protein ki…  1.20     7.74        72.3
## 48 Leaf      Upregulat… Zm00… <NA>        Fucosyltra…  1.33     3.90        70.7
## 49 Leaf      Upregulat… Zm00… bhlh62      bHLH-trans…  1.18     7.71        70.4
## 50 Leaf      Upregulat… Zm00… chn17       chitinase17  1.24     6.43        70.0
## # ℹ 126 more rows

Export Results

write.csv(
  indices,
  file = "~/Desktop/expression_indices.csv",
  row.names = FALSE
)
write.csv(
  custom_enrichment_results,
  file = "~/Desktop/custom_enrichment_results.csv",
  row.names = FALSE
)

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.6          ggtext_0.1.2           clusterProfiler_4.17.0
## [4] tidyr_1.3.1            ggpubr_0.6.2           ggplot2_4.0.0         
## [7] dplyr_1.1.4           
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_2.0.0         
##   [4] magrittr_2.0.4          ggtangle_0.0.7          farver_2.1.2           
##   [7] rmarkdown_2.30          fs_1.6.6                ragg_1.5.0             
##  [10] vctrs_0.6.5             memoise_2.0.1           ggtree_3.99.2          
##  [13] rstatix_0.7.3           janitor_2.2.1           htmltools_0.5.8.1      
##  [16] broom_1.0.10            Formula_1.2-5           gridGraphics_0.5-1     
##  [19] sass_0.4.10             bslib_0.9.0             htmlwidgets_1.6.4      
##  [22] plyr_1.8.9              lubridate_1.9.4         cachem_1.1.0           
##  [25] commonmark_2.0.0        igraph_2.2.0            lifecycle_1.0.4        
##  [28] pkgconfig_2.0.3         Matrix_1.7-4            R6_2.6.1               
##  [31] fastmap_1.2.0           gson_0.1.0              snakecase_0.11.1       
##  [34] digest_0.6.37           aplot_0.2.9             enrichplot_1.29.3      
##  [37] patchwork_1.3.2         AnnotationDbi_1.71.2    S4Vectors_0.47.4       
##  [40] textshaping_1.0.4       RSQLite_2.4.3           labeling_0.4.3         
##  [43] timechange_0.3.0        httr_1.4.7              abind_1.4-8            
##  [46] mgcv_1.9-3              compiler_4.5.1          bit64_4.6.0-1          
##  [49] fontquiver_0.2.1        withr_3.0.2             S7_0.2.0               
##  [52] backports_1.5.0         BiocParallel_1.43.4     carData_3.0-5          
##  [55] DBI_1.2.3               R.utils_2.13.0          ggsignif_0.6.4         
##  [58] rappdirs_0.3.3          tools_4.5.1             ape_5.8-1              
##  [61] R.oo_1.27.1             glue_1.8.0              nlme_3.1-168           
##  [64] GOSemSim_2.35.2         gridtext_0.1.5          grid_4.5.1             
##  [67] reshape2_1.4.4          fgsea_1.35.8            generics_0.1.4         
##  [70] gtable_0.3.6            R.methodsS3_1.8.2       data.table_1.17.8      
##  [73] xml2_1.4.0              car_3.1-3               utf8_1.2.6             
##  [76] XVector_0.49.1          BiocGenerics_0.55.4     pillar_1.11.1          
##  [79] markdown_2.0            stringr_1.5.2           yulab.utils_0.2.1      
##  [82] splines_4.5.1           treeio_1.33.0           lattice_0.22-7         
##  [85] bit_4.6.0               tidyselect_1.2.1        fontLiberation_0.1.0   
##  [88] GO.db_3.22.0            Biostrings_2.77.2       knitr_1.50             
##  [91] fontBitstreamVera_0.1.1 gridExtra_2.3           litedown_0.7           
##  [94] IRanges_2.43.5          Seqinfo_0.99.2          stats4_4.5.1           
##  [97] xfun_0.53               Biobase_2.69.1          stringi_1.8.7          
## [100] lazyeval_0.2.2          ggfun_0.2.0             yaml_2.3.10            
## [103] evaluate_1.0.5          codetools_0.2-20        gdtools_0.4.4          
## [106] tibble_3.3.0            qvalue_2.41.0           ggplotify_0.1.3        
## [109] cli_3.6.5               systemfonts_1.3.1       jquerylib_0.1.4        
## [112] Rcpp_1.1.0              png_0.1-8               parallel_4.5.1         
## [115] blob_1.2.4              DOSE_4.3.0              viridisLite_0.4.2      
## [118] tidytree_0.4.6          ggiraph_0.9.2           scales_1.4.0           
## [121] purrr_1.1.0             crayon_1.5.3            rlang_1.1.6            
## [124] cowplot_1.2.0           fastmatch_1.1-6         KEGGREST_1.49.2