Overview

Purpose: GO over-representation analysis for differential expression across three experimental predictors.

Experimental predictors: - Leaf: Leaf development/senescence (FC threshold: ±0.7) - -P: Phosphorus deficiency (FC threshold: ±2) - Inv4m: Inversion genotype (FC threshold: ±2)

Outputs: - Full enrichment plot (all three predictors) - Focused plot (Leaf and -P only) - Venn diagrams comparing annotated vs all high-confidence DEGs - Gene-centric table of enriched terms

Libraries

library(dplyr)
library(ggplot2)
library(ggpubr)
library(clusterProfiler)
library(tidyr)
library(forcats)
library(ggvenn)

Data Import

# Load differential expression results
effects_df <- read.csv("~/Desktop/DEG_effects.csv")

# Load curated labels
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv") %>%
  select(gene, locus_label)

# Clean and coalesce labels
effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA

effects_df <- effects_df %>%
  left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
  mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
  select(-locus_label_curated)

# Gene universe for enrichment
universe <- effects_df %>%
  filter(predictor == "Leaf") %>%
  pull(gene) %>%
  unique()

cat("Universe size:", length(universe), "genes\n")
## Universe size: 24011 genes

DEG Summary

deg_table <- with(
  effects_df %>% filter(is_DEG),
  table(regulation, predictor)
)

print(addmargins(deg_table))
##                predictor
## regulation         -P -P:Inv4m Inv4m  Leaf   Sum
##   Downregulated  4996        3   397  6861 12257
##   Upregulated    5579        0   583  7393 13555
##   Sum           10575        3   980 14254 25812

GO Enrichment Analysis

Load GO Annotations

TERM2NAME <- readRDS(
  "/Users/fvrodriguez/Desktop/GOMAP_maize_B73_NAM5/TERM2NAME.rds"
)

TERM2GENE <- readRDS(
  "/Users/fvrodriguez/Desktop/GOMAP_maize_B73_NAM5/TERM2GENE_Fattel_2024_full.rds"
)

Enrichment Function

#' Perform GO Enrichment Analysis
#'
#' @param x Character vector of gene IDs
#' @param ontology GO ontology ("BP", "MF", or "CC")
#' @return enrichResult object
ego <- function(x, ontology = "BP") {
  enricher(
    gene = x,
    pvalueCutoff = 0.05,
    pAdjustMethod = "none",
    universe = universe,
    minGSSize = 10,
    maxGSSize = 500,
    qvalueCutoff = 0.2,
    TERM2GENE = TERM2GENE[TERM2GENE$GO %in% TERM2NAME[[ontology]]$go_id, ],
    TERM2NAME = TERM2NAME[[ontology]]
  )
}

Prepare DEG Lists

DEGs <- with(effects_df %>% filter(is_hiconf_DEG), {
  list(
    Leaf.Down = unique(gene[predictor == "Leaf" & regulation == "Downregulated"]),
    Leaf.Up = unique(gene[predictor == "Leaf" & regulation == "Upregulated"]),
    `-P.Down` = unique(gene[predictor == "-P" & regulation == "Downregulated"]),
    `-P.Up` = unique(gene[predictor == "-P" & regulation == "Upregulated"]),
    inv4m.Down = unique(gene[predictor == "Inv4m" & regulation == "Downregulated"]),
    inv4m.Up = unique(gene[predictor == "Inv4m" & regulation == "Upregulated"])
  )
})

n_genes <- sapply(DEGs, length)
cluster_order <- names(DEGs)

print(n_genes)
##  Leaf.Down    Leaf.Up    -P.Down      -P.Up inv4m.Down   inv4m.Up 
##        468        634        129        661         70         69

Run Enrichment

compGO_BP <- compareCluster(geneCluster = DEGs, fun = ego)

levels(compGO_BP@compareClusterResult$Cluster) <- cluster_order

cat("Enriched terms per cluster:\n")
## Enriched terms per cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
## 
##  Leaf.Down    Leaf.Up    -P.Down      -P.Up inv4m.Down   inv4m.Up 
##         88        152          0        134          1          2

Filter to GO Slim

slim <- read.csv("~/slim_to_plot.csv")

slim_comp <- compGO_BP
slim_comp@compareClusterResult <- slim_comp@compareClusterResult %>%
  filter(ID %in% slim$ID)

cat("Curated terms per cluster:\n")
## Curated terms per cluster:
print(table(slim_comp@compareClusterResult$Cluster))
## 
##  Leaf.Down    Leaf.Up    -P.Down      -P.Up inv4m.Down   inv4m.Up 
##          3          9          0          8          0          0

Visualization

Identify Annotated Terms

with_known_gene <- slim_comp@compareClusterResult %>%
  separate_longer_delim(geneID, delim = "/") %>%
  rename(gene = "geneID") %>%
  inner_join(
    effects_df %>%
      select(gene, locus_label, desc_merged) %>%
      distinct(),
    relationship = "many-to-many"
  ) %>%
  filter(!is.na(locus_label) & !is.na(desc_merged)) %>%
  pull(ID) %>%
  unique()

cat("GO terms with annotated genes:", length(with_known_gene), "\n")
## GO terms with annotated genes: 11

Select Top Terms

sorted_by_p <- slim_comp@compareClusterResult %>%
  filter(ID %in% with_known_gene) %>%
  mutate(
    neglogP = -log10(p.adjust),
    Cluster = factor(Cluster, levels = cluster_order)
  ) %>%
  group_by(Cluster) %>%
  arrange(Cluster, p.adjust) %>%
  slice(1:7) %>%
  mutate(
    label_order = 1:n(),
    Description = fct_reorder(Description, label_order)
  )

top_terms <- levels(sorted_by_p$Description)

Prepare Plot Data

to_plot <- slim_comp@compareClusterResult %>%
  filter(ID %in% with_known_gene, Description %in% top_terms) %>%
  mutate(
    neglogP = -log10(p.adjust),
    Cluster = factor(Cluster, levels = cluster_order),
    Description = factor(Description, levels = top_terms)
  ) %>%
  group_by(Cluster) %>%
  arrange(Cluster, p.adjust) %>%
  mutate(
    label_order = 1:n(),
    Description = fct_reorder(Description, label_order)
  )

# Gene metadata
effects_gene_label <- effects_df %>%
  filter(is_hiconf_DEG) %>%
  mutate(Cluster = case_when(
    regulation == "Upregulated" ~ paste(predictor, "Up", sep = "."),
    regulation == "Downregulated" ~ paste(predictor, "Down", sep = ".")
  )) %>%
  select(
    predictor, Cluster, gene, locus_label, locus_symbol,
    desc_merged, logFC, P = adj.P.Val, mahalanobis
  ) %>%
  filter(!is.na(locus_label) & !is.na(Cluster)) %>%
  mutate(neglogPG = -log10(P))

# Top gene per term
term2gene_label <- to_plot %>%
  filter(ID %in% with_known_gene) %>%
  separate_longer_delim(geneID, delim = "/") %>%
  rename(gene = "geneID") %>%
  inner_join(effects_gene_label, relationship = "many-to-many") %>%
  select(
    Cluster, gene, locus_label, ID, Description,
    p.adjust, Count, mahalanobis, logFC, P, neglogPG
  ) %>%
  distinct() %>%
  group_by(Cluster, ID) %>%
  arrange(Cluster, ID, -mahalanobis) %>%
  slice(1)

levels(to_plot$Cluster) <- cluster_order
levels(term2gene_label$Cluster) <- cluster_order

Full Plot (All Predictors)

labels <- paste0(
  gsub(".", " ", cluster_order, fixed = TRUE),
  "\n(", n_genes, ")"
)
names(labels) <- names(n_genes)

bg <- to_plot %>%
  ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
  ggtitle("GO Biological Process Annotation of DEGs") +
  scale_fill_continuous(
    low = "dodgerblue",
    high = "tomato",
    name = "-log10(FDR)"
  ) +
  scale_y_discrete(limits = rev(levels(to_plot$Description))) +
  scale_x_discrete(labels = labels, drop = FALSE) +
  geom_point(shape = 21) +
  scale_size(range = c(2, 8)) +
  ylab(NULL) +
  xlab(NULL) +
  DOSE::theme_dose(font.size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    legend.position = c(0.87, 0.67),
    legend.box = "horizontal",
    axis.text.x = element_text(vjust = 0.5)
  )

annotation_plot <- bg +
  geom_text(
    data = term2gene_label,
    position = position_nudge(0.1),
    hjust = 0,
    vjust = 0.5,
    fontface = "italic",
    mapping = aes(x = Cluster, y = Description, label = locus_label),
    inherit.aes = FALSE
  )

print(annotation_plot)

Gene-Centric Table

genes_in_enriched_terms <- to_plot %>%
  separate_longer_delim(geneID, delim = "/") %>%
  rename(gene = "geneID") %>%
  select(Cluster, ID, Description, gene)

cluster_codes <- c(
  "Leaf.Down" = "LD", "Leaf.Up" = "LU",
  "-P.Down" = "PD", "-P.Up" = "PU",
  "inv4m.Down" = "ID", "inv4m.Up" = "IU"
)

gene_summary <- genes_in_enriched_terms %>%
  group_by(gene) %>%
  summarise(
    n_terms = n_distinct(ID),
    cluster_string = paste(
      sort(unique(cluster_codes[as.character(Cluster)])),
      collapse = ""
    ),
    go_categories = paste(
      sort(unique(Description)),
      collapse = "; "
    ),
    .groups = "drop"
  )

curated_genes <- effects_gene_label %>%
  filter(predictor != "Inv4m", Cluster != "-P.Down") %>%
  group_by(Cluster) %>%
  mutate(relative_mahalanobis = mahalanobis / max(mahalanobis, na.rm = TRUE)) %>%
  ungroup() %>%
  inner_join(gene_summary, by = "gene") %>%
  group_by(cluster_string, gene) %>%
  arrange(cluster_string, gene, -relative_mahalanobis) %>%
  slice(1) %>%
  ungroup() %>%
  select(-predictor, -Cluster) %>%
  select(cluster_string, locus_label, desc_merged, everything())

cat("Total genes in enriched terms:", nrow(curated_genes), "\n")
## Total genes in enriched terms: 104
cat("Terms per gene range:", range(curated_genes$n_terms), "\n")
## Terms per gene range: 1 4

Venn Diagrams

Prepare Gene Sets

hiconf_leaf <- effects_df %>%
  filter(is_hiconf_DEG, predictor == "Leaf") %>%
  pull(gene) %>%
  unique()

hiconf_p <- effects_df %>%
  filter(is_hiconf_DEG, predictor == "-P") %>%
  pull(gene) %>%
  unique()

annotated_genes <- compGO_BP@compareClusterResult %>%
  separate_longer_delim(geneID, delim = "/") %>%
  pull(geneID) %>%
  unique()

annotated_leaf <- intersect(hiconf_leaf, annotated_genes)
annotated_p <- intersect(hiconf_p, annotated_genes)

cat("High-confidence DEGs:\n")
## High-confidence DEGs:
cat("  Leaf:", length(hiconf_leaf), "| -P:", length(hiconf_p), "\n")
##   Leaf: 1102 | -P: 790
cat("Annotated DEGs:\n")
## Annotated DEGs:
cat("  Leaf:", length(annotated_leaf), "| -P:", length(annotated_p), "\n")
##   Leaf: 440 | -P: 295

Create Venn Diagrams

p1 <- ggvenn(
  list(Leaf = annotated_leaf, `-P` = annotated_p),
  fill_color = c("#0073C2FF", "#EFC000FF"),
  stroke_size = 0.5,
  set_name_size = 5,
  text_size = 4
) +
  ggtitle("Annotated High Confidence DEGs") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

p2 <- ggvenn(
  list(Leaf = hiconf_leaf, `-P` = hiconf_p),
  fill_color = c("#0073C2FF", "#EFC000FF"),
  stroke_size = 0.5,
  set_name_size = 5,
  text_size = 4
) +
  ggtitle("High Confidence DEGs") +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

combined_venn <- ggarrange(p1, p2, ncol = 2, nrow = 1)
print(combined_venn)

Statistical Test

dual_annotated <- length(intersect(annotated_leaf, annotated_p))
single_annotated <- length(union(
  setdiff(annotated_leaf, annotated_p),
  setdiff(annotated_p, annotated_leaf)
))

dual_all <- length(intersect(hiconf_leaf, hiconf_p))
single_all <- length(union(
  setdiff(hiconf_leaf, hiconf_p),
  setdiff(hiconf_p, hiconf_leaf)
))

contingency <- matrix(
  c(dual_annotated, single_annotated, dual_all, single_all),
  nrow = 2,
  ncol = 2,
  byrow = TRUE,
  dimnames = list(
    Status = c("Annotated", "All"),
    Response = c("Dual", "Single")
  )
)

fisher_result <- fisher.test(contingency, alternative = "greater")

cat("\n--- Fisher's Exact Test ---\n")
## 
## --- Fisher's Exact Test ---
print(contingency)
##            Response
## Status      Dual Single
##   Annotated  181    373
##   All        366   1160
cat("OR =", round(fisher_result$estimate, 2),
    "| P =", format.pval(fisher_result$p.value, digits = 2), "\n")
## OR = 1.54 | P = 5.4e-05

Focused Analysis: Leaf and -P

DEGs_leaf_p <- DEGs[!grepl("inv4m", names(DEGs))]
n_genes_leaf_p <- sapply(DEGs_leaf_p, length)
cluster_order_leaf_p <- cluster_order[!grepl("inv4m", cluster_order)]

to_plot_leaf_p <- to_plot %>%
  filter(!grepl("inv4m", Cluster)) %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order_leaf_p))

term2gene_label_leaf_p <- term2gene_label %>%
  filter(!grepl("inv4m", Cluster)) %>%
  mutate(Cluster = factor(Cluster, levels = cluster_order_leaf_p)) %>%
  droplevels()

cat("Gene counts (Leaf/-P):\n")
## Gene counts (Leaf/-P):
print(n_genes_leaf_p)
## Leaf.Down   Leaf.Up   -P.Down     -P.Up 
##       468       634       129       661
labels_leaf_p <- paste0(
  gsub(".", " ", cluster_order_leaf_p, fixed = TRUE),
  "\n(", n_genes_leaf_p, ")"
)
names(labels_leaf_p) <- names(n_genes_leaf_p)

bg_leaf_p <- to_plot_leaf_p %>%
  ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
  ggtitle("GO Biological Process Annotation of DEGs") +
  scale_fill_continuous(
    low = "dodgerblue",
    high = "tomato",
    name = "-log10(FDR)"
  ) +
  scale_y_discrete(limits = rev(levels(to_plot_leaf_p$Description))) +
  scale_x_discrete(labels = labels_leaf_p, drop = FALSE) +
  geom_point(shape = 21) +
  scale_size(range = c(2, 8)) +
  ylab(NULL) +
  xlab(NULL) +
  DOSE::theme_dose(font.size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    legend.position = c(0.15, 0.2),
    legend.box = "horizontal",
    axis.text.x = element_text(vjust = 0.5)
  )

annotation_plot_leaf_p <- bg_leaf_p +
  geom_text(
    data = term2gene_label_leaf_p,
    position = position_nudge(0.1),
    hjust = 0,
    vjust = 0.5,
    fontface = "italic",
    mapping = aes(x = Cluster, y = Description, label = locus_label),
    inherit.aes = FALSE
  )

print(annotation_plot_leaf_p)

Export

ggsave(annotation_plot, file = "~/Desktop/FC_annotation_plot_all.svg",
       height = 6.75, width = 12)

ggsave(annotation_plot_leaf_p, file = "~/Desktop/FC_annotation_plot_leaf_P.svg",
       height = 6.75, width = 10)

ggsave(combined_venn, file = "~/Desktop/venn_hiconf_degs.svg",
       height = 6, width = 12)

write.csv(curated_genes, file = "~/Desktop/selected_degs_GO.csv", row.names = FALSE)

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggvenn_0.1.19          forcats_1.0.1          tidyr_1.3.1           
## [4] clusterProfiler_4.17.0 ggpubr_0.6.2           ggplot2_4.0.0         
## [7] dplyr_1.1.4           
## 
## loaded via a namespace (and not attached):
##   [1] DBI_1.2.3               gson_0.1.0              rlang_1.1.6            
##   [4] magrittr_2.0.4          DOSE_4.3.0              compiler_4.5.1         
##   [7] RSQLite_2.4.3           systemfonts_1.3.1       png_0.1-8              
##  [10] vctrs_0.6.5             reshape2_1.4.4          stringr_1.5.2          
##  [13] pkgconfig_2.0.3         crayon_1.5.3            fastmap_1.2.0          
##  [16] backports_1.5.0         XVector_0.49.1          labeling_0.4.3         
##  [19] rmarkdown_2.30          enrichplot_1.29.3       purrr_1.1.0            
##  [22] bit_4.6.0               xfun_0.53               cachem_1.1.0           
##  [25] aplot_0.2.9             jsonlite_2.0.0          blob_1.2.4             
##  [28] BiocParallel_1.43.4     broom_1.0.10            parallel_4.5.1         
##  [31] R6_2.6.1                bslib_0.9.0             stringi_1.8.7          
##  [34] RColorBrewer_1.1-3      car_3.1-3               jquerylib_0.1.4        
##  [37] GOSemSim_2.35.2         Rcpp_1.1.0              Seqinfo_0.99.2         
##  [40] knitr_1.50              ggtangle_0.0.7          R.utils_2.13.0         
##  [43] IRanges_2.43.5          Matrix_1.7-4            splines_4.5.1          
##  [46] igraph_2.2.0            tidyselect_1.2.1        qvalue_2.41.0          
##  [49] rstudioapi_0.17.1       abind_1.4-8             yaml_2.3.10            
##  [52] codetools_0.2-20        lattice_0.22-7          tibble_3.3.0           
##  [55] plyr_1.8.9              treeio_1.33.0           Biobase_2.69.1         
##  [58] withr_3.0.2             KEGGREST_1.49.2         S7_0.2.0               
##  [61] evaluate_1.0.5          gridGraphics_0.5-1      Biostrings_2.77.2      
##  [64] pillar_1.11.1           ggtree_3.99.2           carData_3.0-5          
##  [67] stats4_4.5.1            ggfun_0.2.0             generics_0.1.4         
##  [70] S4Vectors_0.47.4        scales_1.4.0            tidytree_0.4.6         
##  [73] glue_1.8.0              gdtools_0.4.4           lazyeval_0.2.2         
##  [76] tools_4.5.1             data.table_1.17.8       fgsea_1.35.8           
##  [79] ggsignif_0.6.4          ggiraph_0.9.2           fs_1.6.6               
##  [82] fastmatch_1.1-6         cowplot_1.2.0           grid_4.5.1             
##  [85] ape_5.8-1               AnnotationDbi_1.71.2    nlme_3.1-168           
##  [88] patchwork_1.3.2         Formula_1.2-5           cli_3.6.5              
##  [91] rappdirs_0.3.3          fontBitstreamVera_0.1.1 gtable_0.3.6           
##  [94] R.methodsS3_1.8.2       rstatix_0.7.3           yulab.utils_0.2.1      
##  [97] fontquiver_0.2.1        sass_0.4.10             digest_0.6.37          
## [100] BiocGenerics_0.55.4     ggrepel_0.9.6           ggplotify_0.1.3        
## [103] htmlwidgets_1.6.4       farver_2.1.2            memoise_2.0.1          
## [106] htmltools_0.5.8.1       R.oo_1.27.1             lifecycle_1.0.4        
## [109] httr_1.4.7              GO.db_3.22.0            fontLiberation_0.1.0   
## [112] bit64_4.6.0-1