Purpose: GO over-representation analysis for differential expression across three experimental predictors.
Experimental predictors: - Leaf: Leaf development/senescence (FC threshold: ±0.7) - -P: Phosphorus deficiency (FC threshold: ±2) - Inv4m: Inversion genotype (FC threshold: ±2)
Outputs: - Full enrichment plot (all three predictors) - Focused plot (Leaf and -P only) - Venn diagrams comparing annotated vs all high-confidence DEGs - Gene-centric table of enriched terms
library(dplyr)
library(ggplot2)
library(ggpubr)
library(clusterProfiler)
library(tidyr)
library(forcats)
library(ggvenn)
# Load differential expression results
effects_df <- read.csv("~/Desktop/DEG_effects.csv")
# Load curated labels
curated <- read.csv("~/Desktop/selected_DEGs_curated_locus_label_2.csv") %>%
select(gene, locus_label)
# Clean and coalesce labels
effects_df$locus_label[effects_df$locus_label == "A0A1D6NG77"] <- NA
effects_df <- effects_df %>%
left_join(curated, by = "gene", suffix = c("", "_curated")) %>%
mutate(locus_label = coalesce(locus_label_curated, locus_label)) %>%
select(-locus_label_curated)
# Gene universe for enrichment
universe <- effects_df %>%
filter(predictor == "Leaf") %>%
pull(gene) %>%
unique()
cat("Universe size:", length(universe), "genes\n")
## Universe size: 24011 genes
deg_table <- with(
effects_df %>% filter(is_DEG),
table(regulation, predictor)
)
print(addmargins(deg_table))
## predictor
## regulation -P -P:Inv4m Inv4m Leaf Sum
## Downregulated 4996 3 397 6861 12257
## Upregulated 5579 0 583 7393 13555
## Sum 10575 3 980 14254 25812
TERM2NAME <- readRDS(
"/Users/fvrodriguez/Desktop/GOMAP_maize_B73_NAM5/TERM2NAME.rds"
)
TERM2GENE <- readRDS(
"/Users/fvrodriguez/Desktop/GOMAP_maize_B73_NAM5/TERM2GENE_Fattel_2024_full.rds"
)
#' Perform GO Enrichment Analysis
#'
#' @param x Character vector of gene IDs
#' @param ontology GO ontology ("BP", "MF", or "CC")
#' @return enrichResult object
ego <- function(x, ontology = "BP") {
enricher(
gene = x,
pvalueCutoff = 0.05,
pAdjustMethod = "none",
universe = universe,
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.2,
TERM2GENE = TERM2GENE[TERM2GENE$GO %in% TERM2NAME[[ontology]]$go_id, ],
TERM2NAME = TERM2NAME[[ontology]]
)
}
DEGs <- with(effects_df %>% filter(is_hiconf_DEG), {
list(
Leaf.Down = unique(gene[predictor == "Leaf" & regulation == "Downregulated"]),
Leaf.Up = unique(gene[predictor == "Leaf" & regulation == "Upregulated"]),
`-P.Down` = unique(gene[predictor == "-P" & regulation == "Downregulated"]),
`-P.Up` = unique(gene[predictor == "-P" & regulation == "Upregulated"]),
inv4m.Down = unique(gene[predictor == "Inv4m" & regulation == "Downregulated"]),
inv4m.Up = unique(gene[predictor == "Inv4m" & regulation == "Upregulated"])
)
})
n_genes <- sapply(DEGs, length)
cluster_order <- names(DEGs)
print(n_genes)
## Leaf.Down Leaf.Up -P.Down -P.Up inv4m.Down inv4m.Up
## 468 634 129 661 70 69
compGO_BP <- compareCluster(geneCluster = DEGs, fun = ego)
levels(compGO_BP@compareClusterResult$Cluster) <- cluster_order
cat("Enriched terms per cluster:\n")
## Enriched terms per cluster:
print(table(compGO_BP@compareClusterResult$Cluster))
##
## Leaf.Down Leaf.Up -P.Down -P.Up inv4m.Down inv4m.Up
## 88 152 0 134 1 2
slim <- read.csv("~/slim_to_plot.csv")
slim_comp <- compGO_BP
slim_comp@compareClusterResult <- slim_comp@compareClusterResult %>%
filter(ID %in% slim$ID)
cat("Curated terms per cluster:\n")
## Curated terms per cluster:
print(table(slim_comp@compareClusterResult$Cluster))
##
## Leaf.Down Leaf.Up -P.Down -P.Up inv4m.Down inv4m.Up
## 3 9 0 8 0 0
with_known_gene <- slim_comp@compareClusterResult %>%
separate_longer_delim(geneID, delim = "/") %>%
rename(gene = "geneID") %>%
inner_join(
effects_df %>%
select(gene, locus_label, desc_merged) %>%
distinct(),
relationship = "many-to-many"
) %>%
filter(!is.na(locus_label) & !is.na(desc_merged)) %>%
pull(ID) %>%
unique()
cat("GO terms with annotated genes:", length(with_known_gene), "\n")
## GO terms with annotated genes: 11
sorted_by_p <- slim_comp@compareClusterResult %>%
filter(ID %in% with_known_gene) %>%
mutate(
neglogP = -log10(p.adjust),
Cluster = factor(Cluster, levels = cluster_order)
) %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
slice(1:7) %>%
mutate(
label_order = 1:n(),
Description = fct_reorder(Description, label_order)
)
top_terms <- levels(sorted_by_p$Description)
to_plot <- slim_comp@compareClusterResult %>%
filter(ID %in% with_known_gene, Description %in% top_terms) %>%
mutate(
neglogP = -log10(p.adjust),
Cluster = factor(Cluster, levels = cluster_order),
Description = factor(Description, levels = top_terms)
) %>%
group_by(Cluster) %>%
arrange(Cluster, p.adjust) %>%
mutate(
label_order = 1:n(),
Description = fct_reorder(Description, label_order)
)
# Gene metadata
effects_gene_label <- effects_df %>%
filter(is_hiconf_DEG) %>%
mutate(Cluster = case_when(
regulation == "Upregulated" ~ paste(predictor, "Up", sep = "."),
regulation == "Downregulated" ~ paste(predictor, "Down", sep = ".")
)) %>%
select(
predictor, Cluster, gene, locus_label, locus_symbol,
desc_merged, logFC, P = adj.P.Val, mahalanobis
) %>%
filter(!is.na(locus_label) & !is.na(Cluster)) %>%
mutate(neglogPG = -log10(P))
# Top gene per term
term2gene_label <- to_plot %>%
filter(ID %in% with_known_gene) %>%
separate_longer_delim(geneID, delim = "/") %>%
rename(gene = "geneID") %>%
inner_join(effects_gene_label, relationship = "many-to-many") %>%
select(
Cluster, gene, locus_label, ID, Description,
p.adjust, Count, mahalanobis, logFC, P, neglogPG
) %>%
distinct() %>%
group_by(Cluster, ID) %>%
arrange(Cluster, ID, -mahalanobis) %>%
slice(1)
levels(to_plot$Cluster) <- cluster_order
levels(term2gene_label$Cluster) <- cluster_order
labels <- paste0(
gsub(".", " ", cluster_order, fixed = TRUE),
"\n(", n_genes, ")"
)
names(labels) <- names(n_genes)
bg <- to_plot %>%
ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
ggtitle("GO Biological Process Annotation of DEGs") +
scale_fill_continuous(
low = "dodgerblue",
high = "tomato",
name = "-log10(FDR)"
) +
scale_y_discrete(limits = rev(levels(to_plot$Description))) +
scale_x_discrete(labels = labels, drop = FALSE) +
geom_point(shape = 21) +
scale_size(range = c(2, 8)) +
ylab(NULL) +
xlab(NULL) +
DOSE::theme_dose(font.size = 14) +
theme(
plot.title = element_text(face = "bold", size = 20),
legend.position = c(0.87, 0.67),
legend.box = "horizontal",
axis.text.x = element_text(vjust = 0.5)
)
annotation_plot <- bg +
geom_text(
data = term2gene_label,
position = position_nudge(0.1),
hjust = 0,
vjust = 0.5,
fontface = "italic",
mapping = aes(x = Cluster, y = Description, label = locus_label),
inherit.aes = FALSE
)
print(annotation_plot)
genes_in_enriched_terms <- to_plot %>%
separate_longer_delim(geneID, delim = "/") %>%
rename(gene = "geneID") %>%
select(Cluster, ID, Description, gene)
cluster_codes <- c(
"Leaf.Down" = "LD", "Leaf.Up" = "LU",
"-P.Down" = "PD", "-P.Up" = "PU",
"inv4m.Down" = "ID", "inv4m.Up" = "IU"
)
gene_summary <- genes_in_enriched_terms %>%
group_by(gene) %>%
summarise(
n_terms = n_distinct(ID),
cluster_string = paste(
sort(unique(cluster_codes[as.character(Cluster)])),
collapse = ""
),
go_categories = paste(
sort(unique(Description)),
collapse = "; "
),
.groups = "drop"
)
curated_genes <- effects_gene_label %>%
filter(predictor != "Inv4m", Cluster != "-P.Down") %>%
group_by(Cluster) %>%
mutate(relative_mahalanobis = mahalanobis / max(mahalanobis, na.rm = TRUE)) %>%
ungroup() %>%
inner_join(gene_summary, by = "gene") %>%
group_by(cluster_string, gene) %>%
arrange(cluster_string, gene, -relative_mahalanobis) %>%
slice(1) %>%
ungroup() %>%
select(-predictor, -Cluster) %>%
select(cluster_string, locus_label, desc_merged, everything())
cat("Total genes in enriched terms:", nrow(curated_genes), "\n")
## Total genes in enriched terms: 104
cat("Terms per gene range:", range(curated_genes$n_terms), "\n")
## Terms per gene range: 1 4
hiconf_leaf <- effects_df %>%
filter(is_hiconf_DEG, predictor == "Leaf") %>%
pull(gene) %>%
unique()
hiconf_p <- effects_df %>%
filter(is_hiconf_DEG, predictor == "-P") %>%
pull(gene) %>%
unique()
annotated_genes <- compGO_BP@compareClusterResult %>%
separate_longer_delim(geneID, delim = "/") %>%
pull(geneID) %>%
unique()
annotated_leaf <- intersect(hiconf_leaf, annotated_genes)
annotated_p <- intersect(hiconf_p, annotated_genes)
cat("High-confidence DEGs:\n")
## High-confidence DEGs:
cat(" Leaf:", length(hiconf_leaf), "| -P:", length(hiconf_p), "\n")
## Leaf: 1102 | -P: 790
cat("Annotated DEGs:\n")
## Annotated DEGs:
cat(" Leaf:", length(annotated_leaf), "| -P:", length(annotated_p), "\n")
## Leaf: 440 | -P: 295
p1 <- ggvenn(
list(Leaf = annotated_leaf, `-P` = annotated_p),
fill_color = c("#0073C2FF", "#EFC000FF"),
stroke_size = 0.5,
set_name_size = 5,
text_size = 4
) +
ggtitle("Annotated High Confidence DEGs") +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
p2 <- ggvenn(
list(Leaf = hiconf_leaf, `-P` = hiconf_p),
fill_color = c("#0073C2FF", "#EFC000FF"),
stroke_size = 0.5,
set_name_size = 5,
text_size = 4
) +
ggtitle("High Confidence DEGs") +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
combined_venn <- ggarrange(p1, p2, ncol = 2, nrow = 1)
print(combined_venn)
dual_annotated <- length(intersect(annotated_leaf, annotated_p))
single_annotated <- length(union(
setdiff(annotated_leaf, annotated_p),
setdiff(annotated_p, annotated_leaf)
))
dual_all <- length(intersect(hiconf_leaf, hiconf_p))
single_all <- length(union(
setdiff(hiconf_leaf, hiconf_p),
setdiff(hiconf_p, hiconf_leaf)
))
contingency <- matrix(
c(dual_annotated, single_annotated, dual_all, single_all),
nrow = 2,
ncol = 2,
byrow = TRUE,
dimnames = list(
Status = c("Annotated", "All"),
Response = c("Dual", "Single")
)
)
fisher_result <- fisher.test(contingency, alternative = "greater")
cat("\n--- Fisher's Exact Test ---\n")
##
## --- Fisher's Exact Test ---
print(contingency)
## Response
## Status Dual Single
## Annotated 181 373
## All 366 1160
cat("OR =", round(fisher_result$estimate, 2),
"| P =", format.pval(fisher_result$p.value, digits = 2), "\n")
## OR = 1.54 | P = 5.4e-05
DEGs_leaf_p <- DEGs[!grepl("inv4m", names(DEGs))]
n_genes_leaf_p <- sapply(DEGs_leaf_p, length)
cluster_order_leaf_p <- cluster_order[!grepl("inv4m", cluster_order)]
to_plot_leaf_p <- to_plot %>%
filter(!grepl("inv4m", Cluster)) %>%
mutate(Cluster = factor(Cluster, levels = cluster_order_leaf_p))
term2gene_label_leaf_p <- term2gene_label %>%
filter(!grepl("inv4m", Cluster)) %>%
mutate(Cluster = factor(Cluster, levels = cluster_order_leaf_p)) %>%
droplevels()
cat("Gene counts (Leaf/-P):\n")
## Gene counts (Leaf/-P):
print(n_genes_leaf_p)
## Leaf.Down Leaf.Up -P.Down -P.Up
## 468 634 129 661
labels_leaf_p <- paste0(
gsub(".", " ", cluster_order_leaf_p, fixed = TRUE),
"\n(", n_genes_leaf_p, ")"
)
names(labels_leaf_p) <- names(n_genes_leaf_p)
bg_leaf_p <- to_plot_leaf_p %>%
ggplot(aes(x = Cluster, y = Description, size = Count, fill = neglogP)) +
ggtitle("GO Biological Process Annotation of DEGs") +
scale_fill_continuous(
low = "dodgerblue",
high = "tomato",
name = "-log10(FDR)"
) +
scale_y_discrete(limits = rev(levels(to_plot_leaf_p$Description))) +
scale_x_discrete(labels = labels_leaf_p, drop = FALSE) +
geom_point(shape = 21) +
scale_size(range = c(2, 8)) +
ylab(NULL) +
xlab(NULL) +
DOSE::theme_dose(font.size = 14) +
theme(
plot.title = element_text(face = "bold", size = 20),
legend.position = c(0.15, 0.2),
legend.box = "horizontal",
axis.text.x = element_text(vjust = 0.5)
)
annotation_plot_leaf_p <- bg_leaf_p +
geom_text(
data = term2gene_label_leaf_p,
position = position_nudge(0.1),
hjust = 0,
vjust = 0.5,
fontface = "italic",
mapping = aes(x = Cluster, y = Description, label = locus_label),
inherit.aes = FALSE
)
print(annotation_plot_leaf_p)
ggsave(annotation_plot, file = "~/Desktop/FC_annotation_plot_all.svg",
height = 6.75, width = 12)
ggsave(annotation_plot_leaf_p, file = "~/Desktop/FC_annotation_plot_leaf_P.svg",
height = 6.75, width = 10)
ggsave(combined_venn, file = "~/Desktop/venn_hiconf_degs.svg",
height = 6, width = 12)
write.csv(curated_genes, file = "~/Desktop/selected_degs_GO.csv", row.names = FALSE)
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggvenn_0.1.19 forcats_1.0.1 tidyr_1.3.1
## [4] clusterProfiler_4.17.0 ggpubr_0.6.2 ggplot2_4.0.0
## [7] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gson_0.1.0 rlang_1.1.6
## [4] magrittr_2.0.4 DOSE_4.3.0 compiler_4.5.1
## [7] RSQLite_2.4.3 systemfonts_1.3.1 png_0.1-8
## [10] vctrs_0.6.5 reshape2_1.4.4 stringr_1.5.2
## [13] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [16] backports_1.5.0 XVector_0.49.1 labeling_0.4.3
## [19] rmarkdown_2.30 enrichplot_1.29.3 purrr_1.1.0
## [22] bit_4.6.0 xfun_0.53 cachem_1.1.0
## [25] aplot_0.2.9 jsonlite_2.0.0 blob_1.2.4
## [28] BiocParallel_1.43.4 broom_1.0.10 parallel_4.5.1
## [31] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [34] RColorBrewer_1.1-3 car_3.1-3 jquerylib_0.1.4
## [37] GOSemSim_2.35.2 Rcpp_1.1.0 Seqinfo_0.99.2
## [40] knitr_1.50 ggtangle_0.0.7 R.utils_2.13.0
## [43] IRanges_2.43.5 Matrix_1.7-4 splines_4.5.1
## [46] igraph_2.2.0 tidyselect_1.2.1 qvalue_2.41.0
## [49] rstudioapi_0.17.1 abind_1.4-8 yaml_2.3.10
## [52] codetools_0.2-20 lattice_0.22-7 tibble_3.3.0
## [55] plyr_1.8.9 treeio_1.33.0 Biobase_2.69.1
## [58] withr_3.0.2 KEGGREST_1.49.2 S7_0.2.0
## [61] evaluate_1.0.5 gridGraphics_0.5-1 Biostrings_2.77.2
## [64] pillar_1.11.1 ggtree_3.99.2 carData_3.0-5
## [67] stats4_4.5.1 ggfun_0.2.0 generics_0.1.4
## [70] S4Vectors_0.47.4 scales_1.4.0 tidytree_0.4.6
## [73] glue_1.8.0 gdtools_0.4.4 lazyeval_0.2.2
## [76] tools_4.5.1 data.table_1.17.8 fgsea_1.35.8
## [79] ggsignif_0.6.4 ggiraph_0.9.2 fs_1.6.6
## [82] fastmatch_1.1-6 cowplot_1.2.0 grid_4.5.1
## [85] ape_5.8-1 AnnotationDbi_1.71.2 nlme_3.1-168
## [88] patchwork_1.3.2 Formula_1.2-5 cli_3.6.5
## [91] rappdirs_0.3.3 fontBitstreamVera_0.1.1 gtable_0.3.6
## [94] R.methodsS3_1.8.2 rstatix_0.7.3 yulab.utils_0.2.1
## [97] fontquiver_0.2.1 sass_0.4.10 digest_0.6.37
## [100] BiocGenerics_0.55.4 ggrepel_0.9.6 ggplotify_0.1.3
## [103] htmlwidgets_1.6.4 farver_2.1.2 memoise_2.0.1
## [106] htmltools_0.5.8.1 R.oo_1.27.1 lifecycle_1.0.4
## [109] httr_1.4.7 GO.db_3.22.0 fontLiberation_0.1.0
## [112] bit64_4.6.0-1