BugSigDB is a manually curated database of published microbial signatures — sets of taxa reported as differentially abundant in human or animal microbiome studies. It covers a wide range of conditions, body sites, and experimental designs.
This report explores the landscape of studies in BugSigDB with an emphasis on meta-analysis entries, which aggregate findings across multiple primary studies. Understanding which conditions and body sites are well-represented by meta-analyses can help guide future curation priorities.
library(bugsigdbr) # BugSigDB data access
library(dplyr) # Data manipulation
library(ggplot2) # Visualisation
library(tidyr) # Data tidying
library(forcats) # Factor reordering helpers
library(knitr) # Table renderingstudies <- bugsigdbr::importBugSigDB()
cat("Dimensions:", dim(studies)[1], "rows x", dim(studies)[2], "columns\n")## Dimensions: 8163 rows x 50 columns
# High-level counts
tibble(
Metric = c("Total signatures", "Unique studies (PMIDs)", "Unique conditions",
"Unique body sites", "Unique study designs"),
Count = c(
nrow(studies),
n_distinct(studies$PMID),
n_distinct(studies$Condition),
n_distinct(studies$`Body site`),
n_distinct(studies$`Study design`)
)
) |>
kable(caption = "BugSigDB at a glance")| Metric | Count |
|---|---|
| Total signatures | 8163 |
| Unique studies (PMIDs) | 1374 |
| Unique conditions | 589 |
| Unique body sites | 238 |
| Unique study designs | 17 |
studies |>
count(`Study design`, name = "n_signatures") |>
mutate(`Study design` = fct_reorder(`Study design`, n_signatures)) |>
ggplot(aes(x = `Study design`, y = n_signatures, fill = n_signatures)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
coord_flip() +
theme_minimal(base_size = 12) +
theme(panel.grid.major.y = element_blank()) +
labs(
x = NULL,
y = "Number of signatures",
title = "Frequency of study designs in BugSigDB"
)Number of microbial signatures per study design in BugSigDB.
BugSigDB contains three study-design values that include meta-analysis.
meta_designs <- c(
"meta-analysis",
"case-control,meta-analysis",
"laboratory experiment,meta-analysis"
)
meta <- studies |>
filter(`Study design` %in% meta_designs)
cat("Meta-analysis signatures:", nrow(meta), "\n")## Meta-analysis signatures: 139
## Unique studies (PMIDs): 22
meta |>
count(`Study design`, name = "Signatures") |>
arrange(desc(Signatures)) |>
kable(caption = "Meta-analysis sub-types in BugSigDB")| Study design | Signatures |
|---|---|
| meta-analysis | 117 |
| case-control,meta-analysis | 14 |
| laboratory experiment,meta-analysis | 8 |
meta |>
count(Condition, name = "n") |>
slice_max(n, n = 15) |>
mutate(Condition = fct_reorder(Condition, n)) |>
ggplot(aes(x = Condition, y = n, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#A9DFBF", high = "#1D8348") +
coord_flip() +
theme_minimal(base_size = 12) +
theme(panel.grid.major.y = element_blank()) +
labs(
x = NULL,
y = "Number of signatures",
title = "Top 15 conditions in meta-analysis studies"
)Top 15 conditions studied in BugSigDB meta-analyses by number of signatures.
meta |>
count(`Body site`, name = "n") |>
filter(!is.na(`Body site`)) |>
slice_max(n, n = 15) |>
mutate(`Body site` = fct_reorder(`Body site`, n)) |>
ggplot(aes(x = `Body site`, y = n, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#F9E79F", high = "#B7950B") +
coord_flip() +
theme_minimal(base_size = 12) +
theme(panel.grid.major.y = element_blank()) +
labs(
x = NULL,
y = "Number of signatures",
title = "Top body sites in meta-analysis studies"
)Body sites represented in BugSigDB meta-analysis signatures.
Meta-analyses report taxa as either increased or decreased relative to a reference group. Examining this balance reveals whether the literature tends to highlight enrichment or depletion signals.
meta |>
filter(!is.na(`Abundance in Group 1`)) |>
count(`Abundance in Group 1`, name = "n") |>
ggplot(aes(x = `Abundance in Group 1`, y = n,
fill = `Abundance in Group 1`)) +
geom_col(width = 0.5, show.legend = FALSE) +
scale_fill_manual(values = c("increased" = "#E74C3C",
"decreased" = "#2980B9")) +
theme_minimal(base_size = 13) +
theme(panel.grid.major.x = element_blank()) +
labs(
x = "Reported direction",
y = "Number of signatures",
title = "Direction of dysbiosis in meta-analysis signatures"
)Direction of microbial change reported in meta-analysis signatures.
The heatmap below shows which condition–body-site pairs have the most meta-analysis signatures, making gaps in coverage immediately visible.
# Identify top conditions and body sites to keep the plot readable
top_conditions <- meta |>
count(Condition) |>
slice_max(n, n = 12) |>
pull(Condition)
top_bodysites <- meta |>
count(`Body site`) |>
filter(!is.na(`Body site`)) |>
slice_max(n, n = 12) |>
pull(`Body site`)
meta |>
filter(Condition %in% top_conditions, `Body site` %in% top_bodysites) |>
count(Condition, `Body site`, name = "n") |>
complete(Condition, `Body site`, fill = list(n = 0)) |>
ggplot(aes(x = `Body site`, y = Condition, fill = n)) +
geom_tile(colour = "white", linewidth = 0.4) +
geom_text(aes(label = ifelse(n > 0, n, "")),
size = 3, colour = "white", fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1A5276", name = "Signatures") +
theme_minimal(base_size = 11) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1),
panel.grid = element_blank()
) +
labs(
x = "Body site",
y = "Condition",
title = "Meta-analysis coverage: condition × body site"
)Heatmap of meta-analysis signature counts by condition and body site (top 12 of each).
meta |>
filter(!is.na(Year)) |>
count(Year, name = "n") |>
arrange(Year) |>
mutate(cumulative = cumsum(n)) |>
ggplot(aes(x = Year, y = cumulative)) +
geom_area(fill = "#D7BDE2", alpha = 0.6) +
geom_line(colour = "#6C3483", linewidth = 1) +
geom_point(colour = "#6C3483", size = 2) +
theme_minimal(base_size = 12) +
theme(panel.grid.minor = element_blank()) +
labs(
x = "Publication year",
y = "Cumulative signatures",
title = "Growth of meta-analysis signatures in BugSigDB over time"
)Cumulative number of meta-analysis signatures added to BugSigDB by publication year.
tibble(
Aspect = c("Total meta-analysis signatures",
"Unique studies (PMIDs)",
"Most common condition",
"Most common body site",
"Most common direction"),
Finding = c(
as.character(nrow(meta)),
as.character(n_distinct(meta$PMID)),
meta |> count(Condition) |> slice_max(n, n=1) |> pull(Condition),
meta |> count(`Body site`) |> filter(!is.na(`Body site`)) |>
slice_max(n, n=1) |> pull(`Body site`),
meta |> filter(!is.na(`Abundance in Group 1`)) |>
count(`Abundance in Group 1`) |>
slice_max(n, n=1) |> pull(`Abundance in Group 1`)
)
) |>
kable(caption = "Key findings from the exploratory analysis")| Aspect | Finding |
|---|---|
| Total meta-analysis signatures | 139 |
| Unique studies (PMIDs) | 22 |
| Most common condition | Health study participation |
| Most common body site | Feces |
| Most common direction | increased |
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Africa/Lagos
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.51 forcats_1.0.1 tidyr_1.3.2 ggplot2_4.0.2
## [5] dplyr_1.2.0 bugsigdbr_1.17.2
##
## loaded via a namespace (and not attached):
## [1] bit_4.6.0 gtable_0.3.6 jsonlite_2.0.0
## [4] compiler_4.5.2 filelock_1.0.3 tidyselect_1.2.1
## [7] blob_1.3.0 jquerylib_0.1.4 scales_1.4.0
## [10] yaml_2.3.12 fastmap_1.2.0 R6_2.6.1
## [13] labeling_0.4.3 generics_0.1.4 curl_7.0.0
## [16] httr2_1.2.2 tibble_3.3.1 DBI_1.3.0
## [19] bslib_0.10.0 pillar_1.11.1 RColorBrewer_1.1-3
## [22] rlang_1.1.7 cachem_1.1.0 xfun_0.57
## [25] sass_0.4.10 S7_0.2.1 bit64_4.6.0-1
## [28] otel_0.2.0 memoise_2.0.1 RSQLite_2.4.6
## [31] cli_3.6.5 withr_3.0.2 magrittr_2.0.4
## [34] digest_0.6.39 grid_4.5.2 rstudioapi_0.18.0
## [37] dbplyr_2.5.2 rappdirs_0.3.4 BiocFileCache_3.0.0
## [40] lifecycle_1.0.5 vctrs_0.7.2 evaluate_1.0.5
## [43] glue_1.8.0 farver_2.1.2 rmarkdown_2.30
## [46] purrr_1.2.1 tools_4.5.2 pkgconfig_2.0.3
## [49] htmltools_0.5.9