Analysis objectives

Import, recode, and subset data from bugsigdb.org
Create a table of studies
Create a clustered heatmap showing similarity of signatures from independent studies
Calculate the frequency of appearance of each taxa in independent signatures, and identify the most frequently reported taxa
Estimate the probability of the most frequently identified taxa occuring so frequently by chance

Packages installation

Install packages (not evaluated in vignette)

install.packages(c("devtools", "tidyverse", "kableExtra", "gt", "glue"))
devtools::install_github("waldronlab/bugSigSimple")
devtools::install_github("waldronlab/BugSigDBStats")
devtools::install_github("waldronlab/bugsigdbr")

Data import, recoding, and subset

library(bugSigSimple)
dat <- bugsigdbr::importBugSigDB(cache = FALSE)

## Rows: 2032 Columns: 11

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Signature page name, Experiment, Study, Source, Curated date, Cura...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 1204 Columns: 27

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): Experiment page name, Study, Location of subjects, Host species, B...
## dbl  (3): Group 0 sample size, Significance threshold, LDA Score above
## lgl  (2): 16s variable region, MHT correction

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 532 Columns: 10

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Study page name, Study design, DOI, URL, Authors, Title, Journal, A...
## dbl (2): PMID, Year

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(dat)

## [1] 2073   45

names(dat)

##  [1] "Study"                      "Study design"              
##  [3] "PMID"                       "DOI"                       
##  [5] "URL"                        "Authors"                   
##  [7] "Title"                      "Journal"                   
##  [9] "Year"                       "Abstract"                  
## [11] "Experiment"                 "Location of subjects"      
## [13] "Host species"               "Body site"                 
## [15] "Condition"                  "Group 0 name"              
## [17] "Group 1 name"               "Group 1 definition"        
## [19] "Group 0 sample size"        "Group 1 sample size"       
## [21] "Antibiotics exclusion"      "Sequencing type"           
## [23] "16s variable region"        "Sequencing platform"       
## [25] "Statistical test"           "Significance threshold"    
## [27] "MHT correction"             "LDA Score above"           
## [29] "Matched on"                 "Confounders controlled for"
## [31] "Pielou"                     "Shannon"                   
## [33] "Chao1"                      "Simpson"                   
## [35] "Inverse Simpson"            "Richness"                  
## [37] "Signature page name"        "Source"                    
## [39] "Curated date"               "Curator"                   
## [41] "Revision editor"            "Description"               
## [43] "Abundance in Group 1"       "MetaPhlAn taxon names"     
## [45] "NCBI Taxonomy IDs"

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

condition_of_interest <- c("irritable bowel sydrome")
dat_condition <- subsetByCondition(dat, condition_of_interest) %>%
  mutate(comparison1 = paste(`Group 0 name`, `Group 1 name`, sep = " vs "))

Table of studies

bugSigSimple::createStudyTable(dat_condition)

Study	Condition	Cases	Controls	Study Design
Biagi 2011	irritable bowel sydrome	62	46	case-control,prospective cohort
Carroll 2012	irritable bowel sydrome	23	23	case-control
Duboc 2012	irritable bowel sydrome	14	18	case-control
Kerckhoffs 2009	irritable bowel sydrome	41	26	case-control
Maccaferri 2012	irritable bowel sydrome	19	24	randomized controlled trial
Mei 2021	irritable bowel sydrome	30	30	case-control
Saulnier 2011	irritable bowel sydrome	22	22	case-control
Shukla 2015	irritable bowel sydrome	47	30	case-control
Tana 2010	irritable bowel sydrome	26	26	case-control

Taxon frequency tables by body site

gut_sigs <- filter(dat_condition,
                   `Body site` %in% c("feces,mucosa of small intestine", "feces"))

In this table, the Binomial Test p-value corresponds to the null hypothesis

H0: the proportion of signatures in which the taxon is reported increased or decreased, relative to the total number of signatures in which it is reported, is equal to 0.5

kableExtra::kbl(bugSigSimple::createTaxonTable(gut_sigs))

Taxon Name	Taxonomic Level	total_signatures	increased_signatures	decreased_signatures	Binomial Test pval	kingdom	phylum	class	order	family	genus	species	n_signatures	metaphlan_name
Bifidobacterium catenulatum	species	7	1	6	0.130	Bacteria	Actinobacteria	Actinomycetia	Bifidobacteriales	Bifidobacteriaceae	Bifidobacterium	Bifidobacterium catenulatum	7	k__Bacteria\|p__Actinobacteria\|c__Actinomycetia\|o__Bifidobacteriales\|f__Bifidobacteriaceae\|g__Bifidobacterium\|s__Bifidobacterium catenulatum
Bacteroides	genus	7	4	3	1.000	Bacteria	Bacteroidetes	Bacteroidia	Bacteroidales	Bacteroidaceae	Bacteroides	NA	12	k__Bacteria\|p__Bacteroidetes\|c__Bacteroidia\|o__Bacteroidales\|f__Bacteroidaceae\|g__Bacteroides
Enterobacteriaceae	family	7	6	1	0.130	Bacteria	Proteobacteria	Gammaproteobacteria	Enterobacterales	Enterobacteriaceae	NA	NA	8	k__Bacteria\|p__Proteobacteria\|c__Gammaproteobacteria\|o__Enterobacterales\|f__Enterobacteriaceae
Bifidobacterium adolescentis	species	6	2	4	0.690	Bacteria	Actinobacteria	Actinomycetia	Bifidobacteriales	Bifidobacteriaceae	Bifidobacterium	Bifidobacterium adolescentis	6	k__Bacteria\|p__Actinobacteria\|c__Actinomycetia\|o__Bifidobacteriales\|f__Bifidobacteriaceae\|g__Bifidobacterium\|s__Bifidobacterium adolescentis
Bifidobacterium longum	species	6	4	2	0.690	Bacteria	Actinobacteria	Actinomycetia	Bifidobacteriales	Bifidobacteriaceae	Bifidobacterium	Bifidobacterium longum	6	k__Bacteria\|p__Actinobacteria\|c__Actinomycetia\|o__Bifidobacteriales\|f__Bifidobacteriaceae\|g__Bifidobacterium\|s__Bifidobacterium longum
Blautia	genus	6	5	1	0.220	Bacteria	Firmicutes	Clostridia	Eubacteriales	Lachnospiraceae	Blautia	NA	6	k__Bacteria\|p__Firmicutes\|c__Clostridia\|o__Eubacteriales\|f__Lachnospiraceae\|g__Blautia
Clostridioides difficile	species	6	6	0	0.031	Bacteria	Firmicutes	Clostridia	Eubacteriales	Peptostreptococcaceae	Clostridioides	Clostridioides difficile	6	k__Bacteria\|p__Firmicutes\|c__Clostridia\|o__Eubacteriales\|f__Peptostreptococcaceae\|g__Clostridioides\|s__Clostridioides difficile
Veillonella	genus	6	4	2	0.690	Bacteria	Firmicutes	Negativicutes	Veillonellales	Veillonellaceae	Veillonella	NA	6	k__Bacteria\|p__Firmicutes\|c__Negativicutes\|o__Veillonellales\|f__Veillonellaceae\|g__Veillonella
Bifidobacteriaceae	family	5	5	0	0.062	Bacteria	Actinobacteria	Actinomycetia	Bifidobacteriales	Bifidobacteriaceae	NA	NA	19	k__Bacteria\|p__Actinobacteria\|c__Actinomycetia\|o__Bifidobacteriales\|f__Bifidobacteriaceae
Bifidobacterium	genus	4	1	3	0.630	Bacteria	Actinobacteria	Actinomycetia	Bifidobacteriales	Bifidobacteriaceae	Bifidobacterium	NA	17	k__Bacteria\|p__Actinobacteria\|c__Actinomycetia\|o__Bifidobacteriales\|f__Bifidobacteriaceae\|g__Bifidobacterium

Cluster analysis

Note, this EDA should really be done before hypothesis testing.

First calculate pairwise overlaps for all signatures of length > 1:

allsigs <- bugsigdbr::getSignatures(dat_condition, tax.id.type = "taxname")
allsigs <- allsigs[sapply(allsigs, length) > 1] #require length > 1
length(allsigs)

## [1] 32

mydists <- BugSigDBStats::calcPairwiseOverlaps(allsigs)
dim(mydists)

## [1] 496   8

What is the distribution of signature lengths?

library(ggplot2)
siglengths <- sapply(allsigs, length)
siglengths.df <- data.frame(siglengths = siglengths)
ggplot(siglengths.df, aes(x=siglengths)) +
  geom_bar()

table(siglengths)

## siglengths
##  2  3  4  5  6  8  9 11 13 14 15 16 17 18 19 
##  7  5  3  2  2  1  2  2  1  1  1  2  1  1  1

Create a matrix of Jaccard similarities (0 for no overlap, 1 for 100% overlap)

signames <- unique(c(mydists$name1, mydists$name2))
jmat <- matrix(NA, nrow=length(signames), ncol=length(signames), dimnames=list(signames, signames))
diag(jmat) <- 1
for (i in 1:nrow(mydists)){
  jmat[mydists[i, "name2"], mydists[i, "name1"]] <- mydists[i, "jaccard"]
  jmat[mydists[i, "name1"], mydists[i, "name2"]] <- mydists[i, "jaccard"]
}

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

ha <- HeatmapAnnotation(`Signature Length` = anno_barplot(siglengths))
hr <- rowAnnotation(`Signature Length` = anno_barplot(siglengths))
hm <- Heatmap(
  jmat,
  top_annotation = ha, left_annotation = hr,
  row_names_max_width = unit(20, "cm"),
  column_names_max_height = unit(20, "cm"),
#  row_labels = sub(".+:", "", rownames(jmat)),  #get rid of study labels
  column_labels = sub(".+:", "", colnames(jmat))
)
hm

Use this interactively to make an interactive heatmap. Some expanding of the default size is required to see anything. Creating a sub-heatmap, then exporting it as a table, allows in-depth identification of the subgroups.

library(InteractiveComplexHeatmap)
hm <- draw(hm)
htShiny(hm)

hc <- hclust(dist(jmat))
plot(hc)

This tree can be cut to show the clusters, for example. The clusters of more than 1 signature but less than ~10 signatures are most likely to be something interesting.

clusts <- sort(cutree(hc, k = 8))  #set the number of clusters here with k
lapply(unique(clusts), function(i) names(clusts)[clusts == i])

## [[1]]
## [1] "bsdb:192/1/1_irritable-bowel-sydrome:diarrhea-predominant-irritable-bowel-syndrome_vs_healthy-controls_UP"
## [2] "bsdb:192/2/1_irritable-bowel-sydrome:diarrhea-predominant-irritable-bowel-syndrome_vs_healthy-controls_UP"
## [3] "bsdb:488/1/1_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_UP"                                         
## [4] "bsdb:488/1/2_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_DOWN"                                       
## [5] "bsdb:491/1/1_irritable-bowel-sydrome:IBS_vs_Healthy-Control_UP"                                           
## [6] "bsdb:491/1/2_irritable-bowel-sydrome:IBS_vs_Healthy-Control_DOWN"                                         
## [7] "bsdb:502/2/1_irritable-bowel-sydrome:IBS-C_vs_Healthy-Control_UP"                                         
## [8] "bsdb:502/4/1_irritable-bowel-sydrome:IBS-M_vs_Healthy-Control_UP"                                         
## [9] "bsdb:505/1/1_irritable-bowel-sydrome:IBS_vs_Healthy-Control_UP"                                           
## 
## [[2]]
## [1] "bsdb:443/1/1_irritable-bowel-sydrome:IBS_vs_Health-Control_UP"    
## [2] "bsdb:443/2/1_irritable-bowel-sydrome:IBS-C_vs_Health-Control_UP"  
## [3] "bsdb:443/3/1_irritable-bowel-sydrome:IBS-D_vs_Health-Control_UP"  
## [4] "bsdb:443/4/2_irritable-bowel-sydrome:IBS-U_vs_Health-Control_DOWN"
## 
## [[3]]
## [1] "bsdb:443/3/2_irritable-bowel-sydrome:IBS-D_vs_Health-Control_DOWN"
## [2] "bsdb:443/4/1_irritable-bowel-sydrome:IBS-U_vs_Health-Control_UP"  
## 
## [[4]]
## [1] "bsdb:499/1/1_irritable-bowel-sydrome:IBS-A_vs_Healthy-Control_DOWN"             
## [2] "bsdb:499/2/1_irritable-bowel-sydrome:IBS-C_vs_Healthy-Control_DOWN"             
## [3] "bsdb:499/3/1_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_DOWN"             
## [4] "bsdb:499/4/1_irritable-bowel-sydrome:IBS-(All-patients)_vs_Healthy-Control_DOWN"
## 
## [[5]]
## [1] "bsdb:499/1/2_irritable-bowel-sydrome:IBS-A_vs_Healthy-Control_UP"             
## [2] "bsdb:499/2/2_irritable-bowel-sydrome:IBS-C_vs_Healthy-Control_UP"             
## [3] "bsdb:499/3/2_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_UP"             
## [4] "bsdb:499/4/2_irritable-bowel-sydrome:IBS-(All-patients)_vs_Healthy-Control_UP"
## 
## [[6]]
## [1] "bsdb:502/1/1_irritable-bowel-sydrome:IBS_vs_Healthy-Control_UP"  
## [2] "bsdb:502/3/1_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_UP"
## 
## [[7]]
## [1] "bsdb:502/1/2_irritable-bowel-sydrome:IBS_vs_Healthy-Control_DOWN"  
## [2] "bsdb:502/2/2_irritable-bowel-sydrome:IBS-C_vs_Healthy-Control_DOWN"
## [3] "bsdb:502/3/2_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_DOWN"
## 
## [[8]]
## [1] "bsdb:505/1/2_irritable-bowel-sydrome:IBS_vs_Healthy-Control_DOWN"  
## [2] "bsdb:505/2/2_irritable-bowel-sydrome:IBS-D_vs_Healthy-Control_DOWN"
## [3] "bsdb:505/3/2_irritable-bowel-sydrome:IBS-A_vs_Healthy-Control_DOWN"
## [4] "bsdb:505/4/2_irritable-bowel-sydrome:IBS-C_vs_Healthy-Control_DOWN"

Create a wide-format dataframe

This would be suitable for regression analysis.

dat_withsigs <- filter(dat_condition, !is.na(dat_condition$`NCBI Taxonomy IDs`))
sigs <- bugsigdbr::getSignatures(dat_withsigs, tax.id.type = "taxname")
cmat <- t(safe::getCmatrix(sigs, as.matrix = TRUE, min.size = 0, prune = FALSE))

## WARNING: rows are sorted elements of keyword.list
## 41 categories formed

cdf <- data.frame(cmat, stringsAsFactors = FALSE, check.names = FALSE)
cdf <- cbind(dat_withsigs, cdf)
colnames(cdf)[1:54]

##  [1] "Study"                      "Study design"              
##  [3] "PMID"                       "DOI"                       
##  [5] "URL"                        "Authors"                   
##  [7] "Title"                      "Journal"                   
##  [9] "Year"                       "Abstract"                  
## [11] "Experiment"                 "Location of subjects"      
## [13] "Host species"               "Body site"                 
## [15] "Condition"                  "Group 0 name"              
## [17] "Group 1 name"               "Group 1 definition"        
## [19] "Group 0 sample size"        "Group 1 sample size"       
## [21] "Antibiotics exclusion"      "Sequencing type"           
## [23] "16s variable region"        "Sequencing platform"       
## [25] "Statistical test"           "Significance threshold"    
## [27] "MHT correction"             "LDA Score above"           
## [29] "Matched on"                 "Confounders controlled for"
## [31] "Pielou"                     "Shannon"                   
## [33] "Chao1"                      "Simpson"                   
## [35] "Inverse Simpson"            "Richness"                  
## [37] "Signature page name"        "Source"                    
## [39] "Curated date"               "Curator"                   
## [41] "Revision editor"            "Description"               
## [43] "Abundance in Group 1"       "MetaPhlAn taxon names"     
## [45] "NCBI Taxonomy IDs"          "comparison1"               
## [47] "[Clostridium] cellulosi"    "[Clostridium] leptum"      
## [49] "[Clostridium] symbiosum"    "[Eubacterium] rectale"     
## [51] "[Ruminococcus] gnavus"      "[Ruminococcus] lactaris"   
## [53] "Actinobacteria"             "Aeromonadaceae"

Note this has a number of columns that are mostly zeros, it could be filtered significantly for any regression or machine learning analysis:

table(cdf[["Bifidobacterium catenulatum"]])

## 
##  0  1 
## 34  7

Create another heatmap on correlations of presence/absence of taxa. This is not necessary because the previous Jaccard Index heatmap is probably better, it is just a demonstration of doing something with the taxa presence/absence directly.

sigcors <- cor(t(cmat))
siglengths <- sapply(sigs, length)
ha <- HeatmapAnnotation(`Signature Length` = anno_barplot(siglengths))
hr <- rowAnnotation(`Signature Length` = anno_barplot(siglengths))
hm <- Heatmap(
  sigcors,
  top_annotation = ha, left_annotation = hr,
  row_names_max_width = unit(20, "cm"),
  column_names_max_height = unit(20, "cm"),
 # row_labels = sub(".+:", "", rownames(sigcors)), ##removing study just to make signature names legible
  column_labels = sub(".+:", "", colnames(sigcors))
)
hm

Use this interactively to make an interactive heatmap:

library(InteractiveComplexHeatmap)
hm <- draw(hm)
htShiny(hm)

Irritable Bowel Syndrome

Kweku Amoo