library(bugSetEnrichment)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
All signatures
full.dat <- importBugSigDB()
## Using cached version from 2021-03-26 15:06:02
## Using cached version from 2021-03-26 15:11:16
## Using cached version from 2021-03-26 15:11:26
Extract all signatures by one curator (choosing only those of decreased abundance in group 1 for example). See argument tax.level in ?extractBugSigs
to restrict e.g. to genus; default is all taxonomic levels.
my.dat <- full.dat[full.dat$Curator == "Mst Afroza Parvin", ] # data subset
my.sigs <- extractBugSigs(my.dat, direction = "decreased") # signatures
my.siglengths <- sapply(my.sigs, length) #signature lengths
sort(my.siglengths, decreasing = TRUE)
## PMID:31976177_2 PMID:29186529_1 PMID:30267022_1 PMID:30366118_1
## 45 38 21 19
## PMID:31226994_2 PMID:31226994_1 PMID:25444008_2 PMID:25444008_3
## 11 10 9 9
## PMID:25444008_4 PMID:29186529_3 PMID:30534614_1 PMID:25444008_1
## 9 6 6 5
## PMID:31006170_1 PMID:31006170_4 PMID:29935270_1 PMID:25444008_9
## 5 5 5 4
## PMID:31006170_2 PMID:30534614_2 PMID:25444008_7 PMID:31006170_3
## 4 4 3 3
## PMID:29554907_1 PMID:29554907_2 PMID:30568265_1 PMID:32143621_2
## 3 3 3 3
## PMID:32429742_1 PMID:25761741_1 PMID:25761741_2 PMID:26971496_1
## 3 3 3 2
## PMID:28789705_1 PMID:28789705_2 PMID:25444008_6 PMID:25444008_10
## 2 2 2 2
## PMID:24819377_1 PMID:32380969_1 PMID:32380969_2 PMID:31976177_1
## 2 2 2 2
## PMID:30568265_2 PMID:30568265_3 PMID:31934770_1 PMID:22546742_1
## 2 2 2 2
## PMID:26745497_1 PMID:29186529_2 PMID:26971496_2 PMID:28789705_3
## 2 1 1 1
## PMID:25444008_5 PMID:25444008_8 PMID:25444008_11 PMID:32012716_1
## 1 1 1 1
## PMID:31337807_1 PMID:31337807_2 PMID:31337807_3 PMID:31337807_4
## 1 1 1 1
## PMID:30042470_1 PMID:28808302_1 PMID:28808302_2 PMID:26551842_1
## 1 1 1 1
## PMID:26551842_2 PMID:26551842_3 PMID:32143621_1 PMID:23631345_1
## 1 1 1 1
## PMID:23459324_1 PMID:23459324_2 PMID:29935270_2
## 1 1 1
Relevant background data (may want to choose more carefully, this just chooses all signatures of the same body site(s))
relevant.dat <- full.dat[full.dat$`Body site` %in% my.dat$`Body site`, ]
relevant.sigs <- extractBugSigs(my.dat)
## Zero length signatures dropped:PMID:26745497_1
Perform the simulation. This is how frequent the most commonly observed taxon would be expected to be seen by chance when randomly drawing taxa from the “relevant background” (alpha = 0.05).
getCriticalN(relevant.sigs, my.siglengths)
## 95%
## 11
And compare to the most frequently observed taxa:
frequencySigs(my.sigs)
## g__Bacteroides f__Bifidobacteriaceae g__Bifidobacterium
## 12 10 9
## f__Bacteroidaceae f__Porphyromonadaceae p__Actinobacteria
## 7 6 5
## f__Clostridiaceae g__Clostridium g__Blautia
## 5 5 5
## g__Oscillospira
## 5
Extract all signatures by one curator (choosing only those of decreased abundance in group 1 for example). See argument tax.level in ?extractBugSigs
to restrict e.g. to genus; default is all taxonomic levels.
curator <- "Cynthia Anderson"
bodysites <- c("cervicovaginal secretion", "endocervix", "posterior fornix of vagina", "uterine cervix", "vagina")
my.dat <- full.dat[full.dat$Curator == curator & full.dat$`Body site` %in% bodysites, ] # data subset
my.sigs <- extractBugSigs(my.dat, direction = "decreased") # signatures
## Zero length signatures dropped:PMID:28860468_2 PMID:23717441_1
my.siglengths <- sapply(my.sigs, length) #signature lengths
sort(my.siglengths, decreasing = TRUE)
## PMID:29765068_4 PMID:29765068_7 PMID:29765068_3 PMID:33241010_1
## 23 20 19 19
## PMID:29765068_2 PMID:31555603_1 PMID:30986570_2 PMID:29765068_1
## 16 15 14 12
## PMID:30463989_1 PMID:29563617_1 PMID:30986570_1 PMID:32842982_1
## 11 10 9 9
## PMID:32842982_2 PMID:32842982_3 PMID:32842982_4 PMID:29765068_5
## 9 9 9 8
## PMID:29765068_8 PMID:26935422_3 PMID:30782659_3 PMID:26062721_1
## 8 7 6 6
## PMID:29765068_9 PMID:31370796_2 PMID:30782659_2 PMID:32332850_1
## 6 6 5 5
## PMID:29313867_1 PMID:29313867_2 PMID:28860468_1 PMID:29765068_10
## 5 5 4 4
## PMID:31749298_1 PMID:29955075_1 PMID:33241010_2 PMID:29765068_6
## 4 4 4 3
## PMID:31370796_1 PMID:30782659_1 PMID:26574055_1 PMID:32214382_1
## 3 2 2 2
## PMID:32214382_2 PMID:30463989_3 PMID:30463989_4 PMID:26935422_1
## 2 2 2 2
## PMID:30405584_1 PMID:30841606_1 PMID:30841606_2 PMID:30463989_2
## 2 2 2 1
## PMID:30463989_5 PMID:23717441_2 PMID:26935422_2 PMID:30405584_2
## 1 1 1 1
## PMID:30405584_3 PMID:30405584_4
## 1 1
Relevant background data (may want to choose more carefully, this just chooses all signatures of the same body site(s))
relevant.dat <- full.dat[full.dat$`Body site` %in% my.dat$`Body site`, ]
relevant.sigs <- extractBugSigs(my.dat)
## Zero length signatures dropped:PMID:28860468_4 PMID:23717441_2
Perform the simulation. This is how frequent the most commonly observed taxon would be expected to be seen by chance when randomly drawing taxa from the “relevant background” (alpha = 0.05).
getCriticalN(relevant.sigs, my.siglengths)
## 95%
## 12
And compare to the most frequently observed taxa:
frequencySigs(my.sigs)
## g__Lactobacillus c__Bacilli
## 16 10
## o__Lactobacillales f__Lactobacillaceae
## 10 9
## p__Firmicutes s__Peptoniphilus grossensis
## 8 7
## g__Bifidobacterium s__Lactobacillus gasseri
## 6 6
## o__Coriobacteriales f__Coriobacteriaceae
## 5 5