SIAMCAT is used for analysis
library(curatedMetagenomicData)
library(dplyr)
library(mia)
library(SIAMCAT)
submeta$study_name %>% table()
.
RaymondF_2016 VincentC_2016
72 229
Table should match the metadata table
colData(rayvince)$study_name %>% table()
.
RaymondF_2016 VincentC_2016
72 229
assay(rayvince) %>% colnames()
[1] "P10E0" "P10E7" "P10E90" "P11E0" "P11E7" "P11E90" "P12E0" "P12E7"
[9] "P12E90" "P13E0" "P13E7" "P13E90" "P14E0" "P14E7" "P14E90" "P15E0"
[17] "P15E7" "P15E90" "P17E0" "P17E7" "P17E90" "P18E0" "P18E7" "P18E90"
[25] "P19E0" "P19E7" "P19E90" "P1E0" "P1E7" "P1E90" "P20E0" "P20E7"
[33] "P20E90" "P21E0" "P21E7" "P21E90" "P22E0" "P22E7" "P22E90" "P23C0"
[41] "P23C7" "P23C90" "P25C0" "P25C7" "P25C90" "P2E0" "P2E7" "P2E90"
[49] "P38C0" "P38C7" "P38C90" "P3E0" "P3E7" "P3E90" "P4E0" "P4E7"
[57] "P4E90" "P5E0" "P5E7" "P5E90" "P6C0" "P6C7" "P6C90" "P7C0"
[65] "P7C7" "P7C90" "P8C0" "P8C7" "P8C90" "P9E0" "P9E7" "P9E90"
[73] "MM001.1" "MM001.2" "MM001.3" "MM001.4" "MM002.1" "MM002.2" "MM003.1" "MM003.2"
[81] "MM004.1" "MM004.2" "MM005.1" "MM005.2" "MM005.3" "MM006.1" "MM006.2" "MM007"
[89] "MM008.1" "MM008.2" "MM008.3" "MM009.1" "MM009.2" "MM010" "MM011" "MM012.1"
[97] "MM012.2" "MM013.1" "MM013.2" "MM013.3" "MM014" "MM015.1" "MM015.2" "MM016.1"
[105] "MM016.2" "MM016.3" "MM017" "MM018" "MM019.1" "MM019.2" "MM019.3" "MM020.1"
[113] "MM020.2" "MM020.3" "MM021.1" "MM021.2" "MM021.3" "MM021.4" "MM022.1" "MM022.2"
[121] "MM022.3" "MM023.1" "MM023.2" "MM024.1" "MM024.2" "MM024.3" "MM025" "MM026.1"
[129] "MM026.2" "MM027.1" "MM027.2" "MM027.3" "MM028.1" "MM028.2" "MM029.1" "MM029.2"
[137] "MM029.3" "MM030.1" "MM030.2" "MM031" "MM032.1" "MM032.2" "MM032.3" "MM032.4"
[145] "MM033.1" "MM033.2" "MM033.3" "MM033.4" "MM034.1" "MM034.2" "MM034.3" "MM035.1"
[153] "MM035.2" "MM036" "MM037.1" "MM037.2" "MM038" "MM039" "MM040.1" "MM040.2"
[161] "MM041.1" "MM041.2" "MM042.1" "MM042.2" "MM043.1" "MM043.2" "MM044.1" "MM044.2"
[169] "MM045.1" "MM045.2" "MM046.1" "MM046.2" "MM046.3" "MM047.1" "MM047.2" "MM047.3"
[177] "MM048.1" "MM048.2" "MM049.1" "MM049.2" "MM050.1" "MM050.2" "MM050.3" "MM050.4"
[185] "MM051" "MM052.1" "MM052.2" "MM053" "MM054.1" "MM054.2" "MM055.1" "MM055.3"
[193] "MM055.4" "MM055.5" "MM055.6" "MM055.7" "MM056.1" "MM056.2" "MM057.1" "MM057.10"
[201] "MM057.11" "MM057.12" "MM057.2" "MM057.3" "MM057.4" "MM057.5" "MM057.6" "MM057.7"
[209] "MM057.8" "MM057.9" "MM058.1" "MM058.2" "MM059.1" "MM059.2" "MM060.1" "MM060.2"
[217] "MM061" "MM062.1" "MM062.2" "MM063" "MM064.1" "MM064.2" "MM065" "MM066"
[225] "MM068.1" "MM068.2" "MM069" "MM070.1" "MM070.2" "MM070.3" "MM071.2" "MM071.3"
[233] "MM072.1" "MM072.2" "MM074.1" "MM074.2" "MM075.1" "MM075.2" "MM077" "MM078"
[241] "MM079.1" "MM079.2" "MM080" "MM081" "MM082" "MM083.1" "MM083.2" "MM083.3"
[249] "MM083.4" "MM083.5" "MM084" "MM085.1" "MM085.2" "MM086.2" "MM086.3" "MM086.5"
[257] "MM086.6" "MM087.1" "MM087.2" "MM088" "MM089.1" "MM089.2" "MM090.1" "MM090.2"
[265] "MM090.3" "MM091.1" "MM091.2" "MM091.3" "MM091.4" "MM092" "MM093" "MM094.1"
[273] "MM094.2" "MM096" "MM098.1" "MM098.10" "MM098.11" "MM098.12" "MM098.13" "MM098.14"
[281] "MM098.15" "MM098.2" "MM098.3" "MM098.4" "MM098.5" "MM098.6" "MM098.7" "MM098.8"
[289] "MM098.9" "MM099.1" "MM099.2" "MM099.3" "MM099.4" "MM100.1" "MM100.2" "MM101.1"
[297] "MM101.2" "MM103.1" "MM103.2" "MM103.3" "MM104"
#Subset to just Raymond post-exposure cases
There should be 24 data points (18 cases, 6 controls)
colData(ray)$study_name %>% table()
.
RaymondF_2016
24
phyloray <- makePhyloseqFromTreeSummarizedExperiment(ray, abund_values = "relative_abundance")
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
rowTree is pruned to match rownames.Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Not sure that this is necessary. I think SIAMCAT can just read the phyloseq object.
sample.phyloray <- sample_data(phyloray)
feat.phyloray <- otu_table(phyloray)
Remove two variables that are identical with condition of interest as they were causing issues down the line
sample.phyloray <- sample.phyloray[, -which(names(sample.phyloray) %in% c("antibiotics_family", "antibiotics_current_use"))]
label.ray <- create.label(meta=sample.phyloray, label="study_condition", case="cephalosporins")
Label used as case:
cephalosporins
Label used as control:
control
+ finished create.label.from.metadata in 0.007 s
sc.obj <- siamcat(feat=feat.phyloray, label=label.ray, meta=sample.phyloray)
+ starting validate.data
+++ checking overlap between labels and features
+ Keeping labels of 24 sample(s).
+++ checking sample number per class
Data set has a limited number of training examples:
control 6
cephalosporins 18
Note that a dataset this small/skewed is not necessarily suitable for analysis in this pipeline.
+++ checking overlap between samples and metadata
+ finished validate.data in 0.045 s
Cutoff is set at 0.001. Might want to experiment with this.
sc.filt <- filter.features(sc.obj, filter.method = "abundance", cutoff = 0.001)
Features successfully filtered
sc.assc <- check.associations(
sc.filt,
sort.by = 'fc',
alpha = 0.05,
fn.plot = 'assocation_plots.pdf',
mult.corr = "fdr",
feature.type = "filtered",
detect.lim = 10 ^-6,
plot.type = "quantile.box",
panels = c("fc", "prevalence", "auroc"))
[===========================================>------------------------------------------] 51%
[============================================>-----------------------------------------] 52%
[=============================================>----------------------------------------] 53%
[=============================================>----------------------------------------] 54%
[==============================================>---------------------------------------] 54%
[==============================================>---------------------------------------] 55%
[===============================================>--------------------------------------] 55%
[===============================================>--------------------------------------] 56%
[================================================>-------------------------------------] 56%
[================================================>-------------------------------------] 57%
[=================================================>------------------------------------] 58%
[==================================================>-----------------------------------] 59%
[===================================================>----------------------------------] 60%
[===================================================>----------------------------------] 61%
[====================================================>---------------------------------] 61%
[====================================================>---------------------------------] 62%
[=====================================================>--------------------------------] 62%
[=====================================================>--------------------------------] 63%
[======================================================>-------------------------------] 64%
[=======================================================>------------------------------] 65%
[========================================================>-----------------------------] 66%
[=========================================================>----------------------------] 67%
[=========================================================>----------------------------] 68%
[==========================================================>---------------------------] 68%
[==========================================================>---------------------------] 69%
[===========================================================>--------------------------] 69%
[===========================================================>--------------------------] 70%
[============================================================>-------------------------] 71%
[=============================================================>------------------------] 72%
[==============================================================>-----------------------] 73%
[==============================================================>-----------------------] 74%
[===============================================================>----------------------] 74%
[===============================================================>----------------------] 75%
[================================================================>---------------------] 75%
[================================================================>---------------------] 76%
[=================================================================>--------------------] 76%
[=================================================================>--------------------] 77%
[==================================================================>-------------------] 78%
[===================================================================>------------------] 79%
[====================================================================>-----------------] 80%
[====================================================================>-----------------] 81%
[=====================================================================>----------------] 81%
[=====================================================================>----------------] 82%
[======================================================================>---------------] 82%
[======================================================================>---------------] 83%
[=======================================================================>--------------] 84%
[========================================================================>-------------] 85%
[=========================================================================>------------] 86%
[==========================================================================>-----------] 87%
[==========================================================================>-----------] 88%
[===========================================================================>----------] 88%
[===========================================================================>----------] 89%
[============================================================================>---------] 89%
[============================================================================>---------] 90%
[=============================================================================>--------] 91%
[==============================================================================>-------] 92%
[===============================================================================>------] 93%
[===============================================================================>------] 94%
[================================================================================>-----] 94%
[================================================================================>-----] 95%
[=================================================================================>----] 95%
[=================================================================================>----] 96%
[==================================================================================>---] 96%
[==================================================================================>---] 97%
[===================================================================================>--] 98%
[====================================================================================>-] 99%
[======================================================================================] 100%
This checks confounders–does not work if any variable 100% matches condition of interest which is why we removed those two other variables earlier.
sc.conf <- check.confounders(
sc.filt,
fn.plot = 'confounder_plots.pdf',
meta.in = NULL,
feature.type = 'filtered', verbose = 1
)
++ metadata variables:
study_condition
++ are nested inside the label and have been removed from this analysis
++ remove metadata variables, since all subjects have the same value
study_namebody_sitediseaseage_categorycountrynon_westernizedsequencing_platformDNA_extraction_kitPMIDminimum_read_lengthmedian_read_lengthpregnantlactatingcuratordays_from_first_collectionsmoker
I’m guessing due to ranked unit normalization this is using Rankits
sc.norm <- normalize.features(
sc.filt,
norm.method = "rank.unit"
)
Features normalized successfully.
Split the data, not sure the difference between folds and resamples and how to select the right number–right now I just picked numbers that didn’t give an error message
sc.split <- create.data.split(
sc.norm,
num.folds = 6,
num.resample = 2
)
Features splitted for cross-validation successfully.
Lasso also seems to work–others gave weird results (weird ROCs)
sc.mod <- train.model(
sc.split,
method = "randomForest"
)
[=============>------------------------------------------------------------------------] 17%
[=====================>----------------------------------------------------------------] 25%
[============================>---------------------------------------------------------] 33%
[===================================>--------------------------------------------------] 42%
[==========================================>-------------------------------------------] 50%
[=================================================>------------------------------------] 58%
[========================================================>-----------------------------] 67%
[===============================================================>----------------------] 75%
[=======================================================================>--------------] 83%
[==============================================================================>-------] 92%
[======================================================================================] 100%
Trained randomForest models successfully.
sc.pred <- make.predictions(sc.mod)
Made predictions successfully.
pred_matrix <- pred_matrix(sc.pred)
sc.eval <- evaluate.predictions(sc.pred)
Evaluated predictions successfully.
model.evaluation.plot(sc.eval)
model.interpretation.plot(
sc.eval,
fn.plot = 'interpretation.pdf',
consens.thres = 0.01,
limits = c(-3, 3),
heatmap.type = 'zscore'
)
WARNING: restricting amount of features to be plotted to 50
vince <- subset(rayvince, select =colData(rayvince)$study_name=="VincentC_2016")
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by ‘tidytree’
colData(vince)$antibiotics_family %>% str_detect("cephalosporins")
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[16] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[31] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[46] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[61] NA NA NA NA NA TRUE TRUE NA NA NA NA NA NA NA NA
[76] NA NA NA NA FALSE FALSE FALSE NA NA NA NA NA NA NA NA
[91] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[106] NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE FALSE
[121] FALSE FALSE FALSE FALSE NA NA NA NA NA NA NA NA NA NA NA
[136] NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA NA
[151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[166] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[181] NA NA NA NA NA FALSE FALSE NA NA NA NA NA NA NA NA
[196] NA NA NA NA NA NA NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[211] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE NA NA NA NA
[226] NA NA NA NA
colData(vince)$abx <- "control"
colData(vince)$abx[colData(vince)$antibiotics_family %>% str_detect("cephalosporins")] <- "cephalosporins"
colData(vince)$study_name %>% table()
.
VincentC_2016
229
phylovince <- makePhyloseqFromTreeSummarizedExperiment(vince, abund_values = "relative_abundance")
sample.phylovince <- sample_data(phylovince)
feat.phylovince <- otu_table(phylovince)
sample.phylovince <- sample.phylovince[, -which(names(sample.phylovince) %in% c("antibiotics_family", "antibiotics_current_use"))]
label.vince <- create.label(meta=sample.phylovince, label="abx", case="cephalosporins")