We take the differential abundance analysis of 2 groups for illustration. The data was generated from Poisson distribution for each group with 30 individuals and 100 microbes. We set the first 10 microbes to be differetial by multipling a effect size vector for the abundance of one group, which is the log2FC from a norm distribution \(N(4, 1)\).
set.seed(123)
data <- matrix(rpois(100*60, 10), 60)
b <- rnorm(10, 4)
data[31:60,1:10] <- t(t(data[31:60,1:10])*2^b)
group <- rep(c(0, 1), each=30)
rownames(data) <- paste0('sample', 1:60)
colnames(data) <- paste0('microbe', 1:100)
names(b) <- paste0('microbe', 1:10)
# take a look of the log2FC
kable_classic(kbl(t(round(b,4)),align= "l",caption='log2FC for the first 10 microbes'),
html_font = "Cambria") | microbe1 | microbe2 | microbe3 | microbe4 | microbe5 | microbe6 | microbe7 | microbe8 | microbe9 | microbe10 |
|---|---|---|---|---|---|---|---|---|---|
| 3.7867 | 5.1053 | 4.5898 | 3.3323 | 5.6418 | 3.1743 | 2.7887 | 2.5838 | 4.2968 | 3.4753 |
# take a look of the data
kable_classic(kbl(data[1:10, 1:6], align = "l", caption='Data at a glance'),
full_width = T, html_font = "Cambria") | microbe1 | microbe2 | microbe3 | microbe4 | microbe5 | microbe6 | |
|---|---|---|---|---|---|---|
| sample1 | 8 | 11 | 10 | 9 | 10 | 8 |
| sample2 | 9 | 12 | 8 | 10 | 11 | 7 |
| sample3 | 14 | 9 | 10 | 10 | 10 | 8 |
| sample4 | 10 | 9 | 17 | 13 | 4 | 10 |
| sample5 | 10 | 7 | 16 | 8 | 7 | 7 |
| sample6 | 15 | 7 | 5 | 6 | 9 | 10 |
| sample7 | 11 | 11 | 6 | 10 | 10 | 11 |
| sample8 | 5 | 9 | 11 | 18 | 10 | 9 |
| sample9 | 4 | 7 | 5 | 10 | 10 | 20 |
| sample10 | 13 | 10 | 12 | 9 | 15 | 12 |
Here other parameters are set as default.
fit <- fastANCOM(Y=data, x=group)
final_fit <- fit$results$final
kable_classic(kbl(summary(fit), caption = 'Results summary'),
full_width = T, html_font = "Cambria")| Length | Class | Mode | |
|---|---|---|---|
| global | 0 | -none- | NULL |
| results | 3 | -none- | list |
kable_classic(kbl(head(final_fit, 10), align = "l",
caption='Results of fastANCOM'), full_width = T, html_font = "Cambria")| log2FC | log2FC.SD | log2FC.pval | log2FC.qval | Reject.number | REJECT | |
|---|---|---|---|---|---|---|
| microbe1 | 3.911488 | 0.1406454 | 0 | 0 | 99 | TRUE |
| microbe2 | 5.066569 | 0.1080591 | 0 | 0 | 99 | TRUE |
| microbe3 | 4.605962 | 0.1536873 | 0 | 0 | 98 | TRUE |
| microbe4 | 3.057501 | 0.1459191 | 0 | 0 | 97 | TRUE |
| microbe5 | 5.643832 | 0.1470910 | 0 | 0 | 99 | TRUE |
| microbe6 | 3.049002 | 0.1445558 | 0 | 0 | 97 | TRUE |
| microbe7 | 2.847731 | 0.1346457 | 0 | 0 | 97 | TRUE |
| microbe8 | 2.510676 | 0.1128027 | 0 | 0 | 99 | TRUE |
| microbe9 | 4.322188 | 0.1337157 | 0 | 0 | 98 | TRUE |
| microbe10 | 3.456564 | 0.1125187 | 0 | 0 | 99 | TRUE |
Take a look of the computation time with 100 replicates, fastANCOM thousands of times faster than original ANCOM
system.time(replicate(100, fit <- fastANCOM(Y=data, x=group, effect = T)))
#> user system elapsed
#> 0.402 0.052 0.466
system.time(replicate(100, fit <- fastANCOM(Y=data, x=group, effect = F)))
#> user system elapsed
#> 0.276 0.039 0.316
system.time(replicate(1, fit <- ANCOM(Y=data, x=group, tfun = wilcox.test)))
#> user system elapsed
#> 7.539 0.055 7.646
system.time(replicate(1, fit <- ANCOM(Y=data, x=group, tfun = t.test)))
#> user system elapsed
#> 5.246 0.023 5.283
system.time(replicate(100, fit <- ANCOM(Y=data, x=group, tfun = 't2')))
#> user system elapsed
#> 5.587 2.174 7.764We present plots of reject number of each microbe, valcano plot, and the log2 fold changes estimation compared to the true settings.
library(ggplot2)
library(ggpubr)
wp <- ggplot(data=final_fit, aes(x=1:100,y=Reject.number, color=REJECT, shape=REJECT)) +
geom_bar(aes(y=Reject.number), stat = "identity", width = 0.005, color='lightgrey') +
# geom_vline(xintercept = 1:100, linetype='dashed', size=0.1) +
geom_point(size=2) + theme_linedraw() +
labs(x='Microbe',y='W', title='Reject number') +
scale_colour_brewer(palette = "Set1", direction=-1) +
theme(legend.position = c(0.85, 0.78),
legend.background = element_rect(fill = "white", colour = "black"))
wp
vp <-ggplot(data=final_fit,aes(x=log2FC,y=-log10(log2FC.pval), color=REJECT, shape=REJECT)) +
geom_hline(yintercept=-log10(0.05/nrow(final_fit)),
linetype="dashed",color="black",size=0.5)+
geom_point(size=2) + theme_linedraw() + labs(y='log2(p-value)', title='Volcano plot') +
theme(legend.position='none') +
scale_colour_brewer(palette = "Set1",direction=-1)
vp
tmpb <- data.frame(Microbe=factor(1:10),log2FC=c(b, final_fit[final_fit$REJECT, 1]),
se=c(rep(0, 10), final_fit[final_fit$REJECT, 2]),
Type=c(rep('Truth', 10), rep('Estimation', 10)))
ep <- ggplot(data=tmpb, aes(x=Microbe, y=log2FC, color=Type, shape=Type)) +
geom_point(size=2, position = position_dodge(0.2) ) +
geom_errorbar(aes(ymin=log2FC-2*se, ymax=log2FC+2*se),
width=0.2, position = position_dodge(0.2)) +
theme_linedraw() + labs(title='Log2FC estimation') +
scale_colour_brewer(palette = "Set1", direction=-1) +
theme(legend.position = c(0.7, 0.8),
legend.background = element_rect(fill = "white", colour = "black"))
epsessionInfo()
#> R version 4.0.3 (2020-10-10)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Mojave 10.14.6
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ggpubr_0.4.0 ggplot2_3.3.2 fastANCOM_0.0.4 kableExtra_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.1.0 xfun_0.25 bslib_0.2.5.1 purrr_0.3.4
#> [5] haven_2.3.1 lattice_0.20-41 carData_3.0-4 colorspace_1.4-1
#> [9] vctrs_0.3.4 generics_0.1.0 htmltools_0.5.1.1 viridisLite_0.3.0
#> [13] yaml_2.2.1 rlang_0.4.11 jquerylib_0.1.4 pillar_1.4.6
#> [17] foreign_0.8-80 glue_1.4.2 withr_2.3.0 RColorBrewer_1.1-2
#> [21] readxl_1.3.1 lifecycle_0.2.0 stringr_1.4.0 cellranger_1.1.0
#> [25] munsell_0.5.0 ggsignif_0.6.0 gtable_0.3.0 zip_2.1.1
#> [29] rvest_0.3.6 evaluate_0.14 labeling_0.4.2 forcats_0.5.0
#> [33] knitr_1.33 rio_0.5.16 curl_4.3 highr_0.8
#> [37] Rcpp_1.0.5 broom_0.7.2 scales_1.1.1 backports_1.1.10
#> [41] webshot_0.5.2 jsonlite_1.7.1 abind_1.4-5 farver_2.0.3
#> [45] hms_0.5.3 digest_0.6.27 openxlsx_4.2.3 stringi_1.5.3
#> [49] rstatix_0.6.0 dplyr_1.0.2 grid_4.0.3 tools_4.0.3
#> [53] magrittr_2.0.1 sass_0.4.0 tibble_3.0.4 crayon_1.3.4
#> [57] tidyr_1.1.2 car_3.0-10 pkgconfig_2.0.3 ellipsis_0.3.1
#> [61] data.table_1.13.2 xml2_1.3.2 rmarkdown_2.10 httr_1.4.2
#> [65] rstudioapi_0.11 R6_2.5.0 nlme_3.1-149 compiler_4.0.3