Load the libraries

As is customary, the first step when working with each R script is to load the required libraries that will be utilized in the upcoming lesson. The libraries we will need are as follows

  1. ggplot2. ggplot is a package used to make beautiful plots
  2. ggpubr. add functionalities to ggplot
  3. reshape2. helps with table manipulation
  4. RColorBrewer. A collection of color palettes.
  5. phyloseq. A package to manage microbiome data
  6. ANCOMBC. Differential abundance analysis
# load libraries
library(ggplot2)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
if(!require("ggpubr")){      # an if condition, if ggpubr is not installed:
  install.packages("ggpubr") #install ggpubr
}
## Loading required package: ggpubr
if(!require("reshape2")){      # an if condition, if ggpubr is not installed:
  install.packages("reshape2") #install ggpubr
}
## Loading required package: reshape2
if(!require("RColorBrewer")){      # an if condition, if ggpubr is not installed:
  install.packages("RColorBrewer") #install ggpubr
}
## Loading required package: RColorBrewer
if (!require("BiocManager", quietly = TRUE)){
    install.packages("BiocManager")
}
## Bioconductor version '3.15' is out-of-date; the current release version '3.18'
##   is available with R version '4.3'; see https://bioconductor.org/install
if (!require("ANCOMBC", quietly = TRUE)){
       BiocManager::install("ANCOMBC")
}
## Registered S3 methods overwritten by 'treeio':
##   method              from    
##   MRCA.phylo          tidytree
##   MRCA.treedata       tidytree
##   Nnode.treedata      tidytree
##   Ntip.treedata       tidytree
##   ancestor.phylo      tidytree
##   ancestor.treedata   tidytree
##   child.phylo         tidytree
##   child.treedata      tidytree
##   full_join.phylo     tidytree
##   full_join.treedata  tidytree
##   groupClade.phylo    tidytree
##   groupClade.treedata tidytree
##   groupOTU.phylo      tidytree
##   groupOTU.treedata   tidytree
##   is.rooted.treedata  tidytree
##   nodeid.phylo        tidytree
##   nodeid.treedata     tidytree
##   nodelab.phylo       tidytree
##   nodelab.treedata    tidytree
##   offspring.phylo     tidytree
##   offspring.treedata  tidytree
##   parent.phylo        tidytree
##   parent.treedata     tidytree
##   root.treedata       tidytree
##   rootnode.phylo      tidytree
##   sibling.phylo       tidytree
if (!require("phyloseq", quietly = TRUE)){
source('http://bioconductor.org/biocLite.R')
biocLite('phyloseq')
}

Load the files

As is typical, we need to set the working directory. This is the default directory from which the R script will save and load files. Please note that while I have set it to MY working directory, you should adjust it to match YOURS. Specifically, I have set it to the working directory where the outputs from the previous exercise using dada2 were saved The next step is to load the files that we will be using in this exercise. Specifically, we require both the ASV count table and the taxonomy table that we generated in the previous exercise using dada2.

setwd("/Users/gabri/Desktop/lesson_microbiome/")
ASVtab = readRDS("asv_table.RDS")
TAXtab = readRDS("taxa_table_silva.RDS")
TAXtab = as.data.frame(TAXtab)

As with the previous exercise, the mapping file will be downloaded automatically from a shared folder.”

metadata = read.csv("mapping_file.csv")
metadata$Diet = factor(metadata$Diet)

First part, loading the data into ancombc

We need to create first a phyloseq obect to load the data inside ancombc. A phyloseq object stores ASVtable, metadata table and Taxonomy table. In this case we are not loading a taxonomy table as we do not need it for our analysis

rownames(metadata) = metadata$Name # First we need to make the same rownmes for both 
rownames(metadata)  == rownames(ASVtab)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#metadata and ASVtab otherwise phyloseq will throw an error
physeq <- phyloseq( sample_data(metadata),
                    otu_table(ASVtab, taxa_are_rows = FALSE))

Now we run ancombc. There is also ancombc two but it takes very long to run. So we use the one. After that, we extract the results in a dataframe, as ancombc results are stored in a weird way.

out = ancombc(data = physeq, formula = "Diet", p_adj_method = "fdr")
## 'ancombc' is deprecated 
## Use 'ancombc2' instead
## `tax_level` is not speficified 
## The lowest taxonomic level will be used: Species
## Otherwise, please speficy `tax_level` by one of the following: 
## Species
results_ancom_bc = data.frame(ASV = rownames(out$res$lfc),
                              lcf = out$res$lfc$DietNormal, 
                              W = out$res$W$DietNormal, 
                              p_val = out$res$p_val$DietNormal, 
                              q_value = out$res$q_val$DietNormal, 
                              Diff_ab =  out$res$diff_abn$DietNormal)
results_ancom_bc$lcf = results_ancom_bc$lcf * -1

Now we have all we need for plotting the results, let’s first do a vulcano plot.

results_ancom_bc$association = ifelse(results_ancom_bc$q_value < 0.05 & results_ancom_bc$lcf > 0, "High fat", "Not singificant")
results_ancom_bc$association = ifelse(results_ancom_bc$q_value < 0.05 & results_ancom_bc$lcf < 0, "Normal",results_ancom_bc$association)

ggplot(results_ancom_bc, aes(x = as.numeric(lcf), y = -log10(as.numeric(q_value)), color = association)) +
  geom_point() +
  labs(x = "Log2 Fold Change", y = "-log10(p-value)",
       title = "Volcano Plot") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
  theme_bw()

Let’s see detailed results by making a lollypop plot. before, we assign the names to each sequence, by combining a uniue numerical ID (ASV1, ASV2, ASV3) and the genus it belongs. In case the genus is not assigned, we will retrieve

name_tmp = sprintf("ASV%s",seq(1:nrow(TAXtab))) 
#now let's add some information to this. For example, we want to add the genus, and if not available, the last taxonomic information available.
TAXtab$name =  TAXtab$Genus
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Family, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Order, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Class, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Phylum, TAXtab$name)
unique(is.na(TAXtab$name)) # Ok, we have all names
## [1] FALSE  TRUE
## now let us apply a the prefix to each one of them
TAXtab$name = paste(name_tmp, TAXtab$name, sep = "-")
TAXtab$name[1:10] # ok it looks good.
##  [1] "ASV1-Lactococcus"                  "ASV2-Alistipes"                   
##  [3] "ASV3-Clostridia UCG-014"           "ASV4-Faecalibaculum"              
##  [5] "ASV5-Ligilactobacillus"            "ASV6-Ileibacterium"               
##  [7] "ASV7-Muribaculaceae"               "ASV8-RF39"                        
##  [9] "ASV9-Alloprevotella"               "ASV10-Rikenellaceae RC9 gut group"
results_ancom_bc$name = TAXtab$name[match(results_ancom_bc$ASV ,rownames(TAXtab))]

Let’s see how many signficant ASVs we have. If they are still too many, we filter them and plot only the top 50 most abundant.

results_sig = results_ancom_bc[results_ancom_bc$q_value < 0.05,]
table(results_sig$association)
## 
## High fat   Normal 
##      105      193
### We still have 200 (193 and 105) differential abundant ASVs
### I wanna only plot the first 50 most across the entire study, otherwise I will end up with a huge plot
Sum_abundance= colSums(decostand(ASVtab, method = "total"))
results_sig$sum_abundance = Sum_abundance[match(results_sig$ASV, names(Sum_abundance))]
results_sig = results_sig[order(results_sig$sum_abundance, decreasing = TRUE),]
results_sig = results_sig[1:25,]
results_sig$name = factor(results_sig$name, levels = rev(results_sig$name))
a = ggplot(results_sig, aes(lcf, name, color = association ))+
  geom_point() +theme_bw() +
  geom_segment( aes(x=0, xend=lcf, y=name, yend=name, color = association)) +
  geom_vline(xintercept = 0, size=0.3) +xlab("log2FoldChange") +ylab(NULL)# +
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
  #theme(legend.position="bottom")
a

Looks good. Let’s look make some boxplots to put besides it.

ASVtab_rel = decostand(ASVtab, method = "total")
AASVtab_sig = data.frame(ASVtab_rel[,as.numeric(rownames(results_sig))])
colnames(AASVtab_sig) = results_sig$name
AASVtab_sig$Diet = metadata$Diet
melted = reshape2::melt(AASVtab_sig, values = "Diet")
## Using Diet as id variables
melted$variable = factor(melted$variable, levels = rev(results_sig$name))
b = ggplot(melted, aes(value, variable, fill = Diet)) +
  geom_boxplot(outlier.size = 0.5) + theme_bw() + xlab("Relative abundance") +
    theme( axis.text.y=element_blank()) + ylab(NULL)

b 

ggarrange(a,b, ncol = 2, nrow = 1,align = "h", common.legend = TRUE, widths = c(1.85, 0.9))

As we said in the slides, there are multiple tools developed for differential abundance analysis that can be adapted to a moltitude of experimental designs. Some of the include:

  1. ALDEx2.
  2. ANCOM-BC.
  3. DEseq2.
  4. MaAslinn2.
  5. LefSE.
  6. metagenomeSeq.

Explore all different tools and try apply them on your dataset. Be critical with their results.