As is customary, the first step when working with each R script is to load the required libraries that will be utilized in the upcoming lesson. The libraries we will need are as follows
# load libraries
library(ggplot2)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
if(!require("ggpubr")){ # an if condition, if ggpubr is not installed:
install.packages("ggpubr") #install ggpubr
}
## Loading required package: ggpubr
if(!require("reshape2")){ # an if condition, if ggpubr is not installed:
install.packages("reshape2") #install ggpubr
}
## Loading required package: reshape2
if(!require("RColorBrewer")){ # an if condition, if ggpubr is not installed:
install.packages("RColorBrewer") #install ggpubr
}
## Loading required package: RColorBrewer
if (!require("BiocManager", quietly = TRUE)){
install.packages("BiocManager")
}
## Bioconductor version '3.15' is out-of-date; the current release version '3.18'
## is available with R version '4.3'; see https://bioconductor.org/install
if (!require("ANCOMBC", quietly = TRUE)){
BiocManager::install("ANCOMBC")
}
## Registered S3 methods overwritten by 'treeio':
## method from
## MRCA.phylo tidytree
## MRCA.treedata tidytree
## Nnode.treedata tidytree
## Ntip.treedata tidytree
## ancestor.phylo tidytree
## ancestor.treedata tidytree
## child.phylo tidytree
## child.treedata tidytree
## full_join.phylo tidytree
## full_join.treedata tidytree
## groupClade.phylo tidytree
## groupClade.treedata tidytree
## groupOTU.phylo tidytree
## groupOTU.treedata tidytree
## is.rooted.treedata tidytree
## nodeid.phylo tidytree
## nodeid.treedata tidytree
## nodelab.phylo tidytree
## nodelab.treedata tidytree
## offspring.phylo tidytree
## offspring.treedata tidytree
## parent.phylo tidytree
## parent.treedata tidytree
## root.treedata tidytree
## rootnode.phylo tidytree
## sibling.phylo tidytree
if (!require("phyloseq", quietly = TRUE)){
source('http://bioconductor.org/biocLite.R')
biocLite('phyloseq')
}
As is typical, we need to set the working directory. This is the default directory from which the R script will save and load files. Please note that while I have set it to MY working directory, you should adjust it to match YOURS. Specifically, I have set it to the working directory where the outputs from the previous exercise using dada2 were saved The next step is to load the files that we will be using in this exercise. Specifically, we require both the ASV count table and the taxonomy table that we generated in the previous exercise using dada2.
setwd("/Users/gabri/Desktop/lesson_microbiome/")
ASVtab = readRDS("asv_table.RDS")
TAXtab = readRDS("taxa_table_silva.RDS")
TAXtab = as.data.frame(TAXtab)
As with the previous exercise, the mapping file will be downloaded automatically from a shared folder.”
metadata = read.csv("mapping_file.csv")
metadata$Diet = factor(metadata$Diet)
We need to create first a phyloseq obect to load the data inside ancombc. A phyloseq object stores ASVtable, metadata table and Taxonomy table. In this case we are not loading a taxonomy table as we do not need it for our analysis
rownames(metadata) = metadata$Name # First we need to make the same rownmes for both
rownames(metadata) == rownames(ASVtab)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#metadata and ASVtab otherwise phyloseq will throw an error
physeq <- phyloseq( sample_data(metadata),
otu_table(ASVtab, taxa_are_rows = FALSE))
Now we run ancombc. There is also ancombc two but it takes very long to run. So we use the one. After that, we extract the results in a dataframe, as ancombc results are stored in a weird way.
out = ancombc(data = physeq, formula = "Diet", p_adj_method = "fdr")
## 'ancombc' is deprecated
## Use 'ancombc2' instead
## `tax_level` is not speficified
## The lowest taxonomic level will be used: Species
## Otherwise, please speficy `tax_level` by one of the following:
## Species
results_ancom_bc = data.frame(ASV = rownames(out$res$lfc),
lcf = out$res$lfc$DietNormal,
W = out$res$W$DietNormal,
p_val = out$res$p_val$DietNormal,
q_value = out$res$q_val$DietNormal,
Diff_ab = out$res$diff_abn$DietNormal)
results_ancom_bc$lcf = results_ancom_bc$lcf * -1
Now we have all we need for plotting the results, let’s first do a vulcano plot.
results_ancom_bc$association = ifelse(results_ancom_bc$q_value < 0.05 & results_ancom_bc$lcf > 0, "High fat", "Not singificant")
results_ancom_bc$association = ifelse(results_ancom_bc$q_value < 0.05 & results_ancom_bc$lcf < 0, "Normal",results_ancom_bc$association)
ggplot(results_ancom_bc, aes(x = as.numeric(lcf), y = -log10(as.numeric(q_value)), color = association)) +
geom_point() +
labs(x = "Log2 Fold Change", y = "-log10(p-value)",
title = "Volcano Plot") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
theme_bw()
Let’s see detailed results by making a lollypop plot. before, we assign the names to each sequence, by combining a uniue numerical ID (ASV1, ASV2, ASV3) and the genus it belongs. In case the genus is not assigned, we will retrieve
name_tmp = sprintf("ASV%s",seq(1:nrow(TAXtab)))
#now let's add some information to this. For example, we want to add the genus, and if not available, the last taxonomic information available.
TAXtab$name = TAXtab$Genus
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Family, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Order, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Class, TAXtab$name)
TAXtab$name = ifelse(is.na(TAXtab$name), TAXtab$Phylum, TAXtab$name)
unique(is.na(TAXtab$name)) # Ok, we have all names
## [1] FALSE TRUE
## now let us apply a the prefix to each one of them
TAXtab$name = paste(name_tmp, TAXtab$name, sep = "-")
TAXtab$name[1:10] # ok it looks good.
## [1] "ASV1-Lactococcus" "ASV2-Alistipes"
## [3] "ASV3-Clostridia UCG-014" "ASV4-Faecalibaculum"
## [5] "ASV5-Ligilactobacillus" "ASV6-Ileibacterium"
## [7] "ASV7-Muribaculaceae" "ASV8-RF39"
## [9] "ASV9-Alloprevotella" "ASV10-Rikenellaceae RC9 gut group"
results_ancom_bc$name = TAXtab$name[match(results_ancom_bc$ASV ,rownames(TAXtab))]
Let’s see how many signficant ASVs we have. If they are still too many, we filter them and plot only the top 50 most abundant.
results_sig = results_ancom_bc[results_ancom_bc$q_value < 0.05,]
table(results_sig$association)
##
## High fat Normal
## 105 193
### We still have 200 (193 and 105) differential abundant ASVs
### I wanna only plot the first 50 most across the entire study, otherwise I will end up with a huge plot
Sum_abundance= colSums(decostand(ASVtab, method = "total"))
results_sig$sum_abundance = Sum_abundance[match(results_sig$ASV, names(Sum_abundance))]
results_sig = results_sig[order(results_sig$sum_abundance, decreasing = TRUE),]
results_sig = results_sig[1:25,]
results_sig$name = factor(results_sig$name, levels = rev(results_sig$name))
a = ggplot(results_sig, aes(lcf, name, color = association ))+
geom_point() +theme_bw() +
geom_segment( aes(x=0, xend=lcf, y=name, yend=name, color = association)) +
geom_vline(xintercept = 0, size=0.3) +xlab("log2FoldChange") +ylab(NULL)# +
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#theme(legend.position="bottom")
a
Looks good. Let’s look make some boxplots to put besides it.
ASVtab_rel = decostand(ASVtab, method = "total")
AASVtab_sig = data.frame(ASVtab_rel[,as.numeric(rownames(results_sig))])
colnames(AASVtab_sig) = results_sig$name
AASVtab_sig$Diet = metadata$Diet
melted = reshape2::melt(AASVtab_sig, values = "Diet")
## Using Diet as id variables
melted$variable = factor(melted$variable, levels = rev(results_sig$name))
b = ggplot(melted, aes(value, variable, fill = Diet)) +
geom_boxplot(outlier.size = 0.5) + theme_bw() + xlab("Relative abundance") +
theme( axis.text.y=element_blank()) + ylab(NULL)
b
ggarrange(a,b, ncol = 2, nrow = 1,align = "h", common.legend = TRUE, widths = c(1.85, 0.9))
As we said in the slides, there are multiple tools developed for
differential abundance analysis that can be adapted to a moltitude of
experimental designs. Some of the include:
Explore all different tools and try apply them on your dataset. Be critical with their results.