This lab performs differential abundance analysis in the Rampelli Italy/Tanzania dataset, optionally correcting for gender as a potential confounding factor.
Create a phyloseq object from the Rampelli Italy/Tanzania dataset, converting relative abundance to approximate counts:
suppressPackageStartupMessages(library(curatedMetagenomicData))
Rampelli = curatedMetagenomicData("RampelliS_2015.metaphlan_bugs_list.stool",
dryrun = FALSE, counts = TRUE,
bugs.as.phyloseq = TRUE)[[1]]
Question: Look at the sample data by doing View(sample_data(Rampelli))
Keep only taxa with at least 10 total counts across all samples. This is a good way to improve your power to detect differentially abundant taxa because these low-abundance taxa are less likely to be differentially abundant and increase multiple testing.
suppressPackageStartupMessages(library(phyloseq))
Rampelli = prune_taxa(taxa_sums(Rampelli) > 10, Rampelli)
Rampelli
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 727 taxa and 38 samples ]
## sample_data() Sample Data: [ 38 samples by 18 sample variables ]
## tax_table() Taxonomy Table: [ 727 taxa by 8 taxonomic ranks ]
How many taxa were present before and after pruning?
Prepare a DESeq2 object for a linear model with country as the predictor. More help on converting to DESeq2 from various formats here.
dds.data = phyloseq_to_deseq2(Rampelli, ~country)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
Excellent DESeq2 manual here or vignettes(package="DESeq2")
dds = DESeq(dds.data)
res = results(dds)
res = res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab = res[(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"),
as(tax_table(Rampelli)[rownames(sigtab), ], "matrix"))
head(sigtab)
## baseMean log2FoldChange lfcSE stat
## s__Prevotella_stercorea 1026450.69 22.49632 1.069746 21.02960
## t__GCF_000235885 1026450.69 22.49632 1.069746 21.02960
## p__Spirochaetes 572913.49 21.64745 1.176744 18.39606
## c__Spirochaetia 572913.49 21.64745 1.176744 18.39606
## o__Spirochaetales 572913.49 21.64745 1.176744 18.39606
## g__Catenibacterium 51752.17 30.00000 1.670805 17.95542
## pvalue padj Kingdom Phylum
## s__Prevotella_stercorea 3.516377e-98 8.087668e-96 Bacteria Bacteroidetes
## t__GCF_000235885 3.516377e-98 8.087668e-96 Bacteria Bacteroidetes
## p__Spirochaetes 1.412718e-75 1.299700e-73 Bacteria Spirochaetes
## c__Spirochaetia 1.412718e-75 1.299700e-73 Bacteria Spirochaetes
## o__Spirochaetales 1.412718e-75 1.299700e-73 Bacteria Spirochaetes
## g__Catenibacterium 4.353075e-72 2.503018e-70 Bacteria Firmicutes
## Class Order
## s__Prevotella_stercorea Bacteroidia Bacteroidales
## t__GCF_000235885 Bacteroidia Bacteroidales
## p__Spirochaetes <NA> <NA>
## c__Spirochaetia Spirochaetia <NA>
## o__Spirochaetales Spirochaetia Spirochaetales
## g__Catenibacterium Erysipelotrichia Erysipelotrichales
## Family Genus
## s__Prevotella_stercorea Prevotellaceae Prevotella
## t__GCF_000235885 Prevotellaceae Prevotella
## p__Spirochaetes <NA> <NA>
## c__Spirochaetia <NA> <NA>
## o__Spirochaetales <NA> <NA>
## g__Catenibacterium Erysipelotrichaceae Catenibacterium
## Species Strain
## s__Prevotella_stercorea Prevotella_stercorea <NA>
## t__GCF_000235885 Prevotella_stercorea GCF_000235885
## p__Spirochaetes <NA> <NA>
## c__Spirochaetia <NA> <NA>
## o__Spirochaetales <NA> <NA>
## g__Catenibacterium <NA> <NA>
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
plotDispEsts(dds)
library("ggplot2")
theme_set(theme_bw())
sigtabgen = subset(sigtab, !is.na(Family))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Family order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Family, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Family = factor(as.character(sigtabgen$Family), levels=names(x))
ggplot(sigtabgen, aes(y=Family, x=log2FoldChange, color=Phylum)) +
geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
geom_point(size=6) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
table(sample_data(Rampelli)$country, sample_data(Rampelli)$gender)
##
## female male
## ITA 7 4
## TZA 9 18
dds2 = phyloseq_to_deseq2(Rampelli, ~country + gender)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds2 = DESeq(dds2)
resultsNames(dds2)
## [1] "Intercept" "country_TZA_vs_ITA" "gender_male_vs_female"
The way the contrast is specified here sets ITA (Italy) as the numerator and TZA (Tanzania) as the denominator for fold-change calculations.
res2 = results(dds2, contrast=c("country", "ITA", "TZA"))
res2 = res2[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab2 = res2[which(res2$padj < alpha), ]
sigtab2 = cbind(as(sigtab2, "data.frame"), as(tax_table(Rampelli)[rownames(sigtab2), ], "matrix"))
head(sigtab2)
## baseMean log2FoldChange lfcSE stat
## p__Proteobacteria 809241.1 -3.233544 0.8065032 -4.009338
## p__Spirochaetes 572913.5 -21.664881 1.1506633 -18.828168
## p__Actinobacteria 581258.0 3.407046 0.8373670 4.068761
## c__Gammaproteobacteria 746714.3 -4.933632 0.9337371 -5.283748
## c__Spirochaetia 572913.5 -21.664881 1.1506633 -18.828168
## c__Actinobacteria 581258.0 3.407046 0.8373670 4.068761
## pvalue padj Kingdom Phylum
## p__Proteobacteria 6.088923e-05 1.049316e-04 Bacteria Proteobacteria
## p__Spirochaetes 4.438547e-79 3.870413e-77 Bacteria Spirochaetes
## p__Actinobacteria 4.726379e-05 8.177385e-05 Bacteria Actinobacteria
## c__Gammaproteobacteria 1.265670e-07 2.508328e-07 Bacteria Proteobacteria
## c__Spirochaetia 4.438547e-79 3.870413e-77 Bacteria Spirochaetes
## c__Actinobacteria 4.726379e-05 8.177385e-05 Bacteria Actinobacteria
## Class Order Family Genus Species
## p__Proteobacteria <NA> <NA> <NA> <NA> <NA>
## p__Spirochaetes <NA> <NA> <NA> <NA> <NA>
## p__Actinobacteria <NA> <NA> <NA> <NA> <NA>
## c__Gammaproteobacteria Gammaproteobacteria <NA> <NA> <NA> <NA>
## c__Spirochaetia Spirochaetia <NA> <NA> <NA> <NA>
## c__Actinobacteria Actinobacteria <NA> <NA> <NA> <NA>
## Strain
## p__Proteobacteria <NA>
## p__Spirochaetes <NA>
## p__Actinobacteria <NA>
## c__Gammaproteobacteria <NA>
## c__Spirochaetia <NA>
## c__Actinobacteria <NA>
Log fold-change vs. mean shows how well the homoskedasticity assumption holds, and identifies unusual OTUs where fold-change is at a limit.
plotMA(res, main="Difference vs. Average")
legend("bottomright", legend="differentially abundant", lty=-1, pch=1, col="red", bty='n')
par(mfrow=c(1,2))
plotCounts(dds2, gene="p__Actinobacteria", intgroup="country")
plotCounts(dds2, gene="p__Actinobacteria", intgroup="gender")
select <- rownames(sigtab2)
nt <- normTransform(dds2) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[select, ]
df <- as.data.frame(colData(dds2)[,c("country", "gender")])
pheatmap::pheatmap(log2.norm.counts, annotation_col=df, main="log2(counts + 1)")
Shannon Alpha diversity:
shannon = estimate_richness(Rampelli, measures="Shannon")
## Warning in estimate_richness(Rampelli, measures = "Shannon"): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
PCoA with Bray-Curtis dissimilarity:
ord = ordinate(Rampelli, method="PCoA", distance="bray")
Prepare a data.frame
with country, Shannon alpha diversity, and the first five Bray-Curtis ordination scores:
df = cbind(sample_data(Rampelli)[, "country", drop=FALSE],
shannon,
ord$vectors[, 1:3])
df$country <- factor(df$country)
par(mfrow=c(2,2))
for (i in seq(2, ncol(df))){
boxplot(df[, i] ~ df$country, main=colnames(df)[i])
}
Multivariate is problematic because using more than one PCoA score (axis) predicts country perfectly, causing numerical instability that prevents inference on the individual scores. Below limits the predictors to Shannon alpha diversity and only the first PCoA score. You could use all columns in df
except for country as predictors using the model formula country ~ .
.
fit = glm(country ~ Shannon + Axis.1,
control=glm.control(maxit=1000),
data=df, family=binomial("logit"))
summary(fit)
##
## Call:
## glm(formula = country ~ Shannon + Axis.1, family = binomial("logit"),
## data = df, control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9572 -0.5800 0.3504 0.6317 1.7853
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.224 8.422 1.926 0.05405 .
## Shannon -4.194 2.331 -1.799 0.07201 .
## Axis.1 5.367 2.015 2.664 0.00772 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45.728 on 37 degrees of freedom
## Residual deviance: 31.831 on 35 degrees of freedom
## AIC: 37.831
##
## Number of Fisher Scoring iterations: 5
Univariate regression:
unires <- sapply(2:ncol(df), function(i){
fit = glm(df$country ~ df[, i], family=binomial("logit"))
summary(fit)$coefficients[2, ]
})
colnames(unires) = colnames(df)[2:ncol(df)]
unires
## Shannon Axis.1 Axis.2 Axis.3
## Estimate -3.43736988 5.266361941 -24.913582499 5.90950116
## Std. Error 1.96396817 2.003823924 9.340239466 2.75057706
## z value -1.75021669 2.628156036 -2.667338733 2.14845868
## Pr(>|z|) 0.08008093 0.008584913 0.007645457 0.03167734
write.csv(unires, file="univariate_shannonPCoA.csv")
Preparing the data object:
Rampellieset = curatedMetagenomicData("RampelliS_2015.metaphlan_bugs_list.stool",
dryrun = FALSE, counts = TRUE,
bugs.as.phyloseq = FALSE)[[1]]
## Working on RampelliS_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2017-06-09
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/lwaldron//.ExperimentHub/481'
mseq = ExpressionSet2MRexperiment(Rampellieset)
## Loading required namespace: metagenomeSeq
Normalization as per package vignette (browseVignettes("metagenomeSeq")
):
library(metagenomeSeq)
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## Loading required package: glmnet
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## Loading required package: foreach
## Loaded glmnet 2.0-10
## Loading required package: RColorBrewer
p = cumNormStatFast(mseq)
## Default value being used.
mseq = cumNorm(mseq, p = p)
Performing the regression analysis:
pd <- pData(mseq)
mod <- model.matrix(~1 + country, data = pd)
mseqres = fitFeatureModel(mseq, mod)
View(MRcoefs(mseqres))
View(MRtable(mseqres))
The recurring problem with “perfect” models interfering with convergence is still present.
NOTE: It appears that metagenomeSeq uses the same model matrix for both the zero-inflation logistic model, and the log-linear negative binomial model. This is OK for inference on the count model, but I don’t like it for the sake of interpretability of the coefficients. I believe there is a way to over-ride this default, but I’m not yet sure how. However, it does not seem to have affected the coefficients of the linear model very much:
res.matched <- res[match(rownames(MRcoefs(mseqres)), rownames(res)), ]
plot(res.matched$log2FoldChange, MRcoefs(mseqres)$logFC)