This lab performs differential abundance analysis in the Rampelli Italy/Tanzania dataset, optionally correcting for gender as a potential confounding factor.

Load data

Create a phyloseq object from the Rampelli Italy/Tanzania dataset, converting relative abundance to approximate counts:

suppressPackageStartupMessages(library(curatedMetagenomicData))
Rampelli = curatedMetagenomicData("RampelliS_2015.metaphlan_bugs_list.stool", 
                       dryrun = FALSE, counts = TRUE,
                       bugs.as.phyloseq = TRUE)[[1]]

Question: Look at the sample data by doing View(sample_data(Rampelli))

Sample selection and taxa pruning

Keep only taxa with at least 10 total counts across all samples. This is a good way to improve your power to detect differentially abundant taxa because these low-abundance taxa are less likely to be differentially abundant and increase multiple testing.

suppressPackageStartupMessages(library(phyloseq))
Rampelli = prune_taxa(taxa_sums(Rampelli) > 10, Rampelli)
Rampelli
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 727 taxa and 38 samples ]
## sample_data() Sample Data:       [ 38 samples by 18 sample variables ]
## tax_table()   Taxonomy Table:    [ 727 taxa by 8 taxonomic ranks ]

How many taxa were present before and after pruning?

Conversion to DESeq2 object

Prepare a DESeq2 object for a linear model with country as the predictor. More help on converting to DESeq2 from various formats here.

dds.data = phyloseq_to_deseq2(Rampelli, ~country)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

Negative Binomial log-linear model with DESeq2

Excellent DESeq2 manual here or vignettes(package="DESeq2")

dds = DESeq(dds.data)
res = results(dds)
res = res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab = res[(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), 
               as(tax_table(Rampelli)[rownames(sigtab), ], "matrix"))
head(sigtab)
##                           baseMean log2FoldChange    lfcSE     stat
## s__Prevotella_stercorea 1026450.69       22.49632 1.069746 21.02960
## t__GCF_000235885        1026450.69       22.49632 1.069746 21.02960
## p__Spirochaetes          572913.49       21.64745 1.176744 18.39606
## c__Spirochaetia          572913.49       21.64745 1.176744 18.39606
## o__Spirochaetales        572913.49       21.64745 1.176744 18.39606
## g__Catenibacterium        51752.17       30.00000 1.670805 17.95542
##                               pvalue         padj  Kingdom        Phylum
## s__Prevotella_stercorea 3.516377e-98 8.087668e-96 Bacteria Bacteroidetes
## t__GCF_000235885        3.516377e-98 8.087668e-96 Bacteria Bacteroidetes
## p__Spirochaetes         1.412718e-75 1.299700e-73 Bacteria  Spirochaetes
## c__Spirochaetia         1.412718e-75 1.299700e-73 Bacteria  Spirochaetes
## o__Spirochaetales       1.412718e-75 1.299700e-73 Bacteria  Spirochaetes
## g__Catenibacterium      4.353075e-72 2.503018e-70 Bacteria    Firmicutes
##                                    Class              Order
## s__Prevotella_stercorea      Bacteroidia      Bacteroidales
## t__GCF_000235885             Bacteroidia      Bacteroidales
## p__Spirochaetes                     <NA>               <NA>
## c__Spirochaetia             Spirochaetia               <NA>
## o__Spirochaetales           Spirochaetia     Spirochaetales
## g__Catenibacterium      Erysipelotrichia Erysipelotrichales
##                                      Family           Genus
## s__Prevotella_stercorea      Prevotellaceae      Prevotella
## t__GCF_000235885             Prevotellaceae      Prevotella
## p__Spirochaetes                        <NA>            <NA>
## c__Spirochaetia                        <NA>            <NA>
## o__Spirochaetales                      <NA>            <NA>
## g__Catenibacterium      Erysipelotrichaceae Catenibacterium
##                                      Species        Strain
## s__Prevotella_stercorea Prevotella_stercorea          <NA>
## t__GCF_000235885        Prevotella_stercorea GCF_000235885
## p__Spirochaetes                         <NA>          <NA>
## c__Spirochaetia                         <NA>          <NA>
## o__Spirochaetales                       <NA>          <NA>
## g__Catenibacterium                      <NA>          <NA>

Bayesian estimation of dispersion

dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
plotDispEsts(dds)

Plot results

library("ggplot2")
theme_set(theme_bw())
sigtabgen = subset(sigtab, !is.na(Family))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Family order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Family, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Family = factor(as.character(sigtabgen$Family), levels=names(x))
ggplot(sigtabgen, aes(y=Family, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=6) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

Correcting for gender as a potential confounder

table(sample_data(Rampelli)$country, sample_data(Rampelli)$gender)
##      
##       female male
##   ITA      7    4
##   TZA      9   18
dds2 = phyloseq_to_deseq2(Rampelli, ~country + gender)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds2 = DESeq(dds2)
resultsNames(dds2)
## [1] "Intercept"             "country_TZA_vs_ITA"    "gender_male_vs_female"

Correcting for gender as a potential confounder

The way the contrast is specified here sets ITA (Italy) as the numerator and TZA (Tanzania) as the denominator for fold-change calculations.

res2 = results(dds2, contrast=c("country", "ITA", "TZA"))
res2 = res2[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab2 = res2[which(res2$padj < alpha), ]
sigtab2 = cbind(as(sigtab2, "data.frame"), as(tax_table(Rampelli)[rownames(sigtab2), ], "matrix"))
head(sigtab2)
##                        baseMean log2FoldChange     lfcSE       stat
## p__Proteobacteria      809241.1      -3.233544 0.8065032  -4.009338
## p__Spirochaetes        572913.5     -21.664881 1.1506633 -18.828168
## p__Actinobacteria      581258.0       3.407046 0.8373670   4.068761
## c__Gammaproteobacteria 746714.3      -4.933632 0.9337371  -5.283748
## c__Spirochaetia        572913.5     -21.664881 1.1506633 -18.828168
## c__Actinobacteria      581258.0       3.407046 0.8373670   4.068761
##                              pvalue         padj  Kingdom         Phylum
## p__Proteobacteria      6.088923e-05 1.049316e-04 Bacteria Proteobacteria
## p__Spirochaetes        4.438547e-79 3.870413e-77 Bacteria   Spirochaetes
## p__Actinobacteria      4.726379e-05 8.177385e-05 Bacteria Actinobacteria
## c__Gammaproteobacteria 1.265670e-07 2.508328e-07 Bacteria Proteobacteria
## c__Spirochaetia        4.438547e-79 3.870413e-77 Bacteria   Spirochaetes
## c__Actinobacteria      4.726379e-05 8.177385e-05 Bacteria Actinobacteria
##                                      Class Order Family Genus Species
## p__Proteobacteria                     <NA>  <NA>   <NA>  <NA>    <NA>
## p__Spirochaetes                       <NA>  <NA>   <NA>  <NA>    <NA>
## p__Actinobacteria                     <NA>  <NA>   <NA>  <NA>    <NA>
## c__Gammaproteobacteria Gammaproteobacteria  <NA>   <NA>  <NA>    <NA>
## c__Spirochaetia               Spirochaetia  <NA>   <NA>  <NA>    <NA>
## c__Actinobacteria           Actinobacteria  <NA>   <NA>  <NA>    <NA>
##                        Strain
## p__Proteobacteria        <NA>
## p__Spirochaetes          <NA>
## p__Actinobacteria        <NA>
## c__Gammaproteobacteria   <NA>
## c__Spirochaetia          <NA>
## c__Actinobacteria        <NA>

MA plots

Log fold-change vs. mean shows how well the homoskedasticity assumption holds, and identifies unusual OTUs where fold-change is at a limit.

plotMA(res, main="Difference vs. Average")
legend("bottomright", legend="differentially abundant", lty=-1, pch=1, col="red", bty='n')

Plot individual counts

par(mfrow=c(1,2))
plotCounts(dds2, gene="p__Actinobacteria", intgroup="country")
plotCounts(dds2, gene="p__Actinobacteria", intgroup="gender")

Heatmap of differentially abundant taxa

select <- rownames(sigtab2)
nt <- normTransform(dds2) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[select, ]
df <- as.data.frame(colData(dds2)[,c("country", "gender")])
pheatmap::pheatmap(log2.norm.counts, annotation_col=df, main="log2(counts + 1)")

Regression on ordination vectors and alpha diversity

Shannon Alpha diversity:

shannon = estimate_richness(Rampelli, measures="Shannon")
## Warning in estimate_richness(Rampelli, measures = "Shannon"): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.

PCoA with Bray-Curtis dissimilarity:

ord = ordinate(Rampelli, method="PCoA", distance="bray")

Prepare a data.frame with country, Shannon alpha diversity, and the first five Bray-Curtis ordination scores:

df = cbind(sample_data(Rampelli)[, "country", drop=FALSE],
           shannon,
           ord$vectors[, 1:3])
df$country <- factor(df$country)
par(mfrow=c(2,2))
for (i in seq(2, ncol(df))){
  boxplot(df[, i] ~ df$country, main=colnames(df)[i])
}

Regression on ordination vectors and alpha diversity (cont’d)

Multivariate is problematic because using more than one PCoA score (axis) predicts country perfectly, causing numerical instability that prevents inference on the individual scores. Below limits the predictors to Shannon alpha diversity and only the first PCoA score. You could use all columns in df except for country as predictors using the model formula country ~ ..

fit = glm(country ~ Shannon + Axis.1, 
          control=glm.control(maxit=1000),
          data=df, family=binomial("logit"))
summary(fit)
## 
## Call:
## glm(formula = country ~ Shannon + Axis.1, family = binomial("logit"), 
##     data = df, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9572  -0.5800   0.3504   0.6317   1.7853  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   16.224      8.422   1.926  0.05405 . 
## Shannon       -4.194      2.331  -1.799  0.07201 . 
## Axis.1         5.367      2.015   2.664  0.00772 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 45.728  on 37  degrees of freedom
## Residual deviance: 31.831  on 35  degrees of freedom
## AIC: 37.831
## 
## Number of Fisher Scoring iterations: 5

Univariate regression:

unires <- sapply(2:ncol(df), function(i){
  fit = glm(df$country ~ df[, i], family=binomial("logit"))
  summary(fit)$coefficients[2, ]
})
colnames(unires) = colnames(df)[2:ncol(df)]
unires
##                Shannon      Axis.1        Axis.2     Axis.3
## Estimate   -3.43736988 5.266361941 -24.913582499 5.90950116
## Std. Error  1.96396817 2.003823924   9.340239466 2.75057706
## z value    -1.75021669 2.628156036  -2.667338733 2.14845868
## Pr(>|z|)    0.08008093 0.008584913   0.007645457 0.03167734
write.csv(unires, file="univariate_shannonPCoA.csv")

Fitting a zero-inflated log-normal model with metagenomeSeq

metagenomeSeq

Preparing the data object:

Rampellieset = curatedMetagenomicData("RampelliS_2015.metaphlan_bugs_list.stool", 
                       dryrun = FALSE, counts = TRUE,
                       bugs.as.phyloseq = FALSE)[[1]]
## Working on RampelliS_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2017-06-09
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/lwaldron//.ExperimentHub/481'
mseq = ExpressionSet2MRexperiment(Rampellieset)
## Loading required namespace: metagenomeSeq

Normalization as per package vignette (browseVignettes("metagenomeSeq")):

library(metagenomeSeq)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## Loading required package: glmnet
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-10
## Loading required package: RColorBrewer
p = cumNormStatFast(mseq)
## Default value being used.
mseq = cumNorm(mseq, p = p)

Performing the regression analysis:

pd <- pData(mseq)
mod <- model.matrix(~1 + country, data = pd)
mseqres = fitFeatureModel(mseq, mod)
View(MRcoefs(mseqres))
View(MRtable(mseqres))

The recurring problem with “perfect” models interfering with convergence is still present.

NOTE: It appears that metagenomeSeq uses the same model matrix for both the zero-inflation logistic model, and the log-linear negative binomial model. This is OK for inference on the count model, but I don’t like it for the sake of interpretability of the coefficients. I believe there is a way to over-ride this default, but I’m not yet sure how. However, it does not seem to have affected the coefficients of the linear model very much:

res.matched <- res[match(rownames(MRcoefs(mseqres)), rownames(res)), ]
plot(res.matched$log2FoldChange, MRcoefs(mseqres)$logFC)