In this analysis we are looking at the relative abundance of microbiome samples 250 healthy subjects from HMP. The relative abundance table gives us the “percentage” for each microbe detected in the sample (so for each sample the sum of relative abundances of all microbes should add to 1.0). The relative abundance table is available for 5 taxonimc levels, but here we start with the genus, followed by family, followed by order. The general method that we use here is PCA. For a given abundance table, we compute the top 2 PC’s and visualize the data in the PC space. We label the samples that are in PC space with their sampleID. This can visually tell us if there is any signal in the data. After viewing the distribution in PC space, we want to know which microbes are driving the top 2 PC’s. For this, we plot the top 2 PC loadings: that is, the weight that PCA assigns to a microbe for the PC space. A higher weight (positve or negative), indicates that the microbe contributes more the PC’s. We select the top microbes (microbes with largest loadings) and visualize how their abundance changes for different patients.
In summary the steps are as follows:
- Compute PCA on the relative abundance table and visualize how samples change in the top 2 PC space
- Visualize the loadings of the microbes for the top 2 PC and select microbes with highest magnitude loadings
- Visualize the relative abundance of the highest miagnitude microbes across different subjects
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
library(reshape)
load_clean_data = function(file_path, tax.name){
df = read.csv(file_path, header = TRUE, stringsAsFactors = FALSE)
tax.level = which(names(df) == tax.name)
microbe.id = apply( df[ , names(df)[c(3:tax.level)]] , 1 , paste , collapse = "-" )
df.rel = cbind(data.frame(microbe.id = microbe.id), df[,grep("HE.", names(df))])
row.names(df.rel) = df.rel$microbe.id
df.T = t(df.rel[,-1])
colnames(df.T) = row.names(df.rel)
df.res = cbind(data.frame(sample.id = row.names(df.T)), df.T)
return(df.res)
}
pca.computer = function(df.x, take.log = TRUE, imputing.val = 1e-7){
numeric.df = df.x[,-1]
if (take.log == TRUE){
numeric.df = log(imputing.val + numeric.df)
}
pca = prcomp(numeric.df)
pca.res = cbind(df.x$sample.id, as.data.frame(pca$x))
names(pca.res)[1] = "sample.id"
pca.loadings = as.data.frame(pca$rotation)
pca.loadings$microbe = row.names(pca.loadings)
return(list(pca.res, pca.loadings))
}
top.microbes = function(pc.loadings, num.pcs = 6, additional.microbe = NULL, df.x){
loadings.mag = apply(as.matrix(pc.loadings[,c(1:2)]), 1, FUN = function(x) norm(x, type = "2"))
top.loadings = names(loadings.mag[order(loadings.mag, decreasing = TRUE)[c(1:num.pcs)]])
top.microbe.df = df.x[,c("sample.id", top.loadings, additional.microbe)]
top.microbe.m = melt(top.microbe.df, id.vars = "sample.id")
return(top.microbe.m)
}
plot.pca.proj = function(pca.proj){
ggplot(pca.proj, aes(x = PC1, y = PC2, label = sample.id)) +
geom_text(size = 8) -> p
return(p)
}
plot.pca.loadings = function(pca.loadings){
ggplot(pca.loadings, aes(x = PC1, y = PC2, label = microbe)) +
geom_text()-> p
return(p)
}
print.top.microbes = function(pca.loadings, num.microbes){
loadings.mag = apply(as.matrix(pca.loadings[,c(1:2)]), 1, FUN = function(x) norm(x, type = "2"))
return(names(loadings.mag[order(loadings.mag, decreasing = TRUE)[c(1:num.microbes)]]))
}
plot.time.series = function(pc.loadings, num.pcs = 6, additional.microbe = NULL, df.x){
df.m = top.microbes(pc.loadings, num.pcs, additional.microbe, df.x)
ggplot(df.m, aes(x = sample.id, y = value)) + facet_wrap(~variable, ncol = 1, scales = "free_y") +
geom_point() + xlab("") + ylab("") -> p
return(p)
}
df.genus = load_clean_data(tax.name = "genus",
file_path = "~/Documents/microbiome/processedData/ref_cov_abundance-clean/ref_genus_cov_abundance.txt-Table 1.csv")
pca.genus = pca.computer(df.genus, take.log = TRUE, imputing.val = 1e-7)
genus.proj = pca.genus[[1]]
genus.loadings = pca.genus[[2]]
p = plot.pca.proj(genus.proj) + ggtitle("PCA Genus")
ggplotly(p)
Plot the loadings for the genus PCA
p = plot.pca.loadings(genus.loadings) + ggtitle("Genus PCA Loadings")
ggplotly(p)
print.top.microbes(genus.loadings, num.microbes = 40)
## [1] "Bacteria-Verrucomicrobia-Verrucomicrobiae-Verrucomicrobiales-Verrucomicrobiaceae-Akkermansia"
## [2] "Bacteria-Actinobacteria-Actinobacteria-Coriobacteriales-Coriobacteriaceae-Olsenella"
## [3] "Bacteria-Actinobacteria-Actinobacteria-Coriobacteriales-Coriobacteriaceae-Slackia"
## [4] "Bacteria-Firmicutes-Clostridia-Clostridiales-Ruminococcaceae-Ethanoligenens"
## [5] "Bacteria-Proteobacteria-Betaproteobacteria-Burkholderiales-Oxalobacteraceae-Oxalobacter"
## [6] "Bacteria-Tenericutes-Mollicutes-Mycoplasmatales-Mycoplasmataceae-Mycoplasma"
## [7] "Bacteria-Firmicutes-Bacilli-Bacillales-Paenibacillaceae-Paenibacillus"
## [8] "Bacteria-Firmicutes-Clostridia-Clostridiales-Lachnospiraceae-Shuttleworthia"
## [9] "Bacteria-Firmicutes-Clostridia-Clostridiales-Clostridiales Family XIII. Incertae Sedis-Mogibacterium"
## [10] "Bacteria-Firmicutes-Clostridia-Clostridiales-Peptococcaceae-Desulfotomaculum"
## [11] "Bacteria-Firmicutes-Clostridia-Clostridiales-Eubacteriaceae-Anaerofustis"
## [12] "Bacteria-Actinobacteria-Actinobacteria-Actinomycetales-Corynebacteriaceae-Corynebacterium"
## [13] "Bacteria-Actinobacteria-Actinobacteria-Actinomycetales-Actinomycetaceae-Mobiluncus"
## [14] "Bacteria-Proteobacteria-Deltaproteobacteria-Desulfovibrionales-Desulfovibrionaceae-Desulfovibrio"
## [15] "Bacteria-Firmicutes-Clostridia-Clostridiales-Eubacteriaceae-Pseudoramibacter"
## [16] "Bacteria-Actinobacteria-Actinobacteria-Coriobacteriales-Coriobacteriaceae-Gordonibacter"
## [17] "Bacteria-Firmicutes-Negativicutes-Selenomonadales-Veillonellaceae-Mitsuokella"
## [18] "Bacteria-Synergistetes-Synergistia-Synergistales-Synergistaceae-Pyramidobacter"
## [19] "Bacteria-Firmicutes-Clostridia-Clostridiales-Peptococcaceae-Desulfitobacterium"
## [20] "Bacteria-Firmicutes-Erysipelotrichi-Erysipelotrichales-Erysipelotrichaceae-Solobacterium"
## [21] "Bacteria-Actinobacteria-Actinobacteria-Coriobacteriales-Coriobacteriaceae-Atopobium"
## [22] "Bacteria-Firmicutes-Clostridia-Clostridiales-Ruminococcaceae-Acetivibrio"
## [23] "Bacteria-Proteobacteria-Deltaproteobacteria-Desulfuromonadales-Geobacteraceae-Geobacter"
## [24] "Bacteria-Firmicutes-Clostridia-Clostridiales-Clostridiales Family XI. Incertae Sedis-Finegoldia"
## [25] "Bacteria-Cyanobacteria-none-Chroococcales-none-Synechococcus"
## [26] "Bacteria-Firmicutes-Bacilli-Bacillales-none-Gemella"
## [27] "Bacteria-Firmicutes-Clostridia-Clostridiales-Lachnospiraceae-Johnsonella"
## [28] "Bacteria-Firmicutes-Clostridia-Clostridiales-Lachnospiraceae-Stomatobaculum"
## [29] "Bacteria-Firmicutes-Clostridia-Clostridiales-Clostridiales Family XI. Incertae Sedis-Anaerococcus"
## [30] "Bacteria-Firmicutes-Erysipelotrichi-Erysipelotrichales-Erysipelotrichaceae-Eggerthia"
## [31] "Bacteria-Firmicutes-Clostridia-Thermoanaerobacterales-Thermoanaerobacterales Family III. Incertae Sedis-Thermoanaerobacterium"
## [32] "Bacteria-Firmicutes-Clostridia-Clostridiales-Clostridiaceae-Candidatus Arthromitus"
## [33] "Bacteria-Firmicutes-Negativicutes-Selenomonadales-Veillonellaceae-Selenomonas"
## [34] "Bacteria-Firmicutes-Clostridia-Thermoanaerobacterales-Thermoanaerobacterales Family III. Incertae Sedis-Caldicellulosiruptor"
## [35] "Bacteria-Spirochaetes-Spirochaetia-Spirochaetales-Brachyspiraceae-Brachyspira"
## [36] "Bacteria-Firmicutes-Bacilli-Bacillales-Staphylococcaceae-Staphylococcus"
## [37] "Bacteria-Firmicutes-Erysipelotrichi-Erysipelotrichales-Erysipelotrichaceae-Catenibacterium"
## [38] "Bacteria-Firmicutes-Clostridia-Clostridiales-Peptostreptococcaceae-Filifactor"
## [39] "Bacteria-Firmicutes-Clostridia-Clostridiales-Clostridiales Family XI. Incertae Sedis-Parvimonas"
## [40] "Bacteria-Tenericutes-Mollicutes-Acholeplasmatales-Acholeplasmataceae-Candidatus Phytoplasma"
Plot the top genus based on PC loadings for all subjects
p = plot.time.series(genus.loadings, df.x = df.genus) + ggtitle("Top Genus from PC1 and PC2 loadings")
ggplotly(p)
df.family = load_clean_data(tax.name = "family",
file_path = "~/Documents/microbiome/processedData/ref_cov_abundance-clean/ref_family_cov_abundance.txt-Table 1.csv")
pca.family = pca.computer(df.family, take.log = TRUE, imputing.val = 1e-7)
family.proj = pca.family[[1]]
family.loadings = pca.family[[2]]
p = plot.pca.proj(family.proj) + ggtitle("PCA Family")
ggplotly(p)
Plot the loadings for the family PCA
p = plot.pca.loadings(family.loadings) + ggtitle("Family PCA Loadings")
ggplotly(p)
print.top.microbes(family.loadings, num.microbes = 40)
## [1] "Bacteria-Verrucomicrobia-Verrucomicrobiae-Verrucomicrobiales-Verrucomicrobiaceae"
## [2] "Bacteria-Firmicutes-Bacilli-Bacillales-Paenibacillaceae"
## [3] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhodobacterales-Rhodobacteraceae"
## [4] "Bacteria-Proteobacteria-Betaproteobacteria-Burkholderiales-Comamonadaceae"
## [5] "Bacteria-Proteobacteria-Gammaproteobacteria-Aeromonadales-Aeromonadaceae"
## [6] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhizobiales-Rhizobiaceae"
## [7] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhodospirillales-Rhodospirillaceae"
## [8] "Bacteria-Actinobacteria-Actinobacteria-Actinomycetales-Micrococcaceae"
## [9] "Bacteria-Proteobacteria-Gammaproteobacteria-Xanthomonadales-Xanthomonadaceae"
## [10] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhizobiales-Methylobacteriaceae"
## [11] "Bacteria-Actinobacteria-Actinobacteria-Actinomycetales-Corynebacteriaceae"
## [12] "Bacteria-Proteobacteria-Gammaproteobacteria-Pseudomonadales-Pseudomonadaceae"
## [13] "Bacteria-Synergistetes-Synergistia-Synergistales-Synergistaceae"
## [14] "Bacteria-Chlorobi-Chlorobia-Chlorobiales-Chlorobiaceae"
## [15] "Bacteria-Proteobacteria-Deltaproteobacteria-Desulfuromonadales-Geobacteraceae"
## [16] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhizobiales-Bradyrhizobiaceae"
## [17] "Bacteria-Actinobacteria-Actinobacteria-Actinomycetales-Streptomycetaceae"
## [18] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhodospirillales-Acetobacteraceae"
## [19] "Bacteria-Deinococcus-Thermus-Deinococci-Thermales-Thermaceae"
## [20] "Bacteria-Proteobacteria-Gammaproteobacteria-Thiotrichales-Francisellaceae"
## [21] "Bacteria-Proteobacteria-Betaproteobacteria-Burkholderiales-Burkholderiaceae"
## [22] "Bacteria-Proteobacteria-Alphaproteobacteria-Caulobacterales-Caulobacteraceae"
## [23] "Bacteria-Deinococcus-Thermus-Deinococci-Deinococcales-Deinococcaceae"
## [24] "Archaea-Euryarchaeota-Methanococci-Methanococcales-Methanococcaceae"
## [25] "Bacteria-Aquificae-Aquificae-Aquificales-Hydrogenothermaceae"
## [26] "Bacteria-Cyanobacteria-none-Prochlorales-Prochlorococcaceae"
## [27] "Bacteria-Dictyoglomi-Dictyoglomia-Dictyoglomales-Dictyoglomaceae"
## [28] "Bacteria-Deferribacteres-Deferribacteres-Deferribacterales-Deferribacteraceae"
## [29] "Bacteria-Firmicutes-Clostridia-Thermoanaerobacterales-Thermoanaerobacteraceae"
## [30] "Bacteria-Aquificae-Aquificae-Aquificales-Desulfurobacteriaceae"
## [31] "Bacteria-none-none-Haloplasmatales-Haloplasmataceae"
## [32] "Bacteria-Proteobacteria-Epsilonproteobacteria-Nautiliales-Nautiliaceae"
## [33] "Bacteria-Firmicutes-Bacilli-Bacillales-Listeriaceae"
## [34] "Eukaryota-Ascomycota-Saccharomycetes-Saccharomycetales-Saccharomycetaceae"
## [35] "Bacteria-Nitrospirae-Nitrospira-Nitrospirales-Nitrospiraceae"
## [36] "Bacteria-Proteobacteria-Betaproteobacteria-Burkholderiales-Oxalobacteraceae"
## [37] "Bacteria-Thermodesulfobacteria-Thermodesulfobacteria-Thermodesulfobacteriales-Thermodesulfobacteriaceae"
## [38] "Bacteria-Proteobacteria-Gammaproteobacteria-Chromatiales-Ectothiorhodospiraceae"
## [39] "Archaea-Euryarchaeota-Methanobacteria-Methanobacteriales-Methanobacteriaceae"
## [40] "Bacteria-Firmicutes-Clostridia-Halanaerobiales-Halanaerobiaceae"
Plot the top family based on PC loadings for all subjects
p = plot.time.series(family.loadings, df.x = df.family) + ggtitle("Top Family from PC1 and PC2 loadings")
ggplotly(p)
df.order = load_clean_data(tax.name = "order",
file_path = "~/Documents/microbiome/processedData/ref_cov_abundance-clean/ref_order_cov_abundance.txt-Table 1.csv")
pca.order = pca.computer(df.order, take.log = TRUE, imputing.val = 1e-7)
order.proj = pca.order[[1]]
order.loadings = pca.order[[2]]
p = plot.pca.proj(order.proj) + ggtitle("PCA Order")
ggplotly(p)
Plot the loadings for the order PCA
p = plot.pca.loadings(order.loadings) + ggtitle("Order PCA Loadings")
ggplotly(p)
print.top.microbes(order.loadings, num.microbes = 40)
## [1] "Bacteria-Verrucomicrobia-Verrucomicrobiae-Verrucomicrobiales"
## [2] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhodobacterales"
## [3] "Bacteria-Proteobacteria-Alphaproteobacteria-Rhodospirillales"
## [4] "Bacteria-Proteobacteria-Deltaproteobacteria-Desulfuromonadales"
## [5] "Bacteria-Proteobacteria-Gammaproteobacteria-Chromatiales"
## [6] "Bacteria-Synergistetes-Synergistia-Synergistales"
## [7] "Bacteria-Cyanobacteria-none-Chroococcales"
## [8] "Bacteria-Chlorobi-Chlorobia-Chlorobiales"
## [9] "Bacteria-Proteobacteria-Gammaproteobacteria-Xanthomonadales"
## [10] "Bacteria-Proteobacteria-Gammaproteobacteria-Aeromonadales"
## [11] "Archaea-Euryarchaeota-Methanococci-Methanococcales"
## [12] "Bacteria-Tenericutes-Mollicutes-Mycoplasmatales"
## [13] "Eukaryota-Ascomycota-Saccharomycetes-Saccharomycetales"
## [14] "Bacteria-Proteobacteria-Gammaproteobacteria-Alteromonadales"
## [15] "Bacteria-Dictyoglomi-Dictyoglomia-Dictyoglomales"
## [16] "Bacteria-Nitrospirae-Nitrospira-Nitrospirales"
## [17] "Bacteria-Deferribacteres-Deferribacteres-Deferribacterales"
## [18] "Bacteria-Cyanobacteria-none-Prochlorales"
## [19] "Bacteria-Thermodesulfobacteria-Thermodesulfobacteria-Thermodesulfobacteriales"
## [20] "Archaea-Euryarchaeota-Methanomicrobia-Methanosarcinales"
## [21] "Bacteria-Deinococcus-Thermus-Deinococci-Thermales"
## [22] "Bacteria-Tenericutes-Mollicutes-Acholeplasmatales"
## [23] "Bacteria-Firmicutes-Clostridia-Thermoanaerobacterales"
## [24] "Eukaryota-none-none-Dictyosteliida"
## [25] "Bacteria-Deinococcus-Thermus-Deinococci-Deinococcales"
## [26] "Archaea-Euryarchaeota-Methanobacteria-Methanobacteriales"
## [27] "Bacteria-Ignavibacteria-Ignavibacteria-Ignavibacteriales"
## [28] "Bacteria-none-none-Haloplasmatales"
## [29] "Bacteria-Cyanobacteria-none-Nostocales"
## [30] "Bacteria-Thermotogae-Thermotogae-Thermotogales"
## [31] "Bacteria-Proteobacteria-Alphaproteobacteria-Rickettsiales"
## [32] "Bacteria-Proteobacteria-Epsilonproteobacteria-Nautiliales"
## [33] "Bacteria-Aquificae-Aquificae-Aquificales"
## [34] "Bacteria-Firmicutes-Clostridia-Halanaerobiales"
## [35] "Bacteria-Proteobacteria-Alphaproteobacteria-Caulobacterales"
## [36] "Bacteria-Fibrobacteres-Fibrobacteria-Fibrobacterales"
## [37] "Archaea-Euryarchaeota-Methanomicrobia-Methanomicrobiales"
## [38] "Bacteria-Chloroflexi-Dehalococcoidia-Dehalococcoidales"
## [39] "Bacteria-Firmicutes-Clostridia-Natranaerobiales"
## [40] "Bacteria-Proteobacteria-Gammaproteobacteria-Thiotrichales"
Plot top order based on PC loadings for all subjects
p = plot.time.series(order.loadings, df.x = df.order) + ggtitle("Top Order from PC1 and PC2 loadings")
ggplotly(p)