#clear R env
rm(list = ls())
#load libraries
library(lefser)
require(dplyr)
require(tibble)
library(qiime2R)
library(circlize)
library(ggplot2)
library(microbiomeMarker)
library(RColorBrewer)
library(ggthemr)
library(plyr)
library(phyloseq)
library(tidyverse)
library(ComplexHeatmap)
library(cowplot)
library(corncob)
#set theme
ggthemr('fresh')
#convert qiime2 output to phyloseq object
phy<-qza_to_phyloseq("table_EcoZUR.qza",
"rooted_tree_masked_aligned_rep_set_EcoZUR.qza",
"taxonomy_EcoZUR.qza",
"EcoZUR_meta_updated_pathotypes.tsv")
phy
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 4304 taxa and 401 samples ]
## sample_data() Sample Data: [ 401 samples by 540 sample variables ]
## tax_table() Taxonomy Table: [ 4304 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 4304 tips and 4253 internal nodes ]
#filter dataset
#remove taxa where phylum is ambiguous (these are likely artifacts)
phy <- subset_taxa(phy, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))
#remove chloroplast and mitochondrial sequences
phy<-subset_taxa(phy, (Class!="Cloroplast") | is.na(Class))
phy<-subset_taxa(phy, (Family!="mitochondria") | is.na(Class))
#remove taxa that occur <2x and in <10% of samples
phy = filter_taxa(phy, function(x) sum(x > 2) > (0.1*length(x)), TRUE)
#remove rotavirus-positive samples
phy<- subset_samples(phy, rota!="1")
#glom at the family level
phy_fam <-phy%>%
tax_glom("Family")
#run corncob to test for differences in relative abundance, adjusting for age category and sex on abundance and age category, sex, and cc + pathotype on variability
d <- differentialTest(formula = ~pathotype_cc+age_cat+sex,
phi.formula = ~pathotype_cc+age_cat+sex,
formula_null = ~age_cat+sex,
phi.formula_null = ~pathotype_cc+age_cat+sex,
data = phy_fam,
test = "Wald", boot = FALSE,
fdr_cutoff = .05)
plot(d, level="Family")
sig_marker_corncob<-as.data.frame(otu_to_taxonomy(OTU = d$significant_taxa, data = phy_fam))
sig_marker_corncob
## otu_to_taxonomy(OTU = d$significant_taxa, data = phy_fam)
## d46e2205f0c6ecf67b51f83d111c509c Bacteria_Proteobacteria_Gammaproteobacteria_Enterobacteriales_Enterobacteriaceae
## 394eda29c886632f514dd94b58381186 Bacteria_Proteobacteria_Gammaproteobacteria_Pasteurellales_Pasteurellaceae
## 4b7712254e4dd8d5bc585d4a6c5c2e88 Bacteria_Fusobacteria_Fusobacteriia_Fusobacteriales_Fusobacteriaceae
## 7b28c20e72c6c95b3e604f0849245770 Bacteria_Verrucomicrobia_Verrucomicrobiae_Verrucomicrobiales_Verrucomicrobiaceae
## 2c982937754e6321f861027032db80f7 Bacteria_Bacteroidetes_Bacteroidia_Bacteroidales_Bacteroidaceae
#run lefse
lefse_cc_path_fam <- run_lefse(
phy_fam,
wilcoxon_cutoff = 0.05,
group = "pathotype_cc",
kw_cutoff = 0.05,
multigrp_strat = TRUE,
lda_cutoff = 3,
taxa_rank="Family"
)
lefse_cc_path_fam
## microbiomeMarker-class inherited from phyloseq-class
## normalization method: [ CPM ]
## microbiome marker identity method: [ lefse ]
## marker_table() Marker Table: [ 15 microbiome markers with 5 variables ]
## otu_table() OTU Table: [ 30 taxa and 366 samples ]
## sample_data() Sample Data: [ 366 samples by 540 sample variables ]
## tax_table() Taxonomy Table: [ 30 taxa by 1 taxonomic ranks ]
sig_marker_lefse<-data.frame(marker_table(lefse_cc_path_fam))
plot_abundance(lefse_cc_path_fam, group="pathotype_cc")
plot_ef_bar(lefse_cc_path_fam)
#plot boxplots of significant taxa
phy_fam<-phy_fam %>%
transform_sample_counts(function(x) x / sum(x) )
phy_fam_sig<-subset_taxa(phy_fam, Family=="[Paraprevotellaceae]" | Family=="Bacteroidaceae" | Family == "Verrucomicrobiaceae"| Family == "Bifidobacteriaceae" | Family == "Clostridiaceae" | Family == "Erysipelotrichaceae" | Family == "Enterobacteriaceae" | Family == "Porphyromonadaceae" | Family == "Methanobacteriaceae" | Family == "Pasteurellaceae" | Family == "Fusobacteriaceae" | Family == "GZKB119" | Family == "Rikenellaceae" | Family== "Eubacteriaceae" | Family == "Elusimicrobiaceae")
taxa_names(phy_fam_sig) <- tax_table(phy_fam_sig)[ ,"Family"]
sample_data(phy_fam_sig)$pathotype_cc<-factor(sample_data(phy_fam_sig)$pathotype_cc, levels=c("pathpos_Case", "pathpos_Control", "pathneg_Case", "pathneg_Control"))
a<-psmelt(phy_fam_sig) %>%
ggplot(data = ., aes(x = pathotype_cc, y = Abundance, fill=NA)) +
geom_jitter(aes(color = pathotype_cc), height = 0, width = .4) +
geom_boxplot(outlier.shape = NA) +
labs(x = "", y = "Abundance\n") +
facet_wrap(~ OTU, scales = "free")+
theme(panel.grid.major = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),legend.title=element_blank(), legend.position=c(0.9,0.1))+
scale_color_manual(values =c("coral2", "steelblue3", "darkgoldenrod2", "darkolivegreen4"), labels= c("Symptptomatic\nDEC infections", "Asymptomatic\nDEC infections", "Uninfected cases", "Uninfected controls"))
a
#sqrt transformed boxplots to improve visualization of low-abundance taxa
phy_fam_sig_transformed <-transform_sample_counts(phy_fam_sig, function(x) sqrt(x))
sample_data(phy_fam_sig)$pathotype_cc<-factor(sample_data(phy_fam_sig)$pathotype_cc, levels=c("pathpos_Case", "pathpos_Control", "pathneg_Case", "pathneg_Control"))
b<-psmelt(phy_fam_sig_transformed) %>%
ggplot(data = ., aes(x = pathotype_cc, y = Abundance, fill=NA)) +
geom_jitter(aes(color = pathotype_cc), height = 0, width = .4, alpha=0.75) +
geom_boxplot(outlier.shape = NA) +
labs(x = "", y = "Abundance\n") +
facet_wrap(~ OTU, scales = "free")+
theme(panel.grid.major = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),legend.title=element_blank(), legend.position=c(0.9,0.1))+
scale_color_manual(values =c("coral2", "steelblue3", "darkgoldenrod2", "darkolivegreen4"), labels= c("Symptptomatic\nDEC infections", "Asymptomatic\nDEC infections", "Uninfected cases", "Uninfected controls"))
b
#heatmap
taxa_names(phy_fam_sig) <- tax_table(phy_fam_sig)[ ,"Family"]
sample_data<-phy_fam_sig %>%
sample_data() %>%
as.data.frame()%>%
rownames_to_column()
samp<-data.frame(sample_data$rowname, sample_data$pathotype_cc)
samp<-na.omit(samp)
names(samp)[1] <- "rowname"
rel_abund<-otu_table(phy_fam_sig, taxa_are_rows = TRUE)
rel_abund_t<-rel_abund %>%
t()%>%
as.data.frame()%>%
tibble::rownames_to_column()
rel_abund_join<-merge(rel_abund_t, samp, by="rowname")
rel_abund_pathpos_case<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathpos_Case")%>%
subset(select=-c(1,17))
rel_abund_pathpos_control<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathpos_Control")%>%
subset(select=-c(1,17))
rel_abund_pathneg_case<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathneg_Case")%>%
subset(select=-c(1,17))
rel_abund_pathneg_control<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathneg_Control")%>%
subset(select=-c(1,17))
mean_pathpos_case<-colMeans(rel_abund_pathpos_case)
mean_pathpos_control<-colMeans(rel_abund_pathpos_control)
mean_pathneg_case<-colMeans(rel_abund_pathneg_case)
mean_pathneg_control<-colMeans(rel_abund_pathneg_control)
mean_rel_abunds<-data.frame(
pathpos_case=round(mean_pathpos_case, 5),
pathpos_control=round(mean_pathpos_control, 5),
pathneg_case=round(mean_pathneg_case, 5),
pathneg_control=round(mean_pathneg_control, 5))
mean_rel_abunds_mat<-data.matrix(mean_rel_abunds)
heatmap_annot<-read.csv("mean_heatmap_annot v2.csv")
stat_annot<-read.csv("corncob_lefse_annot.csv") %>%
column_to_rownames(var="X")%>%
as.data.frame.matrix()
col = list(pathotype_cc=c("Symptomatic DEC infections"="coral2", "Asymptomatic DEC infections"="steelblue3", "Uninfected cases"="darkgoldenrod2", "Uninfected controls"="darkolivegreen4"), cc = c("Case" = "gray60", "Control" = "gray98"), path=c("Pathotype-negative"="gray98", "Pathotype-positive"="gray60"), path2=c("+"="gray60", "-"="gray98"))
levels=c("Symptomatic DEC infections", "Asymptomatic DEC infections", "Uninfected cases", "Uninfected controls")
ha<-HeatmapAnnotation(pathotype_cc=heatmap_annot$pathotype_cc, path=heatmap_annot$pathotype, cc=heatmap_annot$cc, col=col, annotation_label=c("Sample group","DEC infection status", "Diarrhea status"))
mycols_mean <- colorRamp2(breaks = c(0,(max(mean_rel_abunds_mat)/2),max(mean_rel_abunds_mat)), colors = c("gray98", "mediumpurple", "mediumpurple4"))
h_KW<-Heatmap(mean_rel_abunds_mat,
row_names_gp = gpar(fontsize = 12),
show_column_names=FALSE,
show_column_dend=FALSE,
column_order = c("pathpos_case", "pathpos_control", "pathneg_case", "pathneg_control"),
col=mycols_mean,
show_row_dend = FALSE,
row_names_max_width=max_text_width(rownames(mean_rel_abunds_mat)),
bottom_annotation=ha,
#right_annotation=ha2,
#rect_gp=gpar(col="white", lwd=2),
heatmap_legend_param=list(title=c("Rel. abund"), legend_height=unit(3, "cm")))
stat<-Heatmap(stat_annot, col=c("lightpink1", "darkslategray3"), width=unit(5, "mm"), name="Method", show_column_names = FALSE, show_row_names = TRUE, heatmap_legend_param=list(labels=c("LEfSe", "Corncob+LEfSe")))
h_KW+stat
#sqrt-transformed heatmap
taxa_names(phy_fam_sig_transformed) <- tax_table(phy_fam_sig_transformed)[ ,"Family"]
sample_data<-phy_fam_sig_transformed %>%
sample_data() %>%
as.data.frame()%>%
rownames_to_column()
samp<-data.frame(sample_data$rowname, sample_data$pathotype_cc)
samp<-na.omit(samp)
names(samp)[1] <- "rowname"
rel_abund<-otu_table(phy_fam_sig_transformed, taxa_are_rows = TRUE)
rel_abund_t<-rel_abund %>%
t()%>%
as.data.frame()%>%
tibble::rownames_to_column()
rel_abund_join<-merge(rel_abund_t, samp, by="rowname")
rel_abund_pathpos_case<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathpos_Case")%>%
subset(select=-c(1,17))
rel_abund_pathpos_control<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathpos_Control")%>%
subset(select=-c(1,17))
rel_abund_pathneg_case<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathneg_Case")%>%
subset(select=-c(1,17))
rel_abund_pathneg_control<-rel_abund_join%>%
subset(sample_data.pathotype_cc == "pathneg_Control")%>%
subset(select=-c(1,17))
mean_pathpos_case<-colMeans(rel_abund_pathpos_case)
mean_pathpos_control<-colMeans(rel_abund_pathpos_control)
mean_pathneg_case<-colMeans(rel_abund_pathneg_case)
mean_pathneg_control<-colMeans(rel_abund_pathneg_control)
mean_rel_abunds<-data.frame(
pathpos_case=round(mean_pathpos_case, 5),
pathpos_control=round(mean_pathpos_control, 5),
pathneg_case=round(mean_pathneg_case, 5),
pathneg_control=round(mean_pathneg_control, 5))
mean_rel_abunds_mat<-data.matrix(mean_rel_abunds)
heatmap_annot<-read.csv("mean_heatmap_annot v2.csv")
stat_annot<-read.csv("corncob_lefse_annot.csv") %>%
column_to_rownames(var="X")%>%
as.data.frame.matrix()
col = list(pathotype_cc=c("Symptomatic DEC infections"="coral2", "Asymptomatic DEC infections"="steelblue3", "Uninfected cases"="darkgoldenrod2", "Uninfected controls"="darkolivegreen4"), cc = c("Case" = "gray60", "Control" = "gray98"), path=c("Pathotype-negative"="gray98", "Pathotype-positive"="gray60"), path2=c("+"="gray60", "-"="gray98"))
ha<-HeatmapAnnotation(pathotype_cc=heatmap_annot$pathotype_cc, path=heatmap_annot$pathotype, cc=heatmap_annot$cc, col=col, annotation_label=c("Sample group","DEC infection status", "Diarrhea status"))
mycols_mean <- colorRamp2(breaks = c(0,(max(mean_rel_abunds_mat)/2),max(mean_rel_abunds_mat)), colors = c("gray98", "mediumpurple", "mediumpurple4"))
h_KW<-Heatmap(mean_rel_abunds_mat,
row_names_gp = gpar(fontsize = 12),
show_column_names=FALSE,
show_column_dend=FALSE,
column_order = c("pathpos_case", "pathpos_control", "pathneg_case", "pathneg_control"),
col=mycols_mean,
show_row_dend = FALSE,
row_names_max_width=max_text_width(rownames(mean_rel_abunds_mat)),
bottom_annotation=ha,
#right_annotation=ha2,
#rect_gp=gpar(col="white", lwd=2),
heatmap_legend_param=list(title=c("Sqrt rel. abund"), legend_height=unit(3, "cm")))
stat<-Heatmap(stat_annot, col=c("lightpink1", "darkslategray3"), width=unit(5, "mm"), name="Method", show_column_names = FALSE, show_row_names = TRUE, heatmap_legend_param=list(labels=c("LEfSe", "Corncob+LEfSe")))
j_KW<-h_KW+stat
j_KW