#clear R env

rm(list = ls())

#load libraries

library(lefser)
require(dplyr)
require(tibble)
library(qiime2R)
library(circlize)
library(ggplot2)
library(microbiomeMarker)
library(RColorBrewer)
library(ggthemr)
library(plyr)
library(phyloseq)
library(tidyverse)
library(ComplexHeatmap)
library(cowplot)
library(corncob)

#set theme

ggthemr('fresh')

#convert qiime2 output to phyloseq object

phy<-qza_to_phyloseq("table_EcoZUR.qza", 
                     "rooted_tree_masked_aligned_rep_set_EcoZUR.qza", 
                     "taxonomy_EcoZUR.qza", 
                     "EcoZUR_meta_updated_pathotypes.tsv")
phy
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 4304 taxa and 401 samples ]
## sample_data() Sample Data:       [ 401 samples by 540 sample variables ]
## tax_table()   Taxonomy Table:    [ 4304 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 4304 tips and 4253 internal nodes ]

#filter dataset

#remove taxa where phylum is ambiguous (these are likely artifacts)
phy <- subset_taxa(phy, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))

#remove chloroplast and mitochondrial sequences
phy<-subset_taxa(phy, (Class!="Cloroplast") | is.na(Class))
phy<-subset_taxa(phy, (Family!="mitochondria") | is.na(Class))

#remove taxa that occur <2x and in <10% of samples
phy = filter_taxa(phy, function(x) sum(x > 2) > (0.1*length(x)), TRUE)

#remove rotavirus-positive samples
phy<- subset_samples(phy, rota!="1")

#glom at the family level
phy_fam <-phy%>%
  tax_glom("Family")

#run corncob to test for differences in relative abundance, adjusting for age category and sex on abundance and age category, sex, and cc + pathotype on variability

d <- differentialTest(formula = ~pathotype_cc+age_cat+sex,
                                 phi.formula = ~pathotype_cc+age_cat+sex,
                                 formula_null = ~age_cat+sex,
                                 phi.formula_null = ~pathotype_cc+age_cat+sex,
                                 data = phy_fam,
                                 test = "Wald", boot = FALSE,
                                 fdr_cutoff = .05)

plot(d, level="Family")

sig_marker_corncob<-as.data.frame(otu_to_taxonomy(OTU = d$significant_taxa, data = phy_fam))
sig_marker_corncob
##                                                         otu_to_taxonomy(OTU = d$significant_taxa, data = phy_fam)
## d46e2205f0c6ecf67b51f83d111c509c Bacteria_Proteobacteria_Gammaproteobacteria_Enterobacteriales_Enterobacteriaceae
## 394eda29c886632f514dd94b58381186       Bacteria_Proteobacteria_Gammaproteobacteria_Pasteurellales_Pasteurellaceae
## 4b7712254e4dd8d5bc585d4a6c5c2e88             Bacteria_Fusobacteria_Fusobacteriia_Fusobacteriales_Fusobacteriaceae
## 7b28c20e72c6c95b3e604f0849245770 Bacteria_Verrucomicrobia_Verrucomicrobiae_Verrucomicrobiales_Verrucomicrobiaceae
## 2c982937754e6321f861027032db80f7                  Bacteria_Bacteroidetes_Bacteroidia_Bacteroidales_Bacteroidaceae

#run lefse

lefse_cc_path_fam <- run_lefse(
    phy_fam,
    wilcoxon_cutoff = 0.05,
    group = "pathotype_cc",
    kw_cutoff = 0.05,
    multigrp_strat = TRUE,
    lda_cutoff = 3,
    taxa_rank="Family"
)
lefse_cc_path_fam
## microbiomeMarker-class inherited from phyloseq-class
## normalization method:              [ CPM ]
## microbiome marker identity method: [ lefse ]
## marker_table() Marker Table:       [ 15 microbiome markers with 5 variables ]
## otu_table()    OTU Table:          [ 30 taxa and  366 samples ]
## sample_data()  Sample Data:        [ 366 samples by  540 sample variables ]
## tax_table()    Taxonomy Table:     [ 30 taxa by 1 taxonomic ranks ]
sig_marker_lefse<-data.frame(marker_table(lefse_cc_path_fam))

plot_abundance(lefse_cc_path_fam, group="pathotype_cc")

plot_ef_bar(lefse_cc_path_fam)

#plot boxplots of significant taxa

phy_fam<-phy_fam %>%
  transform_sample_counts(function(x) x / sum(x) )

phy_fam_sig<-subset_taxa(phy_fam, Family=="[Paraprevotellaceae]" | Family=="Bacteroidaceae" | Family == "Verrucomicrobiaceae"| Family == "Bifidobacteriaceae" | Family == "Clostridiaceae"  | Family == "Erysipelotrichaceae" | Family == "Enterobacteriaceae" | Family == "Porphyromonadaceae" | Family == "Methanobacteriaceae" | Family == "Pasteurellaceae" | Family == "Fusobacteriaceae" | Family == "GZKB119" | Family == "Rikenellaceae" | Family== "Eubacteriaceae" | Family == "Elusimicrobiaceae") 
taxa_names(phy_fam_sig) <- tax_table(phy_fam_sig)[ ,"Family"]

sample_data(phy_fam_sig)$pathotype_cc<-factor(sample_data(phy_fam_sig)$pathotype_cc, levels=c("pathpos_Case", "pathpos_Control", "pathneg_Case", "pathneg_Control"))

a<-psmelt(phy_fam_sig) %>%
  ggplot(data = ., aes(x = pathotype_cc, y = Abundance, fill=NA)) +
    geom_jitter(aes(color = pathotype_cc), height = 0, width = .4) +
    geom_boxplot(outlier.shape  = NA) +
    labs(x = "", y = "Abundance\n") +
    facet_wrap(~ OTU, scales = "free")+
    theme(panel.grid.major = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),legend.title=element_blank(), legend.position=c(0.9,0.1))+
    scale_color_manual(values =c("coral2", "steelblue3", "darkgoldenrod2", "darkolivegreen4"), labels= c("Symptptomatic\nDEC infections", "Asymptomatic\nDEC infections", "Uninfected cases", "Uninfected controls"))

a

#sqrt transformed boxplots to improve visualization of low-abundance taxa

phy_fam_sig_transformed <-transform_sample_counts(phy_fam_sig, function(x) sqrt(x))

sample_data(phy_fam_sig)$pathotype_cc<-factor(sample_data(phy_fam_sig)$pathotype_cc, levels=c("pathpos_Case", "pathpos_Control", "pathneg_Case", "pathneg_Control"))

b<-psmelt(phy_fam_sig_transformed) %>%
  ggplot(data = ., aes(x = pathotype_cc, y = Abundance, fill=NA)) +
    geom_jitter(aes(color = pathotype_cc), height = 0, width = .4, alpha=0.75) +
    geom_boxplot(outlier.shape  = NA) +
    labs(x = "", y = "Abundance\n") +
    facet_wrap(~ OTU, scales = "free")+
    theme(panel.grid.major = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),legend.title=element_blank(), legend.position=c(0.9,0.1))+
  scale_color_manual(values =c("coral2", "steelblue3", "darkgoldenrod2", "darkolivegreen4"), labels= c("Symptptomatic\nDEC infections", "Asymptomatic\nDEC infections", "Uninfected cases", "Uninfected controls"))
b

#heatmap

taxa_names(phy_fam_sig) <- tax_table(phy_fam_sig)[ ,"Family"]
sample_data<-phy_fam_sig %>%
  sample_data() %>%
  as.data.frame()%>%
  rownames_to_column()

samp<-data.frame(sample_data$rowname, sample_data$pathotype_cc)
samp<-na.omit(samp)
names(samp)[1] <- "rowname"

rel_abund<-otu_table(phy_fam_sig, taxa_are_rows = TRUE)
rel_abund_t<-rel_abund %>%
  t()%>%
  as.data.frame()%>%
  tibble::rownames_to_column()

rel_abund_join<-merge(rel_abund_t, samp, by="rowname")

rel_abund_pathpos_case<-rel_abund_join%>%
    subset(sample_data.pathotype_cc == "pathpos_Case")%>%
    subset(select=-c(1,17))
rel_abund_pathpos_control<-rel_abund_join%>%
  subset(sample_data.pathotype_cc == "pathpos_Control")%>%
    subset(select=-c(1,17))
rel_abund_pathneg_case<-rel_abund_join%>%
    subset(sample_data.pathotype_cc == "pathneg_Case")%>%
    subset(select=-c(1,17))
rel_abund_pathneg_control<-rel_abund_join%>%
  subset(sample_data.pathotype_cc == "pathneg_Control")%>%
    subset(select=-c(1,17))

mean_pathpos_case<-colMeans(rel_abund_pathpos_case)
mean_pathpos_control<-colMeans(rel_abund_pathpos_control)
mean_pathneg_case<-colMeans(rel_abund_pathneg_case)
mean_pathneg_control<-colMeans(rel_abund_pathneg_control)

mean_rel_abunds<-data.frame(
  pathpos_case=round(mean_pathpos_case, 5),
  pathpos_control=round(mean_pathpos_control, 5),
  pathneg_case=round(mean_pathneg_case, 5), 
  pathneg_control=round(mean_pathneg_control, 5))
mean_rel_abunds_mat<-data.matrix(mean_rel_abunds)

heatmap_annot<-read.csv("mean_heatmap_annot v2.csv")
stat_annot<-read.csv("corncob_lefse_annot.csv") %>%
  column_to_rownames(var="X")%>%
  as.data.frame.matrix() 

col = list(pathotype_cc=c("Symptomatic DEC infections"="coral2", "Asymptomatic DEC infections"="steelblue3", "Uninfected cases"="darkgoldenrod2", "Uninfected controls"="darkolivegreen4"), cc = c("Case" = "gray60", "Control" = "gray98"), path=c("Pathotype-negative"="gray98", "Pathotype-positive"="gray60"), path2=c("+"="gray60", "-"="gray98"))

levels=c("Symptomatic DEC infections", "Asymptomatic DEC infections", "Uninfected cases", "Uninfected controls")

ha<-HeatmapAnnotation(pathotype_cc=heatmap_annot$pathotype_cc, path=heatmap_annot$pathotype, cc=heatmap_annot$cc, col=col, annotation_label=c("Sample group","DEC infection status", "Diarrhea status"))

mycols_mean <- colorRamp2(breaks = c(0,(max(mean_rel_abunds_mat)/2),max(mean_rel_abunds_mat)), colors = c("gray98", "mediumpurple", "mediumpurple4"))
h_KW<-Heatmap(mean_rel_abunds_mat, 
              row_names_gp = gpar(fontsize = 12), 
              show_column_names=FALSE,  
              show_column_dend=FALSE,
              column_order = c("pathpos_case", "pathpos_control", "pathneg_case", "pathneg_control"),
              col=mycols_mean, 
              show_row_dend = FALSE, 
              row_names_max_width=max_text_width(rownames(mean_rel_abunds_mat)), 
              bottom_annotation=ha,   
              #right_annotation=ha2,
              #rect_gp=gpar(col="white", lwd=2),
              heatmap_legend_param=list(title=c("Rel. abund"), legend_height=unit(3, "cm")))

stat<-Heatmap(stat_annot, col=c("lightpink1", "darkslategray3"), width=unit(5, "mm"), name="Method", show_column_names = FALSE, show_row_names = TRUE, heatmap_legend_param=list(labels=c("LEfSe", "Corncob+LEfSe")))

h_KW+stat

#sqrt-transformed heatmap

taxa_names(phy_fam_sig_transformed) <- tax_table(phy_fam_sig_transformed)[ ,"Family"]
sample_data<-phy_fam_sig_transformed %>%
  sample_data() %>%
  as.data.frame()%>%
  rownames_to_column()

samp<-data.frame(sample_data$rowname, sample_data$pathotype_cc)
samp<-na.omit(samp)
names(samp)[1] <- "rowname"

rel_abund<-otu_table(phy_fam_sig_transformed, taxa_are_rows = TRUE)
rel_abund_t<-rel_abund %>%
  t()%>%
  as.data.frame()%>%
  tibble::rownames_to_column()

rel_abund_join<-merge(rel_abund_t, samp, by="rowname")

rel_abund_pathpos_case<-rel_abund_join%>%
    subset(sample_data.pathotype_cc == "pathpos_Case")%>%
    subset(select=-c(1,17))
rel_abund_pathpos_control<-rel_abund_join%>%
  subset(sample_data.pathotype_cc == "pathpos_Control")%>%
    subset(select=-c(1,17))
rel_abund_pathneg_case<-rel_abund_join%>%
    subset(sample_data.pathotype_cc == "pathneg_Case")%>%
    subset(select=-c(1,17))
rel_abund_pathneg_control<-rel_abund_join%>%
  subset(sample_data.pathotype_cc == "pathneg_Control")%>%
    subset(select=-c(1,17))

mean_pathpos_case<-colMeans(rel_abund_pathpos_case)
mean_pathpos_control<-colMeans(rel_abund_pathpos_control)
mean_pathneg_case<-colMeans(rel_abund_pathneg_case)
mean_pathneg_control<-colMeans(rel_abund_pathneg_control)

mean_rel_abunds<-data.frame(
  pathpos_case=round(mean_pathpos_case, 5),
  pathpos_control=round(mean_pathpos_control, 5),
  pathneg_case=round(mean_pathneg_case, 5), 
  pathneg_control=round(mean_pathneg_control, 5))
mean_rel_abunds_mat<-data.matrix(mean_rel_abunds)

heatmap_annot<-read.csv("mean_heatmap_annot v2.csv")
stat_annot<-read.csv("corncob_lefse_annot.csv") %>%
  column_to_rownames(var="X")%>%
  as.data.frame.matrix()

col = list(pathotype_cc=c("Symptomatic DEC infections"="coral2", "Asymptomatic DEC infections"="steelblue3", "Uninfected cases"="darkgoldenrod2", "Uninfected controls"="darkolivegreen4"), cc = c("Case" = "gray60", "Control" = "gray98"), path=c("Pathotype-negative"="gray98", "Pathotype-positive"="gray60"), path2=c("+"="gray60", "-"="gray98"))

ha<-HeatmapAnnotation(pathotype_cc=heatmap_annot$pathotype_cc, path=heatmap_annot$pathotype, cc=heatmap_annot$cc, col=col, annotation_label=c("Sample group","DEC infection status", "Diarrhea status"))

mycols_mean <- colorRamp2(breaks = c(0,(max(mean_rel_abunds_mat)/2),max(mean_rel_abunds_mat)), colors = c("gray98", "mediumpurple", "mediumpurple4"))
h_KW<-Heatmap(mean_rel_abunds_mat, 
              row_names_gp = gpar(fontsize = 12), 
              show_column_names=FALSE,  
              show_column_dend=FALSE,
              column_order = c("pathpos_case", "pathpos_control", "pathneg_case", "pathneg_control"),
              col=mycols_mean, 
              show_row_dend = FALSE, 
              row_names_max_width=max_text_width(rownames(mean_rel_abunds_mat)), 
              bottom_annotation=ha, 
              #right_annotation=ha2,
              #rect_gp=gpar(col="white", lwd=2),
              heatmap_legend_param=list(title=c("Sqrt rel. abund"), legend_height=unit(3, "cm")))

stat<-Heatmap(stat_annot, col=c("lightpink1", "darkslategray3"), width=unit(5, "mm"), name="Method", show_column_names = FALSE, show_row_names = TRUE, heatmap_legend_param=list(labels=c("LEfSe", "Corncob+LEfSe"))) 
j_KW<-h_KW+stat
j_KW