Core Configuration & Libraries

library(dada2)
library(phyloseq)
library(DECIPHER)
library(phangorn)
library(biomformat)
library(ggplot2)
library(vegan)
library(ape)
library(writexl)
library(DESeq2)
library(dplyr)
library(DT)
library(microbiome)
library(Biostrings)
library(ggpubr)
library(indicspecies)
library(betapart)
library(igraph)
library(Hmisc)
library(reshape2)
library(ggrepel)

# ---- CORE CONFIGURATION ----
path <- "C:/Users/Owner/Documents/BSP_2022/BSP_2025/Metabarcoding_training"
cutadapt_path <- "C:/Users/Owner/Downloads/cutadapt" # Verify this matches your system installation
system2(cutadapt_path, args = "--version") # Run shell commands from R

# Standard Universal Primers (To be tested via Cutadapt)
fwd_16S <- "CCTACGGGNGGCWGCAG"        # V3 Forward
rev_16S <- "GACTACHVGGGTATCTAATCC"   # V4 Reverse
fwd_ITS <- "GCATCGATGAAGAACGCAGC"    # ITS3 Forward
rev_ITS <- "TCCTCCGCTTATTGATATGC"    # ITS4 Reverse

PHASE 1: BACTERIAL 16S PIPELINE

cat("\n\n======================================================\n")
cat("STARTING 16S PIPELINE - WATCH CUTADAPT RETENTION RATES\n")
cat("======================================================\n\n")

fnFs_16S <- sort(list.files(path, pattern = "__1_1\\.fastq\\.gz$", full.names = TRUE))
fnRs_16S <- sort(list.files(path, pattern = "__1_2\\.fastq\\.gz$", full.names = TRUE))
sample.names_16S <- sapply(strsplit(basename(fnFs_16S), "__1"), `[`, 1)

path_cut_16S <- file.path(path, "cutadapt_16S")
dir.create(path_cut_16S, showWarnings = FALSE)
fnFs_cut_16S <- file.path(path_cut_16S, paste0(sample.names_16S, "_F_cut.fastq.gz"))
fnRs_cut_16S <- file.path(path_cut_16S, paste0(sample.names_16S, "_R_cut.fastq.gz"))

# Cutadapt Trimming
for(i in seq_along(fnFs_16S)) {
  system2(cutadapt_path, args = c("-g", fwd_16S, "-G", rev_16S,
                                  "-o", shQuote(fnFs_cut_16S[i]), "-p", shQuote(fnRs_cut_16S[i]),
                                  shQuote(fnFs_16S[i]), shQuote(fnRs_16S[i]),
                                  "--discard-untrimmed", "--minimum-length=100"))
}

# DADA2 Filtering
filtFs_16S <- file.path(path, "filtered_16S", paste0(sample.names_16S, "_F_filt.fastq.gz"))
filtRs_16S <- file.path(path, "filtered_16S", paste0(sample.names_16S, "_R_filt.fastq.gz"))
names(filtFs_16S) <- sample.names_16S; names(filtRs_16S) <- sample.names_16S

out_16S <- filterAndTrim(fnFs_cut_16S, filtFs_16S, fnRs_cut_16S, filtRs_16S, 
                         truncLen=c(250, 200), maxN=0, maxEE=c(2,2), truncQ=2, 
                         rm.phix=TRUE, compress=TRUE, multithread=TRUE)

# 16S Quality Control Plots
print(plotQualityProfile(fnFs_16S[1:2]) + ggtitle("RAW Forward Reads (16S)"))

print(plotQualityProfile(filtFs_16S[1:2]) + ggtitle("FILTERED Forward Reads (16S)"))

# Error Learning & Core Algorithm
errF_16S <- learnErrors(filtFs_16S, multithread=TRUE)
errR_16S <- learnErrors(filtRs_16S, multithread=TRUE)
mergers_16S <- mergePairs(dada(filtFs_16S, err=errF_16S, multithread=TRUE), filtFs_16S, 
                          dada(filtRs_16S, err=errR_16S, multithread=TRUE), filtRs_16S, verbose=TRUE)
seqtab_nochim_16S <- removeBimeraDenovo(rowsum(makeSequenceTable(mergers_16S), group = sample.names_16S), method="consensus", multithread=TRUE)

# Taxonomy (SILVA)
seqtab_final_16S <- seqtab_nochim_16S[, nchar(getSequences(seqtab_nochim_16S)) >= 350]
taxa_16S <- assignTaxonomy(seqtab_final_16S, file.path(path, "silva_nr99_v138.2_toGenus_trainset.fa.gz"), multithread=TRUE)
taxa_16S <- addSpecies(taxa_16S, file.path(path, "silva_v138.2_assignSpecies.fa.gz"))

PHASE 2: FUNGAL / YEAST ITS PIPELINE

cat("\n\n======================================================\n")
cat("STARTING ITS PIPELINE - WATCH CUTADAPT RETENTION RATES\n")
cat("======================================================\n\n")

fnFs_ITS <- sort(list.files(path, pattern = "__2_1\\.fastq\\.gz$", full.names = TRUE))
fnRs_ITS <- sort(list.files(path, pattern = "__2_2\\.fastq\\.gz$", full.names = TRUE))
sample.names_ITS <- sapply(strsplit(basename(fnFs_ITS), "__2"), `[`, 1)

path_cut_ITS <- file.path(path, "cutadapt_ITS")
dir.create(path_cut_ITS, showWarnings = FALSE)
fnFs_cut_ITS <- file.path(path_cut_ITS, paste0(sample.names_ITS, "_F_cut.fastq.gz"))
fnRs_cut_ITS <- file.path(path_cut_ITS, paste0(sample.names_ITS, "_R_cut.fastq.gz"))

# Cutadapt Trimming
for(i in seq_along(fnFs_ITS)) {
  system2(cutadapt_path, args = c("-g", fwd_ITS, "-G", rev_ITS,
                                  "-o", shQuote(fnFs_cut_ITS[i]), "-p", shQuote(fnRs_cut_ITS[i]),
                                  shQuote(fnFs_ITS[i]), shQuote(fnRs_ITS[i]),
                                  "--discard-untrimmed", "--minimum-length=50"))
}

# DADA2 Filtering
filtFs_ITS <- file.path(path, "filtered_ITS", paste0(sample.names_ITS, "_F_filt.fastq.gz"))
filtRs_ITS <- file.path(path, "filtered_ITS", paste0(sample.names_ITS, "_R_filt.fastq.gz"))
names(filtFs_ITS) <- sample.names_ITS; names(filtRs_ITS) <- sample.names_ITS

out_ITS <- filterAndTrim(fnFs_cut_ITS, filtFs_ITS, fnRs_cut_ITS, filtRs_ITS, 
                         maxN=0, maxEE=c(2,2), truncQ=2, minLen=50, 
                         rm.phix=TRUE, compress=TRUE, multithread=TRUE)

# ITS Quality Control Plots
print(plotQualityProfile(fnFs_ITS[1:2]) + ggtitle("RAW Forward Reads (ITS)"))

print(plotQualityProfile(filtFs_ITS[1:2]) + ggtitle("FILTERED Forward Reads (ITS)"))

# Error Learning & Core Algorithm
errF_ITS <- learnErrors(filtFs_ITS, multithread=TRUE)
errR_ITS <- learnErrors(filtRs_ITS, multithread=TRUE)
mergers_ITS <- mergePairs(dada(filtFs_ITS, err=errF_ITS, multithread=TRUE), filtFs_ITS, 
                          dada(filtRs_ITS, err=errR_ITS, multithread=TRUE), filtRs_ITS, verbose=TRUE)
seqtab_final_ITS <- removeBimeraDenovo(rowsum(makeSequenceTable(mergers_ITS), group = sample.names_ITS), method="consensus", multithread=TRUE)

# Taxonomy (UNITE - Single Threaded for Windows Stability)
taxa_ITS <- assignTaxonomy(seqtab_final_ITS, file.path(path, "sh_general_release_dynamic_19.02.2025.fasta"), multithread=FALSE)

PHASE 3: PHYLOGENETIC TREES & PHYLOSEQ ASSEMBLY

cat("\n--- Building Phylogenetic Trees & Phyloseq Objects ---\n")

## 
## --- Building Phylogenetic Trees & Phyloseq Objects ---

metadata <- data.frame(
  SampleID = rownames(seqtab_final_16S),
  TimePoint = factor(ifelse(grepl("^0", rownames(seqtab_final_16S)), "Start", "Finish"), levels=c("Start", "Finish")),
  Treatment = factor(ifelse(grepl("CET", rownames(seqtab_final_16S)), "Controlled", "Spontaneous"))
)
rownames(metadata) <- metadata$SampleID

# 16S Assembly (Includes ML Tree)
generate_ml_tree <- function(seqtab_matrix) {
  seqs <- getSequences(seqtab_matrix); names(seqs) <- seqs
  phang_align <- phyDat(as(AlignSeqs(DNAStringSet(seqs), anchor=NA, verbose=FALSE), "matrix"), type="DNA")
  return(optim.pml(pml(NJ(dist.ml(phang_align)), data=phang_align), model="GTR", optInv=TRUE, optGamma=TRUE, rearrangement="stochastic", control=pml.control(trace=0))$tree)
}

ps_16S <- phyloseq(otu_table(seqtab_final_16S, taxa_are_rows=FALSE), sample_data(metadata), 
                   tax_table(taxa_16S), phy_tree(generate_ml_tree(seqtab_final_16S)))
dna_16S <- Biostrings::DNAStringSet(taxa_names(ps_16S)); names(dna_16S) <- taxa_names(ps_16S)
ps_16S <- merge_phyloseq(ps_16S, dna_16S)
taxa_names(ps_16S) <- paste0("ASV_16S_", seq(ntaxa(ps_16S)))

# Filter Host Contaminants
ps_16S <- subset_taxa(ps_16S, (Order != "Chloroplast") | is.na(Order))
ps_16S <- subset_taxa(ps_16S, (Family != "Mitochondria") | is.na(Family))

# ITS Assembly (No Tree)
ps_ITS <- phyloseq(otu_table(seqtab_final_ITS, taxa_are_rows=FALSE), sample_data(metadata), tax_table(taxa_ITS))
dna_ITS <- Biostrings::DNAStringSet(taxa_names(ps_ITS)); names(dna_ITS) <- taxa_names(ps_ITS)
ps_ITS <- merge_phyloseq(ps_ITS, dna_ITS)
taxa_names(ps_ITS) <- paste0("ASV_ITS_", seq(ntaxa(ps_ITS)))

# Overwrite the broken .rds files with the fixed, sequence-containing objects
saveRDS(ps_16S, file = file.path(path, "Saved_R_Data", "ps_16S_final.rds"))
saveRDS(ps_ITS, file = file.path(path, "Saved_R_Data", "ps_ITS_final.rds"))

cat("\nPhyloseq objects successfully rebuilt with embedded DNA sequences.\n")

## 
## Phyloseq objects successfully rebuilt with embedded DNA sequences.

5. Bacterial Community Analysis (16S)

5.1 16S Setup & Background Exports

# Clean and Transform
ps_clean_16S <- prune_samples(sample_sums(ps_16S) > 1000, ps_16S)
ps_rel_16S <- transform_sample_counts(ps_clean_16S, function(x) x / sum(x))

# Background File Exports
writeXStringSet(refseq(ps_clean_16S), file.path(path, "Bacteria_16S_rep.fasta"), format="fasta")
if (!is.null(phy_tree(ps_clean_16S, errorIfNULL = FALSE))) { 
  write.tree(phy_tree(ps_clean_16S), file.path(path, "Bacteria_16S_phylo.tre")) 
}

krona_16S <- psmelt(ps_clean_16S) %>% 
  filter(Abundance > 0) %>% 
  select(Abundance, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>% 
  mutate(across(everything(), ~tidyr::replace_na(as.character(.), "Unclassified")))
write.table(krona_16S, file.path(path, "Bacteria_16S_Krona_Input.txt"), sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)

5.2 16S Alpha Diversity

alpha_df_16S <- estimate_richness(ps_clean_16S, measures=c("Shannon", "Simpson", "Chao1", "Observed"))
write_xlsx(cbind(SampleID = rownames(alpha_df_16S), alpha_df_16S), file.path(path, "Bacteria_16S_Alpha_Diversity.xlsx"))

htmltools::tagList(datatable(alpha_df_16S, options = list(pageLength = 5, scrollX = TRUE), caption = "16S Alpha Diversity Indices"))

plot_richness(ps_clean_16S, x="TimePoint", color="Treatment", measures=c("Shannon", "Simpson")) +
  geom_boxplot(outlier.shape = NA, alpha=0.7) + 
  geom_jitter(width=0.1, alpha=0.6) + 
  theme_classic() + 
  labs(title="Bacteria 16S: Alpha Diversity Significance") +
  stat_compare_means(aes(group = Treatment), label = "p.signif", method = "wilcox.test")

5.3 16S Beta Diversity & PERMANOVA

dist_bray_16S <- phyloseq::distance(ps_clean_16S, method="bray")
write.csv(as.matrix(dist_bray_16S), file.path(path, "Bacteria_16S_BrayCurtis.csv"))

# PERMANOVA
sample_df_16S <- as(sample_data(ps_clean_16S), "data.frame")
permanova_16S <- adonis2(dist_bray_16S ~ Treatment * TimePoint, data = sample_df_16S, permutations = 999)
htmltools::tagList(datatable(as.data.frame(permanova_16S), options = list(dom = 't'), caption = "16S PERMANOVA Results"))

# PCoA Plot
plot_ordination(ps_rel_16S, ordinate(ps_rel_16S, method="PCoA", distance="bray"), color="TimePoint", shape="Treatment") +
  geom_point(size=4) + theme_minimal() + labs(title="Bacteria 16S: PCoA (Bray-Curtis)")

# UPGMA Tree
png(file.path(path, "Bacteria_16S_UPGMA.png"), width=800, height=600, res=100)
plot(hclust(dist_bray_16S, method="average"), main="Bacteria 16S: UPGMA Clustering", xlab="Samples", sub="")
dev.off()

## png 
##   2

plot(hclust(dist_bray_16S, method="average"), main="Bacteria 16S: UPGMA Clustering", xlab="Samples", sub="")

5.4 16S Taxonomic Composition

top40_16S <- names(sort(taxa_sums(ps_clean_16S), decreasing=TRUE))[1:40]

# Barplot (Top 20)
plot_bar(prune_taxa(top40_16S[1:20], ps_clean_16S), x="SampleID", fill="Genus") + 
  facet_wrap(~Treatment, scales="free_x") + theme_bw() + labs(title="Bacteria 16S: Top 20 Genera")

# Heatmap (Top 40)
plot_heatmap(prune_taxa(top40_16S, ps_clean_16S), method="NMDS", distance="bray", 
             sample.order="Treatment", taxa.label="Genus", taxa.order="Phylum",
             low="#000033", high="#FF3300", na.value="white") + labs(title="Bacteria 16S: Taxonomic Heatmap")

5.5 16S Core Microbiome & DESeq2

# Core Microbiome
ps_ctrl_16S <- subset_samples(ps_clean_16S, Treatment == "Controlled")
core_taxa_16S <- core_members(ps_ctrl_16S, detection = 0.001, prevalence = 0.8)

if(length(core_taxa_16S) > 0) {
  cat("\n**16S Core Taxa Detected:**\n")
  htmltools::tagList(datatable(as.data.frame(tax_table(ps_clean_16S)[core_taxa_16S, ]), options = list(pageLength = 5, scrollX = TRUE)))
}

## 
## **16S Core Taxa Detected:**

# DESeq2
ds2_16S <- DESeq(phyloseq_to_deseq2(ps_clean_16S, ~ TimePoint + Treatment), test="Wald", fitType="parametric", sfType="poscounts")
res_16S <- results(ds2_16S, contrast=c("Treatment", "Controlled", "Spontaneous"))
sigtab_16S <- res_16S[which(res_16S$padj < 0.05), ]

# DESeq2 Volcano Plot Visualization (With Taxonomy Annotations)
tax_df_16S <- as.data.frame(tax_table(ps_clean_16S))
volcano_data_16S <- cbind(as.data.frame(res_16S), tax_df_16S[rownames(res_16S), ])

volcano_data_16S <- volcano_data_16S[!is.na(volcano_data_16S$padj), ]

volcano_data_16S$Significance <- "Not Significant"
volcano_data_16S$Significance[volcano_data_16S$log2FoldChange > 1 & volcano_data_16S$padj < 0.05] <- "Significantly Increased (Controlled)"
volcano_data_16S$Significance[volcano_data_16S$log2FoldChange < -1 & volcano_data_16S$padj < 0.05] <- "Significantly Decreased (Spontaneous)"

volcano_data_16S$Label <- NA 
sig_rows_16S <- volcano_data_16S$Significance != "Not Significant"
volcano_data_16S$Label[sig_rows_16S] <- volcano_data_16S$Genus[sig_rows_16S] 

ggplot(volcano_data_16S, aes(x = log2FoldChange, y = -log10(padj), color = Significance)) +
  geom_point(alpha = 0.7, size = 2.5) +
  scale_color_manual(values = c("Not Significant" = "grey80", 
                                "Significantly Increased (Controlled)" = "#2CA02C", 
                                "Significantly Decreased (Spontaneous)" = "#D62728")) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(aes(label = Label), size = 3.5, max.overlaps = 20, 
                  box.padding = 0.5, point.padding = 0.3, color = "black", show.legend = FALSE) +
  theme_bw() +
  labs(title = "DESeq2 Volcano Plot: Controlled vs. Spontaneous Fermentation",
       x = "Log2 Fold Change (Biological Magnitude)",
       y = "-Log10 Adjusted p-value (Statistical Confidence)") +
  theme(legend.position = "bottom")

if(nrow(sigtab_16S) > 0) {
  sig_df_16S <- cbind(as(sigtab_16S, "data.frame"), as(tax_table(ps_clean_16S)[rownames(sigtab_16S), ], "matrix"))
  write_xlsx(sig_df_16S, file.path(path, "Bacteria_16S_DESeq2.xlsx"))
  cat("\n**16S Significant DESeq2 Taxa:**\n")
  htmltools::tagList(datatable(sig_df_16S, options = list(pageLength = 5, scrollX = TRUE)))
}

## 
## **16S Significant DESeq2 Taxa:**

5.6 16S Indicator Species Analysis (IndVal)

otu_mat_16S <- as(otu_table(ps_clean_16S), "matrix")
if(taxa_are_rows(ps_clean_16S)){ otu_mat_16S <- t(otu_mat_16S) }

ind_val_16S <- multipatt(otu_mat_16S, sample_data(ps_clean_16S)$Treatment, func = "IndVal.g", control = how(nperm=999))
ind_sum_16S <- ind_val_16S$sign %>% filter(p.value <= 0.05)

if(nrow(ind_sum_16S) > 0) {
  ind_final_16S <- cbind(as.data.frame(tax_table(ps_clean_16S)[rownames(ind_sum_16S), ]), ind_sum_16S)
  write_xlsx(ind_final_16S, file.path(path, "Bacteria_16S_Indicator.xlsx"))
  htmltools::tagList(datatable(ind_final_16S, options = list(pageLength = 5, scrollX = TRUE), caption = "16S Indicator Species"))
} else {
  cat("\n*No significant indicator species found for 16S.*\n")
}

5.7 16S Beta Diversity Partitioning (Turnover vs. Nestedness)

otu_mat_part <- as(otu_table(ps_clean_16S), "matrix")
if(taxa_are_rows(ps_clean_16S)){ otu_mat_part <- t(otu_mat_part) }

beta_part <- beta.pair.abund(otu_mat_part)

mean_turnover <- mean(as.matrix(beta_part$beta.bray.bal)[lower.tri(as.matrix(beta_part$beta.bray.bal))])
mean_nestedness <- mean(as.matrix(beta_part$beta.bray.gra)[lower.tri(as.matrix(beta_part$beta.bray.gra))])
mean_total <- mean_turnover + mean_nestedness

part_df <- data.frame(
  Component = c("Turnover (Species Replacement)", "Nestedness (Species Loss/Gain)"),
  Percentage = c((mean_turnover/mean_total)*100, (mean_nestedness/mean_total)*100)
)

ggplot(part_df, aes(x = "", y = Percentage, fill = Component)) +
  geom_bar(stat = "identity", width = 0.5, color="black") +
  coord_flip() + theme_minimal() + 
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +
  labs(title = "Bacteria 16S: Drivers of Community Dissimilarity", x = "", y = "% Contribution to Beta Diversity")

5.8 16S Target Genera Ratio Dynamics (LAB vs. Spoilage)

melt_16S <- psmelt(ps_rel_16S)

lab_genera <- c("Lactobacillus", "Leuconostoc", "Lactiplantibacillus", "Weissella", "Pediococcus")
spoilage_genera <- c("Acetobacter", "Gluconobacter", "Enterobacter", "Klebsiella", "Pantoea")

ratio_df <- melt_16S %>%
  group_by(SampleID, Treatment, TimePoint) %>%
  dplyr::summarize(
    LAB_Abundance = sum(Abundance[Genus %in% lab_genera], na.rm = TRUE),
    Spoilage_Abundance = sum(Abundance[Genus %in% spoilage_genera], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(LAB_to_Spoilage_Ratio = (LAB_Abundance + 0.0001) / (Spoilage_Abundance + 0.0001))

ggplot(ratio_df, aes(x = TimePoint, y = log10(LAB_to_Spoilage_Ratio), fill = Treatment)) +
  geom_boxplot(alpha = 0.8, color="black", outlier.shape = NA) +
  geom_jitter(position = position_dodge(0.75), size = 2, alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  theme_bw() +
  scale_fill_manual(values=c("#2CA02C", "#D62728")) +
  labs(title = "16S Shift: LAB vs Spoilage/AAB Dynamics", 
       y = "Log10 (LAB / Spoilage Ratio)", 
       subtitle = "Values above 0 (red line) indicate Lactic Acid Bacteria dominance")

6. Fungal Community Analysis (ITS)

6.1 ITS Setup & Background Exports

ps_clean_ITS <- prune_samples(sample_sums(ps_ITS) > 1000, ps_ITS)
ps_rel_ITS <- transform_sample_counts(ps_clean_ITS, function(x) x / sum(x))

writeXStringSet(refseq(ps_clean_ITS), file.path(path, "Fungi_ITS_rep.fasta"), format="fasta")

krona_ITS <- psmelt(ps_clean_ITS) %>% 
  filter(Abundance > 0) %>% 
  select(Abundance, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>% 
  mutate(across(everything(), ~tidyr::replace_na(as.character(.), "Unclassified")))
write.table(krona_ITS, file.path(path, "Fungi_ITS_Krona_Input.txt"), sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)

6.2 ITS Alpha Diversity

alpha_df_ITS <- estimate_richness(ps_clean_ITS, measures=c("Shannon", "Simpson", "Chao1", "Observed"))
write_xlsx(cbind(SampleID = rownames(alpha_df_ITS), alpha_df_ITS), file.path(path, "Fungi_ITS_Alpha_Diversity.xlsx"))

htmltools::tagList(datatable(alpha_df_ITS, options = list(pageLength = 5, scrollX = TRUE), caption = "ITS Alpha Diversity Indices"))

plot_richness(ps_clean_ITS, x="TimePoint", color="Treatment", measures=c("Shannon", "Simpson")) +
  geom_boxplot(outlier.shape = NA, alpha=0.7) + 
  geom_jitter(width=0.1, alpha=0.6) + 
  theme_classic() + 
  labs(title="Fungi ITS: Alpha Diversity Significance") +
  stat_compare_means(aes(group = Treatment), label = "p.signif", method = "wilcox.test")

6.3 ITS Beta Diversity & PERMANOVA

dist_bray_ITS <- phyloseq::distance(ps_clean_ITS, method="bray")
write.csv(as.matrix(dist_bray_ITS), file.path(path, "Fungi_ITS_BrayCurtis.csv"))

# PERMANOVA
sample_df_ITS <- as(sample_data(ps_clean_ITS), "data.frame")
permanova_ITS <- adonis2(dist_bray_ITS ~ Treatment * TimePoint, data = sample_df_ITS, permutations = 999)
htmltools::tagList(datatable(as.data.frame(permanova_ITS), options = list(dom = 't'), caption = "ITS PERMANOVA Results"))

# PCoA Plot
plot_ordination(ps_rel_ITS, ordinate(ps_rel_ITS, method="PCoA", distance="bray"), color="TimePoint", shape="Treatment") +
  geom_point(size=4) + theme_minimal() + labs(title="Fungi ITS: PCoA (Bray-Curtis)")

# UPGMA Tree
png(file.path(path, "Fungi_ITS_UPGMA.png"), width=800, height=600, res=100)
plot(hclust(dist_bray_ITS, method="average"), main="Fungi ITS: UPGMA Clustering", xlab="Samples", sub="")
dev.off()

## png 
##   2

plot(hclust(dist_bray_ITS, method="average"), main="Fungi ITS: UPGMA Clustering", xlab="Samples", sub="")

6.4 ITS Taxonomic Composition

top40_ITS <- names(sort(taxa_sums(ps_clean_ITS), decreasing=TRUE))[1:40]

# Barplot (Top 20)
plot_bar(prune_taxa(top40_ITS[1:20], ps_clean_ITS), x="SampleID", fill="Genus") + 
  facet_wrap(~Treatment, scales="free_x") + theme_bw() + labs(title="Fungi ITS: Top 20 Genera")

# Heatmap (Top 40)
plot_heatmap(prune_taxa(top40_ITS, ps_clean_ITS), method="NMDS", distance="bray", 
             sample.order="Treatment", taxa.label="Genus", taxa.order="Phylum",
             low="#000033", high="#FF3300", na.value="white") + labs(title="Fungi ITS: Taxonomic Heatmap")

6.5 ITS Core Microbiome & DESeq2

# Core Microbiome
ps_ctrl_ITS <- subset_samples(ps_clean_ITS, Treatment == "Controlled")
core_taxa_ITS <- core_members(ps_ctrl_ITS, detection = 0.001, prevalence = 0.8)

if(length(core_taxa_ITS) > 0) {
  cat("\n**ITS Core Taxa Detected:**\n")
  htmltools::tagList(datatable(as.data.frame(tax_table(ps_clean_ITS)[core_taxa_ITS, ]), options = list(pageLength = 5, scrollX = TRUE)))
}

## 
## **ITS Core Taxa Detected:**

# DESeq2
ds2_ITS <- DESeq(phyloseq_to_deseq2(ps_clean_ITS, ~ TimePoint + Treatment), test="Wald", fitType="parametric", sfType="poscounts")
res_ITS <- results(ds2_ITS, contrast=c("Treatment", "Controlled", "Spontaneous"))
sigtab_ITS <- res_ITS[which(res_ITS$padj < 0.05), ]

if(nrow(sigtab_ITS) > 0) {
  sig_df_ITS <- cbind(as(sigtab_ITS, "data.frame"), as(tax_table(ps_clean_ITS)[rownames(sigtab_ITS), ], "matrix"))
  write_xlsx(sig_df_ITS, file.path(path, "Fungi_ITS_DESeq2.xlsx"))
  cat("\n**ITS Significant DESeq2 Taxa:**\n")
  htmltools::tagList(datatable(sig_df_ITS, options = list(pageLength = 5, scrollX = TRUE)))
}

## 
## **ITS Significant DESeq2 Taxa:**

# Annotated Volcano Plot
tax_df_ITS <- as.data.frame(tax_table(ps_clean_ITS))
volcano_data_ITS <- cbind(as.data.frame(res_ITS), tax_df_ITS[rownames(res_ITS), ])

volcano_data_ITS <- volcano_data_ITS[!is.na(volcano_data_ITS$padj), ]

volcano_data_ITS$Significance <- "Not Significant"
volcano_data_ITS$Significance[volcano_data_ITS$log2FoldChange > 1 & volcano_data_ITS$padj < 0.05] <- "Significantly Increased (Controlled)"
volcano_data_ITS$Significance[volcano_data_ITS$log2FoldChange < -1 & volcano_data_ITS$padj < 0.05] <- "Significantly Decreased (Spontaneous)"

volcano_data_ITS$Label <- NA 
sig_rows_ITS <- volcano_data_ITS$Significance != "Not Significant"
volcano_data_ITS$Label[sig_rows_ITS] <- volcano_data_ITS$Genus[sig_rows_ITS] 
volcano_data_ITS$Label[sig_rows_ITS] <- paste0(rownames(volcano_data_ITS)[sig_rows_ITS], " (", volcano_data_ITS$Genus[sig_rows_ITS], ")")

ggplot(volcano_data_ITS, aes(x = log2FoldChange, y = -log10(padj), color = Significance)) +
  geom_point(alpha = 0.7, size = 2.5) +
  scale_color_manual(values = c("Not Significant" = "grey80", 
                                "Significantly Increased (Controlled)" = "#2CA02C", 
                                "Significantly Decreased (Spontaneous)" = "#D62728")) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(aes(label = Label), size = 3.5, max.overlaps = 20, 
                  box.padding = 0.5, point.padding = 0.3, color = "black", show.legend = FALSE) +
  theme_bw() +
  labs(title = "ITS DESeq2 Volcano Plot: Fungal Shifts in Fermentation",
       x = "Log2 Fold Change (Biological Magnitude)",
       y = "-Log10 Adjusted p-value (Statistical Confidence)") +
  theme(legend.position = "bottom")

6.6 ITS Indicator Species Analysis (IndVal)

otu_mat_ITS <- as(otu_table(ps_clean_ITS), "matrix")
if(taxa_are_rows(ps_clean_ITS)){ otu_mat_ITS <- t(otu_mat_ITS) }

ind_val_ITS <- multipatt(otu_mat_ITS, sample_data(ps_clean_ITS)$Treatment, func = "IndVal.g", control = how(nperm=999))
ind_sum_ITS <- ind_val_ITS$sign %>% filter(p.value <= 0.05)

if(nrow(ind_sum_ITS) > 0) {
  ind_final_ITS <- cbind(as.data.frame(tax_table(ps_clean_ITS)[rownames(ind_sum_ITS), ]), ind_sum_ITS)
  write_xlsx(ind_final_ITS, file.path(path, "Fungi_ITS_Indicator.xlsx"))
  htmltools::tagList(datatable(ind_final_ITS, options = list(pageLength = 5, scrollX = TRUE), caption = "ITS Indicator Species"))
} else {
  cat("\n*No significant indicator species found for ITS.*\n")
}

6.7 ITS Beta Diversity Partitioning (Turnover vs. Nestedness)

otu_mat_part <- as(otu_table(ps_clean_ITS), "matrix")
if(taxa_are_rows(ps_clean_ITS)){ otu_mat_part <- t(otu_mat_part) }

beta_part <- beta.pair.abund(otu_mat_part)

mean_turnover <- mean(as.matrix(beta_part$beta.bray.bal)[lower.tri(as.matrix(beta_part$beta.bray.bal))])
mean_nestedness <- mean(as.matrix(beta_part$beta.bray.gra)[lower.tri(as.matrix(beta_part$beta.bray.gra))])
mean_total <- mean_turnover + mean_nestedness

part_df <- data.frame(
  Component = c("Turnover (Replacement)", "Nestedness (Loss/Gain)"),
  Percentage = c((mean_turnover/mean_total)*100, (mean_nestedness/mean_total)*100)
)

ggplot(part_df, aes(x = "", y = Percentage, fill = Component)) +
  geom_bar(stat = "identity", width = 0.5, color="black") +
  coord_flip() + theme_minimal() + 
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +
  labs(title = "Fungi ITS: Drivers of Community Dissimilarity", x = "", y = "% Contribution to Beta Diversity")

6.8 ITS Co-occurrence Network Analysis

top30_taxa_ITS <- names(sort(taxa_sums(ps_clean_ITS), decreasing=TRUE))[1:30]
ps_net_ITS <- prune_taxa(top30_taxa_ITS, ps_clean_ITS)

net_otu <- as(otu_table(ps_net_ITS), "matrix")
if(taxa_are_rows(ps_net_ITS)){ net_otu <- t(net_otu) }
colnames(net_otu) <- as.data.frame(tax_table(ps_net_ITS))$Genus

cor_res <- rcorr(as.matrix(net_otu), type="spearman")
R_mat <- cor_res$r
P_mat <- cor_res$P

edges <- melt(R_mat, varnames = c("Taxon_A", "Taxon_B"), value.name = "Correlation")
p_edges <- melt(P_mat, varnames = c("Taxon_A", "Taxon_B"), value.name = "p_value")
edges$p_value <- p_edges$p_value

edges_sig <- edges %>%
  filter(Taxon_A != Taxon_B) %>%
  filter(p_value < 0.05) %>%
  filter(abs(Correlation) > 0.6)

if(nrow(edges_sig) > 0) {
  net_graph <- graph_from_data_frame(edges_sig, directed = FALSE)
  net_graph <- simplify(net_graph, edge.attr.comb = "first") 
  
  E(net_graph)$color <- ifelse(E(net_graph)$Correlation > 0, "#0072B2", "#D55E00")
  E(net_graph)$weight <- abs(E(net_graph)$Correlation) * 8 
  
  set.seed(42) 
  plot(net_graph, 
       layout = layout_with_fr(net_graph), 
       vertex.size = 12, 
       vertex.label.cex = 0.8, 
       vertex.label.color = "black",
       vertex.color = "lightblue",
       edge.width = E(net_graph)$weight,
       main = "Fungi ITS: Co-occurrence Network (Top 30 Genera)",
       sub = "Blue = Positive Correlation | Red = Negative Correlation")
} else {
  cat("\n*No significant strong correlations found among the top taxa.*\n")
}

MASTER METABARCODING ANALYSIS PIPELINE: 16S & ITS

Jeshurun Asher Tarun, Ph.D.

2026-06-21

Core Configuration & Libraries

PHASE 1: BACTERIAL 16S PIPELINE

PHASE 2: FUNGAL / YEAST ITS PIPELINE

PHASE 3: PHYLOGENETIC TREES & PHYLOSEQ ASSEMBLY

5. Bacterial Community Analysis (16S)

5.1 16S Setup & Background Exports

5.2 16S Alpha Diversity

5.3 16S Beta Diversity & PERMANOVA

5.4 16S Taxonomic Composition

5.5 16S Core Microbiome & DESeq2

5.6 16S Indicator Species Analysis (IndVal)

5.7 16S Beta Diversity Partitioning (Turnover vs. Nestedness)

5.8 16S Target Genera Ratio Dynamics (LAB vs. Spoilage)

6. Fungal Community Analysis (ITS)

6.1 ITS Setup & Background Exports

6.2 ITS Alpha Diversity

6.3 ITS Beta Diversity & PERMANOVA

6.4 ITS Taxonomic Composition

6.5 ITS Core Microbiome & DESeq2

6.6 ITS Indicator Species Analysis (IndVal)

6.7 ITS Beta Diversity Partitioning (Turnover vs. Nestedness)

6.8 ITS Co-occurrence Network Analysis