This section initializes the required bioinformatic libraries and establishes the core file paths and validated primer sequences for the Coffea excelsa fermentation dataset.
library(dada2)
library(phyloseq)
library(DECIPHER)
library(phangorn)
library(biomformat)
library(ggplot2)
library(vegan)
library(ape)
library(writexl)
library(DESeq2)
library(dplyr)
# Core Paths
path <- "C:/Users/Owner/Documents/BSP_2022/BSP_2025/Metabarcoding_training"
cutadapt_path <- "C:/Users/Owner/Downloads/cutadapt"
# Validated Project Primers
fwd_16S <- "CCTACGGGNGGCWGCAG" # V3 Forward
rev_16S <- "GACTACHVGGGTATCTAATCC" # V4 Reverse
fwd_ITS <- "GCATCGATGAAGAACGCAGC" # ITS3 Forward
rev_ITS <- "TCCTCCGCTTATTGATATGC" # ITS4 ReverseProcessing the bacterial community using the DADA2 algorithm. Reads are trimmed, filtered (truncating at 250F/200R), merged, and assigned taxonomy using the SILVA v138.2 database.
fnFs_16S <- sort(list.files(path, pattern = "__1_1\\.fastq\\.gz$", full.names = TRUE))
fnRs_16S <- sort(list.files(path, pattern = "__1_2\\.fastq\\.gz$", full.names = TRUE))
sample.names_16S <- sapply(strsplit(basename(fnFs_16S), "__1"), `[`, 1)
path_cut_16S <- file.path(path, "cutadapt_16S")
dir.create(path_cut_16S, showWarnings = FALSE)
fnFs_cut_16S <- file.path(path_cut_16S, paste0(sample.names_16S, "_F_cut.fastq.gz"))
fnRs_cut_16S <- file.path(path_cut_16S, paste0(sample.names_16S, "_R_cut.fastq.gz"))
# 1. Cutadapt
for(i in seq_along(fnFs_16S)) {
system2(cutadapt_path, args = c("-g", fwd_16S, "-G", rev_16S,
"-o", shQuote(fnFs_cut_16S[i]), "-p", shQuote(fnRs_cut_16S[i]),
shQuote(fnFs_16S[i]), shQuote(fnRs_16S[i]),
"--discard-untrimmed", "--minimum-length=100"))
}
# 2. DADA2 Core
filtFs_16S <- file.path(path, "filtered_16S", paste0(sample.names_16S, "_F_filt.fastq.gz"))
filtRs_16S <- file.path(path, "filtered_16S", paste0(sample.names_16S, "_R_filt.fastq.gz"))
names(filtFs_16S) <- sample.names_16S; names(filtRs_16S) <- sample.names_16S
out_16S <- filterAndTrim(fnFs_cut_16S, filtFs_16S, fnRs_cut_16S, filtRs_16S,
truncLen=c(250, 200), maxN=0, maxEE=c(2,2), truncQ=2,
rm.phix=TRUE, compress=TRUE, multithread=TRUE)
errF_16S <- learnErrors(filtFs_16S, multithread=TRUE)
errR_16S <- learnErrors(filtRs_16S, multithread=TRUE)
mergers_16S <- mergePairs(dada(filtFs_16S, err=errF_16S, multithread=TRUE), filtFs_16S,
dada(filtRs_16S, err=errR_16S, multithread=TRUE), filtRs_16S, verbose=TRUE)
seqtab_nochim_16S <- removeBimeraDenovo(rowsum(makeSequenceTable(mergers_16S), group = sample.names_16S), method="consensus", multithread=TRUE)
# 3. SILVA Taxonomy
seqtab_final_16S <- seqtab_nochim_16S[, nchar(getSequences(seqtab_nochim_16S)) >= 350]
taxa_16S <- assignTaxonomy(seqtab_final_16S, file.path(path, "silva_nr99_v138.2_toGenus_trainset.fa.gz"), multithread=TRUE)
taxa_16S <- addSpecies(taxa_16S, file.path(path, "silva_v138.2_assignSpecies.fa.gz"))Processing the yeast/fungal community. Due to biological length variation in the ITS region, fixed truncation is disabled. Taxonomy is assigned via the UNITE dynamic database (single-threaded for stability).
fnFs_ITS <- sort(list.files(path, pattern = "__2_1\\.fastq\\.gz$", full.names = TRUE))
fnRs_ITS <- sort(list.files(path, pattern = "__2_2\\.fastq\\.gz$", full.names = TRUE))
sample.names_ITS <- sapply(strsplit(basename(fnFs_ITS), "__2"), `[`, 1)
path_cut_ITS <- file.path(path, "cutadapt_ITS")
dir.create(path_cut_ITS, showWarnings = FALSE)
fnFs_cut_ITS <- file.path(path_cut_ITS, paste0(sample.names_ITS, "_F_cut.fastq.gz"))
fnRs_cut_ITS <- file.path(path_cut_ITS, paste0(sample.names_ITS, "_R_cut.fastq.gz"))
# 1. Cutadapt
for(i in seq_along(fnFs_ITS)) {
system2(cutadapt_path, args = c("-g", fwd_ITS, "-G", rev_ITS,
"-o", shQuote(fnFs_cut_ITS[i]), "-p", shQuote(fnRs_cut_ITS[i]),
shQuote(fnFs_ITS[i]), shQuote(fnRs_ITS[i]),
"--discard-untrimmed", "--minimum-length=50"))
}
# 2. DADA2 Core
filtFs_ITS <- file.path(path, "filtered_ITS", paste0(sample.names_ITS, "_F_filt.fastq.gz"))
filtRs_ITS <- file.path(path, "filtered_ITS", paste0(sample.names_ITS, "_R_filt.fastq.gz"))
names(filtFs_ITS) <- sample.names_ITS; names(filtRs_ITS) <- sample.names_ITS
out_ITS <- filterAndTrim(fnFs_cut_ITS, filtFs_ITS, fnRs_cut_ITS, filtRs_ITS,
maxN=0, maxEE=c(2,2), truncQ=2, minLen=50,
rm.phix=TRUE, compress=TRUE, multithread=TRUE)
errF_ITS <- learnErrors(filtFs_ITS, multithread=TRUE)
errR_ITS <- learnErrors(filtRs_ITS, multithread=TRUE)
mergers_ITS <- mergePairs(dada(filtFs_ITS, err=errF_ITS, multithread=TRUE), filtFs_ITS,
dada(filtRs_ITS, err=errR_ITS, multithread=TRUE), filtRs_ITS, verbose=TRUE)
seqtab_final_ITS <- removeBimeraDenovo(rowsum(makeSequenceTable(mergers_ITS), group = sample.names_ITS), method="consensus", multithread=TRUE)
# 3. UNITE Taxonomy
taxa_ITS <- assignTaxonomy(seqtab_final_ITS, file.path(path, "sh_general_release_dynamic_19.02.2025.fasta"), multithread=FALSE)Integrating OTU tables, taxonomy arrays, and phylogenetic trees (16S
only) into master phyloseq objects. Host plant DNA
(Chloroplast/Mitochondria) is filtered out of the bacterial dataset.
library(Biostrings)
metadata <- data.frame(
SampleID = rownames(seqtab_final_16S),
TimePoint = factor(ifelse(grepl("^0", rownames(seqtab_final_16S)), "Start", "Finish"), levels=c("Start", "Finish")),
Treatment = factor(ifelse(grepl("CET", rownames(seqtab_final_16S)), "Controlled", "Spontaneous"))
)
rownames(metadata) <- metadata$SampleID
generate_ml_tree <- function(seqtab_matrix) {
seqs <- getSequences(seqtab_matrix); names(seqs) <- seqs
phang_align <- phyDat(as(AlignSeqs(DNAStringSet(seqs), anchor=NA, verbose=FALSE), "matrix"), type="DNA")
return(optim.pml(pml(NJ(dist.ml(phang_align)), data=phang_align), model="GTR", optInv=TRUE, optGamma=TRUE, rearrangement="stochastic", control=pml.control(trace=0))$tree)
}
# 16S Assembly
ps_16S <- phyloseq(otu_table(seqtab_final_16S, taxa_are_rows=FALSE), sample_data(metadata),
tax_table(taxa_16S), phy_tree(generate_ml_tree(seqtab_final_16S)))
dna_16S <- Biostrings::DNAStringSet(taxa_names(ps_16S)); names(dna_16S) <- taxa_names(ps_16S)
ps_16S <- merge_phyloseq(ps_16S, dna_16S)
taxa_names(ps_16S) <- paste0("ASV_16S_", seq(ntaxa(ps_16S)))
ps_16S <- subset_taxa(ps_16S, (Order != "Chloroplast") | is.na(Order))
ps_16S <- subset_taxa(ps_16S, (Family != "Mitochondria") | is.na(Family))
# ITS Assembly
ps_ITS <- phyloseq(otu_table(seqtab_final_ITS, taxa_are_rows=FALSE), sample_data(metadata), tax_table(taxa_ITS))
dna_ITS <- Biostrings::DNAStringSet(taxa_names(ps_ITS)); names(dna_ITS) <- taxa_names(ps_ITS)
ps_ITS <- merge_phyloseq(ps_ITS, dna_ITS)
taxa_names(ps_ITS) <- paste0("ASV_ITS_", seq(ntaxa(ps_ITS)))
# Data Save Checkpoint
dir.create(file.path(path, "Saved_R_Data"), showWarnings = FALSE)
saveRDS(ps_16S, file = file.path(path, "Saved_R_Data", "ps_16S_final.rds"))
saveRDS(ps_ITS, file = file.path(path, "Saved_R_Data", "ps_ITS_final.rds"))This section exports all finalized tables to the hard drive while directly rendering the publication-ready figures below.
export_master_pipeline <- function(ps_obj, prefix) {
ps_clean <- prune_samples(sample_sums(ps_obj) > 1000, ps_obj)
if(nsamples(ps_clean) < 2) { return(NULL) }
# Structural Exports
writeXStringSet(refseq(ps_clean), file.path(path, paste0(prefix, "_rep.fasta")), format="fasta")
if (!is.null(phy_tree(ps_clean, errorIfNULL = FALSE))) {
write.tree(phy_tree(ps_clean), file.path(path, paste0(prefix, "_phylo.tre")))
}
ps_rel <- transform_sample_counts(ps_clean, function(x) x / sum(x))
write_xlsx(list(
Count = cbind(as.data.frame(tax_table(ps_clean)), as.data.frame(t(otu_table(ps_clean)))),
Ratio = cbind(as.data.frame(tax_table(ps_clean)), as.data.frame(t(otu_table(ps_rel))))
), file.path(path, paste0(prefix, "_TAXONOMY_Assignment.xlsx")))
cat("\n### Results for:", prefix, "\n")
# --- Alpha Diversity ---
alpha_df <- estimate_richness(ps_clean, measures=c("Shannon", "Simpson", "Chao1", "Observed"))
write_xlsx(cbind(SampleID = rownames(alpha_df), alpha_df), file.path(path, paste0(prefix, "_Alpha_Diversity_Table.xlsx")))
p_alpha <- plot_richness(ps_clean, x="TimePoint", color="Treatment", measures=c("Shannon", "Simpson")) +
geom_boxplot(outlier.shape = NA) + geom_jitter(width=0.1) + theme_classic()
ggsave(file.path(path, paste0(prefix, "_Alpha_Diversity_Boxplot.png")), p_alpha, width=8, height=5)
print(p_alpha)
# --- Beta Diversity ---
dist_bray <- phyloseq::distance(ps_clean, method="bray")
write.csv(as.matrix(dist_bray), file.path(path, paste0(prefix, "_BrayCurtis_DistanceMatrix.csv")))
png(file.path(path, paste0(prefix, "_UPGMA_Tree.png")), width=800, height=600, res=100)
plot(hclust(dist_bray, method="average"), main=paste(prefix, "UPGMA Clustering"), xlab="Samples", sub="")
dev.off()
p_beta <- plot_ordination(ps_rel, ordinate(ps_rel, method="PCoA", distance="bray"), color="TimePoint", shape="Treatment") +
geom_point(size=4) + theme_minimal() + labs(title=paste("PCoA (Bray-Curtis):", prefix))
ggsave(file.path(path, paste0(prefix, "_Beta_Diversity_PCoA.png")), p_beta, width=7, height=5)
print(p_beta)
# --- Taxonomic Composition ---
top40 <- names(sort(taxa_sums(ps_clean), decreasing=TRUE))[1:40]
p_bar <- plot_bar(prune_taxa(top40[1:20], ps_clean), x="SampleID", fill="Genus") +
facet_wrap(~Treatment, scales="free_x") + theme_bw() + labs(title=paste(prefix, "Top 20 Genus Composition"))
ggsave(file.path(path, paste0(prefix, "_Composition_Barplot.png")), p_bar, width=10, height=6)
print(p_bar)
p_heat <- plot_heatmap(prune_taxa(top40, ps_clean), method="NMDS", distance="bray",
sample.order="Treatment", taxa.label="Genus", taxa.order="Phylum",
low="#000033", high="#FF3300", na.value="white") + labs(title=paste(prefix, "Top 40 Heatmap"))
ggsave(file.path(path, paste0(prefix, "_Taxonomic_Heatmap.png")), p_heat, width=10, height=8)
print(p_heat)
# --- Krona Output ---
krona_export <- psmelt(ps_clean) %>% filter(Abundance > 0) %>%
select(Abundance, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>%
mutate(across(everything(), ~tidyr::replace_na(as.character(.), "Unclassified")))
write.table(krona_export, file.path(path, paste0(prefix, "_Krona_Input.txt")), sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
# --- DESeq2 Stats ---
ds2 <- DESeq(phyloseq_to_deseq2(ps_clean, ~ TimePoint + Treatment), test="Wald", fitType="parametric", sfType="poscounts")
res <- results(ds2, contrast=c("Treatment", "Controlled", "Spontaneous"))
sigtab <- res[which(res$padj < 0.05), ]
if(nrow(sigtab) > 0) {
write_xlsx(cbind(as(sigtab, "data.frame"), as(tax_table(ps_clean)[rownames(sigtab), ], "matrix")),
file.path(path, paste0(prefix, "_DESeq2_Significant.xlsx")))
}
}
# Execute for Both Communities
export_master_pipeline(ps_16S, "Bacteria_16S")##
## ### Results for: Bacteria_16S
##
## ### Results for: Fungi_ITS