This report provides the r code used to process Illumina MiSeq data from the wood mycobiome of Populus trichocarpa. The original code is saved in the following scripts:
Additionally, I used Pheniqs to demultiplex the samples, and a shell script with the programs Cutadapt and Seqpurge to remove primer tails from all sequences. These programs were used on a combined MiSeq library of the Populus wood samples and Douglas-fir seed samples. The output of this processing method is a single phyloseq object, which includes the identified OTUs and their taxonomy, their sequences, their read abundances across samples, and metadata for the samples.
library(tidyverse)
library(magrittr)
library(dada2)
library(ShortRead)
library(phyloseq)
library(Biostrings)
library(ggplot2)
library(stringr)
library(readxl)
library(ggthemes)
# these last two packages are to knit certain visual aids in this markdown
library(knitr)
library(kableExtra)
After demultiplexing the sequences into samples and trimming off primer sequences, the first step is to denoise sequences. The input for this step is the trimmed raw sequence files, which includes wood samples and seed samples from another project. We first read in all the sequence files, remove the seed sequences, and then plot the quality of the sequences. The sequence quality plots are randomly selected from the available samples. The outputs here are two plots of sequence quality from a random sample subset. The first plot is for forward reads, and the second plot is for reverse reads.
#path for output
out.path <- "output/dada"
#Identify paths to trimmed fwd and rev reads
in.path <- file.path("../../MiSeq_March2020/output/trim/wood")
path.fwd <- in.path %>% list.files(pattern="R1.fq.gz",full.names = T) %>% sort #forward reads
path.rev <- in.path %>% list.files(pattern="R2.fq.gz",full.names = T) %>% sort #reverse reads
#Filtering out seed samples
f_samp <- path.fwd %>% str_sub(., 37, 48)
r_samp <- path.rev %>% str_sub(., 37, 48)
Wood_Samps <- read_excel("../data/GBergmann_IlluminaMiSeq_Samples.xlsx", sheet = "Wood")
real <- Wood_Samps$Samps
path.fwd %<>% .[f_samp %in% real]
path.rev %<>% .[r_samp %in% real]
#make file names / paths for trimmed and filtered files
filt.path <- file.path(out.path, "filt")
filt.fwd <- path.fwd %>% gsub(in.path,filt.path,.)
filt.rev <- path.rev %>% gsub(in.path,filt.path,.)
#preview read quality of 12 randomly selected samples before trimming
qual.samps <- sample(1:length(path.fwd),6)
plotQualityProfile(path.fwd[qual.samps])
plotQualityProfile(path.rev[qual.samps])
After plotting sequence quality, we can filter out sequences based on maximum error (maxEE), trim the remaining sequences, and check the quality again. As with before, the first quality plot is for forward reads, and the second plot is for reverse reads.
#Filter and trim
trim <- filterAndTrim(path.fwd, filt.fwd,
path.rev, filt.rev,
maxN=0, maxEE=c(2,2), truncQ=2,
multithread=TRUE)
#Update list of trimmed file paths to exclude samples with no reads passing filters
filt.fwd <- list.files(filt.path, pattern = "R1.fq.gz",full.names = T)
filt.rev <- list.files(filt.path, pattern = "R2.fq.gz",full.names = T)
#Check quality of trimmed and filtered reads
qual.samps <- sample(1:length(filt.fwd),12)
plotQualityProfile(filt.fwd[qual.samps])
plotQualityProfile(filt.rev[qual.samps])
We can see from these updated plots that filtering has improved overall quality. We can now dereplicate (combine) identical sequences, identify the true sequences in the community (denoise), make a table of those sequences, and remove chimeras. After these steps, we can create a datasheet summarizing the denoising process (e.g. how many sequences are present after each step.) The denoising step takes a long time, since millions of sequences are randomly selected from the samples to learn errors for this step.
#dereplicate identical sequences
derep.fwd <- derepFastq(filt.fwd, verbose=F)
derep.rev <- derepFastq(filt.rev, verbose=F)
#Trim names of derep objects to just sample names
names(derep.fwd) %<>% gsub(".R1.fq.gz","",.)
names(derep.rev) %<>% gsub(".R2.fq.gz","",.)
#Learn errors
err.fwd <- learnErrors(filt.fwd, multithread=TRUE, nbases = 3e08)
plotErrors(err.fwd, nominalQ=TRUE)
err.rev <- learnErrors(filt.rev, multithread=TRUE, nbases = 3e08)
plotErrors(err.rev, nominalQ=TRUE)
#Denoise
dada.fwd <- dada(derep.fwd, err=err.fwd, multithread=TRUE)
dada.rev <- dada(derep.rev, err=err.rev, multithread=TRUE)
#Merge reads
merged <- mergePairs(dada.fwd, derep.fwd, dada.rev, derep.rev, trimOverhang = T)
#Make sequence table
seqtab <- merged %>% makeSequenceTable
#Remove chimeras
seqtab.nonChimeras <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE)
#Make summary report of what happened
getN <- function(x) sum(getUniques(x))
trim.summary <- trim %>% data.frame %>% rownames_to_column("Sample")
trim.summary$Sample %<>% str_sub(., 1, 12)
track <- cbind(sapply(dada.fwd, getN),
sapply(dada.rev, getN),
sapply(merged, getN),
rowSums(seqtab.nonChimeras)) %>%
data.frame %>% rownames_to_column("Sample")
track %<>% left_join(trim.summary,.)
colnames(track) <- c("sample", "input", "filtered", "denoised.fwd", "denoised.rev", "merged", "ChimeraFiltered")
print(head(track))
file.path(out.path,"dadaSummary.csv") %>% write.csv(track,.,row.names = F)
## sample input filtered denoised.fwd denoised.rev merged ChimeraFiltered
## 1 myco.03.01.a 1557 976 958 958 958 958
## 2 myco.03.01.b 12248 10215 10173 10177 10054 10054
## 3 myco.03.01.c 11357 8669 8538 8593 8063 8063
## 4 myco.03.01.d 5355 4154 4131 4147 3921 3921
## 5 myco.03.01.e 12862 11314 11243 11251 10844 10844
## 6 myco.03.01.f 18892 16359 16315 16313 15048 15048
This table shows how many reads are left after each step of the denoising stage. Based on this table, it seems that there is decent sequencing depth for most of the samples. We can now save the sequence table (seqTab) for further processing.
# Save output
file.path(out.path,"seqTab.rds") %>% saveRDS(seqtab.nonChimeras,.)
After denoising and filtering the sequences, they must be compiled with sample metadata into a phyloseq object. This object type allows for a variety of data visualizations and analyses. To set this up, we first read in the input sequence table and metadata, pull the sequences out of the data file, rename them and write them into a fasta file. We also want to download the most recent taxonomy database from UNITE if it’s not already saved in the project (it was saved here, so the necessary script is commented out at the bottom).
# read in sample metadata (row.names refers to column one, where the sample IDs are)
meta <- read.csv("../data/wood_meta2.csv",as.is=T,row.names=1)
meta$Disease[meta$Disease=="healthy"] <- "Healthy" # I had forgotten to capitalize healthy for a few of the samples, so this line is here to correct that
# read in denoised sequence table
seqTab <- readRDS("../output/dada/seqTab.rds")
#extract sequences from OTU table
seqs <- getSequences(seqTab) %>% DNAStringSet
#rename sequences
OTU.names <- paste0("OTU.",1:ncol(seqTab))
colnames(seqTab) <- OTU.names
names(seqs) <- OTU.names
#Create data frame for summary of processing
compile.summary <- data.frame(SampID=rownames(seqTab),
denoised=rowSums(seqTab))
# create scratch directory
dir.create("../output/scratch")
## Warning in dir.create("../output/scratch"): '../output/scratch' already exists
# write temp file
writeXStringSet(seqs,file="../output/scratch/tmp.fasta",width=1000)
#download a version of the UNITE database if you dont have it
## This link downloads the most recent Fungi 2 database (https://dx.doi.org/10.15156/BIO/786369)
### There are newer versions than this. You can find them on the UNITE website under general fasta release. You can either download them to you computer and then upload via rstudio server or get the url to the file from the website (right click the download link and copy) and use the download.file() function to do it directly.
#if(!file.exists("data/taxonomy_db")){
# download.file("https://files.plutof.ut.ee/public/orig/1D/B9/1DB95C8AC0A80108BECAF1162D761A8D379AF43E2A4295A3EF353DD1632B645B.gz",
# "data/taxonomy_db.zip",
# method="wget")
# unzip("data/taxonomy_db.zip",exdir="data/taxonomy_db")
# file.remove("data/taxonomy_db.zip")}
After preparing the data, we remove host (Populus trichocarpa) sequences and extract the ITS2 region from the sequences. We need to extract the ITS2 region to assign taxonomy, and it allows us to filter out other sequences that don’t match up (like very short reads).
# remove sequences matching the P.trich. genome with Bowtie2
system("/home/busbylab/miniconda3/bin/bowtie2 --threads 10 -x ../data/Ptri.v.3.db -f ../output/scratch/tmp.fasta --un ../output/scratch/noHost.fa --al ../output/scratch/host.fa --fast")
seqs.noHost <- readDNAStringSet("../output/scratch/noHost.fa")
compile.summary$noHost <- seqTab %>% .[,names(seqs.noHost)] %>% rowSums()
#Extract ITS region with ITSx
## ITSx parameters
ITSx.flags <- paste("-i ../output/scratch/noHost.fa",
"-t 'fungi,tracheophyta'",
"--preserve T",
"--complement T",
"--summary T",
"-o ../output/scratch/ITSx",
"--only_full T",
"-E 1e-2")
system2("ITSx", args = ITSx.flags)
#read in ITS2 sequences from ITSx output and filter out very short reads
seqs.ITS2 <- readDNAStringSet("output/scratch/ITSx.ITS2.fasta") %>% .[.@ranges@width > 50]
#remove OTUs from seqTab that are not in ITSx output
seqTab %<>% .[,names(seqs.ITS2)]
#collapse any identical sequences after isolating ITS2
colnames(seqTab) <- seqs.ITS2 %>% as.character %>% unname
seqTab %<>% collapseNoMismatch
Now that the non-host ITS2 sequences have been extracted, we can assign taxonomy and combine all of the data into the phyloseq object. One additional step that I added here was to filter out sequences identified to non-fungal eukaryotes. I then re-assigned taxonomy to the fungal sequences with the fungal taxonomy database from UNITE. This was to ensure that the taxonomy would be as accurate as possible. The step saving the phyloseq object to the R project is not shown here.
#assign taxonomy with DADA2 implementation of RDP classifier (needs 12+ gb ram) - using all eukaryotes here
tax <- assignTaxonomy(seqTab,"../data/taxonomy_db/sh_general_release_dynamic_02.02.2019.fasta",multithread=T)
#set final OTU names
final.names <- paste0("OTU.",1:ncol(seqTab))
final.seqs <- getSequences(seqTab) %>% DNAStringSet()
names(final.seqs) <- final.names
colnames(seqTab) <- final.names
rownames(tax) <- final.names
#make phyloseq object
phy <- phyloseq(otu_table(seqTab,taxa_are_rows = F),
sample_data(meta),
tax_table(tax),
refseq(final.seqs))
# remove non-fungal taxa from phyloseq object
min.reads <- 0
keep.samples <- sample_sums(phy) > min.reads
phy %<>% subset_taxa(., Kingdom=="k__Fungi") %>% prune_samples(keep.samples,.) %>% filter_taxa(function(x) {sum(x) > 0}, TRUE)
# re-assign taxonomy with fungal-only database
phy.seq <- refseq(phy)
phy.tax <- assignTaxonomy(phy.seq,"../data/sh_general_release_dynamic_s_04.02.2020.fasta",multithread=T)
rownames(phy.tax) <- taxa_names(phy)
tax_table(phy) <- tax_table(phy.tax)
print(head(compile.summary))
One last thing we can look at in the compiling step is the summary of what happened. This shows us how many sequence reads are left for each sample following removal of host DNA.
## X SampID denoised noHost
## 1 myco.03.01.a myco.03.01.a 958 958
## 2 myco.03.01.b myco.03.01.b 10054 8422
## 3 myco.03.01.c myco.03.01.c 8063 120
## 4 myco.03.01.d myco.03.01.d 3921 1032
## 5 myco.03.01.e myco.03.01.e 10844 8766
## 6 myco.03.01.f myco.03.01.f 15048 7991
From this table, we can see that some of the samples were dominated by host sequences. This ultimately limited the quality of the final data set.
Now that we have our phyloseq object, we can do some additional filtering steps. This includes removing PCR and extraction controls, removing Amplicon Sequence Variants (ASVs) that are contaminants, removing samples with low sequencing depth, and removing singleton taxa. I also clustered sequences into Operational Taxonomic Units (OTUs) at 97% similarity, but this is optional.
We must first remove contaminants. These are identified taxa where a certain threshold percentage of their reads is present in the controls. The threshold percentage is partially based on if the taxa make sense as endophytes vs. some other fungal group (e.g. animal parasites). To determine an appropriate threshold, we calculated the percentage of sequences in controls for all taxa, and printed these calculations in a table, shown below (subset to only show OTUs with non-zero read counts in the controls).
# subsetting the phyloseq into controls and samples
controls.phy <- subset_samples(phy, Control=="Yes")
samps.phy <- subset_samples(phy, Control=="No")
# creating a table
controls.phy.tab <- tax_table(controls.phy) %>% data.frame %>%
transmute(
OTU=taxa_names(controls.phy),
totSeqs=taxa_sums(phy %>% prune_taxa(taxa_names(controls.phy),.)),
totNegSeqs = taxa_sums(controls.phy),
pctSeqsNeg = round((100*totNegSeqs/totSeqs),4),
Genus=Genus
)
controls.head.tab <- subset(controls.phy.tab, pctSeqsNeg>0)
controls.head.tab %>% kable(caption = "OTUs in negative controls, putative contaminants highlighted (OTUs not in negative controls ommitted)") %>%
kable_styling(full_width = F) %>%
row_spec(which(controls.head.tab$pctSeqsNeg>6), background = "#FFCCCC")
| OTU | totSeqs | totNegSeqs | pctSeqsNeg | Genus | |
|---|---|---|---|---|---|
| 1 | OTU.1 | 562064 | 1342 | 0.2388 | g__Sphaerulina |
| 2 | OTU.2 | 312338 | 528 | 0.1690 | g__Mycosphaerella |
| 4 | OTU.4 | 168360 | 3458 | 2.0539 | g__Cladosporium |
| 7 | OTU.7 | 80866 | 39234 | 48.5173 | g__Arthonia |
| 8 | OTU.8 | 74152 | 3877 | 5.2284 | g__Cryptosphaeria |
| 13 | OTU.13 | 21660 | 18521 | 85.5078 | NA |
| 14 | OTU.14 | 21423 | 5461 | 25.4913 | g__Coniosporium |
| 25 | OTU.25 | 10114 | 9517 | 94.0973 | g__Diaporthe |
| 44 | OTU.44 | 4771 | 393 | 8.2373 | g__Rhabdocline |
| 59 | OTU.59 | 2998 | 2958 | 98.6658 | NA |
| 61 | OTU.61 | 2944 | 1882 | 63.9266 | g__Venturia |
| 82 | OTU.82 | 1970 | 1412 | 71.6751 | NA |
| 121 | OTU.121 | 953 | 120 | 12.5918 | NA |
| 125 | OTU.125 | 917 | 768 | 83.7514 | g__Knufia |
| 130 | OTU.130 | 810 | 792 | 97.7778 | NA |
| 144 | OTU.144 | 622 | 622 | 100.0000 | g__Aureobasidium |
| 145 | OTU.145 | 607 | 84 | 13.8386 | NA |
| 164 | OTU.164 | 444 | 444 | 100.0000 | NA |
| 172 | OTU.172 | 399 | 123 | 30.8271 | g__Ramularia |
| 176 | OTU.176 | 379 | 336 | 88.6544 | g__Malassezia |
| 203 | OTU.203 | 297 | 137 | 46.1279 | g__Phaeococcomyces |
| 210 | OTU.210 | 269 | 269 | 100.0000 | g__Venturia |
| 231 | OTU.231 | 224 | 88 | 39.2857 | g__Venturia |
| 241 | OTU.241 | 200 | 200 | 100.0000 | g__Penicillium |
| 311 | OTU.311 | 127 | 127 | 100.0000 | NA |
| 331 | OTU.331 | 112 | 107 | 95.5357 | g__Malassezia |
| 426 | OTU.426 | 65 | 65 | 100.0000 | g__Venturia |
| 495 | OTU.495 | 48 | 48 | 100.0000 | g__Venturia |
| 879 | OTU.879 | 10 | 10 | 100.0000 | NA |
Based on this table, a 6% threshold was selected to remove putative contaminants. After this, we plotted the sequencing depth (number of sequences) per sample to identify a minimum depth cutoff. We tried three depth cutoffs: 250 reads, 500 reads and 1000 reads.
## pruning out OTUs where 6% or more of reads are in the controls (created table of pruned OTUs just to check)
samps.phy %<>% prune_taxa(!(taxa_names(.) %in% (controls.phy.tab %>% filter(pctSeqsNeg>6) %>% .$OTU)),.)
samps.pruned.tab <- otu_table(samps.phy) %>% as.data.frame()
# Look at sequencing depth
plotSeqDepth <- function(phy,cutoff){
pctPass <- samps.phy %>% sample_data %>% data.frame %>%
mutate(SeqDepth=sample_sums(phy),
pass=SeqDepth>cutoff) %>%
#group_by(DataSet) %>%
summarise(nSamps=n(),pctPass=round(100*sum(pass)/n(),1))
samps.phy %>% sample_data %>% data.frame %>%
mutate(SeqDepth=sample_sums(samps.phy)) %>%
.[order(.$SeqDepth),] %>%
mutate(Index=factor(seq(nrow(.)))) %>%
ggplot(aes(x=Index, y=SeqDepth,color=SeqDepth<cutoff)) +
geom_point(position=position_jitter(width=50),alpha=0.5,shape=19) +
geom_text(aes(x=-Inf,y=Inf,label=paste0(nSamps," samples")),data=pctPass,inherit.aes = F, hjust=-0.1, vjust=1.5) +
geom_text(aes(x=-Inf,y=Inf,label=paste0(pctPass, "% > ",cutoff, " seqs")),data=pctPass,inherit.aes = F, hjust=-0.1, vjust=3) +
scale_color_calc(name=paste0("Sequencing depth > ",cutoff,"\n")) +
#facet_wrap(~DataSet,scales="free") +
theme_classic() +
theme(legend.position = "none",axis.text.x = element_blank(),axis.ticks.x = element_blank())
}
plotSeqDepth(samps.phy,250)
plotSeqDepth(samps.phy,500)
plotSeqDepth(samps.phy,1000)
Based on these plots, we selected 500 reads as the minimum sequencing depth. Samples below this cutoff were removed, along with taxa that had zero reads when the low-depth samples were removed.
minDepth <- 500 # this minDepth is based off the cutoffs tested with the plot
# Remove samples below sequencing depth cutoff
(samps.phy %<>% prune_samples(sample_sums(.)>minDepth,.) %>% filter_taxa(function(x) {sum(x) > 0}, TRUE))
Next, the ASVs were clustered into OTUs with the DECIPHER package. Taxonomy must be reassigned after the OTUs are clustered, which can be done with the method from the compiling stage.
# Cluster similar sequence variants (optional)
cluster <- function(phy.in,method="single",dissimilarity=0.03){
require(DECIPHER)
clusters <- DistanceMatrix(refseq(phy.in), includeTerminalGaps = T, processors=NULL) %>%
IdClusters(method=method, cutoff=dissimilarity, processors=NULL)
clusters[,1] %<>% as.character()
tax_table(phy.in) <- clusters %>% as.matrix %>% tax_table
phy.in %<>% speedyseq::tax_glom("cluster")
tax_table(phy.in) <- NULL
return(phy.in)
}
clustered.phy <- cluster(samps.phy)
## Loading required package: DECIPHER
## Loading required package: RSQLite
## Warning in DistanceMatrix(refseq(phy.in), includeTerminalGaps = T, processors = NULL):
## 133 different sequence lengths.
## Using shorter length in each comparison.
# re-assign taxonomy with fungal-only Unite database for clustered phy
clust.seq <- refseq(clustered.phy)
clust.tax <- assignTaxonomy(clust.seq,"../data/sh_general_release_dynamic_s_04.02.2020.fasta",multithread=T)
rownames(clust.tax) <- taxa_names(clustered.phy)
tax_table(clustered.phy) <- tax_table(clust.tax)
Finally, singletons (OTUs present in only one sample) were removed, and the raw sequence read counts were rarefied to proportional abundance.
# Remove low abundance OTUs by prevalance (singletons)
(clustered.phy %<>% filter_taxa(function(x) { sum(x>0) > 0.00412*ntaxa(.) }, TRUE) %>%
prune_samples(sample_sums(.)>minDepth,.) %>%
filter_taxa(function(x) {sum(x) > 0}, TRUE))
# Convert to proportional abundance (rarefaction)
clustered.phy %<>% transform_sample_counts(function(x){x*min(sample_sums(.)/sum(x))})
We now have a rarefied phyloseq object with 162 OTUs in 290 samples that is ready for visualization and preliminary analysis.