Processing wood mycobiome data from Populus trichocarpa

Gillian E. Bergmann (gillian.bergmann@gmail.com)

This report provides the r code used to process Illumina MiSeq data from the wood mycobiome of Populus trichocarpa. The original code is saved in the following scripts:

denoise_wood.R
compile_wood.R
analysis_wood.R

Additionally, I used Pheniqs to demultiplex the samples, and a shell script with the programs Cutadapt and Seqpurge to remove primer tails from all sequences. These programs were used on a combined MiSeq library of the Populus wood samples and Douglas-fir seed samples. The output of this processing method is a single phyloseq object, which includes the identified OTUs and their taxonomy, their sequences, their read abundances across samples, and metadata for the samples.

Loading relevant packages and data

library(tidyverse)
library(magrittr)
library(dada2)
library(ShortRead)
library(phyloseq)
library(Biostrings)
library(ggplot2)
library(stringr)
library(readxl)
library(ggthemes)
# these last two packages are to knit certain visual aids in this markdown
library(knitr)
library(kableExtra)

Denoising

After demultiplexing the sequences into samples and trimming off primer sequences, the first step is to denoise sequences. The input for this step is the trimmed raw sequence files, which includes wood samples and seed samples from another project. We first read in all the sequence files, remove the seed sequences, and then plot the quality of the sequences. The sequence quality plots are randomly selected from the available samples. The outputs here are two plots of sequence quality from a random sample subset. The first plot is for forward reads, and the second plot is for reverse reads.

#path for output
out.path <- "output/dada"

#Identify paths to trimmed fwd and rev reads
in.path <- file.path("../../MiSeq_March2020/output/trim/wood")
path.fwd <- in.path %>% list.files(pattern="R1.fq.gz",full.names = T) %>% sort #forward reads
path.rev <- in.path %>% list.files(pattern="R2.fq.gz",full.names = T) %>% sort #reverse reads

#Filtering out seed samples
f_samp <- path.fwd %>% str_sub(., 37, 48)
r_samp <- path.rev %>% str_sub(., 37, 48)

Wood_Samps <- read_excel("../data/GBergmann_IlluminaMiSeq_Samples.xlsx", sheet = "Wood")
real <- Wood_Samps$Samps

path.fwd %<>% .[f_samp %in% real] 
path.rev %<>% .[r_samp %in% real]

#make file names / paths for trimmed and filtered files
filt.path <- file.path(out.path, "filt")
filt.fwd <- path.fwd %>% gsub(in.path,filt.path,.)
filt.rev <- path.rev %>% gsub(in.path,filt.path,.)
  
#preview read quality of 12 randomly selected samples before trimming
qual.samps <- sample(1:length(path.fwd),6)
plotQualityProfile(path.fwd[qual.samps]) 
plotQualityProfile(path.rev[qual.samps])

After plotting sequence quality, we can filter out sequences based on maximum error (maxEE), trim the remaining sequences, and check the quality again. As with before, the first quality plot is for forward reads, and the second plot is for reverse reads.

#Filter and trim
trim <- filterAndTrim(path.fwd, filt.fwd,
                      path.rev, filt.rev,
                      maxN=0, maxEE=c(2,2), truncQ=2,
                      multithread=TRUE)
  
#Update list of trimmed file paths to exclude samples with no reads passing filters
filt.fwd <- list.files(filt.path, pattern = "R1.fq.gz",full.names = T)
filt.rev <- list.files(filt.path, pattern = "R2.fq.gz",full.names = T)
  
#Check quality of trimmed and filtered reads
qual.samps <- sample(1:length(filt.fwd),12)
plotQualityProfile(filt.fwd[qual.samps])
plotQualityProfile(filt.rev[qual.samps])

We can see from these updated plots that filtering has improved overall quality. We can now dereplicate (combine) identical sequences, identify the true sequences in the community (denoise), make a table of those sequences, and remove chimeras. After these steps, we can create a datasheet summarizing the denoising process (e.g. how many sequences are present after each step.) The denoising step takes a long time, since millions of sequences are randomly selected from the samples to learn errors for this step.

#dereplicate identical sequences
derep.fwd <- derepFastq(filt.fwd, verbose=F)
derep.rev <- derepFastq(filt.rev, verbose=F) 

#Trim names of derep objects to just sample names 
names(derep.fwd) %<>% gsub(".R1.fq.gz","",.)
names(derep.rev) %<>% gsub(".R2.fq.gz","",.)
  
#Learn errors 
err.fwd <- learnErrors(filt.fwd, multithread=TRUE, nbases = 3e08) 
plotErrors(err.fwd, nominalQ=TRUE)
err.rev <- learnErrors(filt.rev, multithread=TRUE, nbases = 3e08)
plotErrors(err.rev, nominalQ=TRUE)

#Denoise
dada.fwd <- dada(derep.fwd, err=err.fwd, multithread=TRUE)
dada.rev <- dada(derep.rev, err=err.rev, multithread=TRUE)

#Merge reads
merged <- mergePairs(dada.fwd, derep.fwd, dada.rev, derep.rev, trimOverhang = T)
  
#Make sequence table
seqtab <- merged %>% makeSequenceTable
  
#Remove chimeras
seqtab.nonChimeras <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE)
  
#Make summary report of what happened
getN <- function(x) sum(getUniques(x))
trim.summary <- trim %>% data.frame %>% rownames_to_column("Sample") 
trim.summary$Sample %<>% str_sub(., 1, 12)
track <- cbind(sapply(dada.fwd, getN), 
               sapply(dada.rev, getN), 
               sapply(merged, getN),
               rowSums(seqtab.nonChimeras)) %>% 
  data.frame %>% rownames_to_column("Sample") 
track %<>% left_join(trim.summary,.)
colnames(track) <- c("sample", "input", "filtered", "denoised.fwd", "denoised.rev", "merged", "ChimeraFiltered")
print(head(track))
file.path(out.path,"dadaSummary.csv") %>% write.csv(track,.,row.names = F)

##         sample input filtered denoised.fwd denoised.rev merged ChimeraFiltered
## 1 myco.03.01.a  1557      976          958          958    958             958
## 2 myco.03.01.b 12248    10215        10173        10177  10054           10054
## 3 myco.03.01.c 11357     8669         8538         8593   8063            8063
## 4 myco.03.01.d  5355     4154         4131         4147   3921            3921
## 5 myco.03.01.e 12862    11314        11243        11251  10844           10844
## 6 myco.03.01.f 18892    16359        16315        16313  15048           15048

This table shows how many reads are left after each step of the denoising stage. Based on this table, it seems that there is decent sequencing depth for most of the samples. We can now save the sequence table (seqTab) for further processing.

# Save output
file.path(out.path,"seqTab.rds") %>% saveRDS(seqtab.nonChimeras,.)

Compiling

After denoising and filtering the sequences, they must be compiled with sample metadata into a phyloseq object. This object type allows for a variety of data visualizations and analyses. To set this up, we first read in the input sequence table and metadata, pull the sequences out of the data file, rename them and write them into a fasta file. We also want to download the most recent taxonomy database from UNITE if it’s not already saved in the project (it was saved here, so the necessary script is commented out at the bottom).

# read in sample metadata (row.names refers to column one, where the sample IDs are)
meta <- read.csv("../data/wood_meta2.csv",as.is=T,row.names=1)
meta$Disease[meta$Disease=="healthy"] <- "Healthy" # I had forgotten to capitalize healthy for a few of the samples, so this line is here to correct that

# read in denoised sequence table
seqTab <- readRDS("../output/dada/seqTab.rds")
#extract sequences from OTU table 
seqs <- getSequences(seqTab) %>% DNAStringSet
#rename sequences
OTU.names <- paste0("OTU.",1:ncol(seqTab))
colnames(seqTab) <- OTU.names
names(seqs) <- OTU.names
#Create data frame for summary of processing
compile.summary <- data.frame(SampID=rownames(seqTab),
                              denoised=rowSums(seqTab))
# create scratch directory
dir.create("../output/scratch")

## Warning in dir.create("../output/scratch"): '../output/scratch' already exists

# write temp file
writeXStringSet(seqs,file="../output/scratch/tmp.fasta",width=1000)

#download a version of the UNITE database if you dont have it
  ## This link downloads the most recent Fungi 2 database (https://dx.doi.org/10.15156/BIO/786369)
  ### There are newer versions than this. You can find them on the UNITE website under general fasta release. You can either download them to you computer and then upload via rstudio server or get the url to the file from the website (right click the download link and copy) and use the download.file() function to do it directly.
#if(!file.exists("data/taxonomy_db")){
#  download.file("https://files.plutof.ut.ee/public/orig/1D/B9/1DB95C8AC0A80108BECAF1162D761A8D379AF43E2A4295A3EF353DD1632B645B.gz",
#                "data/taxonomy_db.zip",
#                method="wget")
#  unzip("data/taxonomy_db.zip",exdir="data/taxonomy_db")
#  file.remove("data/taxonomy_db.zip")}

After preparing the data, we remove host (Populus trichocarpa) sequences and extract the ITS2 region from the sequences. We need to extract the ITS2 region to assign taxonomy, and it allows us to filter out other sequences that don’t match up (like very short reads).

# remove sequences matching the P.trich. genome with Bowtie2 
system("/home/busbylab/miniconda3/bin/bowtie2 --threads 10 -x ../data/Ptri.v.3.db -f ../output/scratch/tmp.fasta --un ../output/scratch/noHost.fa --al ../output/scratch/host.fa --fast")
seqs.noHost <- readDNAStringSet("../output/scratch/noHost.fa")
compile.summary$noHost <- seqTab %>% .[,names(seqs.noHost)] %>% rowSums()

#Extract ITS region with ITSx
  ## ITSx parameters 
ITSx.flags <- paste("-i ../output/scratch/noHost.fa",
                    "-t 'fungi,tracheophyta'",
                    "--preserve T",
                    "--complement T",
                    "--summary T",
                    "-o ../output/scratch/ITSx",
                    "--only_full T",
                    "-E 1e-2")
system2("ITSx", args = ITSx.flags)

#read in ITS2 sequences from ITSx output and filter out very short reads
seqs.ITS2 <- readDNAStringSet("output/scratch/ITSx.ITS2.fasta") %>% .[.@ranges@width > 50]

#remove OTUs from seqTab that are not in ITSx output
seqTab %<>% .[,names(seqs.ITS2)]
#collapse any identical sequences after isolating ITS2 
colnames(seqTab) <- seqs.ITS2 %>% as.character %>% unname
seqTab %<>% collapseNoMismatch

Now that the non-host ITS2 sequences have been extracted, we can assign taxonomy and combine all of the data into the phyloseq object. One additional step that I added here was to filter out sequences identified to non-fungal eukaryotes. I then re-assigned taxonomy to the fungal sequences with the fungal taxonomy database from UNITE. This was to ensure that the taxonomy would be as accurate as possible. The step saving the phyloseq object to the R project is not shown here.

#assign taxonomy with DADA2 implementation of RDP classifier (needs 12+ gb ram) - using all eukaryotes here
tax <- assignTaxonomy(seqTab,"../data/taxonomy_db/sh_general_release_dynamic_02.02.2019.fasta",multithread=T)

#set final OTU names
final.names <- paste0("OTU.",1:ncol(seqTab))

final.seqs <- getSequences(seqTab) %>% DNAStringSet()
names(final.seqs) <- final.names
colnames(seqTab) <- final.names
rownames(tax) <- final.names

#make phyloseq object
phy <- phyloseq(otu_table(seqTab,taxa_are_rows = F), 
                sample_data(meta),
                tax_table(tax),
                refseq(final.seqs))

# remove non-fungal taxa from phyloseq object
min.reads <- 0
keep.samples <- sample_sums(phy) > min.reads
phy %<>% subset_taxa(., Kingdom=="k__Fungi") %>% prune_samples(keep.samples,.) %>% filter_taxa(function(x) {sum(x) > 0}, TRUE)

# re-assign taxonomy with fungal-only database
phy.seq <- refseq(phy)
phy.tax <- assignTaxonomy(phy.seq,"../data/sh_general_release_dynamic_s_04.02.2020.fasta",multithread=T)
rownames(phy.tax) <- taxa_names(phy)
tax_table(phy) <- tax_table(phy.tax)
print(head(compile.summary))

One last thing we can look at in the compiling step is the summary of what happened. This shows us how many sequence reads are left for each sample following removal of host DNA.

##              X       SampID denoised noHost
## 1 myco.03.01.a myco.03.01.a      958    958
## 2 myco.03.01.b myco.03.01.b    10054   8422
## 3 myco.03.01.c myco.03.01.c     8063    120
## 4 myco.03.01.d myco.03.01.d     3921   1032
## 5 myco.03.01.e myco.03.01.e    10844   8766
## 6 myco.03.01.f myco.03.01.f    15048   7991

From this table, we can see that some of the samples were dominated by host sequences. This ultimately limited the quality of the final data set.

Final quality filtering

Now that we have our phyloseq object, we can do some additional filtering steps. This includes removing PCR and extraction controls, removing Amplicon Sequence Variants (ASVs) that are contaminants, removing samples with low sequencing depth, and removing singleton taxa. I also clustered sequences into Operational Taxonomic Units (OTUs) at 97% similarity, but this is optional.

We must first remove contaminants. These are identified taxa where a certain threshold percentage of their reads is present in the controls. The threshold percentage is partially based on if the taxa make sense as endophytes vs. some other fungal group (e.g. animal parasites). To determine an appropriate threshold, we calculated the percentage of sequences in controls for all taxa, and printed these calculations in a table, shown below (subset to only show OTUs with non-zero read counts in the controls).

# subsetting the phyloseq into controls and samples
controls.phy <- subset_samples(phy, Control=="Yes")
samps.phy <- subset_samples(phy, Control=="No")

# creating a table
controls.phy.tab <- tax_table(controls.phy) %>% data.frame %>% 
  transmute(
    OTU=taxa_names(controls.phy),
    totSeqs=taxa_sums(phy %>% prune_taxa(taxa_names(controls.phy),.)),
    totNegSeqs = taxa_sums(controls.phy),
    pctSeqsNeg = round((100*totNegSeqs/totSeqs),4),
    Genus=Genus
  )
controls.head.tab <- subset(controls.phy.tab, pctSeqsNeg>0)
controls.head.tab %>% kable(caption = "OTUs in negative controls, putative contaminants highlighted (OTUs not in negative controls ommitted)") %>%
  kable_styling(full_width = F) %>% 
  row_spec(which(controls.head.tab$pctSeqsNeg>6), background = "#FFCCCC")

OTUs in negative controls, putative contaminants highlighted (OTUs not in negative controls ommitted)
	OTU	totSeqs	totNegSeqs	pctSeqsNeg	Genus
1	OTU.1	562064	1342	0.2388	g__Sphaerulina
2	OTU.2	312338	528	0.1690	g__Mycosphaerella
4	OTU.4	168360	3458	2.0539	g__Cladosporium
7	OTU.7	80866	39234	48.5173	g__Arthonia
8	OTU.8	74152	3877	5.2284	g__Cryptosphaeria
13	OTU.13	21660	18521	85.5078	NA
14	OTU.14	21423	5461	25.4913	g__Coniosporium
25	OTU.25	10114	9517	94.0973	g__Diaporthe
44	OTU.44	4771	393	8.2373	g__Rhabdocline
59	OTU.59	2998	2958	98.6658	NA
61	OTU.61	2944	1882	63.9266	g__Venturia
82	OTU.82	1970	1412	71.6751	NA
121	OTU.121	953	120	12.5918	NA
125	OTU.125	917	768	83.7514	g__Knufia
130	OTU.130	810	792	97.7778	NA
144	OTU.144	622	622	100.0000	g__Aureobasidium
145	OTU.145	607	84	13.8386	NA
164	OTU.164	444	444	100.0000	NA
172	OTU.172	399	123	30.8271	g__Ramularia
176	OTU.176	379	336	88.6544	g__Malassezia
203	OTU.203	297	137	46.1279	g__Phaeococcomyces
210	OTU.210	269	269	100.0000	g__Venturia
231	OTU.231	224	88	39.2857	g__Venturia
241	OTU.241	200	200	100.0000	g__Penicillium
311	OTU.311	127	127	100.0000	NA
331	OTU.331	112	107	95.5357	g__Malassezia
426	OTU.426	65	65	100.0000	g__Venturia
495	OTU.495	48	48	100.0000	g__Venturia
879	OTU.879	10	10	100.0000	NA

Based on this table, a 6% threshold was selected to remove putative contaminants. After this, we plotted the sequencing depth (number of sequences) per sample to identify a minimum depth cutoff. We tried three depth cutoffs: 250 reads, 500 reads and 1000 reads.

## pruning out OTUs where 6% or more of reads are in the controls (created table of pruned OTUs just to check)
samps.phy %<>% prune_taxa(!(taxa_names(.) %in% (controls.phy.tab %>% filter(pctSeqsNeg>6) %>% .$OTU)),.) 
samps.pruned.tab <- otu_table(samps.phy) %>% as.data.frame()
# Look at sequencing depth
plotSeqDepth <- function(phy,cutoff){
  pctPass <- samps.phy %>% sample_data %>% data.frame %>% 
    mutate(SeqDepth=sample_sums(phy),
           pass=SeqDepth>cutoff) %>% 
    #group_by(DataSet) %>%
    summarise(nSamps=n(),pctPass=round(100*sum(pass)/n(),1))
  samps.phy %>% sample_data %>% data.frame %>%  
    mutate(SeqDepth=sample_sums(samps.phy)) %>% 
    .[order(.$SeqDepth),] %>%
    mutate(Index=factor(seq(nrow(.)))) %>%
    ggplot(aes(x=Index, y=SeqDepth,color=SeqDepth<cutoff)) + 
    geom_point(position=position_jitter(width=50),alpha=0.5,shape=19) +
    geom_text(aes(x=-Inf,y=Inf,label=paste0(nSamps," samples")),data=pctPass,inherit.aes = F, hjust=-0.1, vjust=1.5) +
    geom_text(aes(x=-Inf,y=Inf,label=paste0(pctPass, "% > ",cutoff, " seqs")),data=pctPass,inherit.aes = F, hjust=-0.1, vjust=3) +
    scale_color_calc(name=paste0("Sequencing depth > ",cutoff,"\n")) +
    #facet_wrap(~DataSet,scales="free") +
    theme_classic() + 
    theme(legend.position = "none",axis.text.x = element_blank(),axis.ticks.x = element_blank()) 
}
plotSeqDepth(samps.phy,250)

plotSeqDepth(samps.phy,500)

plotSeqDepth(samps.phy,1000)

Based on these plots, we selected 500 reads as the minimum sequencing depth. Samples below this cutoff were removed, along with taxa that had zero reads when the low-depth samples were removed.

minDepth <- 500 # this minDepth is based off the cutoffs tested with the plot
# Remove samples below sequencing depth cutoff
(samps.phy %<>% prune_samples(sample_sums(.)>minDepth,.) %>% filter_taxa(function(x) {sum(x) > 0}, TRUE))

Next, the ASVs were clustered into OTUs with the DECIPHER package. Taxonomy must be reassigned after the OTUs are clustered, which can be done with the method from the compiling stage.

# Cluster similar sequence variants (optional)
cluster <- function(phy.in,method="single",dissimilarity=0.03){
  require(DECIPHER)
  clusters <- DistanceMatrix(refseq(phy.in), includeTerminalGaps = T, processors=NULL) %>%
    IdClusters(method=method, cutoff=dissimilarity, processors=NULL) 
  clusters[,1] %<>% as.character()
  tax_table(phy.in) <- clusters %>% as.matrix %>% tax_table
  phy.in %<>% speedyseq::tax_glom("cluster")
  tax_table(phy.in) <- NULL
  return(phy.in)
}  
clustered.phy <- cluster(samps.phy)

## Loading required package: DECIPHER

## Loading required package: RSQLite

## Warning in DistanceMatrix(refseq(phy.in), includeTerminalGaps = T, processors = NULL): 
## 133 different sequence lengths.
## Using shorter length in each comparison.

# re-assign taxonomy with fungal-only Unite database for clustered phy
clust.seq <- refseq(clustered.phy)
clust.tax <- assignTaxonomy(clust.seq,"../data/sh_general_release_dynamic_s_04.02.2020.fasta",multithread=T)
rownames(clust.tax) <- taxa_names(clustered.phy)
tax_table(clustered.phy) <- tax_table(clust.tax)

Finally, singletons (OTUs present in only one sample) were removed, and the raw sequence read counts were rarefied to proportional abundance.

# Remove low abundance OTUs by prevalance (singletons)
(clustered.phy %<>% filter_taxa(function(x) { sum(x>0) > 0.00412*ntaxa(.) }, TRUE) %>%
    prune_samples(sample_sums(.)>minDepth,.) %>% 
    filter_taxa(function(x) {sum(x) > 0}, TRUE))

# Convert to proportional abundance (rarefaction)
clustered.phy %<>% transform_sample_counts(function(x){x*min(sample_sums(.)/sum(x))})

We now have a rarefied phyloseq object with 162 OTUs in 290 samples that is ready for visualization and preliminary analysis.