This is step 2 of my project workflow. Here, I use cutadapt to trim the primers off of my reads.

This workflow is adapted from the following pipeline: https://benjjneb.github.io/dada2/ITS_workflow.html

Get started

install r packages

install.packages('ShortRead')
install.packages("BiocManager")
BiocManager::install("dada2")
install.packages('ShortRead')

load packages

library(BiocManager)
library(ShortRead)
library("dada2")

To get cutadapt, I downloaded the miniconda installer using curl

install cutadapt software

#download miniconda from url
cd ../software
curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh

and then moved to the directory it was in by typing cd software in the terminal. I then typed bash Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh in the terminal to start the setup process. Conda is located in: /Users/hailaschultz/miniconda3.

I checked that installation worked by typing conda list in the terminal. In the terminal I then typed: conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge conda config --set channel_priority strict

and then conda create -n cutadaptenv cutadapt

On my personal computer, I had to use CONDA_SUBDIR=osx-64 conda create -n cutadaptenv cutadapt

I ran conda init bash

To activate the conda environment, in the terminal I typed conda activate cutadaptenv

Remove Primers

download fastqc

curl -O https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.7.zip

use fastqc

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc

make multiqc report in the terminal I ran conda install multiqc to install multiqc

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc

multiqc .

screenshot of multiqc mean qualit scores

Make lists of forward and reverse read files

list files for forward and reverse reads

#set file path
path <- "../data/fastq-files"
#make matched list of forward and reverse reads
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern = "_R2_001.fastq.gz", full.names = TRUE))

#specify primer sequences
FWD <- "GGWACWGGWTGAACWGTWTAYCCYCC"  
REV <- "TANACYTCNGGRTGNCCRAARAAYCA"  
#in the reverse primer, I replaced I with N

Check if primers are actually in data

get orientations of primers

allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
##                      Forward                   Complement 
## "GGWACWGGWTGAACWGTWTAYCCYCC" "CCWTGWCCWACTTGWCAWATRGGRGG" 
##                      Reverse                      RevComp 
## "CCYCCYATWTGWCAAGTWGGWCAWGG" "GGRGGRTAWACWGTTCAWCCWGTWCC"

count number of times primers occur in the first sample

primerHits <- function(primer, fn) {
    # Counts number of reads in which the primer is found
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
    return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))
##                  Forward Complement Reverse RevComp
## FWD.ForwardReads   63428          0       0       0
## FWD.ReverseReads       0          0       0       0
## REV.ForwardReads       0          0       0       0
## REV.ReverseReads   60704          0       0       0

Tell R the path to the cutadapt command

cutadapt <- "/Users/hailaschultz/miniconda3/envs/cutadaptenv/bin/cutadapt" 
system2(cutadapt, args = "--version") 

use cutadapt

path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC) 
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)

Run Cutadapt

for(i in seq_along(fnFs)) {
  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
                             fnFs[i], fnRs[i])) # input files
}

Check if adapters were trimmed

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
##                  Forward Complement Reverse RevComp
## FWD.ForwardReads       0          0       0       0
## FWD.ReverseReads       0          0       0       0
## REV.ForwardReads       0          0       0       0
## REV.ReverseReads       5          0       0       0
#there were a few reverse reads that were not trimmed, but overall it looks good

After trimming the primers, about half of the files ended up with zero data/reads. I think it is an issue with the read names. The error I get is along the lines of “error in sequence file at unknown line reads are improperly paired, read names don’t match”, I need to figure out what happened here, but for now I will proceed with the files that are okay and have pairs.

delete files that didn’t make it through cutadapt - smaller than 1 MB in the cutadapt directory

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
find . -type f -name "*.gz" -size -1M -delete

QC check on files after primers were trimmed

use fastqc

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc

make multiqc report to check files after trimming primers

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc

multiqc .

screenshot of multiqc mean quality scores after cutadapt

# Filter Reads

title: “03-filter-reads” output: html_document date: “2023-05-04” —

load packages

library(BiocManager)
library(ShortRead)
library("dada2")

Inspect read quality

Get sample names

# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE))

# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
## [1] "2018APRP12Ve-COI-48samples" "2018APRP12VeA-COI-P1"      
## [3] "2018APRP22Ve-COI-48samples" "2018APRP22VeA-COI-P1"      
## [5] "2018APRP28Ve-COI-48samples" "2018APRP28VeA-COI-P1"

plot forward read quality for first two files

plotQualityProfile(cutFs[1:2])

plot reverse read quality for first two files

plotQualityProfile(cutRs[1:2])

The forward reads are better quality, as we might expect.

filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
# get sample names to see which files match
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.namesF <- unname(sapply(cutFs, get.sample.name))
sample.namesR <- unname(sapply(cutRs, get.sample.name))

Check for differences between lists and remove files that don’t have a matching pair

difsR <- setdiff(sample.namesR,sample.namesF)
difsR
## character(0)
difsF <- setdiff(sample.namesF,sample.namesR)
difsF
## character(0)

It looks like there are no differences, so we can proceed

filter reads using quality thresholds

maxN: allows zero Ns max EE: set to 5 for forward and reverse truncQ: set to 2 minlen: minimum length is 100 reads rm.phix:remove phix this step takes a long time

out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5), 
    truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE) 

Check how many samples went through filtering

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt

ls | wc -l

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered

ls | wc -l
##       97
##       96

learn the error rates

forward reads

errF <- learnErrors(filtFs, multithread=TRUE)
## 118001220 total bases in 432610 reads from 8 samples will be used for learning the error rates.

reverse reads

errR <- learnErrors(filtRs, multithread=TRUE)
## 118551091 total bases in 432610 reads from 8 samples will be used for learning the error rates.

plot the errors

plotErrors(errF, nominalQ=TRUE)

#looks okay! good to proceed
plotErrors(errR, nominalQ=TRUE)

#looks okay! good to proceed

QC check on files after filtering were trimmed

use fastqc

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-filtering-qc

make multiqc report to check files after trimming primers

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-filtering-qc

multiqc .

screenshot of multiqc mean quality scores after filtering