This is step 2 of my project workflow. Here, I use cutadapt to trim the primers off of my reads.
This workflow is adapted from the following pipeline: https://benjjneb.github.io/dada2/ITS_workflow.html
install r packages
install.packages('ShortRead')
install.packages("BiocManager")
BiocManager::install("dada2")
install.packages('ShortRead')
load packages
library(BiocManager)
library(ShortRead)
library("dada2")
To get cutadapt, I downloaded the miniconda installer using curl
install cutadapt software
#download miniconda from url
cd ../software
curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh
and then moved to the directory it was in by typing
cd software in the terminal. I then typed bash
Miniconda3-py310_23.3.1-0-MacOSX-arm64.sh in the terminal to start the
setup process. Conda is located in: /Users/hailaschultz/miniconda3.
I checked that installation worked by typing conda list
in the terminal. In the terminal I then typed:
conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge conda config --set channel_priority strict
and then conda create -n cutadaptenv cutadapt
On my personal computer, I had to use
CONDA_SUBDIR=osx-64 conda create -n cutadaptenv cutadapt
I ran conda init bash
To activate the conda environment, in the terminal I typed
conda activate cutadaptenv
download fastqc
curl -O https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.7.zip
use fastqc
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc
make multiqc report in the terminal I ran
conda install multiqc to install multiqc
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/raw-read-qc
multiqc .
screenshot of multiqc mean qualit scores
Make lists of forward and reverse read files
list files for forward and reverse reads
#set file path
path <- "../data/fastq-files"
#make matched list of forward and reverse reads
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern = "_R2_001.fastq.gz", full.names = TRUE))
#specify primer sequences
FWD <- "GGWACWGGWTGAACWGTWTAYCCYCC"
REV <- "TANACYTCNGGRTGNCCRAARAAYCA"
#in the reverse primer, I replaced I with N
Check if primers are actually in data
get orientations of primers
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
## Forward Complement
## "GGWACWGGWTGAACWGTWTAYCCYCC" "CCWTGWCCWACTTGWCAWATRGGRGG"
## Reverse RevComp
## "CCYCCYATWTGWCAAGTWGGWCAWGG" "GGRGGRTAWACWGTTCAWCCWGTWCC"
count number of times primers occur in the first sample
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 63428 0 0 0
## FWD.ReverseReads 0 0 0 0
## REV.ForwardReads 0 0 0 0
## REV.ReverseReads 60704 0 0 0
Tell R the path to the cutadapt command
cutadapt <- "/Users/hailaschultz/miniconda3/envs/cutadaptenv/bin/cutadapt"
system2(cutadapt, args = "--version")
use cutadapt
path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)
Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs[i], fnRs[i])) # input files
}
Check if adapters were trimmed
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 0 0 0 0
## FWD.ReverseReads 0 0 0 0
## REV.ForwardReads 0 0 0 0
## REV.ReverseReads 5 0 0 0
#there were a few reverse reads that were not trimmed, but overall it looks good
After trimming the primers, about half of the files ended up with zero data/reads. I think it is an issue with the read names. The error I get is along the lines of “error in sequence file at unknown line reads are improperly paired, read names don’t match”, I need to figure out what happened here, but for now I will proceed with the files that are okay and have pairs.
delete files that didn’t make it through cutadapt - smaller than 1 MB in the cutadapt directory
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
find . -type f -name "*.gz" -size -1M -delete
QC check on files after primers were trimmed
use fastqc
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc
make multiqc report to check files after trimming primers
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-cutadapt-qc
multiqc .
screenshot of multiqc mean quality scores after cutadapt
title: “03-filter-reads” output: html_document date: “2023-05-04” —
load packages
library(BiocManager)
library(ShortRead)
library("dada2")
Get sample names
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
## [1] "2018APRP12Ve-COI-48samples" "2018APRP12VeA-COI-P1"
## [3] "2018APRP22Ve-COI-48samples" "2018APRP22VeA-COI-P1"
## [5] "2018APRP28Ve-COI-48samples" "2018APRP28VeA-COI-P1"
plot forward read quality for first two files
plotQualityProfile(cutFs[1:2])
plot reverse read quality for first two files
plotQualityProfile(cutRs[1:2])
The forward reads are better quality, as we might expect.
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
# get sample names to see which files match
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.namesF <- unname(sapply(cutFs, get.sample.name))
sample.namesR <- unname(sapply(cutRs, get.sample.name))
Check for differences between lists and remove files that don’t have a matching pair
difsR <- setdiff(sample.namesR,sample.namesF)
difsR
## character(0)
difsF <- setdiff(sample.namesF,sample.namesR)
difsF
## character(0)
It looks like there are no differences, so we can proceed
maxN: allows zero Ns max EE: set to 5 for forward and reverse truncQ: set to 2 minlen: minimum length is 100 reads rm.phix:remove phix this step takes a long time
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5),
truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
Check how many samples went through filtering
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
ls | wc -l
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered
ls | wc -l
## 97
## 96
learn the error rates
forward reads
errF <- learnErrors(filtFs, multithread=TRUE)
## 118001220 total bases in 432610 reads from 8 samples will be used for learning the error rates.
reverse reads
errR <- learnErrors(filtRs, multithread=TRUE)
## 118551091 total bases in 432610 reads from 8 samples will be used for learning the error rates.
plot the errors
plotErrors(errF, nominalQ=TRUE)
#looks okay! good to proceed
plotErrors(errR, nominalQ=TRUE)
#looks okay! good to proceed
QC check on files after filtering were trimmed
use fastqc
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered
/Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/software/FastQC/fastqc *fastq.gz -o /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-filtering-qc
make multiqc report to check files after trimming primers
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/output/after-filtering-qc
multiqc .
screenshot of multiqc mean quality scores after filtering