Now activate the environment you created last week.
conda activate bfblab
#Install FASTQC for sequence quality check
conda install -c bioconda fastqc
#Install fastp for FASTQ trimming
conda install -c bioconda fastp
#Install Megahit for de novo assembly
conda install -c bioconda megahit
#Then, this will show the user manual:
megahit -h
#Install QUAST for fasta quality check
conda install -c bioconda quast
#Install gdown for file downloading
conda install -c conda-forge gdown
#or you can install all of them at once
conda install -c bioconda -c conda-forge fastqc fastp megahit quast gdown
conda create --name newenv
conda activate newenv
conda install -c bioconda -c conda-forge quast gdown
First, create a separate folder for the lab exercises:
cd ~
mkdir Lab07
cd Lab07
Download the data using this link using wget:
#Download the first read:
gdown https://drive.google.com/uc?id=1CiRkrUcP3S_oNiluGQ1Mh1qSU-3FpKdF
#Download the second read:
gdown https://drive.google.com/uc?id=14di4CJ_J8TrRwISRQm0Nbt9_tILcYF9N
Open and visualize the sequencing reads in one of the fastq files. Pay attention to using ‘|’ (pipe)!
gunzip -c reads_1.fq.gz | less
Check the quality of sequences: Run the FASTQC software on both read_1 and read_2 files as follows:
conda activate bfblab
# *.gz means that select any file which ends with ".gz" ( these characters are called wildcards )
fastqc *.gz
Open the html outputs generated using a web browser. Investigate the results.
Trimming/cleaning the data may be required after QC analysis
#Trimming using fastp (paired-end) (accepts both .fq and fq.gz)
fastp -i reads_1.fq.gz -I reads_2.fq.gz -o out.reads_1.fq.gz -O out.reads_2.fq.gz
#for single read
fastp -i read.fq -o out.read.fq
After quality check and trimming(if required) we may now assemble the reads into contigs
megahit -1 out.reads_1.fq.gz -2 out.reads_2.fq.gz -o Unknown_genome_megahit
This command will create a folder called ‘Unknown_genome_megahit’. Please, navigate into that folder.
cd Unknown_genome_megahit/
ls -al
#And open the final contigs fasta file
less final.contigs.fa
#QUAST reads the fasta file and creates several statistical reports regarding that file
quast final.contigs.fa
#after opening the reports folder
less report.txt
#you can see and analyse the end report of QUAST and answer the previous questions
Example bash script to wrap up all the code above:
write these in a file using a text editor (gedit)
#!/usr/bin/bash
#set the arguments to variables
FASTQ_1=$1
FASTQ_2=$2
#Check the quality of input reads
fastqc $FASTQ_1
fastqc $FASTQ_2
#Trimming using fastp (paired-end) (accepts both .fq and fq.gz)
fastp -i $FASTQ_1 -I $FASTQ_2 -o trimmed."${FASTQ_1}" -O trimmed."${FASTQ_2}"
#Run de novo assembly using MegaHit
megahit -1 trimmed."${FASTQ_1}" -2 trimmed."${FASTQ_2}" -o Unknown_genome_megahit