Data for analysis are downloaded from ftp.sra.ebi.ac.uk/vol1/fastq and data for reference is downloaded from NBCI. The genome is of Mycobacterium tuberculosis H37Rv, assession is SRR28714667.
Codes below are runned in Linux:
setwd("/Users/nnthieu/Rosalind/")
# Genotype: CalA_resC6
system("wget -P ./raw/ ftp.sra.ebi.ac.uk/vol1/fastq/SRR287/078/SRR28714678/SRR28714678_1.fastq.gz")
system("wget -P ./raw/ ftp.sra.ebi.ac.uk/vol1/fastq/SRR287/078/SRR28714678/SRR28714678_2.fastq.gz")
Download reference genome from NBCI then save it to directory ‘/ref/ASM19595v2’:
# reference genome
wget \
'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/195/955/GCA_000195955.2_ASM19595v2/GCA_000195955.2_ASM19595v2_genomic.fna.gz' \
-P './ref/ASM19595v2/'
# Masking repeat regions
gzip -c -d "./ref/ASM19595v2/GCA_000195955.2_ASM19595v2_genomic.fna.gz" > "./ref/ASM19595v2/GCA_000195955.2_ASM19595v2.fasta"
## Comparisons among reference genome to identify genome repeat location
mkdir -p "ref/ASM19595v2/masked/"
nucmer \
-p "./ref/masked/reference" \
--maxmatch --nosimplify \
"./ref/ASM19595v2/GCA_000195955.2_ASM19595v2.fasta" \
"./ref/ASM19595v2/GCA_000195955.2_ASM19595v2.fasta"
## BED file locations to be masked
show-coords -r -T -H "ref/masked/reference.delta" > "ref/masked/reference.coords"
awk '{if ($1 != $3 && $2 != $4) print $0}' ref/masked/reference.coords > ref/masked/masked_ref_BEFORE_ORDER.bed
awk '{print $8"\t"$1"\t"$2}' ref/masked/masked_ref_BEFORE_ORDER.bed > "ref/masked/masked_ref.bed"
## Masking regions in the bed file
bedtools maskfasta \
-fi "./ref/ASM19595v2/GCA_000195955.2_ASM19595v2.fasta" \
-bed "./ref/masked/masked_ref.bed" \
-fo "./ref/masked/masked_reference.fa"
# Indexing (.fai and .dict)
mkdir -p "./ref/fai/" "./ref/dict/"
samtools faidx \
"./ref/masked/masked_reference.fa" \
-o "ref/fai/masked_reference.fa.fai"
gatk CreateSequenceDictionary \
-R "./ref/masked/masked_reference.fa" \
-O "./ref/dict/masked_reference.dict"
# BWA indexing
mkdir -p "ref/BWAIndex/"
bwa index \
-p "./ref/BWAIndex/reference" \
"./ref/masked/masked_reference.fa"
# Run FastQC
fastqc \
--threads 4 \
--memory 4096 \
-o "./output/FastQC/" \
"./raw/SRR28714678_1.fastq.gz" "./raw/SRR28714678_2.fastq.gz"
# Trimming
fastp \
--in1 "raw/SRR28714678_1.fastq.gz" \
--in2 "raw/SRR28714678_2.fastq.gz" \
--out1 "./output/trimmed/SRR28714678_1.trimmed.fastq.gz" \
--out2 "./output/trimmed/SRR28714678_2.trimmed.fastq.gz" \
--json "./output/trimmed/SRR28714678.fastp.json" \
--html "./output/trimmed/SRR28714678.fastp.html" \
--thread 4 \
--detect_adapter_for_pe \
--overrepresentation_analysis \
2>&1 | tee "./output/trimmed/SRR28714678.fastp.log"
# BWA mapping
bwa mem \
-K 100000000 -Y -R "@RG\tID:HAWRWADXX.CalA_resC6.1\tPU:1\tSM:SRR28714678_CalA_resC6\tLB:CalA_resC6\tDS:lecture9/ref/masked/masked_reference.fa\tPL:ILLUMINA" \
-t 4 \
"ref/BWAIndex/reference" \
"output/trimmed/SRR28714678_1.trimmed.fastq.gz" "output/trimmed/SRR28714678_2.trimmed.fastq.gz" \
> "output/alignment/SRR28714678_aln.sam"
# Covert to BAM format and sorting
samtools sort \
--threads 4 \
-o "./output/alignment/SRR28714678_aln_sorted.bam" \
"./output/alignment/SRR28714678_aln.sam"
rm "./output/alignment/SRR28714678_aln.sam"
Mark dupplicates
mkdir -p "./output/alignment/markduplicates/"
gatk --java-options "-Xmx4G" MarkDuplicates \
--INPUT "./output/alignment/SRR28714678_aln_sorted.bam" \
--OUTPUT "./output/alignment/markduplicates/SRR28714678_sorted_md.bam" \
--METRICS_FILE "./output/alignment/markduplicates/SRR28714678_md.metrics" \
--TMP_DIR "./output/alignment/markduplicates/" \
--REFERENCE_SEQUENCE "./ref/masked/masked_reference.fa" \
-REMOVE_DUPLICATES false \
-VALIDATION_STRINGENCY LENIENT
Clean bam file after mark duplicate
gatk --java-options "-Xmx4G" CleanSam \
-I "output/alignment/markduplicates/SRR28714678_sorted_md.bam" \
-O "output/alignment/markduplicates/SRR28714678_cleaned_md.bam"
Ensuring all mate-pair information is in sync between each read and its mate pair
gatk --java-options "-Xmx4G" FixMateInformation \
-I "output/alignment/markduplicates/SRR28714678_cleaned_md.bam" \
-O "output/alignment/markduplicates/SRR28714678_fixmate_md.bam" \
--VALIDATION_STRINGENCY LENIENT
mv "output/alignment/markduplicates/SRR28714678_fixmate_md.bam" "output/alignment/markduplicates/SRR28714678_final.bam"
Indexing preprocessed bam
samtools index "output/alignment/markduplicates/SRR28714678_final.bam"
mkdir -p "./output/alignment/stats/"
samtools stat \
--threads 1 \
--reference "./ref/masked/masked_reference.fa" \
"output/alignment/markduplicates/SRR28714678_final.bam" \
> "output/alignment/stats/SRR28714678_final.bam.stats"
mosdepth \
--threads 4 \
--fasta "./ref/masked/masked_reference.fa" \
-n --fast-mode --by 500 \
"./output/alignment/stats/mosdepth/SRR28714678_final" \
"output/alignment/markduplicates/SRR28714678_final.bam"
mkdir -p "output/alignment/stats/qualimap/"
qualimap bamqc \
-bam "output/alignment/markduplicates/SRR28714678_final.bam" \
-p non-strand-specific \
--collect-overlap-pairs \
-outdir "output/alignment/stats/qualimap/" \
-nt 2
samtools coverage \
"output/alignment/markduplicates/SRR28714678_final.bam" \
> "./output/alignment/stats/SRR28714678_final_cov.stats"
samtools flagstats \
"output/alignment/markduplicates/SRR28714678_final.bam" \
> "./output/alignment/stats/SRR28714678_final.flagstats"
mkdir -p "output/results/"
multiqc . -f --filename "output/results/microbial_WGS_multiqc_report.html"