My post is aimed at showing nessessory steps to download and process data to call variants from sequencing fastq files mapping to the reference genome file.
I will use human raw data from 1000 Genome project to demonstrate variant calling using GATK4 best practice. For saving memory space, I just download chromosome 21 from the genome of each indivisual.
These fastq files that I need to download is listed in the ids.txt file:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p1.jpg")
For downloading file by file of fastq files, I use this code
system("sam-dump --verbose --fastq --aligned-region chr21 --output-file ERR1019044_chr21.fastq ERR1019044
")
For getting consecutively the fastq files I use this download_fastq.sh file:
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/fastq/download_fastq.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
##
## # Check if ids.txt exists
## if [ ! -f ids.txt ]; then
## echo "ids.txt file not found!"
## exit 1
## fi
##
## # Read each line of ids.txt
## while IFS= read -r line; do
## # Extract the first field (sample ID) from the line
## sample_id=$(echo $line | awk '{print $1}')
##
## # Set the output file name based on the sample ID
## output_file="${sample_id}_chr21.fastq"
##
## # Run the sam-dump command for the current sample
## echo "Processing $sample_id..."
## sam-dump --verbose --fastq --aligned-region chr21 --output-file "$output_file" "$sample_id"
##
## # Check if sam-dump was successful
## if [ $? -eq 0 ]; then
## echo "Finished processing $sample_id"
## else
## echo "Error processing $sample_id"
## exit 1
## fi
##
## done < ids.txt
##
## echo "All samples processed successfully!"
output fastq files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p2.jpg")
Check file quality
# check if fastq files contain 0 bp
system("cat ./fastq/ERR1019034_chr21.fastq | head -n 25")
system("cd refgenome")
system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz")
system("gatk CreateSequenceDictionary \
-R ./GCF_000001405.40_GRCh38.p14_genomic.fna \
-O ./GCF_000001405.40_GRCh38.p14_genomic.dict ")
system("samtools faidx ./GCF_000001405.40_GRCh38.p14_genomic.fna")
system("f=$(ls *.fasta)")
system("samtools faidx ${f}")
system("bwa index ${f}")
system("cd ../")
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p2b.jpg")
run consecutively all fastq files by runing run_bwa_mem.sh file
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/fastq/run_bwa_mem.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
## ref=$(ls ../refgenome/*.fasta)
## for i in $(ls *.fastq | rev | cut -c 13- | rev);
## do
## bwa mem -M -t 4 \
## -R “@RG\tID:${i}\tSM:${i}” \
## ${ref} \
## ${i}_chr21.fastq > \
## ../sam/${i}_chr21.sam 2> ../sam/${i}_chr21.log;
## done
The ouput files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p3.jpg")
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/sam_to_bam.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
##
## # Loop through all SAM files in the 'sam' directory
## for sam_file in ./sam/*.sam; do
## # Extract the base filename without the directory and extension
## sample_name=$(basename "${sam_file}" .sam)
##
## # Print status message
## echo "Converting ${sample_name}.sam to BAM..."
##
## # Convert SAM to BAM one file at a time to save memory
## samtools view -S -b "${sam_file}" > "./bam/${sample_name}.bam"
##
## # Optional: Free up memory after each conversion (if available)
## sync && sudo purge
##
## # Print completion message
## echo "${sample_name}.sam has been successfully converted to BAM."
## done
##
## echo "All SAM files have been converted to BAM."
The output bam files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p4.jpg")
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/sorting_bam.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
## #cd bam
## for i in $(ls *.bam);
## do
## samtools sort -T ../sortedbam/tmp.sort -o ../sortedbam/${i} ${i}
## samtools index ../sortedbam/${i}
## done
Identifying variants from whole genome requires large memory and storage space. Therefore, for demonstration and the sake of simplicity, I will focus only on chromosome 21 of human genome. The other chromosomes come from the reference genome.
system("mkdir chr21")
system("cd sortedbam")
system("for i in $(ls *.bam|rev|cut -c 5-|rev);
do
samtools view -b ${i}.bam chr21 > ../chr21/${i}.bam
samtools index ../chr21/${i}.bam
done ")
Install picard.jar into local directory then use it to mark dupplicates.
system("mkdir dedup")
system("cd chr21")
system("for i in $(ls *.bam|rev|cut -c 5-|rev);
do
java -Xmx7g -jar ./picard.jar MarkDuplicates \
-INPUT ${i}.bam \
-OUTPUT ../dedup/${i}.dedup.bam \
-METRICS_FILE ../dedup/${i}.dedup_metrics.txt
samtools index ../dedup/${i}.dedup.bam
done")
RGs are identified in the BAM file by a number of tags that are defined according to a specific format. We can use the following samtools command to display the RG tags on a BAM file:
samtools view -H filename.bam | grep “^@RG”
Using Picard and indexing the resulted BAM files with samtools.
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/adding_RG.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
## for i in $(ls *.bam|rev|cut -c 5-|rev);
## do
## java -jar ../picard.jar AddOrReplaceReadGroups \
## I=${i}.bam \
## O=../RG/${i}.RG.bam \
## RGID=${i} \
## RGLB=lib RGPL=ILLUMINA \
## SORT_ORDER=coordinate \
## RGPU=bar1 RGSM=${i}
## samtools index ../RG/${i}.RG.bam
## done
For detecting systematic errors made by the sequencer. Building a BQSR model (Base Quality Score Recalitation) to minimize QC systemic errors, downloads a standard human SNPs and InDels VCF files for reference.
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz")
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi")
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz")
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi")
Istall gatk-4.6.0.0 into local directory. Build a model.
mkdir BQSR
cd RG
# Read the contents of the .sh file
sh_contents <- suppressWarnings(readLines("/Volumes/Extreme/vcfgatk/build_BQSR.sh"))
# Display the file's contents
cat(sh_contents, sep = "\n")
## #!/bin/bash
## ref=$(ls ../refgenome/*.fasta)
## for i in $(ls *.bam|rev|cut -c 5-|rev);
## do
## /Users/nnthieu/Downloads/gatk-4.6.0.0/gatk --java-options \
## -Xmx4g BaseRecalibrator \
## -I ${i}.bam \
## -R ${ref} \
## --known-sites ../refvcf/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
## --known-sites ../refvcf/Homo_sapiens_assembly38.known_indels.vcf.gz \
## -O ../BQSR/${i}.table
## done
Applying a model
mkdir applyBQSR
cd RG
system("mkdir applyBQSR")
system("cd RG")"
system("ref=$(ls ../refgenome/*.fasta)
for i in $(ls *.bam|rev|cut -c 5-|rev);
do
/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options \
-Xmx4g ApplyBQSR \
-I ${i}.bam \
-R ${ref} \
--bqsr-recal-file ../BQSR/${i}.table \
-O ../applyBQSR/${i}.bqr.bam
done ")"
The output files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p4b.jpg")
The bam files are now ready for variant calling.
Per-sample variant calling using the “HaplotypeCaller” GATK4 function
mkdir gvcf
cd applyBQSR
system("ref=$(ls ../refgenome/*.fasta)
for i in $(ls *.bam|rev|cut -c 5-|rev);
do
/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options \
-Xmx10g HaplotypeCaller \
-I ${i}.bam \
-R ${ref} \
-L chr21 \
-ERC GVCF \
-O ../gvcf/${i}.g.vcf.gz
done")
The output files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p5.jpg")
Making a cohort sample map file. The cohort sample map file is a plain text file that contains two tab-separated columns; the first column is for the sample IDs and the second column is for the names of the gVCF files.
system("cd gvcf")
#a- make file name and absolute path
system("system("find “$PWD”/*_chr21.dedup.RG.bqsr.g.vcf.gz -type f -printf ‘%f%h/%f\n’ > ../tmp.txt ")
#b- remove _1/2.fastq
system("awk ‘{ gsub(/_chr21.dedup.RG.bqsr.g.vcf.gz/,”,”, $1); print } ‘../tmp.txt >../tmp2.txt ")
system("rm ../tmp.txt")
#remove space
system("cat ../tmp2.txt | sed -r ‘s/\s+//g’ > ../tmp3.txt")
system("rm ../tmp2.txt")
#replace comma with tab
system("sed -e ‘s/\,\+/\t/g’ ../tmp3.txt > ../cohort.sample_map")
system("rm ../tmp3.txt")
Once the cohort sample map file is created, I can run GenomicsDBImport tool to import gVCF sample files and GenotypeGVCFs tool to consolidate the variants of 13 samples in a single VCF file.
system("ref=$(ls ../refgenome/*.fasta)
gatk --java-options -Xmx10g \
GenomicsDBImport \
-R ${ref} \
--genomicsdb-workspace-path ../gvcf21db \
--batch-size 50 \
--sample-name-map ../cohort.sample_map \
-L chr21 \
--tmp-dir ../tmp \
--reader-threads 4 ")
and
system("mkdir vcf")
system("ref=$(ls refgenome/*.fasta)
/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options -Xmx10g \
GenotypeGVCFs \
-R ${ref} \
-V gendb://gvcf21db \
-O vcf/allsamples_chr21.vcf ")
Separating the identified SNPs and InDels for analyzing.
#SNPS
system("cd ./vcf")
system("/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options \
-Xmx10g SelectVariants \
-V vcf/allsamples_chr21.vcf \
-select-type SNP \
-O vcf/allsamplesSNP_chr21.vcf")
#INDEL
system("cd ./vcf
/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options \
-Xmx4g SelectVariants \
-V vcf/allsamples_chr21.vcf \
-select-type INDEL \
-O vcf/allsamplesIndels_chr21.vcf")
The output files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p6.jpg")
system("cd refvcf")
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf")
system("wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx")
system("cd ..")
system("mkdir VQSR")
system("cd vcf")
system("/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk --java-options -Xmx10g VariantRecalibrator \
-R ../refgenome/Homo_sapiens_assembly38.fasta \
-V ./allsamplesSNP_chr21.vcf \
--trust-all-polymorphic \
-tranche 100.0 \
-tranche 99.95 \
-tranche 99.90 \
-tranche 99.85 \
-tranche 99.80 \
-tranche 99.00 \
-tranche 98.00 \
-tranche 97.00 \
-tranche 90.00 \
--max-gaussians 6 \
--resource:1000G,known=false,training=true,truth=true,prior=10.0 \
../refvcf/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 \
../refvcf/Homo_sapiens_assembly38.dbsnp138.vcf \
-an QD -an MQ -an MQRankSum \
-an ReadPosRankSum -an FS -an SOR \
-mode SNP \
-O ../VQSR/tranches.out \
--tranches-file ../VQSR/vqsrplots.R ")
The output files are:
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p7.jpg")
Applying the model on the variants using ApplyVQSR by tranche sensitivity thresholds to filter variants and adding PASS to FILTER field of the variants that have VQSLOD above the threshold while the variants with VQSLOD below the threshold are labeled with tranche name
system("mkdir filteredVCF")
system("cd ./vcf")
system("/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options -Xmx4G ApplyVQSR \
-V ./allsamplesSNP_chr21.vcf \
-O ../filteredVCF/humanSNP.vcf \
--truth-sensitivity-filter-level 99.7 \
--tranches-file ../VQSR/vqsrplots.R \
--recal-file ../VQSR/tranches.out \
-mode SNP \
--create-output-variant-index true ")
system("cd vcf")
system("/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options \
-Xmx10G VariantFiltration \
-V ./allsamplesSNP_chr21.vcf \
-filter "QD<2.0" --filter-name "QD2" \
-filter "QUAL<30.0" --filter-name "QUAL30" \
-filter "SOR>3.0" --filter-name "SOR3" \
-filter "FS>60.0" --filter-name "FS60" \
-filter "MQ<40.0" --filter-name "MQ40" \
-filter "MQRankSum<-12.5" --filter-name "MQRankSum-12.5" \
-filter "ReadPostRankSum<-8.0" --filter-name "ReadPostRankSum-8" \
-O ../filteredVCF/hardfilteredVCF.vcf")
system("cd vcf")
system("/Users/nnthieu/Downloads/gatk-4.6.0.0/gatk \
--java-options -Xmx10G VariantFiltration \
-V ./allsamplesIndels_chr21.vcf \
-filter "QD<2.0" --filter-name "QD2" \
-filter "QUAL<30.0" --filter-name "QUAL30" \
-filter "FS>200.0" --filter-name "FS200" \
-filter "ReadPosRankSum<-20.0" --filter-name "ReadPosRankSum-20" \
-O ../filteredVCF/hardfilteredIndels.vcf ")
Use IGV to visualize variants. Chart shows the allele fractions and genotypes for each of the InDels and SNPs of the samples. The dark blue color indicates heterozygous genotype, cyan indicates homozygous genotype, and gray indicates the same genotype as the reference.
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p8.jpg")
Variant annotation involves adding information and knowledge to high-confidence variants in an effort to enhance assessment of variants that are likely to impact functions. Prioritizing the variants with potential associations to phenotypes of interest is usually the key target of the variant annotation.
Variant annotation must be conducted after filtering variants as dis- cussed above to avoid misinterpretation, false positive, and false negative. Generally, we can define variant annotation as the process of assigning functional or phenotype infor- mation to genetic variants such as SNPs, InDels, or copy number variants.
SIFT relies on the assumption that substitutions in conserved regions are more likely to be deleterious if the missense SNV resulted in a nonsynonymous codon that is translated into an amino acid with different physicochemical properties.
I need a reference annotated genome that cab be downloaded by:
system("mkdir sift4g")
system("cd ./sift4g")
system("wget https://sift.bii.a-star.edu.sg/sift4g/public/Homo_sapiens/
GRCh38.78.zip")
sysytem("unzip GRCh38.78.zip")
Download SIFT 4G Annotator Java executable file (.jar) in a directory or in your working directory:
system("wget https://github.com/paulineng/SIFT4G_Annotator/raw/master/SIFT4G_Annotator.jar")
Create annotation xls and vcf file.
system("java -jar SIFT4G_Annotator.jar \
-c -i humanSNP.vcf \
-d GRCh38.78 \
-r output -t ")
oThe output xls file
knitr::include_graphics("/Volumes/Extreme/vcfgatk/p9.jpg")
Open SIFT interface
java -jar SIFT4G_Annotator.jar