Reference:

https://www.youtube.com/watch?v=XZ8scaScfjw

https://github.com/kpatel427/YouTubeTutorials/blob/main/somatic_variant_calling_get_reads_1.sh

1. Data download

Script to call germline variants in a human WGS paired end reads 2 X 100bp Following GATK4 best practices workflow - https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-

directories:

ref=“/Users/nnthieu/vcfgatk/refgenome/hg38.fa”

known_sites=“/Users/nnthieu/vcfgatk/refvcf/Homo_sapiens_assembly38.dbsnp138.vcf”

aligned_reads=“/Users/nnthieu/vcfgatk/aligned_reads”

reads=“/Users/nnthieu/vcfgatk/reads”

results=“/Users/nnthieu/vcfgatk/results”

data=“/Users/nnthieu/vcfgatk/data”

Download fastq files

system("wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_1.filt.fastq.gz")

system("wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_2.filt.fastq.gz")

The output file are data/SRR062634_1.filt.fastq and data/SRR062634_2.filt.fastq

download reference files

Download reference hg38.fa.gz into ‘/regenome’

system("wget -P /Users/nnthieu/vcfgatk/refgenome https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz")

system("gunzip /Users/nnthieu/vcfgatk/refgenome/hg38.fa.gz")

# index ref - .fai file before running haplotype caller
system("samtools faidx /Users/nnthieu/vcfgatk/refgenome/hg38.fa")

ref dict - .dict file before running haplotype caller

system("gatk CreateSequenceDictionary R=/Users/nnthieu/vcfgatk/refgenome/hg38.fa O=/Users/nnthieu/vcfgatk/refgenome/hg38.dict")

download known-sites files for BQSR from GATK resource bundle

system("wget -P /Users/nnthieu/vcfgatk/refgenome/https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf")

system("wget -P /Users/nnthieu/vcfgatk/refgenome/https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx")

2. QC - Run fastqc

system("fastqc ${reads}/SRR062634_1.filt.fastq.gz -o ${reads}/")
system("fastqc ${reads}/SRR062634_2.filt.fastq.gz -o ${reads}/")

The output html files show the acceptable quality so I will not run trimming fastq files.

3. Pre-proccessing data

Map to reference using BWA-MEM

# BWA index reference 
system("bwa index ${ref}")

# BWA alignment
system("bwa mem -t 4 -R "@RG\tID:SRR062634\tPL:ILLUMINA\tSM:SRR062634" ${ref} ${reads}/SRR062634_1.filt.fastq.gz ${reads}/SRR062634_2.filt.fastq.gz > ${aligned_reads}/SRR062634.paired.sam ")

#check sam file quality
system("samtools quickcheck -v ../aligned_reads/SRR062634.paired.sam")

The output file is RR062634.paired.sam

Mark Duplicates and Sort - GATK4

system("samtools view -Sb ${aligned_reads}/SRR062634.paired.sam -o ${aligned_reads}/SRR062634.paired.bam")

system("gatk MarkDuplicatesSpark -I ${aligned_reads}/SRR062634.paired.bam -O ${aligned_reads}/SRR062634_sorted_dedup_reads.bam")

Base quality recalibration

# 1. build the model
system("gatk BaseRecalibrator -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --known-sites ${known_sites} -O ${data}/recal_data.table")

# 2. Apply the model to adjust the base quality scores

system("gatk ApplyBQSR -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --bqsr-recal-file ${data}/recal_data.table -O ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam")

Collect Alignment & Insert Size Metrics

system("gatk CollectAlignmentSummaryMetrics R=${ref} I=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam O=${aligned_reads}/alignment_metrics.txt")

system("gatk CollectInsertSizeMetrics INPUT=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam OUTPUT=${aligned_reads}/insert_size_metrics.txt HISTOGRAM_FILE=${aligned_reads}/insert_size_histogram.pdf")

Show the insert_size_histogram.jpg

4. Call Variants - gatk haplotype caller

system("gatk HaplotypeCaller -R ${ref} -I ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam -O ${results}/raw_variants.vcf
")

extract SNPs & INDELS

system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type SNP -O ${results}/raw_snps.vcf")
## Warning in system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf
## --select-type SNP -O ${results}/raw_snps.vcf"): error in running command
system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type INDEL -O ${results}/raw_indels.vcf")
## Warning in system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf
## --select-type INDEL -O ${results}/raw_indels.vcf"): error in running command

The output files are

/results/raw_snps.vcf

/results/raw_indels.vcf

Filter Variants - GATK4

Filter SNPs

system("
gatk VariantFiltration \
    -R ${ref} \
    -V ${results}/raw_snps.vcf \
    -O ${results}/filtered_snps.vcf \
    -filter-name "QD_filter" -filter "QD < 2.0" \
    -filter-name "FS_filter" -filter "FS > 60.0" \
    -filter-name "MQ_filter" -filter "MQ < 40.0" \
    -filter-name "SOR_filter" -filter "SOR > 4.0" \
    -filter-name "MQRankSum_filter" -filter "MQRankSum < -12.5" \
    -filter-name "ReadPosRankSum_filter" -filter "ReadPosRankSum < -8.0" \
    -genotype-filter-expression "DP < 10" \
    -genotype-filter-name "DP_filter" \
    -genotype-filter-expression "GQ < 10" \
    -genotype-filter-name "GQ_filter"
")

Filter INDELS

system("
gatk VariantFiltration \
    -R ${ref} \
    -V ${results}/raw_indels.vcf \
    -O ${results}/filtered_indels.vcf \
    -filter-name "QD_filter" -filter "QD < 2.0" \
    -filter-name "FS_filter" -filter "FS > 200.0" \
    -filter-name "SOR_filter" -filter "SOR > 10.0" \
    -genotype-filter-expression "DP < 10" \
    -genotype-filter-name "DP_filter" \
    -genotype-filter-expression "GQ < 10" \
    -genotype-filter-name "GQ_filter"       
")

Select Variants that PASS filters

system("
gatk SelectVariants \
    --exclude-filtered \
    -V ${results}/filtered_snps.vcf \
    -O ${results}/analysis-ready-snps.vcf    
       ")

system("
gatk SelectVariants \
    --exclude-filtered \
    -V ${results}/filtered_indels.vcf \
    -O ${results}/analysis-ready-indels.vcf       
")

Exclude variants that failed genotype filters

system("#cat ${results}/analysis-ready-snps.vcf|grep -v -E "DP_filter|GQ_filter" > ${results}/analysis-ready-snps-filteredGT.vcf ")

system("cat ${results}/analysis-ready-indels.vcf| grep -v -E "DP_filter|GQ_filter" > ${results}/analysis-ready-indels-filteredGT.vcf ")

The output files are

analysis-ready-snps-filteredGT.vcf

analysis-ready-indels-filteredGT.vcf

5. Variant annotation using GATK4 Funcotator

Download funcotator_dataSources.v1.7.20200521g.tar.gz from ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/

for snps

 system("
gatk Funcotator \
    --variant ${results}/analysis-ready-snps-filteredGT.vcf \
    --reference ${ref} \
    --ref-version hg38 \
    --data-sources-path /Users/nnthieu/vcfgatk/funcotator/funcotator_dataSources.v1.7.20200521g \
    --output ${results}/analysis-ready-snps-filteredGT-functotated.vcf \
    --output-file-format VCF      
      ")

for indels

system("
gatk Funcotator \
    --variant ${results}/analysis-ready-indels-filteredGT.vcf \
    --reference ${ref} \
    --ref-version hg38 \
    --data-sources-path /Users/nnthieu/vcfgatk/funcotator/funcotator_dataSources.v1.7.20200521g \
    --output ${results}/analysis-ready-indels-filteredGT-functotated.vcf \
    --output-file-format VCF
       ")

Extract fields from a VCF file to a tab-delimited table

system("
gatk VariantsToTable \
    -V ${results}/analysis-ready-snps-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
    -O ${results}/output_snps.table ")

system("gatk VariantsToTable \
    -V ${results}/analysis-ready-indels-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
    -O ${results}/output_indels.table ")
       
       ")

creating variant calling tables gor gene ’NBPF1”

The data I want to see is that belong to gene ‘NBPF1’ so I filter for this gene:

system("
cat ${results}/analysis-ready-snps-filteredGT-functotated.vcf | grep " Funcotation fields are: " | sed 's/|/\t/g' > ${results}/output_curated_variants.txt ")

system("cat ${results}/output_snps.table | cut -f 5 | grep "NBPF1" | sed 's/|/\t/g' >> ${results}/output_curated_variants.txt
")

Show the table, swipt to see whole table width.

## Warning: `includeHTML()` was provided a `path` that appears to be a complete HTML document.
## ✖ Path: /tmp/curated_variants.html
## ℹ Use `tags$iframe()` to include an HTML document. You can either ensure `path` is accessible in your app or document (see e.g. `shiny::addResourcePath()`) and pass the relative path to the `src` argument. Or you can read the contents of `path` and pass the contents to `srcdoc`.
table output
Curated Variants
X..INFO..ID.FUNCOTATION.Number.A.Type.String.Description..Functional.annotation.from.the.Funcotator.tool...Funcotation.fields.are..Gencode_34_hugoSymbol Gencode_34_ncbiBuild Gencode_34_chromosome Gencode_34_start Gencode_34_end Gencode_34_variantClassification Gencode_34_secondaryVariantClassification Gencode_34_variantType Gencode_34_refAllele Gencode_34_tumorSeqAllele1 Gencode_34_tumorSeqAllele2 Gencode_34_genomeChange Gencode_34_annotationTranscript Gencode_34_transcriptStrand Gencode_34_transcriptExon Gencode_34_transcriptPos Gencode_34_cDnaChange Gencode_34_codonChange Gencode_34_proteinChange Gencode_34_gcContent Gencode_34_referenceContext Gencode_34_otherTranscripts ACMGLMMLof_LOF_Mechanism ACMGLMMLof_Mode_of_Inheritance ACMGLMMLof_Notes ACMG_recommendation_Disease_Name ClinVar_VCF_AF_ESP ClinVar_VCF_AF_EXAC ClinVar_VCF_AF_TGP ClinVar_VCF_ALLELEID ClinVar_VCF_CLNDISDB ClinVar_VCF_CLNDISDBINCL ClinVar_VCF_CLNDN ClinVar_VCF_CLNDNINCL ClinVar_VCF_CLNHGVS ClinVar_VCF_CLNREVSTAT ClinVar_VCF_CLNSIG ClinVar_VCF_CLNSIGCONF ClinVar_VCF_CLNSIGINCL ClinVar_VCF_CLNVC ClinVar_VCF_CLNVCSO ClinVar_VCF_CLNVI ClinVar_VCF_DBVARID ClinVar_VCF_GENEINFO ClinVar_VCF_MC ClinVar_VCF_ORIGIN ClinVar_VCF_RS ClinVar_VCF_SSR ClinVar_VCF_ID ClinVar_VCF_FILTER LMMKnown_LMM_FLAGGED LMMKnown_ID LMMKnown_FILTER..
[NBPF1 hg38 chr1 16581585 16581585 INTRON NA SNP G G A g.chr1:16581585G>A ENST00000430580.6 - NA NA c.e15+93C>T NA NA 0.4588529 ACATCTATGTGTGGTTCAGCA NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16581620 16581620 INTRON NA SNP A A G g.chr1:16581620A>G ENST00000430580.6 - NA NA c.e15+128T>C NA NA 0.4613466 GTCATGTTTTATCTTTCACAA NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16584690 16584690 INTRON NA SNP G G C g.chr1:16584690G>C ENST00000430580.6 - NA NA c.e12-799C>G NA NA 0.4588529 GTCACCCAGAGTGGAGTGCAA NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16584711 16584711 INTRON NA SNP T T A g.chr1:16584711T>A ENST00000430580.6 - NA NA c.e12-778A>T NA NA 0.4663342 TGGCAAAATCTTGGCTCACTG NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16609282 16609282 INTRON NA SNP A A G g.chr1:16609282A>G ENST00000430580.6 - NA NA c.e2+503T>C NA NA 0.3965087 TGCTTATTAAAGTAGCTTTGG NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16609286 16609286 INTRON NA SNP G G T g.chr1:16609286G>T ENST00000430580.6 - NA NA c.e2+507C>A NA NA 0.3940150 TATTAAAGTAGCTTTGGAATC NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16609833 16609833 INTRON NA SNP A A G g.chr1:16609833A>G ENST00000430580.6 - NA NA c.e2+1054T>C NA NA 0.4014963 TGTGATCCCAACACTTTGGGA NBPF1_ENST00000432949.5_INTRON NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16612769 16612769 INTRON NA SNP C C A g.chr1:16612769C>A ENST00000430580.6 - NA NA c.e1-569G>T NA NA 0.5361596 ACCCCACAATCCAATCTACCC NBPF1_ENST00000432949.5_INTRON/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16614430 16614430 FIVE_PRIME_FLANK NA SNP G G A g.chr1:16614430G>A ENST00000430580.6 - NA NA NA NA 0.4239401 TATCTCCTCCGTCACCACCCT NBPF1_ENST00000432949.5_FIVE_PRIME_FLANK/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]
[NBPF1 hg38 chr1 16615597 16615597 FIVE_PRIME_FLANK NA SNP T T A g.chr1:16615597T>A ENST00000430580.6 - NA NA NA NA 0.4413965 CAGAGCAAGATCTTGTCTCAA NBPF1_ENST00000432949.5_FIVE_PRIME_FLANK/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA FALSE NA ]

#———–