Reference:
https://www.youtube.com/watch?v=XZ8scaScfjw
https://github.com/kpatel427/YouTubeTutorials/blob/main/somatic_variant_calling_get_reads_1.sh
Script to call germline variants in a human WGS paired end reads 2 X 100bp Following GATK4 best practices workflow - https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-
directories:
ref=“/Users/nnthieu/vcfgatk/refgenome/hg38.fa”
known_sites=“/Users/nnthieu/vcfgatk/refvcf/Homo_sapiens_assembly38.dbsnp138.vcf”
aligned_reads=“/Users/nnthieu/vcfgatk/aligned_reads”
reads=“/Users/nnthieu/vcfgatk/reads”
results=“/Users/nnthieu/vcfgatk/results”
data=“/Users/nnthieu/vcfgatk/data”
system("wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_1.filt.fastq.gz")
system("wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/data/HG00096/sequence_read/SRR062634_2.filt.fastq.gz")
The output file are data/SRR062634_1.filt.fastq and data/SRR062634_2.filt.fastq
Download reference hg38.fa.gz into ‘/regenome’
system("wget -P /Users/nnthieu/vcfgatk/refgenome https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz")
system("gunzip /Users/nnthieu/vcfgatk/refgenome/hg38.fa.gz")
# index ref - .fai file before running haplotype caller
system("samtools faidx /Users/nnthieu/vcfgatk/refgenome/hg38.fa")
ref dict - .dict file before running haplotype caller
system("gatk CreateSequenceDictionary R=/Users/nnthieu/vcfgatk/refgenome/hg38.fa O=/Users/nnthieu/vcfgatk/refgenome/hg38.dict")
system("wget -P /Users/nnthieu/vcfgatk/refgenome/https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf")
system("wget -P /Users/nnthieu/vcfgatk/refgenome/https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx")
system("fastqc ${reads}/SRR062634_1.filt.fastq.gz -o ${reads}/")
system("fastqc ${reads}/SRR062634_2.filt.fastq.gz -o ${reads}/")
The output html files show the acceptable quality so I will not run trimming fastq files.
# BWA index reference
system("bwa index ${ref}")
# BWA alignment
system("bwa mem -t 4 -R "@RG\tID:SRR062634\tPL:ILLUMINA\tSM:SRR062634" ${ref} ${reads}/SRR062634_1.filt.fastq.gz ${reads}/SRR062634_2.filt.fastq.gz > ${aligned_reads}/SRR062634.paired.sam ")
#check sam file quality
system("samtools quickcheck -v ../aligned_reads/SRR062634.paired.sam")
The output file is RR062634.paired.sam
system("samtools view -Sb ${aligned_reads}/SRR062634.paired.sam -o ${aligned_reads}/SRR062634.paired.bam")
system("gatk MarkDuplicatesSpark -I ${aligned_reads}/SRR062634.paired.bam -O ${aligned_reads}/SRR062634_sorted_dedup_reads.bam")
# 1. build the model
system("gatk BaseRecalibrator -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --known-sites ${known_sites} -O ${data}/recal_data.table")
# 2. Apply the model to adjust the base quality scores
system("gatk ApplyBQSR -I ${aligned_reads}/SRR062634_sorted_dedup_reads.bam -R ${ref} --bqsr-recal-file ${data}/recal_data.table -O ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam")
system("gatk CollectAlignmentSummaryMetrics R=${ref} I=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam O=${aligned_reads}/alignment_metrics.txt")
system("gatk CollectInsertSizeMetrics INPUT=${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam OUTPUT=${aligned_reads}/insert_size_metrics.txt HISTOGRAM_FILE=${aligned_reads}/insert_size_histogram.pdf")
Show the insert_size_histogram.jpg
system("gatk HaplotypeCaller -R ${ref} -I ${aligned_reads}/SRR062634_sorted_dedup_bqsr_reads.bam -O ${results}/raw_variants.vcf
")
system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type SNP -O ${results}/raw_snps.vcf")
## Warning in system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf
## --select-type SNP -O ${results}/raw_snps.vcf"): error in running command
system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf --select-type INDEL -O ${results}/raw_indels.vcf")
## Warning in system("gatk SelectVariants -R ${ref} -V ${results}/raw_variants.vcf
## --select-type INDEL -O ${results}/raw_indels.vcf"): error in running command
The output files are
/results/raw_snps.vcf
/results/raw_indels.vcf
Filter SNPs
system("
gatk VariantFiltration \
-R ${ref} \
-V ${results}/raw_snps.vcf \
-O ${results}/filtered_snps.vcf \
-filter-name "QD_filter" -filter "QD < 2.0" \
-filter-name "FS_filter" -filter "FS > 60.0" \
-filter-name "MQ_filter" -filter "MQ < 40.0" \
-filter-name "SOR_filter" -filter "SOR > 4.0" \
-filter-name "MQRankSum_filter" -filter "MQRankSum < -12.5" \
-filter-name "ReadPosRankSum_filter" -filter "ReadPosRankSum < -8.0" \
-genotype-filter-expression "DP < 10" \
-genotype-filter-name "DP_filter" \
-genotype-filter-expression "GQ < 10" \
-genotype-filter-name "GQ_filter"
")
Filter INDELS
system("
gatk VariantFiltration \
-R ${ref} \
-V ${results}/raw_indels.vcf \
-O ${results}/filtered_indels.vcf \
-filter-name "QD_filter" -filter "QD < 2.0" \
-filter-name "FS_filter" -filter "FS > 200.0" \
-filter-name "SOR_filter" -filter "SOR > 10.0" \
-genotype-filter-expression "DP < 10" \
-genotype-filter-name "DP_filter" \
-genotype-filter-expression "GQ < 10" \
-genotype-filter-name "GQ_filter"
")
system("
gatk SelectVariants \
--exclude-filtered \
-V ${results}/filtered_snps.vcf \
-O ${results}/analysis-ready-snps.vcf
")
system("
gatk SelectVariants \
--exclude-filtered \
-V ${results}/filtered_indels.vcf \
-O ${results}/analysis-ready-indels.vcf
")
system("#cat ${results}/analysis-ready-snps.vcf|grep -v -E "DP_filter|GQ_filter" > ${results}/analysis-ready-snps-filteredGT.vcf ")
system("cat ${results}/analysis-ready-indels.vcf| grep -v -E "DP_filter|GQ_filter" > ${results}/analysis-ready-indels-filteredGT.vcf ")
The output files are
analysis-ready-snps-filteredGT.vcf
analysis-ready-indels-filteredGT.vcf
Download funcotator_dataSources.v1.7.20200521g.tar.gz from ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/
for snps
system("
gatk Funcotator \
--variant ${results}/analysis-ready-snps-filteredGT.vcf \
--reference ${ref} \
--ref-version hg38 \
--data-sources-path /Users/nnthieu/vcfgatk/funcotator/funcotator_dataSources.v1.7.20200521g \
--output ${results}/analysis-ready-snps-filteredGT-functotated.vcf \
--output-file-format VCF
")
for indels
system("
gatk Funcotator \
--variant ${results}/analysis-ready-indels-filteredGT.vcf \
--reference ${ref} \
--ref-version hg38 \
--data-sources-path /Users/nnthieu/vcfgatk/funcotator/funcotator_dataSources.v1.7.20200521g \
--output ${results}/analysis-ready-indels-filteredGT-functotated.vcf \
--output-file-format VCF
")
system("
gatk VariantsToTable \
-V ${results}/analysis-ready-snps-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
-O ${results}/output_snps.table ")
system("gatk VariantsToTable \
-V ${results}/analysis-ready-indels-filteredGT-functotated.vcf -F AC -F AN -F DP -F AF -F FUNCOTATION \
-O ${results}/output_indels.table ")
")
The data I want to see is that belong to gene ‘NBPF1’ so I filter for this gene:
system("
cat ${results}/analysis-ready-snps-filteredGT-functotated.vcf | grep " Funcotation fields are: " | sed 's/|/\t/g' > ${results}/output_curated_variants.txt ")
system("cat ${results}/output_snps.table | cut -f 5 | grep "NBPF1" | sed 's/|/\t/g' >> ${results}/output_curated_variants.txt
")
Show the table, swipt to see whole table width.
## Warning: `includeHTML()` was provided a `path` that appears to be a complete HTML document.
## ✖ Path: /tmp/curated_variants.html
## ℹ Use `tags$iframe()` to include an HTML document. You can either ensure `path` is accessible in your app or document (see e.g. `shiny::addResourcePath()`) and pass the relative path to the `src` argument. Or you can read the contents of `path` and pass the contents to `srcdoc`.
| X..INFO..ID.FUNCOTATION.Number.A.Type.String.Description..Functional.annotation.from.the.Funcotator.tool...Funcotation.fields.are..Gencode_34_hugoSymbol | Gencode_34_ncbiBuild | Gencode_34_chromosome | Gencode_34_start | Gencode_34_end | Gencode_34_variantClassification | Gencode_34_secondaryVariantClassification | Gencode_34_variantType | Gencode_34_refAllele | Gencode_34_tumorSeqAllele1 | Gencode_34_tumorSeqAllele2 | Gencode_34_genomeChange | Gencode_34_annotationTranscript | Gencode_34_transcriptStrand | Gencode_34_transcriptExon | Gencode_34_transcriptPos | Gencode_34_cDnaChange | Gencode_34_codonChange | Gencode_34_proteinChange | Gencode_34_gcContent | Gencode_34_referenceContext | Gencode_34_otherTranscripts | ACMGLMMLof_LOF_Mechanism | ACMGLMMLof_Mode_of_Inheritance | ACMGLMMLof_Notes | ACMG_recommendation_Disease_Name | ClinVar_VCF_AF_ESP | ClinVar_VCF_AF_EXAC | ClinVar_VCF_AF_TGP | ClinVar_VCF_ALLELEID | ClinVar_VCF_CLNDISDB | ClinVar_VCF_CLNDISDBINCL | ClinVar_VCF_CLNDN | ClinVar_VCF_CLNDNINCL | ClinVar_VCF_CLNHGVS | ClinVar_VCF_CLNREVSTAT | ClinVar_VCF_CLNSIG | ClinVar_VCF_CLNSIGCONF | ClinVar_VCF_CLNSIGINCL | ClinVar_VCF_CLNVC | ClinVar_VCF_CLNVCSO | ClinVar_VCF_CLNVI | ClinVar_VCF_DBVARID | ClinVar_VCF_GENEINFO | ClinVar_VCF_MC | ClinVar_VCF_ORIGIN | ClinVar_VCF_RS | ClinVar_VCF_SSR | ClinVar_VCF_ID | ClinVar_VCF_FILTER | LMMKnown_LMM_FLAGGED | LMMKnown_ID | LMMKnown_FILTER.. |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| [NBPF1 | hg38 | chr1 | 16581585 | 16581585 | INTRON | NA | SNP | G | G | A | g.chr1:16581585G>A | ENST00000430580.6 | - | NA | NA | c.e15+93C>T | NA | NA | 0.4588529 | ACATCTATGTGTGGTTCAGCA | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16581620 | 16581620 | INTRON | NA | SNP | A | A | G | g.chr1:16581620A>G | ENST00000430580.6 | - | NA | NA | c.e15+128T>C | NA | NA | 0.4613466 | GTCATGTTTTATCTTTCACAA | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16584690 | 16584690 | INTRON | NA | SNP | G | G | C | g.chr1:16584690G>C | ENST00000430580.6 | - | NA | NA | c.e12-799C>G | NA | NA | 0.4588529 | GTCACCCAGAGTGGAGTGCAA | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16584711 | 16584711 | INTRON | NA | SNP | T | T | A | g.chr1:16584711T>A | ENST00000430580.6 | - | NA | NA | c.e12-778A>T | NA | NA | 0.4663342 | TGGCAAAATCTTGGCTCACTG | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16609282 | 16609282 | INTRON | NA | SNP | A | A | G | g.chr1:16609282A>G | ENST00000430580.6 | - | NA | NA | c.e2+503T>C | NA | NA | 0.3965087 | TGCTTATTAAAGTAGCTTTGG | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16609286 | 16609286 | INTRON | NA | SNP | G | G | T | g.chr1:16609286G>T | ENST00000430580.6 | - | NA | NA | c.e2+507C>A | NA | NA | 0.3940150 | TATTAAAGTAGCTTTGGAATC | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16609833 | 16609833 | INTRON | NA | SNP | A | A | G | g.chr1:16609833A>G | ENST00000430580.6 | - | NA | NA | c.e2+1054T>C | NA | NA | 0.4014963 | TGTGATCCCAACACTTTGGGA | NBPF1_ENST00000432949.5_INTRON | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16612769 | 16612769 | INTRON | NA | SNP | C | C | A | g.chr1:16612769C>A | ENST00000430580.6 | - | NA | NA | c.e1-569G>T | NA | NA | 0.5361596 | ACCCCACAATCCAATCTACCC | NBPF1_ENST00000432949.5_INTRON/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
| [NBPF1 | hg38 | chr1 | 16614430 | 16614430 | FIVE_PRIME_FLANK | NA | SNP | G | G | A | g.chr1:16614430G>A | ENST00000430580.6 | - | NA | NA | NA | NA | 0.4239401 | TATCTCCTCCGTCACCACCCT | NBPF1_ENST00000432949.5_FIVE_PRIME_FLANK/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] | |
| [NBPF1 | hg38 | chr1 | 16615597 | 16615597 | FIVE_PRIME_FLANK | NA | SNP | T | T | A | g.chr1:16615597T>A | ENST00000430580.6 | - | NA | NA | NA | NA | 0.4413965 | CAGAGCAAGATCTTGTCTCAA | NBPF1_ENST00000432949.5_FIVE_PRIME_FLANK/AL137798.1_ENST00000607700.1_FIVE_PRIME_FLANK | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | FALSE | NA | ] |
#———–