RでVCFを処理する. VCFを扱うためのパッケージとしてVariantAnnotationがあるが, ここでは汎用性の面を考えてdplyr, tidyr等を使ってVCFを編集する.
VCFに記述されている各種統計値を用いてフィルターの条件を考える. 最終的には, IGV等で確認.
変異アノテーションはゲノムデータが存在しないような非モデル生物を想定している.
VCFファイル読み込み
readr::read_delimで *.vcf.gzをそのまま読み込める.
comment = "##"でヘッダーを読み込まない
- 1列目の列名は
#CHROMになっているので変更する.
- サンプルデータはRNA-seqでの変異検出のデータ
- transdecoderで予測されたCDSに対してマッピングしている(対応するアミノ酸配列がある)
- 6サンプルに対して変異コールしている.
bcftools callは-mオプション(デフォルト)で実行
filtered vcf
|
CHROM
|
POS
|
ID
|
REF
|
ALT
|
QUAL
|
FILTER
|
INFO
|
FORMAT
|
m1
|
m2
|
m3
|
m4
|
m5
|
p
|
|
TRINITY_DN26150_c0_g1_i1.p1
|
1323
|
.
|
G
|
A
|
999
|
PASS
|
DP=620;VDB=1.27605e-09;SGB=-221.314;RPB=0.922765;MQB=1;BQB=0.163893;MQ0F=0;AF1=0.5;G3=2.2024e-19,1,9.0548e-29;HWE=0.00392592;AC1=6;DP4=0,383,0,237;MQ=60;FQ=999;PV4=1,1,1,1
|
GT:PL
|
0/1:159,0,157
|
0/1:122,0,152
|
0/1:124,0,154
|
0/1:148,0,165
|
0/1:84,0,131
|
0/1:138,0,163
|
|
TRINITY_DN18902_c0_g1_i2.p1
|
113
|
.
|
T
|
C
|
999
|
PASS
|
DP=82;VDB=7.63329e-28;SGB=33.285;MQ0F=0;AF1=1;AC1=12;DP4=0,0,0,82;MQ=60;FQ=-33.6828
|
GT:PL
|
1/1:168,33,0
|
1/1:187,84,0
|
1/1:187,90,0
|
1/1:171,39,0
|
1/1:0,0,0
|
1/1:0,0,0
|
|
TRINITY_DN24281_c0_g2_i1.p1
|
54
|
.
|
G
|
C
|
999
|
EndDistBias
|
DP=344;VDB=3.60596e-11;SGB=-9.52537;RPB=4.64287e-27;MQB=5.0105e-27;BQB=0.419832;MQ0F=0.00872093;AF1=1;AC1=12;DP4=0,50,0,294;MQ=55;FQ=-53.391;PV4=1,0.0378862,1,1.77239e-06
|
GT:PL
|
1/1:174,16,0
|
1/1:193,91,0
|
1/1:194,121,0
|
1/1:200,132,0
|
1/1:187,59,0
|
1/1:187,81,0
|
FILTER
- PASS:
- MinDP
- MinAB
- MinMQ
- Qual
- SnpGap
- EndDistBias
- GapWin
- StrandBias
QUAL

INFO
INDELとSNVを分ける
bcftools mpileup --skip-indelsでindelをskipできる.
snps
|
CHROM
|
POS
|
ID
|
REF
|
ALT
|
QUAL
|
FILTER
|
INFO
|
FORMAT
|
m1
|
m2
|
m3
|
m4
|
m5
|
p
|
|
TRINITY_DN9398_c0_g2_i1.p1
|
533
|
.
|
C
|
G
|
999
|
PASS
|
DP=73;VDB=0.99882;SGB=-9.68064;RPB=0.427108;MQB=1;BQB=0.454231;MQ0F=0;AF1=0.648935;AC1=8;DP4=0,22,0,51;MQ=60;FQ=999;PV4=1,0.267188,1,0.0222946
|
GT:PL
|
0/1:104,0,139
|
0/1:142,0,61
|
0/1:143,0,28
|
1/1:164,33,0
|
0/1:124,0,99
|
1/1:47,6,0
|
|
TRINITY_DN26453_c0_g2_i3.p1
|
77
|
.
|
T
|
A
|
999
|
PASS
|
DP=224;VDB=5.69349e-05;SGB=-83.4022;RPB=0.632444;MQB=1;BQB=0.228378;MQ0F=0;AF1=0.499992;G3=1.83863e-14,1,2.93887e-37;HWE=0.00392635;AC1=6;DP4=0,140,0,84;MQ=60;FQ=999;PV4=1,1,1,1
|
GT:PL
|
0/1:149,0,154
|
0/1:154,0,146
|
0/1:157,0,140
|
0/1:127,0,154
|
0/1:37,0,113
|
0/1:101,0,150
|
|
TRINITY_DN20483_c0_g1_i1.p1
|
45
|
.
|
A
|
T
|
999
|
PASS
|
DP=1692;VDB=2.0676e-40;SGB=-103.465;RPB=0.0109055;MQB=1;MQSB=1;BQB=0.720885;MQ0F=0;AF1=0.666667;AC1=8;DP4=19,496,20,1023;MQ=60;FQ=999;PV4=0.0393977,0.10993,1,1
|
GT:PL
|
0/1:255,0,199
|
0/1:229,0,233
|
1/1:255,255,0
|
1/1:255,255,0
|
0/1:255,0,255
|
0/1:255,0,255
|
indels
|
CHROM
|
POS
|
ID
|
REF
|
ALT
|
QUAL
|
FILTER
|
INFO
|
FORMAT
|
m1
|
m2
|
m3
|
m4
|
m5
|
p
|
|
TRINITY_DN22332_c0_g2_i1.p1
|
125
|
.
|
TGCAG
|
TG
|
87.4902
|
PASS
|
INDEL;IDV=2;IMF=0.153846;DP=55;VDB=0.0682743;SGB=-6.97471;MQ0F=0;AF1=0.253417;AC1=3;DP4=0,51,0,4;MQ=59;FQ=88.7612;PV4=1,0.0395552,1,1
|
GT:PL
|
0/0:0,12,184
|
0/0:0,30,255
|
0/0:0,36,255
|
0/1:33,0,248
|
0/1:39,0,216
|
0/1:72,0,255
|
|
TRINITY_DN18026_c0_g1_i1.p1
|
1484
|
.
|
CAA
|
CAAA
|
999.0000
|
PASS
|
INDEL;IDV=75;IMF=0.892857;DP=419;VDB=0.895445;SGB=-4.15888;MQ0F=0;AF1=1;AC1=12;DP4=0,35,0,384;MQ=60;FQ=-58.8853;PV4=1,1,1,1
|
GT:PL
|
1/1:139,33,0
|
1/1:148,71,0
|
1/1:140,62,0
|
1/1:176,114,0
|
1/1:127,14,0
|
1/1:160,73,0
|
|
TRINITY_DN39391_c0_g1_i1.p1
|
505
|
.
|
GAAAAAAAA
|
GAAAAAAA
|
52.5467
|
PASS
|
INDEL;IDV=8;IMF=0.888889;DP=20;VDB=5.58693e-06;SGB=0.0952813;MQ0F=0;AF1=0.824379;AC1=10;DP4=0,4,0,16;MQ=60;FQ=-29.3019;PV4=1,1,1,1
|
GT:PL
|
1/1:22,6,0
|
1/1:39,14,0
|
1/1:18,0,0
|
1/1:22,6,0
|
0/1:0,3,12
|
0/1:6,0,6
|
INFO列を分割し,列持ちデータに変換
- 以下の順にINFO列を処理. 値がない場所にはNAが入る.
- INFO列を分割,サイトごとにユニークなIDを付与
- 行持ちデータに変換. 対応するCHROM列を追加
- INFO値のそれぞれをkeyとvalueに分割
- data.frameに変換
- tidyr::spreadで列持ちデータに変換
# INFO値を行持ちデータに置き換えた後,列持ちデータに変換
info.dat <- strsplit(snp$INFO, ";") %>% # INFO列分割
setNames(., 1:nrow(snp)) %>% # リストにユニークIDを付与
stack() %>% # 行持ちデータに変換
mutate(id = snp$CHROM[.$ind]) %>% # 対応するCHROM列を追加
{cbind(.$ind, .$id, do.call(rbind, strsplit(.$values, "=")))} %>% # keyとvalueに分割
data.frame(., stringsAsFactors = F) %>% # data.frameに変換
setNames(., c("ind","CHROM", "k", "v")) %>%
tidyr::spread(., "k", "v") %>% # 列持ちデータに変換
arrange(as.integer(ind))
INFO
|
ind
|
CHROM
|
AC1
|
AF1
|
BQB
|
DP
|
DP4
|
FQ
|
G3
|
HWE
|
MQ
|
MQ0F
|
MQB
|
MQSB
|
PV4
|
RPB
|
SGB
|
VDB
|
|
1
|
TRINITY_DN9398_c0_g2_i1.p1
|
8
|
0.648935
|
0.454231
|
73
|
0,22,0,51
|
999
|
NA
|
NA
|
60
|
0
|
1
|
NA
|
1,0.267188,1,0.0222946
|
0.427108
|
-9.68064
|
0.99882
|
|
2
|
TRINITY_DN26453_c0_g2_i3.p1
|
6
|
0.499992
|
0.228378
|
224
|
0,140,0,84
|
999
|
1.83863e-14,1,2.93887e-37
|
0.00392635
|
60
|
0
|
1
|
NA
|
1,1,1,1
|
0.632444
|
-83.4022
|
5.69349e-05
|
|
3
|
TRINITY_DN20483_c0_g1_i1.p1
|
8
|
0.666667
|
0.720885
|
1692
|
19,496,20,1023
|
999
|
NA
|
NA
|
60
|
0
|
1
|
1
|
0.0393977,0.10993,1,1
|
0.0109055
|
-103.465
|
2.0676e-40
|
INFO列の各値を眺めてみる.
- DP: Depth, DP=13なら13回読まれている
# 各値のhist
par(mfrow = c(3,3))
hist(as.numeric(info.dat$AF1), main = "AF1");
hist(as.numeric(info.dat$FQ), main = "FQ");
hist(as.numeric(info.dat$DP), main = "DP");
hist(as.numeric(info.dat$HWE), main = "HWE");
hist(as.numeric(info.dat$MQ), main = "MQ");
hist(as.numeric(info.dat$MQSB), main = "MQSB")
hist(as.numeric(info.dat$RPB), main = "RPB")
hist(as.numeric(info.dat$SGB), main = "SGB")
hist(as.numeric(info.dat$VDB), main = "VDB")

DP4を分割
DP4=0,22,0,51の値はそれぞれREFのread depth(forward), REFのread depth(reverse), ALTのread depth(forward), ALTのread depth(reverse)を表している.
|
ind
|
CHROM
|
rf
|
rr
|
af
|
ar
|
|
1
|
TRINITY_DN9398_c0_g2_i1.p1
|
0
|
22
|
0
|
51
|
|
2
|
TRINITY_DN26453_c0_g2_i3.p1
|
0
|
140
|
0
|
84
|
|
3
|
TRINITY_DN20483_c0_g1_i1.p1
|
19
|
496
|
20
|
1023
|
|
4
|
TRINITY_DN27484_c0_g1_i7.p1
|
0
|
10
|
0
|
24
|
|
5
|
TRINITY_DN26150_c0_g1_i1.p1
|
0
|
383
|
0
|
237
|
|
6
|
TRINITY_DN18902_c0_g1_i2.p1
|
0
|
0
|
0
|
82
|
GT(Genotype)とPL(Phred-scaled genotype likelihoods)
- GT: 0/0(REFのホモ), 0/1(REFとALTのヘテロ), 1/1(ALTのホモ)
- PL: 191,0,255のようにカンマ区切りの値は, 各GTのもっともらしさ. 低いほど信頼度高
- REF=C, ALT=Gの場合
- GTの値が0/0(CC), 0/1(CG), 1/1(GG)に対応する.
- PLの値(104,0,139)はそれぞれCC,CG,GGに対応しており,CG(0/1)が採用.
- REF=T, ALT=C,G (ALTの候補が2つある)の場合
- PL(255,255,0,255,255,255)はそれぞれ, TT,TC,CC,TG,CG,GGに対応しているこの場合GTはCC(1/1)が採用
# サンプル別の列ラベル作成
smp <- names(snp)[10:ncol(snp)]
gt.smp <- paste("GT", smp, sep = ".")
pl.smp <- paste("PL", smp, sep = ".")
# GT,PL
gtpl <- lapply(snp[10:ncol(snp)], function(x){
data.frame(do.call(rbind, strsplit(x, ":")))
}) %>%
{
cbind(do.call(cbind, lapply(., "[", 1)), do.call(cbind, lapply(., "[", 2)))
} %>%
setNames(., c(gt.smp, pl.smp)) %>%
bind_cols(snp[1:5], .)
# 複数候補があるサイトを取り出す
mgt <- gtpl[sapply(strsplit(gtpl$ALT, ","), length) > 1,]
GT & PL
|
CHROM
|
POS
|
ID
|
REF
|
ALT
|
GT.m1
|
GT.m2
|
GT.m3
|
GT.m4
|
GT.m5
|
GT.p
|
PL.m1
|
PL.m2
|
PL.m3
|
PL.m4
|
PL.m5
|
PL.p
|
|
TRINITY_DN9398_c0_g2_i1.p1
|
533
|
.
|
C
|
G
|
0/1
|
0/1
|
0/1
|
1/1
|
0/1
|
1/1
|
104,0,139
|
142,0,61
|
143,0,28
|
164,33,0
|
124,0,99
|
47,6,0
|
|
TRINITY_DN26453_c0_g2_i3.p1
|
77
|
.
|
T
|
A
|
0/1
|
0/1
|
0/1
|
0/1
|
0/1
|
0/1
|
149,0,154
|
154,0,146
|
157,0,140
|
127,0,154
|
37,0,113
|
101,0,150
|
|
TRINITY_DN20483_c0_g1_i1.p1
|
45
|
.
|
A
|
T
|
0/1
|
0/1
|
1/1
|
1/1
|
0/1
|
0/1
|
255,0,199
|
229,0,233
|
255,255,0
|
255,255,0
|
255,0,255
|
255,0,255
|
multiple
|
CHROM
|
POS
|
ID
|
REF
|
ALT
|
GT.m1
|
GT.m2
|
GT.m3
|
GT.m4
|
GT.m5
|
GT.p
|
PL.m1
|
PL.m2
|
PL.m3
|
PL.m4
|
PL.m5
|
PL.p
|
|
TRINITY_DN12760_c0_g1_i1.p2
|
480
|
.
|
T
|
C,G
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
255,255,0,255,255,255
|
255,255,0,255,255,255
|
255,255,0,255,255,255
|
245,255,0,246,255,243
|
36,3,0,36,3,36
|
0,0,0,0,0,0
|
|
TRINITY_DN24514_c0_g3_i1.p1
|
9
|
.
|
C
|
T,G
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
87,9,0,87,9,87
|
116,24,0,116,24,116
|
159,30,0,158,18,155
|
136,21,0,136,21,136
|
128,21,0,128,21,128
|
166,39,0,166,39,166
|
|
TRINITY_DN32770_c0_g1_i1.p1
|
172
|
.
|
G
|
A,T
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
1/1
|
181,39,0,181,39,181
|
137,18,0,137,18,137
|
183,42,0,183,42,183
|
151,27,0,151,27,151
|
163,30,0,163,30,163
|
164,33,0,163,21,160
|
変異アノテーション.
同義置換と非同義置換.
- ここではTransDecoderで予測されたORFにマッピングしているので, 抽出された変異で置換したcds配列を翻訳して, そのまま同義/非同義置換を確認する.
fastaファイル読み込み.
Biostrings::readDNAStringSetでfasta配列を読み込む
- vcfのCHROMとfasta形式のヘッダーと対応させる. もしくは読み込んだあとで変換する.
- ここで用いている配列はTransdecoderの出力(部分配列も含むORFのcds).
vcfとcdsから同義/非同義置換を判定する関数
Biostrings::translateで翻訳する際no.init.codon = Tにする(使用しているcdsは部分配列も含むので).
- PROVEAN PROTEINのwebフォームに入力する書式も出力する.
snv.anno <- function(vcf, cds){
# VCFに記述されているCHROMに対応するCDSを取り出し, 翻訳する.
cds <- cds[match(vcf$CHROM, names(cds))]
trcds <- Biostrings::translate(cds[match(vcf$CHROM, names(cds))], no.init.codon = T)
# length(cds); length(trcds)
# all.equal(Biostrings::width(cds), Biostrings::width(trcds)*3)
# mismatch positionを見つける関数
seqdiff <- function(seq1, seq2){
seq <- strsplit(c(seq1, seq2), split = '')
mismatches <- which(seq[[1]] != seq[[2]])
return(mismatches)
}
# annotation 付加 vcfを一行づつ処理
pos_alt <- vcf[c(1,2,4,5)]
snv_anno <- unlist(sapply(1:nrow(pos_alt), function(i){
# extract vcf data ----
idx <- as.character(unlist(pos_alt[i,]))
id <- idx[1]; pos <- as.integer(idx[2]); ref <- idx[3]; alt <- idx[4]
# id; pos; ref; alt;pos_alt[i,]
# REF cds, ALT cds ----
ref_cds <- toString(cds[[i]])
alt_cds <- ref_cds
if (substr(ref_cds, pos, pos) == ref) {
substr(alt_cds, pos, pos) <- alt
} else {
cat(paste0('The "REF" nuc is different from "', ref, '".'))
}
# REF aa, ALT aa ----
ref_pep <- toString(trcds[[i]])
alt_pep <- toString(Biostrings::translate(Biostrings::DNAString(alt_cds),
no.init.codon = T))
mismatch_pos <- seqdiff(ref_pep, alt_pep)
# compare REF aa, ALT aa -> synonymous or non-synonymous ----
if (length(mismatch_pos)) {
ref_aa <- substr(ref_pep, mismatch_pos, mismatch_pos)
alt_aa <- substr(alt_pep, mismatch_pos, mismatch_pos)
paste0(mismatch_pos,":", ref_aa, "->", alt_aa)
} else {
"synonymous"
}
}))
# substitution data and aa seq for provean
prove <- sapply(strsplit(snv_anno, ":|->"), function(x){
if (length(x) != 1) {
paste(x[c(2,1,3)], collapse = "")
} else {
"synonymous"
}
})
vtrcds <- sapply(trcds, function(x) toString(x))
prove.seq <- ifelse(snv_anno != "synonymous", vtrcds, NA)
snv_flt <- pos_alt %>%
mutate(aa = snv_anno,
provean = prove,
prove.seq) %>%
mutate(nuc = paste0(POS,":",REF,"->",ALT)) %>%
select(1:4,8,5,6,7)
return(snv_flt)
}
res <- snv.anno(snp, cds)
synonymous/non-synonymous
|
CHROM
|
POS
|
REF
|
ALT
|
nuc
|
aa
|
provean
|
prove.seq
|
|
TRINITY_DN9398_c0_g2_i1.p1
|
533
|
C
|
G
|
533:C->G
|
178:T->S
|
T178S
|
CIIGAGMSGILAFKYLLNTDVNLTVFEAKEGIGGVWRETFASTTLQTPRPAYQFTDFPWPSHVTDRLPTQSQVMDYLNSYAENFGLMDRIQFNSKVVGIRYIGEQSMECSGLWSKNGGPYTEGRAWEVGVQKRNGGSIEWHHFDFVVMCIGRYGDVPNIPSFPPNKGPEVFEGKVLHTMDYSALNKTEAYELLRGKRVIVVGFQKSALDFAVECAEANQGGGGKSCTMIFRTVHWTVPYDHKCWGVPL
|
|
TRINITY_DN26453_c0_g2_i3.p1
|
77
|
T
|
A
|
77:T->A
|
26:L->*
|
L26*
|
MALEPVNVSEFSELARQVLPKMVYDLYAGGAEDEWTLKQNIAAFQRIRLRPRVLVDVSRVDLSTSILGFKISSPIMIAPTGLHKLAHPEGEVATARAAADANTIMIVSFSSSRTVEEIAATGDAVRFHQIYIYKNRNISAEIVRRAERAGYKAIVLTVDTPRIGRREADMRNKLVVPTPGNFEGLISVDIDTEKGSGLQSYASQTLDSSFTWKDVRWLQSITNLPILMKGILTAEDAELAIQAGAAGIIVSNHGARQLDFSPASITALEEEDLRVLQLRERHPATQGLRDRLT*
|
|
TRINITY_DN20483_c0_g1_i1.p1
|
45
|
A
|
T
|
45:A->T
|
15:E->D
|
E15D
|
MASPEESPVIENKEEVETSVRERGLFNKEKKEKEEKDEDKEEEKGGFIDKVKDFIHDIGEKIEHAIGFGKPTADISGIHIPSINLKKAELVVDVLITNPNPIPIPLVDIDYLVESDGRKLISGLIPDAGTISVHGSETIKIPITLIYDDIKDTFHDIKPGSIIPYKIKIALIADVPVFGKITLPLEKEGEIPVPYKPDVDLDKVEFDKFSFEETSATLHMKLENKNDFDLALNALEYDIWLSDVNIGNAKMTKSSKIEKNGISYIEVPISFRPKDFGSALWDMIRGKGTGYAMKGNLEVDTPFGPMQMPFNREGGSTRLKKKNKEEGDEDDDED*
|
|
TRINITY_DN27484_c0_g1_i7.p1
|
408
|
G
|
A
|
408:G->A
|
synonymous
|
synonymous
|
NA
|
|
TRINITY_DN26150_c0_g1_i1.p1
|
1323
|
G
|
A
|
1323:G->A
|
synonymous
|
synonymous
|
NA
|
|
TRINITY_DN18902_c0_g1_i2.p1
|
113
|
T
|
C
|
113:T->C
|
38:F->S
|
F38S
|
NPYLHLPFFFAHLLGSSIHRFCSEQVLIEETAYQKSVFVSVTYTDFGAMSIVGIIVLLTVINGAWAVTFTLVNNCEYTVWPGFLSNGGVAALGTTGFELPASTSRVVETPAKWSGRMWGRTGCSFDSQGKGKCATGDC
|
環境.
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_0.8.3 dplyr_0.8.0.1 readr_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 highr_0.7 pillar_1.3.1
## [4] compiler_3.5.2 plyr_1.8.4 XVector_0.22.0
## [7] tools_3.5.2 zlibbioc_1.28.0 digest_0.6.18
## [10] rskodat_0.1.0 viridisLite_0.3.0 evaluate_0.13
## [13] tibble_2.1.1 pkgconfig_2.0.2 rlang_0.3.3
## [16] rstudioapi_0.9.0 yaml_2.2.0 parallel_3.5.2
## [19] xfun_0.5 rsko_0.1.0 kableExtra_1.0.1
## [22] stringr_1.4.0 httr_1.4.0 knitr_1.21
## [25] xml2_1.2.0 Biostrings_2.50.2 S4Vectors_0.20.1
## [28] hms_0.4.2 IRanges_2.16.0 webshot_0.5.1
## [31] stats4_3.5.2 tidyselect_0.2.5 glue_1.3.1
## [34] R6_2.4.0 rmarkdown_1.11 purrr_0.3.2
## [37] magrittr_1.5 scales_1.0.0 htmltools_0.3.6
## [40] BiocGenerics_0.28.0 assertthat_0.2.1 rvest_0.3.2
## [43] colorspace_1.4-1 stringi_1.4.3 munsell_0.5.0
## [46] crayon_1.3.4