santosha — Oct 29, 2013, 10:44 AM
# This tsv file is coming from the script crisp_vcf2tsv.sh
vcf = read.table("hsr.vcf.tsv", header=T, sep="\t", colClasses="character")
n.vcf = nrow(vcf)
# First split the vcf file according to the variou kind of variants.
vcf.ins = vcf[grep("INSERTION", vcf$INFO), ] #INSERTION and some with INS+SNV (or INS+DEL)
vcf.del = vcf[grep("DELETION", vcf$INFO), ] #DELETION and some with DEL+SNV (or INS+DEL)
vcf.snv = vcf[grep("VT=SNV;", vcf$INFO), ] #only SNV
n.ins = nrow(vcf.ins)
n.del = nrow(vcf.del)
cat("\tINS+: ", n.ins, "\tDEL+: ", n.del)
INS+: 404 DEL+: 670
# INS and DEL sometime occur with other combination, like INS+DEL, INS+SNV or DEL+SNV
# Let's make sure that the numbers are correct
n.indel = length(union(rownames(vcf.ins), rownames(vcf.del)))
n.snv = nrow(vcf.snv)
cat("Total: ", n.vcf, "\tSNV: ", n.snv, "\tINDEL: ", n.indel)
Total: 15274 SNV: 14227 INDEL: 1047
### Let's see the statistics of SNVs only
snv = vcf.snv
# at some of the positions, there are multiple SNVs. Let's discard them for now as the calculation becomes difficult for multiple SNVs, with multiple values pVAls and CT-vals.
snv = snv[snv$ALT %in% c("A", "C", "T", "G"),] # match only one allele in ALT
cat("Single allelic SNVs: ", nrow(snv))
Single allelic SNVs: 14113
# The info data frame
info=as.character((snv$INFO))
which(is.na(info)) # Is any info missing?
integer(0)
info.lst=strsplit(info, ";")
# Returns the value of "key" in info-field if present, else "".
keyValue = function(data, key) {
x=grep(key, data)
ifelse(!length(x), "", data[x])
}
reqd.keys = c("CT", "QVpf", "QVpr", "VP")
# Initialize reqd.values with the correnct num of rows
reqd.values = snv[-(1:nrow(snv))]
for (i in reqd.keys) {
value = sapply(info.lst, keyValue, i)
value = as.numeric(gsub(".+=", "", value))
reqd.values = cbind(reqd.values, value)
}
colnames(reqd.values) = reqd.keys
any(is.na(reqd.values$VP)) # Any NA values after co-ercing etc?
[1] FALSE
# Check the stats
# VP = Number of pools with variant allelles
vp = reqd.values$VP
table(vp)
vp
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
7260 1099 440 285 219 189 176 160 151 106 122 102 68 84 59
16 17 18 19 20 21 22 23 24 25
86 93 85 123 125 149 208 270 426 2028
hist(vp, breaks=1:25)
# CT = contingency table p-value
ct = reqd.values$CT
hist(ct)
# Check hist of > -100
hist(ct[ct>-100])
sum(ct> -5) # Num CT > -5
[1] 2756
sum(ct< -20) # Num CT < -20
[1] 8415
# QVpf/QVpr = Q.val based p-valueusing fwd/rev strand reads
qvpf = reqd.values$QVpf
qvpr = reqd.values$QVpr
hist(qvpf)
hist(qvpf[qvpf>-10])
hist(qvpr)
hist(qvpr[qvpr>-10])