processVCF.R

santosha — Oct 29, 2013, 10:44 AM

# This tsv file is coming from the script crisp_vcf2tsv.sh
vcf = read.table("hsr.vcf.tsv", header=T, sep="\t", colClasses="character")
n.vcf = nrow(vcf)

# First split the vcf file according to the variou kind of variants.
vcf.ins = vcf[grep("INSERTION", vcf$INFO), ] #INSERTION and some with INS+SNV (or INS+DEL)
vcf.del = vcf[grep("DELETION", vcf$INFO), ] #DELETION and some with DEL+SNV (or INS+DEL)
vcf.snv = vcf[grep("VT=SNV;", vcf$INFO), ] #only SNV

n.ins = nrow(vcf.ins)
n.del = nrow(vcf.del)

cat("\tINS+: ", n.ins, "\tDEL+: ", n.del)
    INS+:  404  DEL+:  670

# INS and DEL sometime occur with other combination, like INS+DEL, INS+SNV or DEL+SNV
# Let's make sure that the numbers are correct
n.indel = length(union(rownames(vcf.ins), rownames(vcf.del)))
n.snv = nrow(vcf.snv)
cat("Total: ", n.vcf, "\tSNV: ", n.snv, "\tINDEL: ", n.indel) 
Total:  15274   SNV:  14227     INDEL:  1047

### Let's see the statistics of SNVs only
snv = vcf.snv

# at some of the positions, there are multiple SNVs. Let's discard them for now as the calculation becomes difficult for multiple SNVs, with multiple values pVAls and CT-vals.
snv = snv[snv$ALT %in% c("A", "C", "T", "G"),] # match  only one allele in ALT
cat("Single allelic SNVs: ", nrow(snv))
Single allelic SNVs:  14113

# The info data frame
info=as.character((snv$INFO))
which(is.na(info)) # Is any info missing?
integer(0)

info.lst=strsplit(info, ";")

# Returns the value of "key" in info-field if present, else "".
keyValue = function(data, key) {
  x=grep(key, data)
  ifelse(!length(x), "", data[x])
}

reqd.keys = c("CT", "QVpf", "QVpr", "VP")

# Initialize reqd.values with the correnct num of rows
reqd.values = snv[-(1:nrow(snv))]

for (i in reqd.keys) {
    value = sapply(info.lst, keyValue, i)
    value = as.numeric(gsub(".+=", "", value))

    reqd.values = cbind(reqd.values, value)
}

colnames(reqd.values) = reqd.keys
any(is.na(reqd.values$VP)) # Any NA values after co-ercing etc?
[1] FALSE

# Check the stats

# VP = Number of pools with variant allelles
vp = reqd.values$VP
table(vp)
vp
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
7260 1099  440  285  219  189  176  160  151  106  122  102   68   84   59 
  16   17   18   19   20   21   22   23   24   25 
  86   93   85  123  125  149  208  270  426 2028 
hist(vp, breaks=1:25)

plot of chunk unnamed-chunk-1


# CT = contingency table p-value
ct = reqd.values$CT
hist(ct)

plot of chunk unnamed-chunk-1


# Check hist of > -100
hist(ct[ct>-100])

plot of chunk unnamed-chunk-1


sum(ct> -5) # Num CT > -5
[1] 2756
sum(ct< -20) # Num CT < -20
[1] 8415

# QVpf/QVpr = Q.val based p-valueusing fwd/rev strand reads
qvpf = reqd.values$QVpf
qvpr = reqd.values$QVpr

hist(qvpf)

plot of chunk unnamed-chunk-1

hist(qvpf[qvpf>-10])

plot of chunk unnamed-chunk-1


hist(qvpr)

plot of chunk unnamed-chunk-1

hist(qvpr[qvpr>-10])

plot of chunk unnamed-chunk-1