Explore a VCF file

Shaun Jackman — Mar 5, 2013, 5:26 PM

# Explore a VCF file

#source('http://bioconductor.org/biocLite.R')
#biocLite('VariantAnnotation')

library('lattice')
library('reshape2')
library('VariantAnnotation')
Loading required package: GenomicRanges
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following object(s) are masked from 'package:stats':

xtabs
The following object(s) are masked from 'package:base':

anyDuplicated, cbind, colnames, duplicated, eval, Filter, Find, get,
intersect, lapply, Map, mapply, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, Position, rbind, Reduce, rep.int, rownames, sapply, setdiff,
table, tapply, union, unique
Loading required package: IRanges
Loading required package: Rsamtools
Loading required package: Biostrings

vcf <- readVcf('98E.vcf.gz', 'NC_018521')

# Histogram of the quality values.
qual <- fixed(vcf)$QUAL
histogram(qual, main='Histogram of quality')

plot of chunk unnamed-chunk-1


# Violin plot of the quality vs. genotype.
gt <- data.frame(geno(vcf)$GT)
Warning: some row.names duplicated:
3101,13143,14066,23915,30230,32614,37500,44514,45040 --> row.names NOT
used
qual.gt <- melt(cbind(qual, gt), id.vars='qual',
    variable.name='sample', value.name='gt')
qual.gt$gt <- factor(qual.gt$gt)
bwplot(qual ~ gt, qual.gt, panel = panel.violin,
    main='Quality vs. genotype',
    ylab='Quality', xlab='Genotype')

plot of chunk unnamed-chunk-1

bwplot(qual ~ gt | sample, qual.gt, panel = panel.violin,
    main='Quality vs. genotype',
    ylab='Quality', xlab='Genotype')

plot of chunk unnamed-chunk-1


# Examine genotype qualities.
gt.gq <- data.frame(gt = geno(vcf)$GT, gq=geno(vcf)$GQ)
Warning: some row.names duplicated:
3101,13143,14066,23915,30230,32614,37500,44514,45040 --> row.names NOT
used
Warning: some row.names duplicated:
3101,13143,14066,23915,30230,32614,37500,44514,45040 --> row.names NOT
used
gt <- melt(t(geno(vcf)$GT), value.name='gt')
gq <- melt(t(geno(vcf)$GQ), value.name='gq')
gt.gq <- cbind(gt[1], gt$gt, gq$gq)
colnames(gt.gq) <- c('sample', 'gt', 'gq')
histogram(gt.gq$gq, main='Histogram of genotype quality')

plot of chunk unnamed-chunk-1

bwplot(gq ~ gt, gt.gq, panel = panel.violin,
    main='Genotype quality vs. genotype',
    ylab='Genotype quality', xlab='Genotype')

plot of chunk unnamed-chunk-1

bwplot(gq ~ gt | sample, gt.gq, panel = panel.violin,
    main='Genotype quality vs. genotype',
    ylab='Genotype quality', xlab='Genotype')

plot of chunk unnamed-chunk-1