This is an exercise to refactor R code for parsing a Strelka VCF file such that it uses “tidy principles” (i.e., using Hadley Wickam’s package such as dplyr and tidyr). I also make use of the purrr package for the first time.
# Load magrittr for R pipes (%>%)
library(magrittr)
# Strelka output VCF file (all variants, 1.3 MB)
x <- "snvs.vcf"
These are helper functions that are used in the alternative parsers
row_tibble <- function(x, col_names) {
tibble::as_tibble(rbind(setNames(x, col_names)))
}
parse_format <- function(format, sample) {
purrr::map2(
strsplit(sample, ":", fixed = TRUE),
strsplit(format, ":", fixed = TRUE),
row_tibble)
}
parse_info <- function(info) {
strsplit(info, ";", fixed = TRUE) %>%
purrr::map(~row_tibble(sub("^.*=(.*)", "\\1", .x), sub("^(.*)=.*", "\\1", .x)))
}
split_tiers <- function(x) {
name <- names(x)
x$`1` <- x[[1]] %>%
strsplit(",", fixed = TRUE) %>%
purrr::map(~as.integer(.x[1])) %>%
purrr::flatten_int()
x$`2` <- x[[1]] %>%
strsplit(",", fixed = TRUE) %>%
purrr::map(~as.integer(.x[2])) %>%
purrr::flatten_int()
names(x)[2:3] <- paste0(name, "_", names(x)[2:3])
x[2:3]
}
calc_counts <- function(x) {
T_REF_COUNT_KEY <- paste0("T_", x$REF, "U_", x$TQSS_NT)
T_ALT_COUNT_KEY <- paste0("T_", x$ALT, "U_", x$TQSS_NT)
N_REF_COUNT_KEY <- paste0("N_", x$REF, "U_", x$TQSS_NT)
N_ALT_COUNT_KEY <- paste0("N_", x$ALT, "U_", x$TQSS_NT)
T_REF_COUNT <- x[[T_REF_COUNT_KEY]]
T_ALT_COUNT <- x[[T_ALT_COUNT_KEY]]
N_REF_COUNT <- x[[N_REF_COUNT_KEY]]
N_ALT_COUNT <- x[[N_ALT_COUNT_KEY]]
T_VAF <- T_ALT_COUNT / (T_ALT_COUNT + T_REF_COUNT)
N_VAF <- N_ALT_COUNT / (N_ALT_COUNT + N_REF_COUNT)
data.frame(T_VAF = T_VAF, N_VAF = N_VAF)
}
This is the original Strelka VCF file parser.
parse_vcf_orig <- function(x) {
vcf.colnames <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","NORMAL","TUMOR")
somatic.snv.vcf <- read.delim(x,header=F,comment.char='#',col.names=vcf.colnames)
expand_strelka_vcf <- function(vcf){
# Expand INFO column into separate columns for each field
NT <- character()
QSS <- numeric()
QSS_NT <- numeric()
SGT <- character()
TQSS <- numeric()
TQSS_NT <- numeric()
# Expand the normal FORMAT column into separate columns for each field
N_DP <- numeric()
N_FDP <- numeric()
N_SDP <- numeric()
N_SUBDP <- numeric()
N_AU_1 <- numeric()
N_AU_2 <- numeric()
N_CU_1 <- numeric()
N_CU_2 <- numeric()
N_GU_1 <- numeric()
N_GU_2 <- numeric()
N_TU_1 <- numeric()
N_TU_2 <- numeric()
# Expand the tumour FORMAT column into separate columns for each field
T_DP <- numeric()
T_FDP <- numeric()
T_SDP <- numeric()
T_SUBDP <- numeric()
T_AU_1 <- numeric()
T_AU_2 <- numeric()
T_CU_1 <- numeric()
T_CU_2 <- numeric()
T_GU_1 <- numeric()
T_GU_2 <- numeric()
T_TU_1 <- numeric()
T_TU_2 <- numeric()
for(row_n in 1:nrow(vcf)){
# Order NT,QSS,QSS_NT,SGT,"SOMATIC",TQSS,TQSS_NT
info.field_values <- unlist(strsplit(as.character(vcf$INFO[row_n]),";"))
NT <- c(NT, unlist(strsplit(info.field_values[1],"="))[2])
QSS <- c(QSS, unlist(strsplit(info.field_values[2],"="))[2])
QSS_NT <- c(QSS_NT, as.numeric(unlist(strsplit(info.field_values[3],"=")))[2])
SGT <- c(SGT, unlist(strsplit(info.field_values[4],"="))[2])
TQSS <- c(TQSS, unlist(strsplit(info.field_values[6],"="))[2])
TQSS_NT <- c(TQSS_NT, unlist(strsplit(info.field_values[7],"="))[2])
# Order N_DP,N_FDP,N_SDP,N_SUBDP,N_AU_1,N_AU_2,N_CU_1,N_CU_2,N_GU_1,N_GU_2,N_TU_1,N_TU_2
normal.format_values <- unlist(strsplit(as.character(vcf$NORMAL[row_n]),":"))
N_DP <- c(N_DP, normal.format_values[1])
N_FDP <- c(N_FDP, normal.format_values[2])
N_SDP <- c(N_SDP, normal.format_values[3])
N_SUBDP <- c(N_SUBDP, normal.format_values[4])
N_AU_1 <- c(N_AU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[5]),","))[1]))
N_AU_2 <- c(N_AU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[5]),","))[2]))
N_CU_1 <- c(N_CU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[6]),","))[1]))
N_CU_2 <- c(N_CU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[6]),","))[2]))
N_GU_1 <- c(N_GU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[7]),","))[1]))
N_GU_2 <- c(N_GU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[7]),","))[2]))
N_TU_1 <- c(N_TU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[8]),","))[1]))
N_TU_2 <- c(N_TU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[8]),","))[2]))
# Order T_DP,T_FDP,T_SDP,T_SUBDP,T_AU_1,T_AU_2,T_CU_1,T_CU_2,T_GU_1,T_GU_2,T_TU_1,T_TU_2
tumour.field_values <- unlist(strsplit(as.character(vcf$TUMOR[row_n]),":"))
T_DP <- c(T_DP, tumour.field_values[1])
T_FDP <- c(T_FDP, tumour.field_values[2])
T_SDP <- c(T_SDP, tumour.field_values[3])
T_SUBDP <- c(T_SUBDP, tumour.field_values[4])
T_AU_1 <- c(T_AU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[5]),","))[1]))
T_AU_2 <- c(T_AU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[5]),","))[2]))
T_CU_1 <- c(T_CU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[6]),","))[1]))
T_CU_2 <- c(T_CU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[6]),","))[2]))
T_GU_1 <- c(T_GU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[7]),","))[1]))
T_GU_2 <- c(T_GU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[7]),","))[2]))
T_TU_1 <- c(T_TU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[8]),","))[1]))
T_TU_2 <- c(T_TU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[8]),","))[2]))
}
expanded.vcf <- cbind.data.frame(somatic.snv.vcf,NT,QSS,QSS_NT,SGT,TQSS,TQSS_NT,N_DP,N_FDP,N_SDP,N_SUBDP,N_AU_1,N_AU_2,N_CU_1,N_CU_2,N_GU_1,N_GU_2,N_TU_1,N_TU_2,T_DP,T_FDP,T_SDP,T_SUBDP,T_AU_1,T_AU_2,T_CU_1,T_CU_2,T_GU_1,T_GU_2,T_TU_1,T_TU_2)
return(expanded.vcf)
}
calculate_alt_vafs_in_normal <- function(expanded.vcf){
# Plot the ALT allele VAF in normal
# 1. Determine which tier is used for QSS_NT
# 1.1. Calculate the total read depth in normal depending on the data tier (Skip totals <= 0)
# 1.1.1. (tier 1) Total = Tier 1 REF count + Tier 2 ALT count
# 1.1.2. (tier 2) Total = Tier 2 REF count + Tier 2 ALT count
# 2. Find VAF of ALT allele in normal (Skip ALT columns with two values or if ALT == ".")
# 2.1. Get correct ALT allele read count depending on the data tier
# 2.1.1. (tier 1) First number
# 2.1.2. (tier 2) Second number
# 2.2. VAF = ALT allele read count / Total
VAFs <- numeric()
for(row_n in 1:nrow(expanded.vcf)){
tqss_nt <- as.numeric(expanded.vcf$TQSS_NT[row_n])
ref_allele <- as.character(expanded.vcf$REF[row_n])
alt_allele <- as.character(expanded.vcf$ALT[row_n])
if(length(unlist(strsplit(alt_allele,","))) > 1 | alt_allele == "." ){
next
}
ref_count <- NULL
alt_count <- NULL
if(tqss_nt == 1){
if(ref_allele == "A"){
ref_count <- as.numeric(expanded.vcf$N_AU_1[row_n])
}else if(ref_allele == "C"){
ref_count <- as.numeric(expanded.vcf$N_CU_1[row_n])
}else if(ref_allele == "G"){
ref_count <- as.numeric(expanded.vcf$N_GU_1[row_n])
}else if(ref_allele == "T"){
ref_count <- as.numeric(expanded.vcf$N_TU_1[row_n])
}
if(alt_allele == "A"){
alt_count <- as.numeric(expanded.vcf$N_AU_1[row_n])
}else if(alt_allele == "C"){
alt_count <- as.numeric(expanded.vcf$N_CU_1[row_n])
}else if(alt_allele == "G"){
alt_count <- as.numeric(expanded.vcf$N_GU_1[row_n])
}else if(alt_allele == "T"){
alt_count <- as.numeric(expanded.vcf$N_TU_1[row_n])
}
}else if(tqss_nt == 2){
if(ref_allele == "A"){
ref_count <- as.numeric(expanded.vcf$N_AU_2[row_n])
}else if(ref_allele == "C"){
ref_count <- as.numeric(expanded.vcf$N_CU_2[row_n])
}else if(ref_allele == "G"){
ref_count <- as.numeric(expanded.vcf$N_GU_2[row_n])
}else if(ref_allele == "T"){
ref_count <- as.numeric(expanded.vcf$N_TU_2[row_n])
}
if(alt_allele == "A"){
alt_count <- as.numeric(expanded.vcf$N_AU_2[row_n])
}else if(alt_allele == "C"){
alt_count <- as.numeric(expanded.vcf$N_CU_2[row_n])
}else if(alt_allele == "G"){
alt_count <- as.numeric(expanded.vcf$N_GU_2[row_n])
}else if(alt_allele == "T"){
alt_count <- as.numeric(expanded.vcf$N_TU_2[row_n])
}
}
total_depth <- ref_count + alt_count
if(total_depth <= 0){
VAFs <- c(VAFs, NA)
next
}
VAF <- alt_count/total_depth
VAFs <- c(VAFs, VAF)
}
return(VAFs)
}
calculate_alt_vafs_in_tumour <- function(expanded.vcf){
# Plot the ALT allele VAF in normal
# 1. Determine which tier is used for QSS_NT
# 1.1. Calculate the total read depth in normal depending on the data tier (Skip totals <= 0)
# 1.1.1. (tier 1) Total = Tier 1 REF count + Tier 2 ALT count
# 1.1.2. (tier 2) Total = Tier 2 REF count + Tier 2 ALT count
# 2. Find VAF of ALT allele in tumour (Skip ALT columns with two values or if ALT == ".")
# 2.1. Get correct ALT allele read count depending on the data tier
# 2.1.1. (tier 1) First number
# 2.1.2. (tier 2) Second number
# 2.2. VAF = ALT allele read count / Total
VAFs <- numeric()
for(row_n in 1:nrow(expanded.vcf)){
tqss_nt <- as.numeric(expanded.vcf$TQSS_NT[row_n])
ref_allele <- as.character(expanded.vcf$REF[row_n])
alt_allele <- as.character(expanded.vcf$ALT[row_n])
if(length(unlist(strsplit(alt_allele,","))) > 1 | alt_allele == "." ){
next
}
ref_count <- NULL
alt_count <- NULL
if(tqss_nt == 1){
if(ref_allele == "A"){
ref_count <- as.numeric(expanded.vcf$T_AU_1[row_n])
}else if(ref_allele == "C"){
ref_count <- as.numeric(expanded.vcf$T_CU_1[row_n])
}else if(ref_allele == "G"){
ref_count <- as.numeric(expanded.vcf$T_GU_1[row_n])
}else if(ref_allele == "T"){
ref_count <- as.numeric(expanded.vcf$T_TU_1[row_n])
}
if(alt_allele == "A"){
alt_count <- as.numeric(expanded.vcf$T_AU_1[row_n])
}else if(alt_allele == "C"){
alt_count <- as.numeric(expanded.vcf$T_CU_1[row_n])
}else if(alt_allele == "G"){
alt_count <- as.numeric(expanded.vcf$T_GU_1[row_n])
}else if(alt_allele == "T"){
alt_count <- as.numeric(expanded.vcf$T_TU_1[row_n])
}
}else if(tqss_nt == 2){
if(ref_allele == "A"){
ref_count <- as.numeric(expanded.vcf$T_AU_2[row_n])
}else if(ref_allele == "C"){
ref_count <- as.numeric(expanded.vcf$T_CU_2[row_n])
}else if(ref_allele == "G"){
ref_count <- as.numeric(expanded.vcf$T_GU_2[row_n])
}else if(ref_allele == "T"){
ref_count <- as.numeric(expanded.vcf$T_TU_2[row_n])
}
if(alt_allele == "A"){
alt_count <- as.numeric(expanded.vcf$T_AU_2[row_n])
}else if(alt_allele == "C"){
alt_count <- as.numeric(expanded.vcf$T_CU_2[row_n])
}else if(alt_allele == "G"){
alt_count <- as.numeric(expanded.vcf$T_GU_2[row_n])
}else if(alt_allele == "T"){
alt_count <- as.numeric(expanded.vcf$T_TU_2[row_n])
}
}
total_depth <- ref_count + alt_count
if(total_depth <= 0){
VAFs <- c(VAFs, NA)
next
}
VAF <- alt_count/total_depth
VAFs <- c(VAFs, VAF)
}
return(VAFs)
}
somatic.snv.vcf.XInfo.XNormFormat.XTumFormat <- expand_strelka_vcf(somatic.snv.vcf)
somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1 <- somatic.snv.vcf.XInfo.XNormFormat.XTumFormat[!seq(1,nrow(somatic.snv.vcf.XInfo.XNormFormat.XTumFormat)) %in% grep(",",somatic.snv.vcf.XInfo.XNormFormat.XTumFormat$ALT),]
snv.filtered <- somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1[somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1$ALT != ".",]
normal.pass.alt_vaf <- calculate_alt_vafs_in_normal(snv.filtered)
tumour.pass.alt.vaf <- calculate_alt_vafs_in_tumour(snv.filtered)
snv.filtered.alt_vaf <- cbind.data.frame(snv.filtered,vaf_norm=normal.pass.alt_vaf,vaf_tumour=tumour.pass.alt.vaf)
snv.filtered.alt_vaf
}
These are alternative parsers that attempt to perform the parsing in a more tidy fashion (using Hadley Wickam’s packages like dplyr and tidyr).
parse_vcf_alt1 <- function(x) {
vcf <- readr::read_tsv(x, comment = "##")
vcf <- dplyr::rename(vcf, CHROM = `#CHROM`)
vcf <- dplyr::mutate(vcf, ALT = strsplit(ALT, ",") %>% purrr::map_chr(1),
info = parse_info(INFO),
normal = parse_format(FORMAT, NORMAL),
tumour = parse_format(FORMAT, TUMOR))
vcf <- tidyr::unnest(vcf, info)
vcf <- tidyr::unnest(vcf, N = normal, T = tumour, .sep = "_")
vcf <- purrr::lmap_at(vcf, dplyr::matches("[TN]_[ACTG]U", vars = names(vcf)), split_tiers)
vcf <- readr::type_convert(vcf)
vcf <- tidyr::gather(vcf, allele, count, dplyr::matches("[TN]_[ACTG]U_[12]"))
vcf <- tidyr::separate(vcf, allele, c("sample", "base", "tier"), sep = "[_U]+", remove = FALSE)
vcf <- dplyr::group_by(vcf, CHROM, POS, REF, ALT)
vcf <- dplyr::mutate(vcf, T_REF_COUNT = count[REF == base & sample == "T" & tier == TQSS_NT],
T_ALT_COUNT = count[ALT == base & sample == "T" & tier == TQSS_NT],
N_REF_COUNT = count[REF == base & sample == "N" & tier == TQSS_NT],
N_ALT_COUNT = count[ALT == base & sample == "N" & tier == TQSS_NT],
T_VAF = T_ALT_COUNT / (T_ALT_COUNT + T_REF_COUNT),
N_VAF = N_ALT_COUNT / (N_ALT_COUNT + N_REF_COUNT))
vcf <- dplyr::select(vcf, -sample, -base, -tier)
vcf <- tidyr::spread(vcf, allele, count) # Takes up most of the time
vcf
}
parse_vcf_alt2 <- function(x) {
vcf <- readr::read_tsv(x, comment = "##")
vcf <- dplyr::rename(vcf, CHROM = `#CHROM`)
vcf <- dplyr::mutate(vcf, ALT = purrr::map_chr(strsplit(ALT, ","), 1),
info = parse_info(INFO),
normal = parse_format(FORMAT, NORMAL),
tumour = parse_format(FORMAT, TUMOR))
vcf <- tidyr::unnest(vcf, info)
vcf <- tidyr::unnest(vcf, N = normal, T = tumour, .sep = "_")
vcf <- purrr::lmap_at(vcf, dplyr::matches("[TN]_[ACTG]U", vars = names(vcf)), split_tiers)
vcf <- readr::type_convert(vcf)
vcf <- purrr::by_row(vcf, calc_counts)
vcf <- tidyr::unnest(vcf, .out)
vcf
}
Here, I’m benchmarking the three parsers to confirm whether the tidy approach is indeed faster.
microbenchmark::microbenchmark(
original = parse_vcf_orig(x),
alt1 = parse_vcf_alt1(x),
alt2 = parse_vcf_alt2(x),
times = 3
)
Unit: seconds
expr min lq mean median uq max neval
original 9.806034 10.55204 12.623073 11.298051 14.03159 16.765134 3
alt1 11.407053 11.50608 11.557087 11.605101 11.63210 11.659108 3
alt2 5.176408 5.21499 5.332022 5.253572 5.40983 5.566088 3