Parsing Strelka VCF Files in R

This is an exercise to refactor R code for parsing a Strelka VCF file such that it uses “tidy principles” (i.e., using Hadley Wickam’s package such as dplyr and tidyr). I also make use of the purrr package for the first time.

Setup

# Load magrittr for R pipes (%>%)
library(magrittr)

# Strelka output VCF file (all variants, 1.3 MB)
x <- "snvs.vcf"

Helper Functions

These are helper functions that are used in the alternative parsers

row_tibble <- function(x, col_names) {
  tibble::as_tibble(rbind(setNames(x, col_names)))
}

parse_format <- function(format, sample) {
  purrr::map2(
    strsplit(sample, ":", fixed = TRUE),
    strsplit(format, ":", fixed = TRUE),
    row_tibble)
}

parse_info <- function(info) {
  strsplit(info, ";", fixed = TRUE) %>%
    purrr::map(~row_tibble(sub("^.*=(.*)", "\\1", .x), sub("^(.*)=.*", "\\1", .x)))
}

split_tiers <- function(x) {
  name <- names(x)
  x$`1` <- x[[1]] %>%
    strsplit(",", fixed = TRUE) %>%
    purrr::map(~as.integer(.x[1])) %>%
    purrr::flatten_int()
  x$`2` <- x[[1]] %>%
    strsplit(",", fixed = TRUE) %>%
    purrr::map(~as.integer(.x[2])) %>%
    purrr::flatten_int()
  names(x)[2:3] <- paste0(name, "_", names(x)[2:3])
  x[2:3]
}

calc_counts <- function(x) {
  T_REF_COUNT_KEY <- paste0("T_", x$REF, "U_", x$TQSS_NT)
  T_ALT_COUNT_KEY <- paste0("T_", x$ALT, "U_", x$TQSS_NT)
  N_REF_COUNT_KEY <- paste0("N_", x$REF, "U_", x$TQSS_NT)
  N_ALT_COUNT_KEY <- paste0("N_", x$ALT, "U_", x$TQSS_NT)
  T_REF_COUNT <- x[[T_REF_COUNT_KEY]]
  T_ALT_COUNT <- x[[T_ALT_COUNT_KEY]]
  N_REF_COUNT <- x[[N_REF_COUNT_KEY]]
  N_ALT_COUNT <- x[[N_ALT_COUNT_KEY]]
  T_VAF <- T_ALT_COUNT / (T_ALT_COUNT + T_REF_COUNT)
  N_VAF <- N_ALT_COUNT / (N_ALT_COUNT + N_REF_COUNT)
  data.frame(T_VAF = T_VAF, N_VAF = N_VAF)
}

Original Parser

This is the original Strelka VCF file parser.

parse_vcf_orig <- function(x) {
  vcf.colnames <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","NORMAL","TUMOR")

  somatic.snv.vcf <- read.delim(x,header=F,comment.char='#',col.names=vcf.colnames)

  expand_strelka_vcf <- function(vcf){

    # Expand INFO column into separate columns for each field
    NT <- character()
    QSS <- numeric()
    QSS_NT <- numeric()
    SGT <- character()
    TQSS <- numeric()
    TQSS_NT <- numeric()

    # Expand the normal FORMAT column into separate columns for each field
    N_DP <- numeric()
    N_FDP <- numeric()
    N_SDP <- numeric()
    N_SUBDP <- numeric()
    N_AU_1 <- numeric()
    N_AU_2 <- numeric()
    N_CU_1 <- numeric()
    N_CU_2 <- numeric()
    N_GU_1 <- numeric()
    N_GU_2 <- numeric()
    N_TU_1 <- numeric()
    N_TU_2 <- numeric()

    # Expand the tumour FORMAT column into separate columns for each field
    T_DP <- numeric()
    T_FDP <- numeric()
    T_SDP <- numeric()
    T_SUBDP <- numeric()
    T_AU_1 <- numeric()
    T_AU_2 <- numeric()
    T_CU_1 <- numeric()
    T_CU_2 <- numeric()
    T_GU_1 <- numeric()
    T_GU_2 <- numeric()
    T_TU_1 <- numeric()
    T_TU_2 <- numeric()

    for(row_n in 1:nrow(vcf)){
      # Order NT,QSS,QSS_NT,SGT,"SOMATIC",TQSS,TQSS_NT
      info.field_values <- unlist(strsplit(as.character(vcf$INFO[row_n]),";"))
      NT <- c(NT, unlist(strsplit(info.field_values[1],"="))[2])
      QSS <- c(QSS, unlist(strsplit(info.field_values[2],"="))[2])
      QSS_NT <- c(QSS_NT, as.numeric(unlist(strsplit(info.field_values[3],"=")))[2])
      SGT <- c(SGT, unlist(strsplit(info.field_values[4],"="))[2])
      TQSS <- c(TQSS, unlist(strsplit(info.field_values[6],"="))[2])
      TQSS_NT <- c(TQSS_NT, unlist(strsplit(info.field_values[7],"="))[2])

      # Order N_DP,N_FDP,N_SDP,N_SUBDP,N_AU_1,N_AU_2,N_CU_1,N_CU_2,N_GU_1,N_GU_2,N_TU_1,N_TU_2
      normal.format_values <- unlist(strsplit(as.character(vcf$NORMAL[row_n]),":"))
      N_DP <- c(N_DP, normal.format_values[1])
      N_FDP <- c(N_FDP, normal.format_values[2])
      N_SDP <- c(N_SDP, normal.format_values[3])
      N_SUBDP <- c(N_SUBDP, normal.format_values[4])
      N_AU_1 <- c(N_AU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[5]),","))[1]))
      N_AU_2 <- c(N_AU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[5]),","))[2]))
      N_CU_1 <- c(N_CU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[6]),","))[1]))
      N_CU_2 <- c(N_CU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[6]),","))[2]))
      N_GU_1 <- c(N_GU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[7]),","))[1]))
      N_GU_2 <- c(N_GU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[7]),","))[2]))
      N_TU_1 <- c(N_TU_1, as.numeric(unlist(strsplit(as.character(normal.format_values[8]),","))[1]))
      N_TU_2 <- c(N_TU_2, as.numeric(unlist(strsplit(as.character(normal.format_values[8]),","))[2]))

      # Order T_DP,T_FDP,T_SDP,T_SUBDP,T_AU_1,T_AU_2,T_CU_1,T_CU_2,T_GU_1,T_GU_2,T_TU_1,T_TU_2
      tumour.field_values <- unlist(strsplit(as.character(vcf$TUMOR[row_n]),":"))
      T_DP <- c(T_DP, tumour.field_values[1])
      T_FDP <- c(T_FDP, tumour.field_values[2])
      T_SDP <- c(T_SDP, tumour.field_values[3])
      T_SUBDP <- c(T_SUBDP, tumour.field_values[4])
      T_AU_1 <- c(T_AU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[5]),","))[1]))
      T_AU_2 <- c(T_AU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[5]),","))[2]))
      T_CU_1 <- c(T_CU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[6]),","))[1]))
      T_CU_2 <- c(T_CU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[6]),","))[2]))
      T_GU_1 <- c(T_GU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[7]),","))[1]))
      T_GU_2 <- c(T_GU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[7]),","))[2]))
      T_TU_1 <- c(T_TU_1, as.numeric(unlist(strsplit(as.character(tumour.field_values[8]),","))[1]))
      T_TU_2 <- c(T_TU_2, as.numeric(unlist(strsplit(as.character(tumour.field_values[8]),","))[2]))
    }

    expanded.vcf <- cbind.data.frame(somatic.snv.vcf,NT,QSS,QSS_NT,SGT,TQSS,TQSS_NT,N_DP,N_FDP,N_SDP,N_SUBDP,N_AU_1,N_AU_2,N_CU_1,N_CU_2,N_GU_1,N_GU_2,N_TU_1,N_TU_2,T_DP,T_FDP,T_SDP,T_SUBDP,T_AU_1,T_AU_2,T_CU_1,T_CU_2,T_GU_1,T_GU_2,T_TU_1,T_TU_2)

    return(expanded.vcf)
  }

  calculate_alt_vafs_in_normal <- function(expanded.vcf){
    # Plot the ALT allele VAF in normal
    # 1. Determine which tier is used for QSS_NT
    #   1.1. Calculate the total read depth in normal depending on the data tier (Skip totals <= 0)
    #     1.1.1. (tier 1) Total = Tier 1 REF count + Tier 2 ALT count
    #     1.1.2. (tier 2) Total = Tier 2 REF count + Tier 2 ALT count
    # 2. Find VAF of ALT allele in normal (Skip ALT columns with two values or if ALT == ".")
    #   2.1. Get correct ALT allele read count depending on the data tier
    #     2.1.1. (tier 1) First number
    #     2.1.2. (tier 2) Second number
    #   2.2. VAF = ALT allele read count / Total
    VAFs <- numeric()
    for(row_n in 1:nrow(expanded.vcf)){
      tqss_nt <- as.numeric(expanded.vcf$TQSS_NT[row_n])

      ref_allele <- as.character(expanded.vcf$REF[row_n])
      alt_allele <- as.character(expanded.vcf$ALT[row_n])

      if(length(unlist(strsplit(alt_allele,","))) > 1 | alt_allele == "." ){
        next
      }

      ref_count <- NULL
      alt_count <- NULL
      if(tqss_nt == 1){
        if(ref_allele == "A"){
          ref_count <- as.numeric(expanded.vcf$N_AU_1[row_n])
        }else if(ref_allele == "C"){
          ref_count <- as.numeric(expanded.vcf$N_CU_1[row_n])
        }else if(ref_allele == "G"){
          ref_count <- as.numeric(expanded.vcf$N_GU_1[row_n])
        }else if(ref_allele == "T"){
          ref_count <- as.numeric(expanded.vcf$N_TU_1[row_n])
        }
        if(alt_allele == "A"){
          alt_count <- as.numeric(expanded.vcf$N_AU_1[row_n])
        }else if(alt_allele == "C"){
          alt_count <- as.numeric(expanded.vcf$N_CU_1[row_n])
        }else if(alt_allele == "G"){
          alt_count <- as.numeric(expanded.vcf$N_GU_1[row_n])
        }else if(alt_allele == "T"){
          alt_count <- as.numeric(expanded.vcf$N_TU_1[row_n])
        }
      }else if(tqss_nt == 2){
        if(ref_allele == "A"){
          ref_count <- as.numeric(expanded.vcf$N_AU_2[row_n])
        }else if(ref_allele == "C"){
          ref_count <- as.numeric(expanded.vcf$N_CU_2[row_n])
        }else if(ref_allele == "G"){
          ref_count <- as.numeric(expanded.vcf$N_GU_2[row_n])
        }else if(ref_allele == "T"){
          ref_count <- as.numeric(expanded.vcf$N_TU_2[row_n])
        }
        if(alt_allele == "A"){
          alt_count <- as.numeric(expanded.vcf$N_AU_2[row_n])
        }else if(alt_allele == "C"){
          alt_count <- as.numeric(expanded.vcf$N_CU_2[row_n])
        }else if(alt_allele == "G"){
          alt_count <- as.numeric(expanded.vcf$N_GU_2[row_n])
        }else if(alt_allele == "T"){
          alt_count <- as.numeric(expanded.vcf$N_TU_2[row_n])
        }
      }

      total_depth <- ref_count + alt_count

      if(total_depth <= 0){
        VAFs <- c(VAFs, NA)
        next
      }

      VAF <- alt_count/total_depth
      VAFs <- c(VAFs, VAF)

    }
    return(VAFs)
  }

  calculate_alt_vafs_in_tumour <- function(expanded.vcf){
    # Plot the ALT allele VAF in normal
    # 1. Determine which tier is used for QSS_NT
    #   1.1. Calculate the total read depth in normal depending on the data tier (Skip totals <= 0)
    #     1.1.1. (tier 1) Total = Tier 1 REF count + Tier 2 ALT count
    #     1.1.2. (tier 2) Total = Tier 2 REF count + Tier 2 ALT count
    # 2. Find VAF of ALT allele in tumour (Skip ALT columns with two values or if ALT == ".")
    #   2.1. Get correct ALT allele read count depending on the data tier
    #     2.1.1. (tier 1) First number
    #     2.1.2. (tier 2) Second number
    #   2.2. VAF = ALT allele read count / Total
    VAFs <- numeric()
    for(row_n in 1:nrow(expanded.vcf)){
      tqss_nt <- as.numeric(expanded.vcf$TQSS_NT[row_n])

      ref_allele <- as.character(expanded.vcf$REF[row_n])
      alt_allele <- as.character(expanded.vcf$ALT[row_n])

      if(length(unlist(strsplit(alt_allele,","))) > 1 | alt_allele == "." ){
        next
      }

      ref_count <- NULL
      alt_count <- NULL
      if(tqss_nt == 1){
        if(ref_allele == "A"){
          ref_count <- as.numeric(expanded.vcf$T_AU_1[row_n])
        }else if(ref_allele == "C"){
          ref_count <- as.numeric(expanded.vcf$T_CU_1[row_n])
        }else if(ref_allele == "G"){
          ref_count <- as.numeric(expanded.vcf$T_GU_1[row_n])
        }else if(ref_allele == "T"){
          ref_count <- as.numeric(expanded.vcf$T_TU_1[row_n])
        }
        if(alt_allele == "A"){
          alt_count <- as.numeric(expanded.vcf$T_AU_1[row_n])
        }else if(alt_allele == "C"){
          alt_count <- as.numeric(expanded.vcf$T_CU_1[row_n])
        }else if(alt_allele == "G"){
          alt_count <- as.numeric(expanded.vcf$T_GU_1[row_n])
        }else if(alt_allele == "T"){
          alt_count <- as.numeric(expanded.vcf$T_TU_1[row_n])
        }
      }else if(tqss_nt == 2){
        if(ref_allele == "A"){
          ref_count <- as.numeric(expanded.vcf$T_AU_2[row_n])
        }else if(ref_allele == "C"){
          ref_count <- as.numeric(expanded.vcf$T_CU_2[row_n])
        }else if(ref_allele == "G"){
          ref_count <- as.numeric(expanded.vcf$T_GU_2[row_n])
        }else if(ref_allele == "T"){
          ref_count <- as.numeric(expanded.vcf$T_TU_2[row_n])
        }
        if(alt_allele == "A"){
          alt_count <- as.numeric(expanded.vcf$T_AU_2[row_n])
        }else if(alt_allele == "C"){
          alt_count <- as.numeric(expanded.vcf$T_CU_2[row_n])
        }else if(alt_allele == "G"){
          alt_count <- as.numeric(expanded.vcf$T_GU_2[row_n])
        }else if(alt_allele == "T"){
          alt_count <- as.numeric(expanded.vcf$T_TU_2[row_n])
        }
      }

      total_depth <- ref_count + alt_count

      if(total_depth <= 0){
        VAFs <- c(VAFs, NA)
        next
      }

      VAF <- alt_count/total_depth
      VAFs <- c(VAFs, VAF)

    }
    return(VAFs)
  }

  somatic.snv.vcf.XInfo.XNormFormat.XTumFormat <- expand_strelka_vcf(somatic.snv.vcf)
  somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1 <- somatic.snv.vcf.XInfo.XNormFormat.XTumFormat[!seq(1,nrow(somatic.snv.vcf.XInfo.XNormFormat.XTumFormat)) %in% grep(",",somatic.snv.vcf.XInfo.XNormFormat.XTumFormat$ALT),]
  snv.filtered <- somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1[somatic.snv.vcf.XInfo.XNormFormat.XTumFormat.alt_filtered1$ALT != ".",]

  normal.pass.alt_vaf <- calculate_alt_vafs_in_normal(snv.filtered)
  tumour.pass.alt.vaf <- calculate_alt_vafs_in_tumour(snv.filtered)
  snv.filtered.alt_vaf <- cbind.data.frame(snv.filtered,vaf_norm=normal.pass.alt_vaf,vaf_tumour=tumour.pass.alt.vaf)
  snv.filtered.alt_vaf
}

Alternative Parsers

These are alternative parsers that attempt to perform the parsing in a more tidy fashion (using Hadley Wickam’s packages like dplyr and tidyr).

parse_vcf_alt1 <- function(x) {
    vcf <- readr::read_tsv(x, comment = "##")
    vcf <- dplyr::rename(vcf, CHROM = `#CHROM`)
    vcf <- dplyr::mutate(vcf, ALT = strsplit(ALT, ",") %>% purrr::map_chr(1),
                         info = parse_info(INFO),
                         normal = parse_format(FORMAT, NORMAL),
                         tumour = parse_format(FORMAT, TUMOR))
    vcf <- tidyr::unnest(vcf, info)
    vcf <- tidyr::unnest(vcf, N = normal, T = tumour, .sep = "_")
    vcf <- purrr::lmap_at(vcf, dplyr::matches("[TN]_[ACTG]U", vars = names(vcf)), split_tiers)
    vcf <- readr::type_convert(vcf)
    vcf <- tidyr::gather(vcf, allele, count, dplyr::matches("[TN]_[ACTG]U_[12]"))
    vcf <- tidyr::separate(vcf, allele, c("sample", "base", "tier"), sep = "[_U]+", remove = FALSE)
    vcf <- dplyr::group_by(vcf, CHROM, POS, REF, ALT)
    vcf <- dplyr::mutate(vcf, T_REF_COUNT = count[REF == base & sample == "T" & tier == TQSS_NT],
                         T_ALT_COUNT = count[ALT == base & sample == "T" & tier == TQSS_NT],
                         N_REF_COUNT = count[REF == base & sample == "N" & tier == TQSS_NT],
                         N_ALT_COUNT = count[ALT == base & sample == "N" & tier == TQSS_NT],
                         T_VAF = T_ALT_COUNT / (T_ALT_COUNT + T_REF_COUNT),
                         N_VAF = N_ALT_COUNT / (N_ALT_COUNT + N_REF_COUNT))
    vcf <- dplyr::select(vcf, -sample, -base, -tier)
    vcf <- tidyr::spread(vcf, allele, count)  # Takes up most of the time
    vcf
}

parse_vcf_alt2 <- function(x) {
    vcf <- readr::read_tsv(x, comment = "##")
    vcf <- dplyr::rename(vcf, CHROM = `#CHROM`)
    vcf <- dplyr::mutate(vcf, ALT = purrr::map_chr(strsplit(ALT, ","), 1),
                         info = parse_info(INFO),
                         normal = parse_format(FORMAT, NORMAL),
                         tumour = parse_format(FORMAT, TUMOR))
    vcf <- tidyr::unnest(vcf, info)
    vcf <- tidyr::unnest(vcf, N = normal, T = tumour, .sep = "_")
    vcf <- purrr::lmap_at(vcf, dplyr::matches("[TN]_[ACTG]U", vars = names(vcf)), split_tiers)
    vcf <- readr::type_convert(vcf)
    vcf <- purrr::by_row(vcf, calc_counts)
    vcf <- tidyr::unnest(vcf, .out)
    vcf
}

Benchmarking

Here, I’m benchmarking the three parsers to confirm whether the tidy approach is indeed faster.

microbenchmark::microbenchmark(
    original = parse_vcf_orig(x), 
    alt1 = parse_vcf_alt1(x), 
    alt2 = parse_vcf_alt2(x), 
    times = 3
)

Unit: seconds
     expr       min       lq      mean    median       uq       max neval
 original  9.806034 10.55204 12.623073 11.298051 14.03159 16.765134     3
     alt1 11.407053 11.50608 11.557087 11.605101 11.63210 11.659108     3
     alt2  5.176408  5.21499  5.332022  5.253572  5.40983  5.566088     3