library(tidyverse)
library(rtracklayer)
library(GenomicRanges)
library(BSgenome)
library(VariantAnnotation)
grch37 <- "BSgenome.Hsapiens.1000genomes.hs37d5"
hg005_bedfile <- "data/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.bed"
hg005_bed <- import(hg005_bedfile, genome = grch37)
hg006_bedfile <- "data/HG006_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.bed"
hg006_bed <- import(hg006_bedfile, genome = grch37)
hg007_bedfile <- "data/HG007_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.bed"
hg007_bed <- import(hg007_bedfile, genome = grch37)

hg005_vcffile <- "data/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg005_vcf <- readVcf(hg005_vcffile, genome = grch37)
hg006_vcffile <- "data/HG006_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg006_vcf <- readVcf(hg006_vcffile, genome = grch37)
hg007_vcffile <- "data/HG007_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg007_vcf <- readVcf(hg007_vcffile, genome = grch37)
get_alpha_freq <- function(i){
    BSgenome.Hsapiens.1000genomes.hs37d5[[i]] %>% 
        alphabetFrequency() %>% 
        as.data.frame()
}
alpha_freq_df <- as.list(1:22) %>% 
    map_dfc(get_alpha_freq)

colnames(alpha_freq_df) <- paste0("chr",1:22)

alpha_freq_df <- alpha_freq_df %>% 
    filter(chr1 != 0) %>% 
    add_column(base = c("A","C","G","T","N"))

Chromosome 1-22 total length and number of non-N bases

chromosome_lengths <- alpha_freq_df %>% 
    gather(key = "chrom", value = "nbases", -base) %>% 
    group_by(chrom) %>% 
    mutate(base_type = if_else(base == "N", "N", "non_N")) %>% 
    group_by(chrom, base_type) %>% 
    summarise(n_bases = sum(nbases)) %>% 
    spread(base_type, n_bases) %>% 
    mutate(len = N + non_N)

chromosome_lengths <- data_frame(chrom = "genome", 
                                 N = sum(chromosome_lengths$N),
                                 non_N = sum(chromosome_lengths$non_N),
                                 len = sum(chromosome_lengths$len)) %>% 
    bind_rows(chromosome_lengths)
get_cov_df <- function(bed){
    cov <- coverage(bed)[1:22]
    tablist <- List(lapply(cov, table))
    mcols(tablist)$len <- lengths(cov, use.names = FALSE)
    covhist <- stack(tablist, "seqnames", "count", "coverage")
    margin <- aggregate(covhist, ~coverage, 
                        count = sum(NumericList(count)))[-1L]
    margin <- DataFrame(seqnames = Rle("genome"), 
                        margin, 
                        len = sum(as.numeric(lengths(cov))))
    covhist <- rbind(covhist, margin)
    ans <- within(covhist, fraction <- count/len)
    
    as.data.frame(ans) %>% 
        filter(seqnames %in% c(1:22, "genome"), 
               coverage == 1)
}
cov_df <- list(HG005 = hg005_bed, 
               HG006 = hg006_bed, 
               HG007 = hg007_bed) %>% 
    map_dfr(get_cov_df, .id = "hgref")

cov_df <- cov_df %>% 
    mutate(seqnames = as.character(seqnames)) %>%
    mutate(chrom = if_else( seqnames != "genome", 
                            paste0("chr", seqnames), 
                            as.character(seqnames))) %>% 
    dplyr::select(hgref, chrom, count, len, fraction)
cov_df <- cov_df %>% 
    left_join(chromosome_lengths) %>% 
    mutate(frac_non_N = count/non_N)
## Joining, by = c("chrom", "len")

0 used to indicate full genome.

cov_df %>% 
    ggplot() + 
    geom_bar(aes(x = chrom, y = fraction, fill = hgref), 
             width = 0.4, position = "dodge", stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = -45, hjust = 0)) +
    coord_cartesian(xlim = NULL, ylim = c(0.5, 1))
Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.

Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.

Need to fix order

cov_df %>% 
    ggplot() + 
    geom_bar(aes(x = chrom, y = frac_non_N, fill = hgref), 
             width = 0.4, position = "dodge", stat = "identity") + 
    theme_bw() + 
    theme(axis.text.x = element_text(angle = -45, hjust = 0)) +
    coord_cartesian(xlim = NULL, ylim = c(0.75, 1))
Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.

Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.

Coverage slightly lower for HG006 and HG007. Full genome length 313,745,450 bp (including N’s).

cov_df %>% filter(chrom == "genome") %>%
    dplyr::select(hgref, fraction, frac_non_N) %>% 
    knitr::kable(digits = 4)
hgref fraction frac_non_N
HG005 0.8460 0.9079
HG006 0.8308 0.8916
HG007 0.8311 0.8919

Variant Summaries

get_var_summary_df <- function(vcf){
    ## full genome
    nvar <- nrow(vcf)
    ti_lgl <- isTransition(vcf)
    titv <- sum(ti_lgl)/sum(!ti_lgl)
    
    ## Metrics for individual chromosomes
    for(i in 1:22){
        chrom_vcf <- subset(vcf, seqnames(vcf) == i)
        nvar <- c(nvar, nrow(chrom_vcf))
        ti_lgl <- isTransition(chrom_vcf)
        titv <- c(titv, sum(ti_lgl)/sum(!ti_lgl))
    }
    data.frame(CHROM = 0:22, nvar, titv)
}
variant_df <- list(HG005 = hg005_vcf, 
                   HG006 = hg006_vcf, 
                   HG007 = hg007_vcf) %>% 
    map_dfr(get_var_summary_df, .id = "hgref") 

Number of variants by chromosome

variant_df %>% 
    ggplot() + 
    geom_bar(aes(x = CHROM, y = nvar, fill = hgref), 
             width = 0.4, position = "dodge", stat = "identity") + 
    theme_bw() + 
    # theme(axis.text.x = element_text(angle = -45, hjust = 0)) + 
    scale_y_log10() + 
    annotation_logticks(base = 10, sides = "rl") +
    labs(y = "Number of Variants") + 
    coord_cartesian(xlim = NULL, ylim = c(1e4, 1e7))

Note values are not correct Ti/Tv for whole genome (CHROM 0) and individual chromosomes.

variant_df %>% dplyr::select(-nvar) %>% spread(hgref, titv)
##    CHROM    HG005    HG006    HG007
## 1      0 1.477984 1.474381 1.472636
## 2      1 1.520015 1.519046 1.513190
## 3      2 1.472985 1.463891 1.464626
## 4      3 1.449761 1.443425 1.447414
## 5      4 1.448618 1.456218 1.452431
## 6      5 1.433584 1.432964 1.423877
## 7      6 1.483034 1.476101 1.481471
## 8      7 1.468434 1.476196 1.465349
## 9      8 1.403261 1.403403 1.396826
## 10     9 1.446309 1.435683 1.434456
## 11    10 1.525333 1.527103 1.508354
## 12    11 1.469162 1.467356 1.476335
## 13    12 1.504614 1.487735 1.493365
## 14    13 1.469425 1.471442 1.478295
## 15    14 1.473790 1.476800 1.481621
## 16    15 1.487022 1.487725 1.473470
## 17    16 1.405125 1.385997 1.400156
## 18    17 1.603757 1.567680 1.568899
## 19    18 1.508162 1.509984 1.513959
## 20    19 1.585010 1.554630 1.549298
## 21    20 1.593320 1.583853 1.581084
## 22    21 1.426933 1.443853 1.430394
## 23    22 1.644711 1.646919 1.638870

Stats from Trio Harmonizer

Stats from command rtg vcfstats clean-sp.vcf.gz

Passed Filters               : 4233607
Sample Name: child
SNPs                         : 2744098
MNPs                         : 0
Insertions                   : 139976
Deletions                    : 144263
Indels                       : 3350
Same as reference            : 1201920
Phased Genotypes             : 100.0% (4233607/4233607)
SNP Transitions/Transversions: 2.11 (2751892/1302496)
Total Het/Hom ratio          : 1.13 (1605906/1425781)
SNP Het/Hom ratio            : 1.10 (1434543/1309555)
MNP Het/Hom ratio            : - (0/0)
Insertion Het/Hom ratio      : 1.25 (77853/62123)
Deletion Het/Hom ratio       : 1.68 (90337/53926)
Indel Het/Hom ratio          : 17.93 (3173/177)
Insertion/Deletion ratio     : 0.97 (139976/144263)
Indel/SNP+MNP ratio          : 0.10 (287589/2744098)
Sample Name: dad
SNPs                         : 2764794
MNPs                         : 0
Insertions                   : 162676
Deletions                    : 168444
Indels                       : 2507
Same as reference            : 1135186
SNP Transitions/Transversions: 2.11 (2770738/1314603)
Total Het/Hom ratio          : 1.16 (1664056/1434365)
SNP Het/Hom ratio            : 1.09 (1444985/1319809)
MNP Het/Hom ratio            : - (0/0)
Insertion Het/Hom ratio      : 1.65 (101219/61457)
Deletion Het/Hom ratio       : 2.18 (115530/52914)
Indel Het/Hom ratio          : 12.55 (2322/185)
Insertion/Deletion ratio     : 0.97 (162676/168444)
Indel/SNP+MNP ratio          : 0.12 (333627/2764794)
Sample Name: mom
SNPs                         : 2781532
MNPs                         : 0
Insertions                   : 164031
Deletions                    : 171409
Indels                       : 2378
Same as reference            : 1114257
SNP Transitions/Transversions: 2.11 (2777219/1318918)
Total Het/Hom ratio          : 1.19 (1692145/1427205)
SNP Het/Hom ratio            : 1.12 (1467677/1313855)
MNP Het/Hom ratio            : - (0/0)
Insertion Het/Hom ratio      : 1.69 (102954/61077)
Deletion Het/Hom ratio       : 2.29 (119308/52101)
Indel Het/Hom ratio          : 12.83 (2206/172)
Insertion/Deletion ratio     : 0.96 (164031/171409)
Indel/SNP+MNP ratio          : 0.12 (337818/2781532)

VCF Eval: rtg vcfeval --ref-overlap --Xobey-phase=true,true --XXcom.rtg.vcf.eval.custom-path-processor=phase-transfer -t rtgsdf -b clean-sp.vcf.gz -c unphased-calls.vcf.gz -o phase-transfer-sp --sample child,INTEGRATION

Threshold  True-pos-baseline  True-pos-call  False-pos  False-neg  Precision  Sensitivity  F-measure
----------------------------------------------------------------------------------------------------
  117.000            3009782        3009795     564636      21905     0.8420       0.9928     0.9112
     None            3031649        3031670     595626         38     0.8358       1.0000     0.9105

For comparison values from AJ Trio. Similar number of TP, FP, but order of magnitude fewer FN with AJ compared to Asian trio. Not sure if this is due to different threshold values.

rtg vcfeval --ref-overlap --Xobey-phase=true,true --XXcom.rtg.vcf.eval.custom-path-processor=phase-transfer -t rtgsdf -b clean-sp.vcf.gz -c unphased-calls.vcf.gz -o phase-transfer-sp --sample child,INTEGRATION

Threshold  True-pos-baseline  True-pos-call  False-pos  False-neg  Precision  Sensitivity  F-measure
----------------------------------------------------------------------------------------------------
  100.000            3026791        3026854     563160       2805     0.8431       0.9991     0.9145
     None            3029515        3029578     566543         81     0.8425       1.0000     0.9145

Trio Inconsistencies

trio_vcffile <- "data/HG005_HG006_HG007_trioinconsistent.vcf.gz"
trioinconsistent_vcf <- readVcf(trio_vcffile, genome = grch37)
## Warning in .bcfHeaderAsSimpleList(header): duplicate keys in header will be
## forced to unique rownames

Numbers for Integration manuscript

Annotating Variant Type

trio_incon_df <- geno(trioinconsistent_vcf)[["GT"]] %>% 
    data.frame(stringsAsFactors = FALSE) %>% 
    rownames_to_column(var = "vcf_rowname") %>% 
    # add_column(indel_single = isIndel(trioinconsistent_vcf, singleAltOnly = TRUE)) %>% 
    add_column(indel = isIndel(trioinconsistent_vcf, singleAltOnly = FALSE)) %>% 
    # add_column(insertion = isInsertion(trioinconsistent_vcf)) %>% 
    # add_column(delection = isDeletion(trioinconsistent_vcf)) %>% 
    add_column(delins = isDelins(trioinconsistent_vcf)) %>% 
    add_column(snp = isSNV(trioinconsistent_vcf, singleAltOnly = FALSE)) %>% 
    add_column(substitution = isSubstitution(trioinconsistent_vcf, singleAltOnly = FALSE))

Total Violations

length(trioinconsistent_vcf)
## [1] 425

Indels

sum(isIndel(trioinconsistent_vcf, singleAltOnly = FALSE))
## [1] 216

SNPs

sum(isSNV(trioinconsistent_vcf))
## [1] 208

One position not classified as a SNV or Indel delins - deletion followed by an insertion

trio_incon_df  %>% filter(snp == FALSE, indel == FALSE)
##                vcf_rowname child dad mom indel delins   snp substitution
## 1 15:49763681_AAAT/AAATAAT   0|0 1/1 0/0 FALSE   TRUE FALSE        FALSE

Likely cell-line or germline de novo mutations - son heterozygous Total (SNP, Indels)

denovo_son <- geno(trioinconsistent_vcf)[["GT"]] %>% 
    data.frame(stringsAsFactors = FALSE) %>% 
    rownames_to_column(var = "vcf_rowname") %>% 
    left_join(trio_incon_df) %>% 
    filter(dad == "0/0", mom == "0/0", child != "1|1")
## Joining, by = c("vcf_rowname", "child", "dad", "mom")

Total

nrow(denovo_son)
## [1] 146

Number of SNP and Indels

denovo_son %>% 
    dplyr::select(vcf_rowname, snp, indel) %>%
    gather("var_type","var_lgl", -vcf_rowname) %>% 
    filter(var_lgl == TRUE) %>% 
    group_by(var_type) %>% 
    summarise(count = n())
## # A tibble: 2 x 2
##   var_type count
##   <chr>    <int>
## 1 indel       43
## 2 snp        103
denovo_son_var <- rownames(trioinconsistent_vcf) %in% denovo_son$vcf_rowname
trioinconsistent_vcf[denovo_son_var,] %>% 
    writeVcf(filename = "data/AsianTrio_inconsistent_denovo_son.vcf.gz",
             index = TRUE)
rowRanges(trioinconsistent_vcf) %>% as.data.frame() %>% 
    dplyr::select(-seqnames, -strand, -paramRangeID, -QUAL, -FILTER) %>% 
    DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html