library(tidyverse)
library(rtracklayer)
library(GenomicRanges)
library(BSgenome)
library(VariantAnnotation)
grch37 <- "BSgenome.Hsapiens.1000genomes.hs37d5"
hg005_bedfile <- "data/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.bed"
hg005_bed <- import(hg005_bedfile, genome = grch37)
hg006_bedfile <- "data/HG006_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.bed"
hg006_bed <- import(hg006_bedfile, genome = grch37)
hg007_bedfile <- "data/HG007_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.bed"
hg007_bed <- import(hg007_bedfile, genome = grch37)
hg005_vcffile <- "data/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg005_vcf <- readVcf(hg005_vcffile, genome = grch37)
hg006_vcffile <- "data/HG006_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg006_vcf <- readVcf(hg006_vcffile, genome = grch37)
hg007_vcffile <- "data/HG007_GIAB_GRCh37_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz"
hg007_vcf <- readVcf(hg007_vcffile, genome = grch37)
get_alpha_freq <- function(i){
BSgenome.Hsapiens.1000genomes.hs37d5[[i]] %>%
alphabetFrequency() %>%
as.data.frame()
}
alpha_freq_df <- as.list(1:22) %>%
map_dfc(get_alpha_freq)
colnames(alpha_freq_df) <- paste0("chr",1:22)
alpha_freq_df <- alpha_freq_df %>%
filter(chr1 != 0) %>%
add_column(base = c("A","C","G","T","N"))
Chromosome 1-22 total length and number of non-N bases
chromosome_lengths <- alpha_freq_df %>%
gather(key = "chrom", value = "nbases", -base) %>%
group_by(chrom) %>%
mutate(base_type = if_else(base == "N", "N", "non_N")) %>%
group_by(chrom, base_type) %>%
summarise(n_bases = sum(nbases)) %>%
spread(base_type, n_bases) %>%
mutate(len = N + non_N)
chromosome_lengths <- data_frame(chrom = "genome",
N = sum(chromosome_lengths$N),
non_N = sum(chromosome_lengths$non_N),
len = sum(chromosome_lengths$len)) %>%
bind_rows(chromosome_lengths)
get_cov_df <- function(bed){
cov <- coverage(bed)[1:22]
tablist <- List(lapply(cov, table))
mcols(tablist)$len <- lengths(cov, use.names = FALSE)
covhist <- stack(tablist, "seqnames", "count", "coverage")
margin <- aggregate(covhist, ~coverage,
count = sum(NumericList(count)))[-1L]
margin <- DataFrame(seqnames = Rle("genome"),
margin,
len = sum(as.numeric(lengths(cov))))
covhist <- rbind(covhist, margin)
ans <- within(covhist, fraction <- count/len)
as.data.frame(ans) %>%
filter(seqnames %in% c(1:22, "genome"),
coverage == 1)
}
cov_df <- list(HG005 = hg005_bed,
HG006 = hg006_bed,
HG007 = hg007_bed) %>%
map_dfr(get_cov_df, .id = "hgref")
cov_df <- cov_df %>%
mutate(seqnames = as.character(seqnames)) %>%
mutate(chrom = if_else( seqnames != "genome",
paste0("chr", seqnames),
as.character(seqnames))) %>%
dplyr::select(hgref, chrom, count, len, fraction)
cov_df <- cov_df %>%
left_join(chromosome_lengths) %>%
mutate(frac_non_N = count/non_N)
## Joining, by = c("chrom", "len")
0 used to indicate full genome.
cov_df %>%
ggplot() +
geom_bar(aes(x = chrom, y = fraction, fill = hgref),
width = 0.4, position = "dodge", stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = -45, hjust = 0)) +
coord_cartesian(xlim = NULL, ylim = c(0.5, 1))
Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.
Need to fix order
cov_df %>%
ggplot() +
geom_bar(aes(x = chrom, y = frac_non_N, fill = hgref),
width = 0.4, position = "dodge", stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = -45, hjust = 0)) +
coord_cartesian(xlim = NULL, ylim = c(0.75, 1))
Fraction of genome, chrom 0, and individual chromosomes covered by high confidence regions.
Coverage slightly lower for HG006 and HG007. Full genome length 313,745,450 bp (including N’s).
cov_df %>% filter(chrom == "genome") %>%
dplyr::select(hgref, fraction, frac_non_N) %>%
knitr::kable(digits = 4)
| hgref | fraction | frac_non_N |
|---|---|---|
| HG005 | 0.8460 | 0.9079 |
| HG006 | 0.8308 | 0.8916 |
| HG007 | 0.8311 | 0.8919 |
get_var_summary_df <- function(vcf){
## full genome
nvar <- nrow(vcf)
ti_lgl <- isTransition(vcf)
titv <- sum(ti_lgl)/sum(!ti_lgl)
## Metrics for individual chromosomes
for(i in 1:22){
chrom_vcf <- subset(vcf, seqnames(vcf) == i)
nvar <- c(nvar, nrow(chrom_vcf))
ti_lgl <- isTransition(chrom_vcf)
titv <- c(titv, sum(ti_lgl)/sum(!ti_lgl))
}
data.frame(CHROM = 0:22, nvar, titv)
}
variant_df <- list(HG005 = hg005_vcf,
HG006 = hg006_vcf,
HG007 = hg007_vcf) %>%
map_dfr(get_var_summary_df, .id = "hgref")
Number of variants by chromosome
variant_df %>%
ggplot() +
geom_bar(aes(x = CHROM, y = nvar, fill = hgref),
width = 0.4, position = "dodge", stat = "identity") +
theme_bw() +
# theme(axis.text.x = element_text(angle = -45, hjust = 0)) +
scale_y_log10() +
annotation_logticks(base = 10, sides = "rl") +
labs(y = "Number of Variants") +
coord_cartesian(xlim = NULL, ylim = c(1e4, 1e7))
Note values are not correct Ti/Tv for whole genome (CHROM 0) and individual chromosomes.
variant_df %>% dplyr::select(-nvar) %>% spread(hgref, titv)
## CHROM HG005 HG006 HG007
## 1 0 1.477984 1.474381 1.472636
## 2 1 1.520015 1.519046 1.513190
## 3 2 1.472985 1.463891 1.464626
## 4 3 1.449761 1.443425 1.447414
## 5 4 1.448618 1.456218 1.452431
## 6 5 1.433584 1.432964 1.423877
## 7 6 1.483034 1.476101 1.481471
## 8 7 1.468434 1.476196 1.465349
## 9 8 1.403261 1.403403 1.396826
## 10 9 1.446309 1.435683 1.434456
## 11 10 1.525333 1.527103 1.508354
## 12 11 1.469162 1.467356 1.476335
## 13 12 1.504614 1.487735 1.493365
## 14 13 1.469425 1.471442 1.478295
## 15 14 1.473790 1.476800 1.481621
## 16 15 1.487022 1.487725 1.473470
## 17 16 1.405125 1.385997 1.400156
## 18 17 1.603757 1.567680 1.568899
## 19 18 1.508162 1.509984 1.513959
## 20 19 1.585010 1.554630 1.549298
## 21 20 1.593320 1.583853 1.581084
## 22 21 1.426933 1.443853 1.430394
## 23 22 1.644711 1.646919 1.638870
Stats from command rtg vcfstats clean-sp.vcf.gz
Passed Filters : 4233607
Sample Name: child
SNPs : 2744098
MNPs : 0
Insertions : 139976
Deletions : 144263
Indels : 3350
Same as reference : 1201920
Phased Genotypes : 100.0% (4233607/4233607)
SNP Transitions/Transversions: 2.11 (2751892/1302496)
Total Het/Hom ratio : 1.13 (1605906/1425781)
SNP Het/Hom ratio : 1.10 (1434543/1309555)
MNP Het/Hom ratio : - (0/0)
Insertion Het/Hom ratio : 1.25 (77853/62123)
Deletion Het/Hom ratio : 1.68 (90337/53926)
Indel Het/Hom ratio : 17.93 (3173/177)
Insertion/Deletion ratio : 0.97 (139976/144263)
Indel/SNP+MNP ratio : 0.10 (287589/2744098)
Sample Name: dad
SNPs : 2764794
MNPs : 0
Insertions : 162676
Deletions : 168444
Indels : 2507
Same as reference : 1135186
SNP Transitions/Transversions: 2.11 (2770738/1314603)
Total Het/Hom ratio : 1.16 (1664056/1434365)
SNP Het/Hom ratio : 1.09 (1444985/1319809)
MNP Het/Hom ratio : - (0/0)
Insertion Het/Hom ratio : 1.65 (101219/61457)
Deletion Het/Hom ratio : 2.18 (115530/52914)
Indel Het/Hom ratio : 12.55 (2322/185)
Insertion/Deletion ratio : 0.97 (162676/168444)
Indel/SNP+MNP ratio : 0.12 (333627/2764794)
Sample Name: mom
SNPs : 2781532
MNPs : 0
Insertions : 164031
Deletions : 171409
Indels : 2378
Same as reference : 1114257
SNP Transitions/Transversions: 2.11 (2777219/1318918)
Total Het/Hom ratio : 1.19 (1692145/1427205)
SNP Het/Hom ratio : 1.12 (1467677/1313855)
MNP Het/Hom ratio : - (0/0)
Insertion Het/Hom ratio : 1.69 (102954/61077)
Deletion Het/Hom ratio : 2.29 (119308/52101)
Indel Het/Hom ratio : 12.83 (2206/172)
Insertion/Deletion ratio : 0.96 (164031/171409)
Indel/SNP+MNP ratio : 0.12 (337818/2781532)
VCF Eval: rtg vcfeval --ref-overlap --Xobey-phase=true,true --XXcom.rtg.vcf.eval.custom-path-processor=phase-transfer -t rtgsdf -b clean-sp.vcf.gz -c unphased-calls.vcf.gz -o phase-transfer-sp --sample child,INTEGRATION
Threshold True-pos-baseline True-pos-call False-pos False-neg Precision Sensitivity F-measure
----------------------------------------------------------------------------------------------------
117.000 3009782 3009795 564636 21905 0.8420 0.9928 0.9112
None 3031649 3031670 595626 38 0.8358 1.0000 0.9105
For comparison values from AJ Trio. Similar number of TP, FP, but order of magnitude fewer FN with AJ compared to Asian trio. Not sure if this is due to different threshold values.
rtg vcfeval --ref-overlap --Xobey-phase=true,true --XXcom.rtg.vcf.eval.custom-path-processor=phase-transfer -t rtgsdf -b clean-sp.vcf.gz -c unphased-calls.vcf.gz -o phase-transfer-sp --sample child,INTEGRATION
Threshold True-pos-baseline True-pos-call False-pos False-neg Precision Sensitivity F-measure
----------------------------------------------------------------------------------------------------
100.000 3026791 3026854 563160 2805 0.8431 0.9991 0.9145
None 3029515 3029578 566543 81 0.8425 1.0000 0.9145
trio_vcffile <- "data/HG005_HG006_HG007_trioinconsistent.vcf.gz"
trioinconsistent_vcf <- readVcf(trio_vcffile, genome = grch37)
## Warning in .bcfHeaderAsSimpleList(header): duplicate keys in header will be
## forced to unique rownames
Numbers for Integration manuscript
Annotating Variant Type
trio_incon_df <- geno(trioinconsistent_vcf)[["GT"]] %>%
data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column(var = "vcf_rowname") %>%
# add_column(indel_single = isIndel(trioinconsistent_vcf, singleAltOnly = TRUE)) %>%
add_column(indel = isIndel(trioinconsistent_vcf, singleAltOnly = FALSE)) %>%
# add_column(insertion = isInsertion(trioinconsistent_vcf)) %>%
# add_column(delection = isDeletion(trioinconsistent_vcf)) %>%
add_column(delins = isDelins(trioinconsistent_vcf)) %>%
add_column(snp = isSNV(trioinconsistent_vcf, singleAltOnly = FALSE)) %>%
add_column(substitution = isSubstitution(trioinconsistent_vcf, singleAltOnly = FALSE))
Total Violations
length(trioinconsistent_vcf)
## [1] 425
Indels
sum(isIndel(trioinconsistent_vcf, singleAltOnly = FALSE))
## [1] 216
SNPs
sum(isSNV(trioinconsistent_vcf))
## [1] 208
One position not classified as a SNV or Indel delins - deletion followed by an insertion
trio_incon_df %>% filter(snp == FALSE, indel == FALSE)
## vcf_rowname child dad mom indel delins snp substitution
## 1 15:49763681_AAAT/AAATAAT 0|0 1/1 0/0 FALSE TRUE FALSE FALSE
Likely cell-line or germline de novo mutations - son heterozygous Total (SNP, Indels)
denovo_son <- geno(trioinconsistent_vcf)[["GT"]] %>%
data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column(var = "vcf_rowname") %>%
left_join(trio_incon_df) %>%
filter(dad == "0/0", mom == "0/0", child != "1|1")
## Joining, by = c("vcf_rowname", "child", "dad", "mom")
Total
nrow(denovo_son)
## [1] 146
Number of SNP and Indels
denovo_son %>%
dplyr::select(vcf_rowname, snp, indel) %>%
gather("var_type","var_lgl", -vcf_rowname) %>%
filter(var_lgl == TRUE) %>%
group_by(var_type) %>%
summarise(count = n())
## # A tibble: 2 x 2
## var_type count
## <chr> <int>
## 1 indel 43
## 2 snp 103
denovo_son_var <- rownames(trioinconsistent_vcf) %in% denovo_son$vcf_rowname
trioinconsistent_vcf[denovo_son_var,] %>%
writeVcf(filename = "data/AsianTrio_inconsistent_denovo_son.vcf.gz",
index = TRUE)
rowRanges(trioinconsistent_vcf) %>% as.data.frame() %>%
dplyr::select(-seqnames, -strand, -paramRangeID, -QUAL, -FILTER) %>%
DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html