A number of false positive and negative sites were randomly sampled to further evaluate the potential for using CCS callsets to correct errors in benchmark sets. A total of 60 variants were randomly sampled for manual curation.
Along with 5 SNP and 5 Indel FP allele match errors.
Target categories - lowcmp_AllRepeats_gt95identity_slop5 and lowcmp_SimpleRepeat_imperfecthomopolymer_gt10_slop5
Target categories defined to include most homopolymers as based on our preliminary analysis these were the largest source of errors in the variant callsets.
Estimating number of how many extra variants and regions CCS might help add to the GIAB benchmark.
Used CallableLoci - see scripts/ccs_callableLoci.sh
Subtracted stratifications for homopolymers and excluding HG002 SVs and segmental duplications (superdups) then compare to benchmark regions - see scripts/calc_extend_bench.sh.
Calculate (non-N) genome coverage for resulting regions.
This estimate represents an upper limit for what how much we expect the CCS data to expand the current benchmark regions.
Example LINE (IGV screenshot) and an estimate of total count.
## Loading data
hap_list <- list(
GATK4 = "data/happy_output/results/result_1",
DeepVar = "data/happy_output/results/result_2",
DeepVarHap = "data/happy_output_deepVarHap/results/result_1",
GATK4_retyped = "data/happy_output_jebler_retyped/results/result_1",
DeepVar_retyped = "data/happy_output_jebler_retyped/results/result_2") %>%
map(read_happy)
## Creating a tidy data frame
extend_df <- hap_list %>%
map("extended") %>%
bind_rows(.id = "query_method")
ext_trim_df <- extend_df %>%
## Excluding non-HG002 genome specific stratifications
filter(!str_detect(Subset, "HG00[1,3,4,5]")) %>%
## excluding strat with < 1000 obs - lack of statistical power
filter(Subset.Size > 1000) %>%
## Subset must have at least one variant in region
filter(Subset.IS_CONF.Size > 0)
benchmark_metrics_tbl <- ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Subset %in% "*") %>%
filter(Filter == "PASS") %>%
rename(Callset = query_method,
Recall = METRIC.Recall,
Precision = METRIC.Precision,
F1 = METRIC.F1_Score) %>%
dplyr::select(Callset, Type, Recall, Precision, F1) %>%
arrange(Type, -F1)
write_csv(benchmark_metrics_tbl, "results/benchmark_metrics_tbl.csv")
Metrics for high level comparison
benchmark_metrics_tbl
## Do we have all difficult regions not in HG002allgenomespecific
high_level_strat <- c("*",
"alldifficultregions",
"notinalldifficultregions")
bench_overview_df <- ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(Subset %in% high_level_strat) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
mutate(Subset = fct_recode(Subset, "All" = "*",
"Diff" = "alldifficultregions",
"Not Diff" = "notinalldifficultregions"))
bench_overview_df %>%
ggplot(aes(x = Subset, y = Value,
fill = query_method, group=query_method)) +
geom_linerange(aes(ymin = 0.00001, ymax = Value),
position=position_dodge(width=0.5)) +
geom_point(shape = 21, color = "grey40", position=position_dodge(width=0.5)) +
scale_y_log10() +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom") +
annotation_logticks(sides = "l") +
labs(fill = "Callset")
Figure 4.1: Benchmarking performance metrics for CCS variant callsets for HG002 benchmark regions, Diff- all difficult regions, and Not Diff - not in difficult regions.
ggsave("results/benchmark_highlevel_strats.png",
dpi = 600,width = 6, height = 3)
TODO * Stratification indel numbers for homopolymers > 2bp - JZ will provide code and input files for analysis - for use in estimating the percentage of discordant indels in homopolymer runs
Overall high precision and recall for SNPs, and indels not in difficult regions.
stratifications <- c("*",
"alldifficultregions",
"lowcmp_AllRepeats_gt95identity_slop5",
"lowcmp_SimpleRepeat_imperfecthomopolymer_gt10_slop5",
# "lowcmp_SimpleRepeat_homopolymer_6to10",
# "lowcmp_SimpleRepeat_homopolymer_gt10",
# "map_l250_m0_e0",
"notinalldifficultregions",
"notinlowcmp_AllRepeats_gt95identity_slop5")
bench_strat_df <- ext_trim_df %>%
filter(Subtype == "*", query_method == "DeepVar") %>%
filter(Filter == "PASS") %>%
filter(Subset %in% stratifications) %>%
dplyr::select(Type, Subset, query_method,
contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA",
paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA",
1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
## Relevel factors
mutate(Subset = fct_recode(Subset,
"All" = "*",
"Diff" = "alldifficultregions",
"Not Diff" = "notinalldifficultregions",
"All Reps." = "lowcmp_AllRepeats_gt95identity_slop5",
"Not All Reps." = "notinlowcmp_AllRepeats_gt95identity_slop5",
"Imp. Homopolymer" = "lowcmp_SimpleRepeat_imperfecthomopolymer_gt10_slop5" #,
# "Homopolymer 6 - 10 bp" = "lowcmp_SimpleRepeat_homopolymer_6to10",
# "Homopolymer >10 bp" = "lowcmp_SimpleRepeat_homopolymer_gt10",
# "Map 250" = "map_l250_m0_e0"
)
)
bench_strat_df %>%
ggplot(aes(x = Subset, y = Value,
fill = Subset,
group=query_method)) +
geom_linerange(aes(ymin = 0.000025, ymax = Value),
position=position_dodge(width=0.5)) +
geom_point(shape = 21, color = "grey20", position=position_dodge(width=0.5)) +
scale_y_log10() +
scale_fill_brewer(type = "qual", palette = 2) +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom",
# axis.text.x = element_text(angle = -45, hjust = 0)
axis.text.x = element_blank(),
axis.title.x = element_blank()) +
annotation_logticks(sides = "l") +
labs(fill = "Stratifications")
Figure 4.2: Performance metrics stratification results.
ggsave("results/benchmark_strats.png",
dpi = 600,width = 8, height = 4)
Based on stratified analysis of the benchmarking results variant caller accuracy was poor for low complexity repeats with greater than 95% similarity (including homopolymers and tandem repeats). Variant caller performance was similar for indels when excluding low complexity repeats with > 95% similarity to when excluding all difficult regions indicating that this stratification is responsible for most of the discrepancies between the HG002 v3.3.2 benchmark callset and the CCS callsets.
ext_trim_df %>%
filter(Subtype == "*", Filter == "PASS", query_method == "DeepVar") %>%
filter(str_detect(Subset,"homopolymer")) %>%
filter(!str_detect(Subset,"_unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = query_method, y = Value, fill = Subset),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom",axis.text.x = element_text(angle = -45, hjust = 0)) +
labs(x = "Simple Repeat Type", fill = "Homopolymer Type")
Figure 4.3: Performance in homopolymers.
ext_trim_df %>%
filter(Subtype == "*", query_method == "DeepVar") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"Human_Full_Genome"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_Human_Full_Genome_TRDB_hg19_150331_")) %>%
mutate(Subset = str_remove(Subset, "_gt95identity_merged")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "lt", " <")) %>%
mutate(Subset = str_replace(Subset, "gt", " >")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Tandem Repeat Type", fill = "Callset")
Figure 4.4: Benchmarking metrics for TRDB.
Impact of re-genotyping on benchmarking results (Table 1, Supplemental Fig R6-1)
## Do we have all difficult regions not in HG002allgenomespecific
high_level_strat <- c("*",
"alldifficultregions",
"notinalldifficultregions")
bench_overview_df <- ext_trim_df %>%
filter(Subtype == "*", query_method %in% c("GATK4","DeepVar", "GATK4_retyped","DeepVar_retyped")) %>%
filter(Filter == "PASS") %>%
filter(Subset %in% high_level_strat) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
mutate(Subset = fct_recode(Subset, "All" = "*",
"Diff" = "alldifficultregions",
"Not Diff" = "notinalldifficultregions"))
bench_overview_df %>%
ggplot() + geom_col(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge") +
scale_y_log10() +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom") +
labs(fill = "Callset")
Figure 4.5: Impact of re-genotyping using WhatsHap on GATK4 and DeepVariant callset benchmark metrics. Diff- all difficult regions, and Not Diff - not in difficult regions.
strat_of_interest <- c("*","HG002allgenomespecific",
"HG002allgenomespecificandifficult",
"alldifficultregions","decoy",
"lowcmp_AllRepeats_51to200bp_gt95identity_merged_slop5",
"lowcmp_AllRepeats_gt200bp_gt95identity_merged_slop5",
"lowcmp_AllRepeats_gt95identity_slop5",
"lowcmp_AllRepeats_lt51bpTRs_gt95identity_merged_slop5",
"lowcmp_AllRepeats_lt51bp_gt95identity_merged_slop5",
"lowcmp_SimpleRepeat_homopolymer_6to10",
"lowcmp_SimpleRepeat_homopolymer_gt10",
"lowcmp_SimpleRepeat_imperfecthomopolymer_gt10_slop5",
"map_all","map_l250_m0_e0",
"notinHG002allgenomespecificandifficult",
"notinalldifficultregions",
"notinlowcmp_AllRepeats_gt95identity_slop5","segdup")
gg <- ext_trim_df %>%
filter(Type == "INDEL", Subtype == "*", Filter == "PASS") %>%
filter(Subset %in% strat_of_interest) %>%
ggplot() +
geom_point(aes(x = METRIC.Recall,
y = METRIC.Precision,
alpha = log10(Subset.Size),
text = Subset),
shape = 19, stroke = 0.25) +
facet_wrap(~query_method) +
theme_bw() +
labs(x = "Recall", y = "Precision")
plotly::ggplotly(gg)
gg <- ext_trim_df %>%
filter(Type == "SNP", Subtype == "*", Filter == "PASS") %>%
filter(Subset %in% strat_of_interest) %>%
ggplot() +
geom_point(aes(x = METRIC.Recall,
y = METRIC.Precision,
alpha = log10(Subset.Size),
text = Subset),
shape = 19, stroke = 0.25) +
facet_wrap(~query_method) +
theme_bw() +
labs(x = "Recall", y = "Precision")
plotly::ggplotly(gg)
ext_trim_df %>%
filter(Subtype == "*", Filter == "PASS") %>%
filter(Subset %in% strat_of_interest) %>%
dplyr::select(query_method, Type, Subset, contains("METRIC")) %>%
DT::datatable(caption = "The three query methods (variant callsets) performance metrics for high level stratifications and stratifications with know poor performance, e.g. homopolymers.")
ext_trim_df %>%
filter(Type == "INDEL", Filter == "PASS") %>%
filter(Subset %in% strat_of_interest) %>%
filter(TRUTH.TOTAL > 0)%>%
dplyr::select(query_method, Type, Subtype,Subset, contains("METRIC"), TRUTH.TOTAL) %>%
DT::datatable()
Performance metrics are consistent across homopolymer units for homopolymers between 6 and 10 bp but but varies for homopolymers greater than 10 bp.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"homopolymer")) %>%
filter(!str_detect(Subset,"_unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = query_method, y = Value, fill = Subset),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom",axis.text.x = element_text(angle = -45, hjust = 0)) +
labs(x = "Simple Repeat Type", fill = "Homopolymer Type")
Figure 5.1: Performance in homopolymers.
Performance varies by homopolymer unit. Similar performance is expected for A and T as observed. For G and C similar performance is not observed, inconsistent with expectation.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"homopolymer")) %>%
filter(str_detect(Subset,"_unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
separate(Subset, c("Size", "Unit"), sep = "_") %>%
mutate(Unit = str_remove(Unit, "unit=")) %>%
mutate(Size = str_remove(Size, "homopolymer ")) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
filter(query_method == "DeepVar") %>%
ggplot() + geom_bar(aes(x = Size, y = Value, fill = Unit),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Simple Repeat Type", fill = "Callset")
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"homopolymer")) %>%
filter(str_detect(Subset,"_unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
separate(Subset, c("Size", "Unit"), sep = "_") %>%
mutate(Unit = str_remove(Unit, "unit=")) %>%
mutate(Size = str_remove(Size, "homopolymer ")) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
filter(query_method == "DeepVarHap") %>%
ggplot() + geom_bar(aes(x = Size, y = Value, fill = Unit),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Simple Repeat Type", fill = "Callset")
Figure 5.2: DeepVariant with haplotype informaton performance metrics for homopolymers by unit.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"homopolymer")) %>%
filter(str_detect(Subset,"_unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
separate(Subset, c("Size", "Unit"), sep = "_") %>%
mutate(Unit = str_remove(Unit, "unit=")) %>%
mutate(Size = str_remove(Size, "homopolymer ")) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
filter(query_method == "GATK4") %>%
mutate(Unit = fct_relevel(Unit, c("A","T","C","G"))) %>%
ggplot() + geom_bar(aes(x = Size, y = Value, fill = Unit),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Simple Repeat Type", fill = "Unit")
ext_trim_df %>%
filter(Type == "INDEL") %>%
filter(Filter == "PASS") %>%
filter(!str_detect(Subtype, "C")) %>%
filter(str_detect(Subset,"homopolymer"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Subtype, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Subtype, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA", Metric != "1 - F1_Score") %>%
ggplot() + geom_bar(aes(x = Subtype, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Metric~Subset, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90),
legend.position = "bottom") +
labs(x = "Indel Subtype", fill = "Callset")
get_var_df <- function(vcffile){
## Read VCF
vcf <- readVcf(vcffile, genome = "BSgenome.Hsapiens.1000genomes.hs37d5")
## Generate tidy data frame
## Truth table classification
tt_class_df <- geno(vcf)[['BD']] %>%
as.data.frame() %>%
rownames_to_column(var = "variant") %>%
filter(QUERY == "FP" | TRUTH == "FN") %>%
as_tibble()
## Subsetting vcf for FP and FN
fpfn_positions <- str_remove(tt_class_df$variant, "_.*")
vcf_positions <- rownames(vcf) %>% str_remove("_.*")
fpfn_vcf <- vcf[vcf_positions %in% fpfn_positions ]
## Truth table classifications for Fp and Fn positions
tt_fpfn_df <- geno(fpfn_vcf )[['BD']] %>%
as.data.frame() %>%
rownames_to_column(var = "variant") %>%
as_tibble()
## Benchmarking Type
bk_type_df <- geno(fpfn_vcf)[['BK']] %>%
as.data.frame() %>%
rownames_to_column(var = "variant") %>%
dplyr::rename(Q.bk = QUERY, T.bk = TRUTH) %>%
as_tibble() %>%
left_join(tt_class_df, by = c("variant" = "variant"))
## Genotype
gt_type_df <- geno(fpfn_vcf)[['GT']] %>%
as.data.frame() %>%
rownames_to_column(var = "variant") %>%
dplyr::rename(Q.gt = QUERY, T.gt = TRUTH) %>%
as_tibble() %>%
left_join(bk_type_df)
## Variant type
fpfn_df <- geno(fpfn_vcf)[['BVT']] %>%
as.data.frame() %>%
rownames_to_column(var = "variant") %>%
dplyr::rename(Q.Var = QUERY, T.Var = TRUTH) %>%
left_join(gt_type_df)
## Combining into single data frame
fpfn_df %>% add_column(FILTER = rowRanges(fpfn_vcf)$FILTER,
Regions = as.list(info(fpfn_vcf)[['Regions']]))
}
annotate_var_df <- function(var_df){
var_anno_df <- var_df %>%
mutate(allrepeats = map_lgl(Regions, ~("lowcmp_AllRepeats_gt95identity_slop5" %in% .)),
imphomo = map_lgl(Regions, ~("lowcmp_SimpleRepeat_imperfecthomopolymer_gt10_slop5" %in% .)),
target_cat = if_else(allrepeats + imphomo == 0, "non-target","target")) %>%
dplyr::select(-allrepeats, -imphomo)
mutate(var_anno_df,
var_cat = case_when(
(Q.bk == "am" | T.bk == "am") & (Q.Var == "INDEL" | T.Var == "INDEL") ~ "AM.INDEL",
(Q.bk == "am" | T.bk == "am") & (Q.Var == "SNP" | T.Var == "SNP") ~ "AM.SNP",
# Q.bk == "am" | T.bk == "am" ~ "AM",
QUERY == "FP" & Q.Var == "INDEL" ~ "FP.INDEL",
QUERY == "FP" & Q.Var == "SNP" ~ "FP.SNP",
QUERY == "." & T.Var == "INDEL" ~ "FN.INDEL",
QUERY == "." & T.Var == "SNP" ~ "FN.SNP",
TRUE ~ "other")
)
}
## Getting random subset
anno_df <- list(gatk = "data/happy_output/results/result_1.vcf.gz",
deep = "data/happy_output/results/result_2.vcf.gz",
deepHap = "data/happy_output_deepVarHap/results/result_1.vcf.gz") %>%
map(get_var_df) %>%
map_dfr(annotate_var_df, .id = "callset") %>%
# Cleaner variant ids
mutate(CHROM = str_extract(variant, ".*(?=:)"),
POS = str_extract(variant, "(?<=:).*(?=_)"),
ALT = str_extract(variant, "(?<=_).*"))
## Joining, by = "variant"
## Joining, by = "variant"
## Joining, by = "variant"
## Joining, by = "variant"
## Joining, by = "variant"
## Joining, by = "variant"
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
set.seed(531)
var_random_df <- anno_df %>%
dplyr::filter(var_cat != "other") %>%
mutate(CHROM = str_extract(variant, ".*(?=:)")) %>%
mutate(POS = str_extract(variant, "(?<=:).*(?=_)")) %>%
## Random subsetting to include multiple variant groups
group_by(callset, var_cat, target_cat) %>%
sample_n(size = 5) %>%
ungroup() %>%
## Includes ALTs at the same position
dplyr::select(callset, CHROM, POS) %>%
left_join(anno_df)
## Joining, by = c("callset", "CHROM", "POS")
deepvar_random_df <- var_random_df %>%
filter(callset == "deepHap") %>%
mutate(variant = str_remove(variant, ".*_")) %>%
dplyr::rename(VAR = variant) %>%
dplyr::select(-Regions, -ALT) %>%
select(callset, target_cat, var_cat, CHROM, POS, everything())
## Total number of variants in each variant category for target and non-target regions.
anno_df %>%
dplyr::select(callset, CHROM, POS, target_cat, var_cat) %>%
distinct() %>%
filter(callset == "deepHap") %>%
group_by(target_cat, var_cat) %>%
summarise(count = n()) %>%
spread(target_cat, count)
var_region_heatmap_df <- var_random_df %>%
filter(callset == "deepHap") %>%
dplyr::select(variant, var_cat, target_cat, Regions) %>%
mutate(region_df = map(Regions, as_data_frame)) %>%
dplyr::select(-Regions) %>%
unnest()
var_region_heatmap_df %>%
filter(!str_detect(value, "HG00[1,3,4,5]"), value !="CONF") %>%
mutate(CHROM_POS = str_extract(variant, ".*(?=_)")) %>%
mutate(ALT = str_extract(variant, "(?<=_).*")) %>%
ggplot() +
geom_raster(aes(y = value, x = CHROM_POS, fill = target_cat)) +
theme(axis.text.x = element_text(angle = -90)) +
facet_wrap(~var_cat, nrow = 1, scales = "free_x")
### Manual curation spreadsheets
## Already generate files
# tmp_tsv <- tempfile(fileext = ".tsv")
# deepvar_random_df %>%
# add_column(PacBio = "", GIAB = "", Notes = "") %>%
# write_tsv(path = tmp_tsv)
#
# gs_upload(file = tmp_tsv, sheet_title = "hg002-ccs-deepvar-curate-JZ")
# gs_upload(file = tmp_tsv, sheet_title = "hg002-ccs-deepvar-curate-JM")
# gs_upload(file = tmp_tsv, sheet_title = "hg002-ccs-deepvar-curate-NDO")
get_genome <- function(genome){
if (genome == "hs37d5"){
require(BSgenome.Hsapiens.1000genomes.hs37d5)
return(BSgenome.Hsapiens.1000genomes.hs37d5)
} else if (genome == "GRCh38") {
require(BSgenome.Hsapiens.NCBI.GRCh38)
return(BSgenome.Hsapiens.NCBI.GRCh38)
} else {
stop("Genome not `hs37d5` or `GRCh38`")
}
}
get_chrom_sizes <- function(genome = "hs37d5"){
genome_obj <- get_genome(genome)
get_alpha_freq <- function(i){
genome_obj[[i]] %>%
alphabetFrequency() %>%
data.frame()
}
alpha_freq_df <- as.list(1:22) %>%
map_dfc(get_alpha_freq)
colnames(alpha_freq_df) <- paste0("chr",1:22)
alpha_freq_df <- alpha_freq_df %>%
## Removing bases not included in counts and non-standard bases
filter(chr1 >100) %>%
add_column(base = c("A","C","G","T","N"))
chromosome_lengths <- alpha_freq_df %>%
tidyr::gather(key = "chrom", value = "nbases", -base) %>%
group_by(chrom) %>%
mutate(base_type = if_else(base == "N", "N", "non_N")) %>%
group_by(chrom, base_type) %>%
summarise(n_bases = sum(nbases)) %>%
tidyr::spread(base_type, n_bases) %>%
mutate(len = N + non_N) %>%
dplyr::select(-N)
## data frame with total length and number of non-N bases
data_frame(chrom = "genome",
non_N = sum(chromosome_lengths$non_N),
len = sum(chromosome_lengths$len)) %>%
bind_rows(chromosome_lengths)
}
### High confidence coverage ###################################################
### Bed chromosome coverage lengths from bed
get_bed_cov_by_chrom <- function(bed_file){
## Read as tsv
bed_df <- read_tsv(bed_file,
col_names = c("chrom","start","end","info"), col_types = "ciic") %>%
## Compute region size
mutate(region_size = end - start) %>%
## Only looking at Chromosomes 1-22
filter(chrom %in% c(1:22, paste0("chr", 1:22)))
## Compute bases per chromosome
chrom_cov <- bed_df %>%
## Changing to chromosome names to chr
mutate(chrom = paste0("chr", chrom)) %>%
group_by(chrom,info) %>%
summarise(nbases = sum(region_size))
## NBases data frame
data_frame(chrom = "genome",
nbases = NA,
info = "") %>%
bind_rows(chrom_cov)
}
chrom_size_df <- get_chrom_sizes()
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
bench_extend_df <- get_bed_cov_by_chrom("data/benchmark_extend/ccs_extended.bed")
Supplemental Figure
bench_extend_df %>%
filter(info == "CALLABLE") %>%
left_join(chrom_size_df) %>%
mutate(extend = nbases/non_N) %>%
select(chrom, nbases, extend) %>%
mutate(chrom = str_remove(chrom, "chr"),
chrom = factor(chrom, levels = 1:22)) %>%
ggplot() + geom_bar(aes(x = chrom, y = extend), stat = "identity") +
theme_bw() +
labs(x = "Chromosome", y = "Proportion Extend")
## Joining, by = "chrom"
Extend is the fraction of additional non-N bases in the genome that may be covered by benchmark set when using CCS DeepVariant with haplotype callset to generate benchmark regions.
bench_extend_df %>%
filter(info == "CALLABLE") %>%
summarise(nbases = sum(nbases)) %>%
mutate(chrom = "genome") %>%
left_join(chrom_size_df) %>%
mutate(extend = nbases/non_N)
## Joining, by = "chrom"
bcftools view -R ccs_extended.bed ../kolesnikov/pacbio-15kb-hapsort-wgs.vcf.gz -o deepvar_extend_bcftools.vcf
SN 0 number of samples: 1
SN 0 number of records: 418649
SN 0 number of no-ALTs: 0
SN 0 number of SNPs: 210184
SN 0 number of MNPs: 0
SN 0 number of indels: 208691
SN 0 number of others: 0
SN 0 number of multiallelic sites: 764
SN 0 number of multiallelic SNP sites: 123
False negatives in LINEs were frequently observed. For example LINE at 21:42288851, where paired end short reads have no evidence but CC and mate-pair do (Fig. ADD REF). Additionally, 10x does for some other phased SNPs in the region. Including PacBio CCS callsets as an additional input for the NIST integration pipeline can potentially correct errors in LINEs. TODO estimate for total number of errors in LINEs
Most variant call errors in CCS callsets were adjacent to homopolymers, TODO SNP estimate with UCI-LCI and INDEL estimate with UCI and LCI (Table ccsErrorEst). TODO relationship between number of AM error, FP, and FN.
## Get JZ manual curation results
mc_jz <- gs_title(x = "hg002-ccs-deepvar-curate-JZ - November 19, 2:38 AM")
## Auto-refreshing stale OAuth token.
## Sheet successfully identified: "hg002-ccs-deepvar-curate-JZ - November 19, 2:38 AM"
mc_df <- gs_read(ss = mc_jz)
## Accessing worksheet titled 'hg002-ccs-deepvar-curate-JZ'.
## Parsed with column specification:
## cols(
## callset = col_character(),
## target_cat = col_character(),
## var_cat = col_character(),
## CHROM = col_double(),
## POS = col_double(),
## VAR = col_character(),
## T.Var = col_character(),
## Q.Var = col_character(),
## T.gt = col_character(),
## Q.gt = col_character(),
## T.bk = col_character(),
## Q.bk = col_character(),
## TRUTH = col_character(),
## QUERY = col_character(),
## FILTER = col_character(),
## PacBio = col_character(),
## GIAB = col_character(),
## Notes = col_character()
## )
mc_error <- mc_df %>%
group_by(target_cat, var_cat, PacBio) %>%
summarise(count = n()) %>%
filter(PacBio != "-") %>%
spread(PacBio, count, fill = 0) %>%
mutate(error_rate = N/(N + Y))
var_counts <- anno_df %>%
filter(callset == "deepHap") %>%
dplyr::select(callset, CHROM, POS, target_cat, var_cat) %>%
distinct() %>%
group_by(callset, target_cat, var_cat) %>%
summarise(count = n())
error_est_df <- var_counts %>%
left_join(mc_error) %>%
filter(var_cat != "other") %>%
mutate(bconf = map(Y, Hmisc::binconf, n = 5),
est = map_dbl(bconf, 1) * count,
lci = map_dbl(bconf, 2) * count,
uci = map_dbl(bconf, 3) * count)
## Joining, by = c("target_cat", "var_cat")
sum(error_est_df$count)
## [1] 18469
error_est_df %>% filter(target_cat == "non-target",
!str_detect(var_cat, "AM")) %>%
.$est %>% sum()
## [1] 2434.4
error_est_df %>% filter(target_cat == "non-target",
!str_detect(var_cat, "AM")) %>% .$lci %>% sum()
## [1] 1312.758
error_est_df %>% filter(target_cat == "non-target",
!str_detect(var_cat, "AM")) %>% .$uci %>% sum()
## [1] 2610.634
2434 (1313 - 2611)
error_est_df %>%
select(-callset, -bconf) %>%
knitr::kable(caption = "Estimated number of variant call errors in the DeepVariant callset with haplotype informed model.")
## Adding missing grouping variables: `callset`
| callset | target_cat | var_cat | count | N | Y | error_rate | est | lci | uci |
|---|---|---|---|---|---|---|---|---|---|
| deepHap | non-target | AM.INDEL | 36 | 3 | 2 | 0.6 | 14.4 | 4.234348 | 27.69393 |
| deepHap | non-target | AM.SNP | 73 | 2 | 3 | 0.4 | 43.8 | 16.842872 | 64.41368 |
| deepHap | non-target | FN.INDEL | 134 | 3 | 2 | 0.6 | 53.6 | 15.761184 | 103.08295 |
| deepHap | non-target | FN.SNP | 338 | 2 | 3 | 0.4 | 202.8 | 77.984807 | 298.24418 |
| deepHap | non-target | FP.INDEL | 165 | 1 | 4 | 0.2 | 132.0 | 61.963214 | 163.30732 |
| deepHap | non-target | FP.SNP | 2046 | 0 | 5 | 0.0 | 2046.0 | 1157.048877 | 2046.00000 |
| deepHap | target | AM.INDEL | 7051 | 5 | 0 | 1.0 | 0.0 | 0.000000 | 3063.53586 |
| deepHap | target | AM.SNP | 308 | 5 | 0 | 1.0 | 0.0 | 0.000000 | 133.82060 |
| deepHap | target | FN.INDEL | 5303 | 5 | 0 | 1.0 | 0.0 | 0.000000 | 2304.06051 |
| deepHap | target | FN.SNP | 140 | 5 | 0 | 1.0 | 0.0 | 0.000000 | 60.82755 |
| deepHap | target | FP.INDEL | 2653 | 4 | 1 | 0.8 | 530.6 | 27.216222 | 1656.70663 |
| deepHap | target | FP.SNP | 222 | 4 | 1 | 0.8 | 44.4 | 2.277422 | 138.63131 |
TODO
gg <- ext_trim_df %>%
filter(!str_detect(Subset, "unit=")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
filter(Subtype == "*", Filter == "PASS") %>%
ggplot() +
geom_point(aes(x = METRIC.Recall,
y = METRIC.Precision,
alpha = log10(Subset.Size),
text = Subset),
shape = 19, stroke = 0.25) +
facet_wrap(query_method~Type, scales = "free") +
theme_bw() +
labs(x = "Recall", y = "Precision")
## Warning: Ignoring unknown aesthetics: text
plotly::ggplotly(gg)
Figure 7.1: Relationship between precision and recall for all stratifications, excluding different repeat units.
GATK4 performance impacted by changes in GC more than DeepVariant and DeepVariant with haplotype information. Trends in performance relative GC are potentially confounded with low complexity regions.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"gc")) %>%
mutate(Subset = str_remove(Subset, "gc_l100_gc")) %>%
mutate(Subset = str_remove(Subset, "_slop50")) %>%
mutate(Subset = str_replace(Subset, "or", " or ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "lt", "<")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
mutate(Subset = fct_relevel(Subset,
c(">85","<25 or >65", "<30 or >55"),
after = Inf)) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90),
legend.position = "bottom") +
labs(x = "% GC for 100 bp windows", fill = "Callset")
Figure 7.2: Benchmarking metrics by GC content for 100 bp windows with 50 bp slop.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"lowcmp_SimpleRepeat"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_SimpleRepeat_")) %>%
mutate(Subset = str_remove(Subset, "_slop5")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90),
legend.position = "bottom") +
labs(x = "Simple Repeat Type", fill = "Callset")
Figure 7.3: Benchmarking metrics for different simple repeat stratifications.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"Human_Full_Genome"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_Human_Full_Genome_TRDB_hg19_150331_")) %>%
mutate(Subset = str_remove(Subset, "_gt95identity_merged")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "lt", " <")) %>%
mutate(Subset = str_replace(Subset, "gt", " >")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Tandem Repeat Type", fill = "Callset")
Figure 7.4: Benchmarking metrics for TRDB.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"AllRepeats"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "lowcmp_AllRepeats_")) %>%
mutate(Subset = str_remove(Subset, "_gt95identity_merged")) %>%
mutate(Subset = str_replace(Subset, "_", " ")) %>%
mutate(Subset = str_replace(Subset, "to", " to ")) %>%
mutate(Subset = str_replace(Subset, "lt", "<")) %>%
mutate(Subset = str_replace(Subset, "gt", ">")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Repeat Type", fill = "Callset")
Figure 7.5: Benchmarking metrics for all repeat stratifications.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"segdup")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Segmental Duplication Category", fill = "Callset")
Figure 7.6: Performance metrics for segmental duplication stratifications.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(Subset %in% c("map_all", "map_siren", "notinmap_all")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "map_l")) %>%
mutate(Subset = str_replace_all(Subset, "_", " ")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Mapping Category", fill = "Callset")
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(str_detect(Subset,"map"),
!str_detect(Subset,"unit")) %>%
filter(Subset.IS_CONF.Size > 0) %>%
mutate(Subset = str_remove(Subset, "map_l")) %>%
mutate(Subset = str_replace_all(Subset, "_", " ")) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Mapping Category", fill = "Callset")
Figure 7.7: Performance metrics for mappability stratifications.
ext_trim_df %>%
filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(Subset %in% c("notinfunc_cds","func_cds",
"notinsegdupall", "segdupall",
"notinalldifficultregions", "alldifficultregions")) %>%
mutate(Subset = fct_relevel(Subset, "func_cds", after = 2)) %>%
mutate(Subset = fct_relevel(Subset, "notinsegdupall", after = Inf)) %>%
dplyr::select(Type, Subset, query_method, contains("METRIC"), Subset.Size) %>%
gather("Metric","Value", -Type, -Subset,
-query_method, -Subset.Size) %>%
mutate(Metric = str_remove(Metric, "METRIC.")) %>%
mutate(Metric = if_else(Metric != "Frac_NA", paste("1 -", Metric), Metric),
Value = if_else(Metric != "Frac_NA", 1 - Value, Value)) %>%
filter(Metric != "Frac_NA") %>%
ggplot() + geom_bar(aes(x = Subset, y = Value, fill = query_method),
color = "grey40", width = 0.5,
position = "dodge", stat = "identity") +
facet_grid(Type~Metric, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = -90)) +
labs(x = "Stratification", fill = "Callset")
Figure 7.8: Benchmarking metrics for in/not-in stratification comparisons.
Statifications with high precision and recall
ext_trim_df %>%
# filter(Subtype == "*") %>%
filter(Filter == "PASS") %>%
filter(!str_detect(Subset, "unit")) %>%
filter(METRIC.Recall >= 0.999,
METRIC.Precision > 0.98) %>%
dplyr::select(query_method, Type, Subtype, Subset,
contains("METRIC"),
Subset.Size, Subset.IS_CONF.Size)
Matching md5 check sums between vcf files run on benchmarking app and vcf files in “final_callsets”. The MD5 check sums match therefore we do not need to run the benchmarking app on the files in “final_callsets”.
vcf_md5 <- c("b6ce8fbd50b983ea4a3bad5e4d4fc18b jebler/phased/pacbio_minimap2_15kb_69500_b37_wgs.all.callset_phased.vcf.gz",
"120a15cc95138cd5f1ed107e1e1cd85d jebler/phased/pacbio_minimap2_15kb_69500_b37_wgs.all.callset_phased.vcf.gz.tbi",
"bbb714c95ca7e6e8369ccfb22e4ae586 jebler/phased/pacbio_minimap2_15kb_69500_b37_wgs.all.retyped_phased.vcf.gz",
"000c980bbc4fb7040fe20947d1762ee3 jebler/phased/pacbio_minimap2_15kb_69500_b37_wgs.all.retyped_phased.vcf.gz.tbi",
"5a049ef09dd22d07b8378668276b4bad jebler/retyped/pacbio_minimap2_15kb_69500_b37_wgs.all.retyped.vcf.gz",
"f90a2d11cb8135dc951c2c9d5925f816 jebler/retyped/pacbio_minimap2_15kb_69500_b37_wgs.all.retyped.vcf.gz.tbi",
"f756d13a231e660909d16bc69b355059 jebler/retyped/pacbio_pbmm2_15kb_GATK4_hs37d5.all.retyped.vcf.gz",
"f764e0c44772f0db7b241e6c48bbfaba jebler/retyped/pacbio_pbmm2_15kb_GATK4_hs37d5.all.retyped.vcf.gz.tbi",
"5a049ef09dd22d07b8378668276b4bad for_jzook/final_callsets/DeepVariant-CCS-WhatsHap.vcf.gz",
"f90a2d11cb8135dc951c2c9d5925f816 for_jzook/final_callsets/DeepVariant-CCS-WhatsHap.vcf.gz.tbi",
"166703dfb3c15b201c5886dde0ace70b for_jzook/final_callsets/DeepVariant-CCS-hapsort.vcf.gz",
"f868b7e2ecd42d72e6223a9fc9819246 for_jzook/final_callsets/DeepVariant-CCS-hapsort.vcf.gz.tbi",
"a21eb94b7d8ee530244d6d4a28194a59 for_jzook/final_callsets/DeepVariant-CCS.vcf.gz",
"db771e12c949639c0015ad5eea7edf7f for_jzook/final_callsets/DeepVariant-CCS.vcf.gz.tbi",
"f756d13a231e660909d16bc69b355059 for_jzook/final_callsets/GATKHC-WhatsHap.vcf.gz",
"f764e0c44772f0db7b241e6c48bbfaba for_jzook/final_callsets/GATKHC-WhatsHap.vcf.gz.tbi",
"0075ff0ca7df131c0b02e426847c28cf for_jzook/final_callsets/GATKHC.vcf.gz",
"8c1432ec042fd7dc8c14329cd1a37100 for_jzook/final_callsets/GATKHC.vcf.gz.tbi",
"166703dfb3c15b201c5886dde0ace70b kolesnikov/pacbio-15kb-hapsort-wgs.vcf.gz",
"16577af07101b1105dab9e42d95c71c8 kolesnikov/pacbio-15kb-hapsort-wgs.vcf.gz.csi",
"a21eb94b7d8ee530244d6d4a28194a59 kolesnikov/pacbio_minimap2_15kb_69500_b37_wgs.vcf.gz",
"0075ff0ca7df131c0b02e426847c28cf wrowell/pacbio_pbmm2_15kb_GATK4_hs37d5.vcf.gz",
"8c1432ec042fd7dc8c14329cd1a37100 wrowell/pacbio_pbmm2_15kb_GATK4_hs37d5.vcf.gz.tbi",
"17fc4f33579bb916a130ebe865b8f128 wrowell/pacbio_pbmm2_15kb_GATK4_hs37d5_whatshap_phased.vcf.gz",
"89c63e379a19cd7616f605bdfb60ca1e wrowell/pacbio_pbmm2_15kb_GATK4_hs37d5_whatshap_phased.vcf.gz.tbi")
data_frame(vcf_md5) %>% separate(vcf_md5, c("md5","vcffile"),
sep = " ",
remove = TRUE) %>%
arrange(md5) %>%
filter(str_detect(vcffile, "vcf.gz$")) %>%
filter(!str_detect(vcffile, "phased"))
sessioninfo::platform_info()
## setting value
## version R version 3.5.2 (2018-12-20)
## os macOS High Sierra 10.13.6
## system x86_64, darwin17.7.0
## ui unknown
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2019-02-21
sessioninfo::package_info() %>%
knitr::kable(booktabs = TRUE)
| package | ondiskversion | loadedversion | path | loadedpath | attached | is_base | date | source | md5ok | library | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| acepack | acepack | 1.4.1 | 1.4.1 | /usr/local/lib/R/3.5/site-library/acepack | /usr/local/lib/R/3.5/site-library/acepack | FALSE | FALSE | 2016-10-29 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| AnnotationDbi | AnnotationDbi | 1.42.1 | 1.42.1 | /usr/local/lib/R/3.5/site-library/AnnotationDbi | /usr/local/lib/R/3.5/site-library/AnnotationDbi | FALSE | FALSE | 2018-05-09 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| askpass | askpass | 1.1 | 1.1 | /usr/local/lib/R/3.5/site-library/askpass | /usr/local/lib/R/3.5/site-library/askpass | FALSE | FALSE | 2019-01-13 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| assertthat | assertthat | 0.2.0 | 0.2.0 | /usr/local/lib/R/3.5/site-library/assertthat | /usr/local/lib/R/3.5/site-library/assertthat | FALSE | FALSE | 2017-04-11 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| backports | backports | 1.1.3 | 1.1.3 | /usr/local/lib/R/3.5/site-library/backports | /usr/local/lib/R/3.5/site-library/backports | FALSE | FALSE | 2018-12-14 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| base64enc | base64enc | 0.1.3 | 0.1-3 | /usr/local/lib/R/3.5/site-library/base64enc | /usr/local/lib/R/3.5/site-library/base64enc | FALSE | FALSE | 2015-07-28 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| bindr | bindr | 0.1.1 | 0.1.1 | /usr/local/lib/R/3.5/site-library/bindr | /usr/local/lib/R/3.5/site-library/bindr | FALSE | FALSE | 2018-03-13 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| bindrcpp | bindrcpp | 0.2.2 | 0.2.2 | /usr/local/lib/R/3.5/site-library/bindrcpp | /usr/local/lib/R/3.5/site-library/bindrcpp | TRUE | FALSE | 2018-03-29 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Biobase | Biobase | 2.40.0 | 2.40.0 | /usr/local/lib/R/3.5/site-library/Biobase | /usr/local/lib/R/3.5/site-library/Biobase | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| BiocGenerics | BiocGenerics | 0.26.0 | 0.26.0 | /usr/local/lib/R/3.5/site-library/BiocGenerics | /usr/local/lib/R/3.5/site-library/BiocGenerics | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| BiocParallel | BiocParallel | 1.14.2 | 1.14.2 | /usr/local/lib/R/3.5/site-library/BiocParallel | /usr/local/lib/R/3.5/site-library/BiocParallel | TRUE | FALSE | 2018-07-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| biomaRt | biomaRt | 2.36.1 | 2.36.1 | /usr/local/lib/R/3.5/site-library/biomaRt | /usr/local/lib/R/3.5/site-library/biomaRt | FALSE | FALSE | 2018-06-28 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| Biostrings | Biostrings | 2.48.0 | 2.48.0 | /usr/local/lib/R/3.5/site-library/Biostrings | /usr/local/lib/R/3.5/site-library/Biostrings | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| bit | bit | 1.1.14 | 1.1-14 | /usr/local/lib/R/3.5/site-library/bit | /usr/local/lib/R/3.5/site-library/bit | FALSE | FALSE | 2018-05-29 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| bit64 | bit64 | 0.9.7 | 0.9-7 | /usr/local/lib/R/3.5/site-library/bit64 | /usr/local/lib/R/3.5/site-library/bit64 | FALSE | FALSE | 2017-05-08 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| bitops | bitops | 1.0.6 | 1.0-6 | /usr/local/lib/R/3.5/site-library/bitops | /usr/local/lib/R/3.5/site-library/bitops | FALSE | FALSE | 2013-08-17 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| blob | blob | 1.1.1 | 1.1.1 | /usr/local/lib/R/3.5/site-library/blob | /usr/local/lib/R/3.5/site-library/blob | FALSE | FALSE | 2018-03-25 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| bookdown | bookdown | 0.9 | 0.9 | /usr/local/lib/R/3.5/site-library/bookdown | /usr/local/lib/R/3.5/site-library/bookdown | FALSE | FALSE | 2018-12-21 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| broom | broom | 0.5.1 | 0.5.1 | /usr/local/lib/R/3.5/site-library/broom | /usr/local/lib/R/3.5/site-library/broom | FALSE | FALSE | 2018-12-05 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| BSgenome | BSgenome | 1.48.0 | 1.48.0 | /usr/local/lib/R/3.5/site-library/BSgenome | /usr/local/lib/R/3.5/site-library/BSgenome | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| BSgenome.Hsapiens.1000genomes.hs37d5 | BSgenome.Hsapiens.1000genomes.hs37d5 | 0.99.1 | 0.99.1 | /usr/local/lib/R/3.5/site-library/BSgenome.Hsapiens.1000genomes.hs37d5 | /usr/local/lib/R/3.5/site-library/BSgenome.Hsapiens.1000genomes.hs37d5 | TRUE | FALSE | 2018-08-06 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| Cairo | Cairo | 1.5.9 | 1.5-9 | /usr/local/lib/R/3.5/site-library/Cairo | /usr/local/lib/R/3.5/site-library/Cairo | FALSE | FALSE | 2015-09-26 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| cellranger | cellranger | 1.1.0 | 1.1.0 | /usr/local/lib/R/3.5/site-library/cellranger | /usr/local/lib/R/3.5/site-library/cellranger | FALSE | FALSE | 2016-07-27 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| checkmate | checkmate | 1.9.1 | 1.9.1 | /usr/local/lib/R/3.5/site-library/checkmate | /usr/local/lib/R/3.5/site-library/checkmate | FALSE | FALSE | 2019-01-15 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| cli | cli | 1.0.1 | 1.0.1 | /usr/local/lib/R/3.5/site-library/cli | /usr/local/lib/R/3.5/site-library/cli | FALSE | FALSE | 2018-09-25 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| cluster | cluster | 2.0.7.1 | 2.0.7-1 | /usr/local/lib/R/3.5/site-library/cluster | /usr/local/lib/R/3.5/site-library/cluster | FALSE | FALSE | 2018-04-09 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| colorspace | colorspace | 1.4.0 | 1.4-0 | /usr/local/lib/R/3.5/site-library/colorspace | /usr/local/lib/R/3.5/site-library/colorspace | FALSE | FALSE | 2019-01-13 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| crayon | crayon | 1.3.4 | 1.3.4 | /usr/local/lib/R/3.5/site-library/crayon | /usr/local/lib/R/3.5/site-library/crayon | FALSE | FALSE | 2017-09-16 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| crosstalk | crosstalk | 1.0.0 | 1.0.0 | /usr/local/lib/R/3.5/site-library/crosstalk | /usr/local/lib/R/3.5/site-library/crosstalk | FALSE | FALSE | 2016-12-21 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| curl | curl | 3.3 | 3.3 | /usr/local/lib/R/3.5/site-library/curl | /usr/local/lib/R/3.5/site-library/curl | FALSE | FALSE | 2019-01-10 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| data.table | data.table | 1.12.0 | 1.12.0 | /usr/local/lib/R/3.5/site-library/data.table | /usr/local/lib/R/3.5/site-library/data.table | FALSE | FALSE | 2019-01-13 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| DBI | DBI | 1.0.0 | 1.0.0 | /usr/local/lib/R/3.5/site-library/DBI | /usr/local/lib/R/3.5/site-library/DBI | FALSE | FALSE | 2018-05-02 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| DelayedArray | DelayedArray | 0.6.6 | 0.6.6 | /usr/local/lib/R/3.5/site-library/DelayedArray | /usr/local/lib/R/3.5/site-library/DelayedArray | TRUE | FALSE | 2018-09-11 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| digest | digest | 0.6.18 | 0.6.18 | /usr/local/lib/R/3.5/site-library/digest | /usr/local/lib/R/3.5/site-library/digest | FALSE | FALSE | 2018-10-10 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| dplyr | dplyr | 0.7.8 | 0.7.8 | /usr/local/lib/R/3.5/site-library/dplyr | /usr/local/lib/R/3.5/site-library/dplyr | TRUE | FALSE | 2018-11-10 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| evaluate | evaluate | 0.12 | 0.12 | /usr/local/lib/R/3.5/site-library/evaluate | /usr/local/lib/R/3.5/site-library/evaluate | FALSE | FALSE | 2018-10-09 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| forcats | forcats | 0.3.0 | 0.3.0 | /usr/local/lib/R/3.5/site-library/forcats | /usr/local/lib/R/3.5/site-library/forcats | TRUE | FALSE | 2018-02-19 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| foreign | foreign | 0.8.71 | 0.8-71 | /usr/local/lib/R/3.5/site-library/foreign | /usr/local/lib/R/3.5/site-library/foreign | FALSE | FALSE | 2018-07-20 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Formula | Formula | 1.2.3 | 1.2-3 | /usr/local/lib/R/3.5/site-library/Formula | /usr/local/lib/R/3.5/site-library/Formula | FALSE | FALSE | 2018-05-03 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| generics | generics | 0.0.2 | 0.0.2 | /usr/local/lib/R/3.5/site-library/generics | /usr/local/lib/R/3.5/site-library/generics | FALSE | FALSE | 2018-11-29 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| GenomeInfoDb | GenomeInfoDb | 1.16.0 | 1.16.0 | /usr/local/lib/R/3.5/site-library/GenomeInfoDb | /usr/local/lib/R/3.5/site-library/GenomeInfoDb | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| GenomeInfoDbData | GenomeInfoDbData | 1.1.0 | 1.1.0 | /usr/local/lib/R/3.5/site-library/GenomeInfoDbData | /usr/local/lib/R/3.5/site-library/GenomeInfoDbData | FALSE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| GenomicAlignments | GenomicAlignments | 1.16.0 | 1.16.0 | /usr/local/lib/R/3.5/site-library/GenomicAlignments | /usr/local/lib/R/3.5/site-library/GenomicAlignments | FALSE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| GenomicFeatures | GenomicFeatures | 1.32.3 | 1.32.3 | /usr/local/lib/R/3.5/site-library/GenomicFeatures | /usr/local/lib/R/3.5/site-library/GenomicFeatures | FALSE | FALSE | 2018-10-05 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| GenomicRanges | GenomicRanges | 1.32.7 | 1.32.7 | /usr/local/lib/R/3.5/site-library/GenomicRanges | /usr/local/lib/R/3.5/site-library/GenomicRanges | TRUE | FALSE | 2018-09-20 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| ggplot2 | ggplot2 | 3.1.0 | 3.1.0 | /usr/local/lib/R/3.5/site-library/ggplot2 | /usr/local/lib/R/3.5/site-library/ggplot2 | TRUE | FALSE | 2018-10-25 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| glue | glue | 1.3.0 | 1.3.0 | /usr/local/lib/R/3.5/site-library/glue | /usr/local/lib/R/3.5/site-library/glue | FALSE | FALSE | 2018-07-17 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| googlesheets | googlesheets | 0.3.0 | 0.3.0 | /usr/local/lib/R/3.5/site-library/googlesheets | /usr/local/lib/R/3.5/site-library/googlesheets | TRUE | FALSE | 2018-06-29 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| gridExtra | gridExtra | 2.3 | 2.3 | /usr/local/lib/R/3.5/site-library/gridExtra | /usr/local/lib/R/3.5/site-library/gridExtra | FALSE | FALSE | 2017-09-09 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| gtable | gtable | 0.2.0 | 0.2.0 | /usr/local/lib/R/3.5/site-library/gtable | /usr/local/lib/R/3.5/site-library/gtable | FALSE | FALSE | 2016-02-26 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| happyR | happyR | 1.0.0 | 1.0.0 | /usr/local/lib/R/3.5/site-library/happyR | /usr/local/lib/R/3.5/site-library/happyR | TRUE | FALSE | 2018-09-07 | Github (Illumina/happyR@4ef8832) | NA | /usr/local/lib/R/3.5/site-library |
| haven | haven | 2.0.0 | 2.0.0 | /usr/local/lib/R/3.5/site-library/haven | /usr/local/lib/R/3.5/site-library/haven | FALSE | FALSE | 2018-11-22 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| highr | highr | 0.7 | 0.7 | /usr/local/lib/R/3.5/site-library/highr | /usr/local/lib/R/3.5/site-library/highr | FALSE | FALSE | 2018-06-09 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Hmisc | Hmisc | 4.2.0 | 4.2-0 | /usr/local/lib/R/3.5/site-library/Hmisc | /usr/local/lib/R/3.5/site-library/Hmisc | FALSE | FALSE | 2019-01-26 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| hms | hms | 0.4.2 | 0.4.2 | /usr/local/lib/R/3.5/site-library/hms | /usr/local/lib/R/3.5/site-library/hms | FALSE | FALSE | 2018-03-10 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| htmlTable | htmlTable | 1.13.1 | 1.13.1 | /usr/local/lib/R/3.5/site-library/htmlTable | /usr/local/lib/R/3.5/site-library/htmlTable | FALSE | FALSE | 2019-01-07 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| htmltools | htmltools | 0.3.6 | 0.3.6 | /usr/local/lib/R/3.5/site-library/htmltools | /usr/local/lib/R/3.5/site-library/htmltools | FALSE | FALSE | 2017-04-28 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| htmlwidgets | htmlwidgets | 1.3 | 1.3 | /usr/local/lib/R/3.5/site-library/htmlwidgets | /usr/local/lib/R/3.5/site-library/htmlwidgets | FALSE | FALSE | 2018-09-30 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| httpuv | httpuv | 1.4.5.1 | 1.4.5.1 | /usr/local/lib/R/3.5/site-library/httpuv | /usr/local/lib/R/3.5/site-library/httpuv | FALSE | FALSE | 2018-12-18 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| httr | httr | 1.4.0 | 1.4.0 | /usr/local/lib/R/3.5/site-library/httr | /usr/local/lib/R/3.5/site-library/httr | FALSE | FALSE | 2018-12-11 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| IRanges | IRanges | 2.14.12 | 2.14.12 | /usr/local/lib/R/3.5/site-library/IRanges | /usr/local/lib/R/3.5/site-library/IRanges | TRUE | FALSE | 2018-09-20 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| jsonlite | jsonlite | 1.6 | 1.6 | /usr/local/lib/R/3.5/site-library/jsonlite | /usr/local/lib/R/3.5/site-library/jsonlite | FALSE | FALSE | 2018-12-07 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| knitr | knitr | 1.21 | 1.21 | /usr/local/lib/R/3.5/site-library/knitr | /usr/local/lib/R/3.5/site-library/knitr | FALSE | FALSE | 2018-12-10 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| labeling | labeling | 0.3 | 0.3 | /usr/local/lib/R/3.5/site-library/labeling | /usr/local/lib/R/3.5/site-library/labeling | FALSE | FALSE | 2014-08-23 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| later | later | 0.7.5 | 0.7.5 | /usr/local/lib/R/3.5/site-library/later | /usr/local/lib/R/3.5/site-library/later | FALSE | FALSE | 2018-09-18 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| lattice | lattice | 0.20.38 | 0.20-38 | /usr/local/lib/R/3.5/site-library/lattice | /usr/local/lib/R/3.5/site-library/lattice | FALSE | FALSE | 2018-11-04 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| latticeExtra | latticeExtra | 0.6.28 | 0.6-28 | /usr/local/lib/R/3.5/site-library/latticeExtra | /usr/local/lib/R/3.5/site-library/latticeExtra | FALSE | FALSE | 2016-02-09 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| lazyeval | lazyeval | 0.2.1 | 0.2.1 | /usr/local/lib/R/3.5/site-library/lazyeval | /usr/local/lib/R/3.5/site-library/lazyeval | FALSE | FALSE | 2017-10-29 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| lubridate | lubridate | 1.7.4 | 1.7.4 | /usr/local/lib/R/3.5/site-library/lubridate | /usr/local/lib/R/3.5/site-library/lubridate | FALSE | FALSE | 2018-04-11 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| magrittr | magrittr | 1.5 | 1.5 | /usr/local/lib/R/3.5/site-library/magrittr | /usr/local/lib/R/3.5/site-library/magrittr | FALSE | FALSE | 2014-11-22 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Matrix | Matrix | 1.2.15 | 1.2-15 | /usr/local/lib/R/3.5/site-library/Matrix | /usr/local/lib/R/3.5/site-library/Matrix | FALSE | FALSE | 2018-11-01 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| matrixStats | matrixStats | 0.54.0 | 0.54.0 | /usr/local/lib/R/3.5/site-library/matrixStats | /usr/local/lib/R/3.5/site-library/matrixStats | TRUE | FALSE | 2018-07-23 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| memoise | memoise | 1.1.0 | 1.1.0 | /usr/local/lib/R/3.5/site-library/memoise | /usr/local/lib/R/3.5/site-library/memoise | FALSE | FALSE | 2017-04-21 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| mime | mime | 0.6 | 0.6 | /usr/local/lib/R/3.5/site-library/mime | /usr/local/lib/R/3.5/site-library/mime | FALSE | FALSE | 2018-10-05 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| modelr | modelr | 0.1.2 | 0.1.2 | /usr/local/lib/R/3.5/site-library/modelr | /usr/local/lib/R/3.5/site-library/modelr | FALSE | FALSE | 2018-05-11 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| munsell | munsell | 0.5.0 | 0.5.0 | /usr/local/lib/R/3.5/site-library/munsell | /usr/local/lib/R/3.5/site-library/munsell | FALSE | FALSE | 2018-06-12 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| nlme | nlme | 3.1.137 | 3.1-137 | /usr/local/lib/R/3.5/site-library/nlme | /usr/local/lib/R/3.5/site-library/nlme | FALSE | FALSE | 2018-04-07 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| nnet | nnet | 7.3.12 | 7.3-12 | /usr/local/lib/R/3.5/site-library/nnet | /usr/local/lib/R/3.5/site-library/nnet | FALSE | FALSE | 2016-02-02 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| openssl | openssl | 1.2.1 | 1.2.1 | /usr/local/lib/R/3.5/site-library/openssl | /usr/local/lib/R/3.5/site-library/openssl | FALSE | FALSE | 2019-01-17 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| pillar | pillar | 1.3.1 | 1.3.1 | /usr/local/lib/R/3.5/site-library/pillar | /usr/local/lib/R/3.5/site-library/pillar | FALSE | FALSE | 2018-12-15 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| pkgconfig | pkgconfig | 2.0.2 | 2.0.2 | /usr/local/lib/R/3.5/site-library/pkgconfig | /usr/local/lib/R/3.5/site-library/pkgconfig | FALSE | FALSE | 2018-08-16 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| plotly | plotly | 4.8.0 | 4.8.0 | /usr/local/lib/R/3.5/site-library/plotly | /usr/local/lib/R/3.5/site-library/plotly | FALSE | FALSE | 2018-07-20 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| plyr | plyr | 1.8.4 | 1.8.4 | /usr/local/lib/R/3.5/site-library/plyr | /usr/local/lib/R/3.5/site-library/plyr | FALSE | FALSE | 2016-06-08 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| prettyunits | prettyunits | 1.0.2 | 1.0.2 | /usr/local/lib/R/3.5/site-library/prettyunits | /usr/local/lib/R/3.5/site-library/prettyunits | FALSE | FALSE | 2015-07-13 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| progress | progress | 1.2.0 | 1.2.0 | /usr/local/lib/R/3.5/site-library/progress | /usr/local/lib/R/3.5/site-library/progress | FALSE | FALSE | 2018-06-14 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| promises | promises | 1.0.1 | 1.0.1 | /usr/local/lib/R/3.5/site-library/promises | /usr/local/lib/R/3.5/site-library/promises | FALSE | FALSE | 2018-04-13 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| purrr | purrr | 0.3.0 | 0.3.0 | /usr/local/lib/R/3.5/site-library/purrr | /usr/local/lib/R/3.5/site-library/purrr | TRUE | FALSE | 2019-01-27 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| R6 | R6 | 2.3.0 | 2.3.0 | /usr/local/lib/R/3.5/site-library/R6 | /usr/local/lib/R/3.5/site-library/R6 | FALSE | FALSE | 2018-10-04 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| RColorBrewer | RColorBrewer | 1.1.2 | 1.1-2 | /usr/local/lib/R/3.5/site-library/RColorBrewer | /usr/local/lib/R/3.5/site-library/RColorBrewer | FALSE | FALSE | 2014-12-07 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Rcpp | Rcpp | 1.0.0 | 1.0.0 | /usr/local/lib/R/3.5/site-library/Rcpp | /usr/local/lib/R/3.5/site-library/Rcpp | FALSE | FALSE | 2018-11-07 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| RCurl | RCurl | 1.95.4.11 | 1.95-4.11 | /usr/local/lib/R/3.5/site-library/RCurl | /usr/local/lib/R/3.5/site-library/RCurl | FALSE | FALSE | 2018-07-15 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| readr | readr | 1.3.1 | 1.3.1 | /usr/local/lib/R/3.5/site-library/readr | /usr/local/lib/R/3.5/site-library/readr | TRUE | FALSE | 2018-12-21 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| readxl | readxl | 1.2.0 | 1.2.0 | /usr/local/lib/R/3.5/site-library/readxl | /usr/local/lib/R/3.5/site-library/readxl | FALSE | FALSE | 2018-12-19 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| reshape2 | reshape2 | 1.4.3 | 1.4.3 | /usr/local/lib/R/3.5/site-library/reshape2 | /usr/local/lib/R/3.5/site-library/reshape2 | FALSE | FALSE | 2017-12-11 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| rlang | rlang | 0.3.1 | 0.3.1 | /usr/local/lib/R/3.5/site-library/rlang | /usr/local/lib/R/3.5/site-library/rlang | FALSE | FALSE | 2019-01-08 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| rmarkdown | rmarkdown | 1.11 | 1.11 | /usr/local/lib/R/3.5/site-library/rmarkdown | /usr/local/lib/R/3.5/site-library/rmarkdown | FALSE | FALSE | 2018-12-08 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| rpart | rpart | 4.1.13 | 4.1-13 | /usr/local/lib/R/3.5/site-library/rpart | /usr/local/lib/R/3.5/site-library/rpart | FALSE | FALSE | 2018-02-23 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| Rsamtools | Rsamtools | 1.32.3 | 1.32.3 | /usr/local/lib/R/3.5/site-library/Rsamtools | /usr/local/lib/R/3.5/site-library/Rsamtools | TRUE | FALSE | 2018-08-22 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| RSQLite | RSQLite | 2.1.1 | 2.1.1 | /usr/local/lib/R/3.5/site-library/RSQLite | /usr/local/lib/R/3.5/site-library/RSQLite | FALSE | FALSE | 2018-05-06 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| rstudioapi | rstudioapi | 0.9.0 | 0.9.0 | /usr/local/lib/R/3.5/site-library/rstudioapi | /usr/local/lib/R/3.5/site-library/rstudioapi | FALSE | FALSE | 2019-01-09 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| rtracklayer | rtracklayer | 1.40.6 | 1.40.6 | /usr/local/lib/R/3.5/site-library/rtracklayer | /usr/local/lib/R/3.5/site-library/rtracklayer | TRUE | FALSE | 2018-08-31 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| rvest | rvest | 0.3.2 | 0.3.2 | /usr/local/lib/R/3.5/site-library/rvest | /usr/local/lib/R/3.5/site-library/rvest | FALSE | FALSE | 2016-06-17 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| S4Vectors | S4Vectors | 0.18.3 | 0.18.3 | /usr/local/lib/R/3.5/site-library/S4Vectors | /usr/local/lib/R/3.5/site-library/S4Vectors | TRUE | FALSE | 2018-06-28 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| scales | scales | 1.0.0 | 1.0.0 | /usr/local/lib/R/3.5/site-library/scales | /usr/local/lib/R/3.5/site-library/scales | FALSE | FALSE | 2018-08-09 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| sessioninfo | sessioninfo | 1.1.1 | 1.1.1 | /usr/local/lib/R/3.5/site-library/sessioninfo | /usr/local/lib/R/3.5/site-library/sessioninfo | FALSE | FALSE | 2018-11-05 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| shiny | shiny | 1.2.0 | 1.2.0 | /usr/local/lib/R/3.5/site-library/shiny | /usr/local/lib/R/3.5/site-library/shiny | FALSE | FALSE | 2018-11-02 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| stringi | stringi | 1.2.4 | 1.2.4 | /usr/local/lib/R/3.5/site-library/stringi | /usr/local/lib/R/3.5/site-library/stringi | FALSE | FALSE | 2018-07-20 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| stringr | stringr | 1.3.1 | 1.3.1 | /usr/local/lib/R/3.5/site-library/stringr | /usr/local/lib/R/3.5/site-library/stringr | TRUE | FALSE | 2018-05-10 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| SummarizedExperiment | SummarizedExperiment | 1.10.1 | 1.10.1 | /usr/local/lib/R/3.5/site-library/SummarizedExperiment | /usr/local/lib/R/3.5/site-library/SummarizedExperiment | TRUE | FALSE | 2018-06-28 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| survival | survival | 2.43.3 | 2.43-3 | /usr/local/lib/R/3.5/site-library/survival | /usr/local/lib/R/3.5/site-library/survival | FALSE | FALSE | 2018-11-26 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| tibble | tibble | 2.0.1 | 2.0.1 | /usr/local/lib/R/3.5/site-library/tibble | /usr/local/lib/R/3.5/site-library/tibble | TRUE | FALSE | 2019-01-12 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| tidyr | tidyr | 0.8.2 | 0.8.2 | /usr/local/lib/R/3.5/site-library/tidyr | /usr/local/lib/R/3.5/site-library/tidyr | TRUE | FALSE | 2018-10-28 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| tidyselect | tidyselect | 0.2.5 | 0.2.5 | /usr/local/lib/R/3.5/site-library/tidyselect | /usr/local/lib/R/3.5/site-library/tidyselect | FALSE | FALSE | 2018-10-11 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| tidyverse | tidyverse | 1.2.1 | 1.2.1 | /usr/local/lib/R/3.5/site-library/tidyverse | /usr/local/lib/R/3.5/site-library/tidyverse | TRUE | FALSE | 2017-11-14 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| VariantAnnotation | VariantAnnotation | 1.26.1 | 1.26.1 | /usr/local/lib/R/3.5/site-library/VariantAnnotation | /usr/local/lib/R/3.5/site-library/VariantAnnotation | TRUE | FALSE | 2018-07-04 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| viridisLite | viridisLite | 0.3.0 | 0.3.0 | /usr/local/lib/R/3.5/site-library/viridisLite | /usr/local/lib/R/3.5/site-library/viridisLite | FALSE | FALSE | 2018-02-01 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| withr | withr | 2.1.2 | 2.1.2 | /usr/local/lib/R/3.5/site-library/withr | /usr/local/lib/R/3.5/site-library/withr | FALSE | FALSE | 2018-03-15 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| xfun | xfun | 0.4 | 0.4 | /usr/local/lib/R/3.5/site-library/xfun | /usr/local/lib/R/3.5/site-library/xfun | FALSE | FALSE | 2018-10-23 | CRAN (R 3.5.2) | NA | /usr/local/lib/R/3.5/site-library |
| XML | XML | 3.98.1.16 | 3.98-1.16 | /usr/local/lib/R/3.5/site-library/XML | /usr/local/lib/R/3.5/site-library/XML | FALSE | FALSE | 2018-08-19 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| xml2 | xml2 | 1.2.0 | 1.2.0 | /usr/local/lib/R/3.5/site-library/xml2 | /usr/local/lib/R/3.5/site-library/xml2 | FALSE | FALSE | 2018-01-24 | CRAN (R 3.5.0) | NA | /usr/local/lib/R/3.5/site-library |
| xtable | xtable | 1.8.3 | 1.8-3 | /usr/local/lib/R/3.5/site-library/xtable | /usr/local/lib/R/3.5/site-library/xtable | FALSE | FALSE | 2018-08-29 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| XVector | XVector | 0.20.0 | 0.20.0 | /usr/local/lib/R/3.5/site-library/XVector | /usr/local/lib/R/3.5/site-library/XVector | TRUE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |
| yaml | yaml | 2.2.0 | 2.2.0 | /usr/local/lib/R/3.5/site-library/yaml | /usr/local/lib/R/3.5/site-library/yaml | FALSE | FALSE | 2018-07-25 | CRAN (R 3.5.1) | NA | /usr/local/lib/R/3.5/site-library |
| zlibbioc | zlibbioc | 1.26.0 | 1.26.0 | /usr/local/lib/R/3.5/site-library/zlibbioc | /usr/local/lib/R/3.5/site-library/zlibbioc | FALSE | FALSE | 2018-05-08 | Bioconductor | NA | /usr/local/lib/R/3.5/site-library |