0.1 Overview

Nanopore sequencing of the GIAB HG002 was performed at three sites, JIMB, NOTT, and BIRM. The following analysis summarizes the sequencing runs generated at the sites comparing run throughput, N50, and coverage by read length. Sequence metrics were calculated from the alignment files (CRAM). Therefore the run metrics is likely lower than metrics calculated from the basecall sequence files (fastq).

Nanopore sequencing platforms

0.2 Data Preparation

Downloading run metadata from googlesheet.

nanopore_gs <- gs_title("Nanopore runs")
jimb_metadata <- nanopore_gs %>%
  gs_read(ws = "GM24385")
nott_metadata <- nanopore_gs %>% 
    gs_read(ws = "Nott")

Pulling alignment statistics from stanford oak filesystem, requires mounting stanford oak filesystem as stanford_oak directory.

The data were saved locally to make it easier to re-compile Rmarkdown.

if (TRUE) {
    stat_files <- list.files("nanopore/stats/", 
                         pattern = ".*stats.tsv.gz", full.names = TRUE,
                         recursive = TRUE)

    # BAM statistics combined bam stat files
    col_names <- c("read_id", 
                   "aln_lengthsum", 
                   "aln_lengthmax",
                   "aln_count",
                   "ref_lengthsum",
                   "ref_lengthmax",
                   "ref_lengthcount",
                   "bases")

    aln_stat_files <- grep("/aln/", stat_files, value = TRUE)
    aln_names <- str_extract(aln_stat_files, "(?<=stats//).*(?=/aln)")
    aln_stat_df <- aln_stat_files %>%  
        set_names(aln_names) %>% 
        map_dfr(read_tsv, 
                #col_names = col_names, 
                #col_types = "ciiiiiid", 
                .id = "Run")
    saveRDS(aln_stat_df, "aln_stat_df.RDS")
} else {
    aln_stat_df <- readRDS("aln_stat_df.RDS")
}

0.3 Calculating Run Statistics

Run statistics calculated using alignment lengths from bam statistic files.

Run Statistics
- N50 in Kb, length of read where shorter reads represent half the run half the run throughput. - Throughput in Mb total length of aligned reads. - Coverage by read length thresholds: all, 50kb, 100kb, 250kb, 5000kb.

## Coverage by length bins
cov_breaks <- c(0,50000,100000, 250000, 500000)
cov_df <- aln_stat_df %>% 
    mutate(length_bins = cut(aln_lengthmax, 
                             breaks = c(cov_breaks, max(.$aln_lengthmax)),
                             labels = cov_breaks)) %>%
    group_by(Run, length_bins) %>% 
    summarise(coverage = sum(as.numeric(aln_lengthmax))/3.1e9,
              n_reads = n()) %>% 
    group_by(Run) %>% 
    arrange(desc(length_bins)) %>% 
    mutate(cum_coverage = cumsum(coverage)) %>% 
    mutate(length_bins = as.numeric(as.character(length_bins)))

nested_cov_df <- cov_df %>% group_by(Run) %>% nest()

## N50 and throughput
calc_n50 <- function(seq_lengths){
    sorted_lengths <- sort(seq_lengths)
    cum_lengths <- cumsum(sorted_lengths)
    min(sorted_lengths[cum_lengths >= max(cum_lengths)*0.5])
}

aln_summary_stats_df <- aln_stat_df %>% 
    mutate(aln_lengthmax = as.numeric(aln_lengthmax)) %>% 
    group_by(Run) %>% 
    summarise(N50 = calc_n50(aln_lengthmax),
           throughput = sum(aln_lengthmax))

# Annotating with run metadata
jimb_run_metrics <- jimb_metadata %>% 
    filter(!is.na(Run)) %>% 
    select(-N50) %>% 
    left_join(aln_summary_stats_df) %>% 
    mutate(Date = mdy(Date)) %>% 
    filter(!is.na(N50)) %>% 
    mutate(lab = "JIMB",
           platform = "minion")
## Joining, by = "Run"
nott_run_metrics <- nott_metadata %>%
    mutate(Date = mdy(Date)) %>%
    left_join(aln_summary_stats_df) %>% 
    mutate(lab = "NOTT",
           platform = case_when(str_detect(Run, "_UB_") ~ "gridion",
                                str_detect(Run, "Prom") ~ "promethion",
                                TRUE ~ "minion"),
           lab = if_else(platform == "gridion", "BIRM", lab))
## Joining, by = "Run"
run_metrics_df <- 
    bind_rows(nott_run_metrics, jimb_run_metrics) %>%
    rename(Flowcell = `Flowcell ID`) %>% 
    select(lab, platform, Flowcell, Date, Run, N50, throughput, optimized_protocol) %>% 
    mutate(N50 = N50/1e3, throughput = throughput/1e6) %>% 
    left_join(nested_cov_df)
## Joining, by = "Run"

0.4 Run summaries

Read length distributions were similar across runs. The x-axis scales vary in the following plots making it hard to compare runs. In general runs with distributions shifted to the right have shorter reads.

aln_stat_df %>% group_by(Run) %>% 
    nest() %>%
    mutate(panel = map_plot(data, ~ ggplot(.) + 
                                       geom_histogram(aes(x = aln_lengthmax)) +
                                       theme_bw())) %>%
  trelliscope(name = "Read Length Distribution", nrow = 2, ncol = 2, 
              self_contained = TRUE)

0.4.1 JIMB Run Summary

Variation in N50 and throughput during JIMB run optimization. Most recent runs have high throughput and N50.

Orange points represent runs performed using the optimized protocol.

scatter_dat <- run_metrics_df %>% filter(lab == "JIMB")
ggplot(scatter_dat) + 
    geom_point(aes(x = Date, y = N50)) + 
    geom_point(data = filter(scatter_dat, optimized_protocol == 1), 
               aes(x = Date, y = N50), color = "darkorange") + 
    geom_smooth(aes(x = Date, y = N50)) + 
    geom_hline(aes(yintercept = 100), linetype = 2) +
    theme_bw() +
    labs(y = "N50 (Kb)")

scatter_dat <- run_metrics_df %>% filter(lab == "JIMB")
ggplot(scatter_dat) + 
    geom_point(aes(x = Date, y = throughput)) + 
    geom_smooth(aes(x = Date, y = throughput)) + 
        geom_point(data = filter(scatter_dat, optimized_protocol == 1), 
               aes(x = Date, y = throughput), color = "darkorange") + 
    theme_bw() + 
    scale_y_log10() + 
    labs(y = "Throughput (Mb)")
Throughput over time.

Figure 1: Throughput over time.

pore_counts <- jimb_metadata %>%
    select(`Flowcell ID`, Date, `Flowcell Well Count`) %>% 
    dplyr::rename(Flowcell = `Flowcell ID`, pore_num = `Flowcell Well Count`) %>% 
    group_by(Flowcell) %>% 
    summarise(pore_num = as.numeric(max(pore_num))) %>% 
    filter(!is.na(pore_num))
## Warning in evalq(as.numeric(max(pore_num)), <environment>): NAs introduced
## by coercion
run_metrics_df %>% filter(lab == "JIMB") %>% 
    right_join(pore_counts) %>% 
    ggplot() + 
    geom_point(aes(x = Date, y = throughput/pore_num)) + 
    geom_smooth(aes(x = Date, y = throughput/pore_num)) + 
    theme_bw() + 
    scale_y_log10() + 
    labs(y = "Throughput (Mb)/Pores")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
Throughput normalized by flowcell pores

Figure 2: Throughput normalized by flowcell pores

long_scatter_dat <- scatter_dat %>% 
    select(Date, Run, N50, throughput) %>% 
    mutate(throughput = log10(throughput)) %>% 
    gather("Stat","Value", -Date, -Run)

ggplot(long_scatter_dat) + 
    geom_smooth(aes(x = Date, y = Value)) + 
    geom_point(data = filter(long_scatter_dat, Date < "2018-07-22"),
               aes(x = Date, y = Value)) + 
    geom_point(data = filter(long_scatter_dat, Date > "2018-07-20"), 
               aes(x = Date, y = Value), color = "darkorange") + 
    theme_bw() +
    facet_wrap(~Stat, ncol = 1, scales = "free_y") + 
    labs(y = "Statistic")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

scatter_dat <- run_metrics_df %>% filter(lab == "JIMB") %>% 
    gather()
## Warning: attributes are not identical across measure variables;
## they will be dropped

0.5 N50 and Throughput Lab Comparison

NOTT runs have consistently high throughput but lower N50. JIMB runs from July have high N50 and throughput.

plt <- run_metrics_df %>% 
    ggplot(aes(text = paste("Date:", Date))) + 
    geom_point(aes(x = throughput, y = N50, color = lab, shape = platform)) + 
    scale_x_log10() + 
    scale_color_brewer(palette = 2, type = "qual") +
    theme_bw() + 
    labs(x = "Throughput (Mb)", y = "N50 (Kb)", 
         shape = "Platform", color = "Lab") + 
    theme(legend.position = "bottom")

ggplotly(plt)

Figure 3: Relationship between throughput and N50.

plt <- run_metrics_df %>% group_by(lab) %>% top_n(n = 10, wt = Date) %>% 
    ggplot(aes(text = paste("Date:", Date))) + 
        geom_point(aes(x = throughput, y = N50, 
                       color = lab, shape = platform)) + 
        scale_x_log10() + 
        scale_color_brewer(palette = 2, type = "qual") +
        theme_bw() + 
        labs(x = "Throughput (Mb)", y = "N50 (Kb)", 
             shape = "Platform", color = "Lab")  + 
    theme(legend.position = "bottom")

ggplotly(plt)

Figure 4: Relationship between throughput and N50 for 10 latest runs for each lab.

run_metrics_df %>% filter(throughput > 500, N50 > 59) %>% 
    select(-data) %>% 
    arrange(-N50) %>% 
    knitr::kable(digits = 2, caption = "Runs with N50 > 65Kb and throughput > 400Mb. The `high_precip` Run was performed across flowcells on the two dates. The N50 and throughput were calculated using data from both dates. ")
Table 1: Runs with N50 > 65Kb and throughput > 400Mb. The high_precip Run was performed across flowcells on the two dates. The N50 and throughput were calculated using data from both dates.
lab platform Flowcell Date Run N50 throughput optimized_protocol
JIMB minion FAH92627 2018-07-25 gm24385_180714DNAd 144.09 1201.61 1
JIMB minion FAJ14777 2018-08-23 gm24385_180807DNA2a 141.65 2485.94 1
JIMB minion FAJ18538 2018-08-23 gm24385_180807DNA2c 119.10 1453.43 1
JIMB minion FAH96481 2018-07-22 gm24385_180714DNA 107.23 935.97 1
BIRM gridion FAH86477 2018-06-22 GIAB_GM24385_UB_Run6 105.56 1014.92 NA
JIMB minion FAJ18554 2018-08-23 gm24385_180807DNA2b 102.97 1463.25 1
BIRM gridion FAH86902 2018-06-25 GIAB_GM24385_UB_Run8 97.06 1788.24 NA
JIMB minion FAH95006 2018-07-25 gm24385_180714DNAc 93.70 883.56 1
JIMB minion FAH96699 2018-07-22 gm24385_180714DNAb 93.69 920.29 1
JIMB minion FAJ14951 2018-08-20 gm24385_180807DNA1f 84.53 2803.01 1
JIMB minion FAH86511 2018-06-04 high_precip 84.05 1702.10 0
JIMB minion FAH96539 2018-06-27 high_precip 84.05 1702.10 0
BIRM gridion FAH86787 2018-06-22 GIAB_GM24385_UB_Run5 81.45 961.91 NA
JIMB minion FAH31830 2017-12-04 20171204_2345_171204_24385_PCIA_2 79.65 612.86 NA
JIMB minion FAJ14969 2018-08-14 gm24385_180807DNA1a 63.38 2344.53 1
JIMB minion FAH41164 2017-12-20 20171220_1902_20171220_PCIAx2_DIALYSIS_RAD4 60.95 698.39 NA
JIMB minion FAJ15116 2018-08-14 gm24385_180807DNA1c 60.65 756.58 1
BIRM gridion FAJ04298 2018-06-20 GIAB_GM24385_UB_Run4 59.99 3322.82 NA

0.6 Coverage Comparison

# Coverage plots for all runs. 
cov_plot <- function(data){
    plt <- data %>% 
        mutate(cov_pct = coverage/max(cum_coverage)) %>% 
        ggplot() + 
        geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
            geom_point(aes(x = length_bins/1000, y = cum_coverage)) + 
            geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                          label = signif(coverage,2)), nudge_x = 15, nudge_y = 0.4) + 
            geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                          label = signif(cov_pct,2)*100), nudge_x = 15, nudge_y = 0.25) + 
            geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                          label = n_reads), nudge_x = 15, nudge_y = 0.1) + 
            theme_bw() + 
            labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
    return(plt)
}

run_metrics_df %>% 
    mutate(panel = map_plot(data, ~cov_plot(.))) %>%
  trelliscope(name = "Coverage By Read Length", nrow = 2, ncol = 2, 
              self_contained = TRUE)
run_metrics_df %>% 
    filter(throughput > 500, N50 > 59, Date != ymd("2018-06-04")) %>%
    unnest() %>% 
ggplot() + 
    geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
    geom_point(aes(x = length_bins/1000, y = cum_coverage, 
               color = lab, shape = platform)) + 
    theme_bw() + 
    labs(x = "Read Length (Kb)", y = "Coverage by Reads > length") + 
    facet_wrap(~Run)
Coverage distribution plots for run with high throughput and N50

Figure 5: Coverage distribution plots for run with high throughput and N50

run_metrics_df %>% 
    filter(str_detect(Run, "Prom")) %>%
    unnest() %>% 
    mutate(cov_pct = coverage/max(cum_coverage)) %>% 
ggplot() + 
    geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
    geom_point(aes(x = length_bins/1000, y = cum_coverage)) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = signif(coverage,2)), nudge_x = 15, nudge_y = 0.4) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = signif(cov_pct,2)*100), nudge_x = 15, nudge_y = 0.25) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = n_reads), nudge_x = 15, nudge_y = 0.1) + 
    theme_bw() + 
    labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
Prometion run coverage. Text coverage by length bin, percent of run total, and number of reads.

Figure 6: Prometion run coverage. Text coverage by length bin, percent of run total, and number of reads.

total_summary <-  run_metrics_df %>% 
    unnest() %>% 
    group_by(length_bins) %>% 
    summarise(coverage = sum(coverage), 
              n_reads = sum(n_reads),
              cum_coverage = sum(cum_coverage)) %>% 
    mutate(cov_pct = coverage/max(cum_coverage))

ggplot(total_summary) + 
    geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
    geom_point(aes(x = length_bins/1000, y = cum_coverage)) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = signif(coverage,2)), 
              nudge_x = 17, nudge_y = 2) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = paste0(signif(cov_pct,2)*100,"%")), 
              nudge_x = 15, nudge_y = 1.25) + 
    geom_text(aes(x = length_bins/1000, y = cum_coverage, 
                  label = round(n_reads/1000,1)), 
              nudge_x = 15, nudge_y = 0.5) + 
    theme_bw() + 
    labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
Total coverage across all runs. Text coverage by length bin, percent of run total, and number of reads (1k).

Figure 7: Total coverage across all runs. Text coverage by length bin, percent of run total, and number of reads (1k).

aln_stat_df %>% 
    filter(aln_lengthmax > 500000) %>% 
    ggplot() + 
    geom_histogram(aes(x = aln_lengthmax/100000)) + 
    theme_bw() + 
    labs(x = "Read Length (100kb)", y = "Number of Reads")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

aln_stat_df %>% arrange(-aln_lengthmax) %>% 
    top_n(10) %>% 
    select(Run, aln_lengthmax) %>% 
    mutate(aln_lengthmax = formatC(aln_lengthmax, 
                                format="d", 
                                big.mark=",")) %>% 
    knitr::kable(align = c("l","r"))
## Selecting by bases
Run aln_lengthmax
gm24385_180807DNA2a 1,569,394
gm24385_180807DNA1a 1,262,407
NoDip 1,191,269
gm24385_180807DNA2c 1,191,025
NoDip 1,181,879
gm24385_180807DNA2a 980,211
gm24385_180807DNA1c 870,155
20171201_0030_171130_PCIA_2 774,983
gm24385_180807DNA2a 643,092
gm24385_180807DNA2c 539,559
long_bin_metrics <- run_metrics_df %>% 
    unnest() %>% 
    mutate(cov_pct = coverage/max(cum_coverage)*100) %>% 
    select(lab, platform, Run, length_bins, 
           coverage, cum_coverage, cov_pct, n_reads) %>% 
    gather("metric","value", -lab, -platform, -Run, -length_bins) 

ggplot(long_bin_metrics) +
    geom_boxplot(aes(x = factor(length_bins), y = value, fill = lab)) +
    scale_y_log10() + 
    theme_bw() + 
    facet_wrap(~metric, scales = "free_y")
Distribution coverage metrics and number of reads by read length bins. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.

Figure 8: Distribution coverage metrics and number of reads by read length bins. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.

long_bin_metrics <- run_metrics_df %>% 
    group_by(lab) %>% 
    top_n(10, wt = Date) %>% 
    unnest() %>% 
    mutate(cov_pct = coverage/max(cum_coverage)*100) %>% 
    select(lab, platform, Run, length_bins, 
           coverage, cum_coverage, cov_pct, n_reads) %>% 
    gather("metric","value", -lab, -platform, -Run, -length_bins) 

ggplot(long_bin_metrics) +
    geom_boxplot(aes(x = factor(length_bins), y = value, fill = lab)) +
    scale_y_log10() + 
    theme_bw() + 
    facet_wrap(~metric, scales = "free_y")
Distribution coverage metrics and number of reads by read length bins for last 10 runs from each lab. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.

Figure 9: Distribution coverage metrics and number of reads by read length bins for last 10 runs from each lab. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.

0.6.1 Run Metrics

run_metrics_df %>% 
    mutate(N50 = signif(N50, 2), throughput = signif(throughput, 3)) %>%
    unnest() %>% 
    filter(length_bins == 0) %>% 
    mutate(`Total Coverage` = cum_coverage) %>% 
    select(lab, platform, Date, Run, N50, throughput, `Total Coverage`) %>% 
    datatable(caption = "Summary metrics for all runs", rownames = FALSE)