Nanopore sequencing of the GIAB HG002 was performed at three sites, JIMB, NOTT, and BIRM. The following analysis summarizes the sequencing runs generated at the sites comparing run throughput, N50, and coverage by read length. Sequence metrics were calculated from the alignment files (CRAM). Therefore the run metrics is likely lower than metrics calculated from the basecall sequence files (fastq).
Nanopore sequencing platforms
Downloading run metadata from googlesheet.
nanopore_gs <- gs_title("Nanopore runs")
jimb_metadata <- nanopore_gs %>%
gs_read(ws = "GM24385")
nott_metadata <- nanopore_gs %>%
gs_read(ws = "Nott")
Pulling alignment statistics from stanford oak filesystem,
requires mounting stanford oak filesystem as stanford_oak directory.
The data were saved locally to make it easier to re-compile Rmarkdown.
if (TRUE) {
stat_files <- list.files("nanopore/stats/",
pattern = ".*stats.tsv.gz", full.names = TRUE,
recursive = TRUE)
# BAM statistics combined bam stat files
col_names <- c("read_id",
"aln_lengthsum",
"aln_lengthmax",
"aln_count",
"ref_lengthsum",
"ref_lengthmax",
"ref_lengthcount",
"bases")
aln_stat_files <- grep("/aln/", stat_files, value = TRUE)
aln_names <- str_extract(aln_stat_files, "(?<=stats//).*(?=/aln)")
aln_stat_df <- aln_stat_files %>%
set_names(aln_names) %>%
map_dfr(read_tsv,
#col_names = col_names,
#col_types = "ciiiiiid",
.id = "Run")
saveRDS(aln_stat_df, "aln_stat_df.RDS")
} else {
aln_stat_df <- readRDS("aln_stat_df.RDS")
}
Run statistics calculated using alignment lengths from bam statistic files.
Run Statistics
- N50 in Kb, length of read where shorter reads represent half the run half the run throughput.
- Throughput in Mb total length of aligned reads.
- Coverage by read length thresholds: all, 50kb, 100kb, 250kb, 5000kb.
## Coverage by length bins
cov_breaks <- c(0,50000,100000, 250000, 500000)
cov_df <- aln_stat_df %>%
mutate(length_bins = cut(aln_lengthmax,
breaks = c(cov_breaks, max(.$aln_lengthmax)),
labels = cov_breaks)) %>%
group_by(Run, length_bins) %>%
summarise(coverage = sum(as.numeric(aln_lengthmax))/3.1e9,
n_reads = n()) %>%
group_by(Run) %>%
arrange(desc(length_bins)) %>%
mutate(cum_coverage = cumsum(coverage)) %>%
mutate(length_bins = as.numeric(as.character(length_bins)))
nested_cov_df <- cov_df %>% group_by(Run) %>% nest()
## N50 and throughput
calc_n50 <- function(seq_lengths){
sorted_lengths <- sort(seq_lengths)
cum_lengths <- cumsum(sorted_lengths)
min(sorted_lengths[cum_lengths >= max(cum_lengths)*0.5])
}
aln_summary_stats_df <- aln_stat_df %>%
mutate(aln_lengthmax = as.numeric(aln_lengthmax)) %>%
group_by(Run) %>%
summarise(N50 = calc_n50(aln_lengthmax),
throughput = sum(aln_lengthmax))
# Annotating with run metadata
jimb_run_metrics <- jimb_metadata %>%
filter(!is.na(Run)) %>%
select(-N50) %>%
left_join(aln_summary_stats_df) %>%
mutate(Date = mdy(Date)) %>%
filter(!is.na(N50)) %>%
mutate(lab = "JIMB",
platform = "minion")
## Joining, by = "Run"
nott_run_metrics <- nott_metadata %>%
mutate(Date = mdy(Date)) %>%
left_join(aln_summary_stats_df) %>%
mutate(lab = "NOTT",
platform = case_when(str_detect(Run, "_UB_") ~ "gridion",
str_detect(Run, "Prom") ~ "promethion",
TRUE ~ "minion"),
lab = if_else(platform == "gridion", "BIRM", lab))
## Joining, by = "Run"
run_metrics_df <-
bind_rows(nott_run_metrics, jimb_run_metrics) %>%
rename(Flowcell = `Flowcell ID`) %>%
select(lab, platform, Flowcell, Date, Run, N50, throughput, optimized_protocol) %>%
mutate(N50 = N50/1e3, throughput = throughput/1e6) %>%
left_join(nested_cov_df)
## Joining, by = "Run"
Read length distributions were similar across runs. The x-axis scales vary in the following plots making it hard to compare runs. In general runs with distributions shifted to the right have shorter reads.
aln_stat_df %>% group_by(Run) %>%
nest() %>%
mutate(panel = map_plot(data, ~ ggplot(.) +
geom_histogram(aes(x = aln_lengthmax)) +
theme_bw())) %>%
trelliscope(name = "Read Length Distribution", nrow = 2, ncol = 2,
self_contained = TRUE)
Variation in N50 and throughput during JIMB run optimization. Most recent runs have high throughput and N50.
Orange points represent runs performed using the optimized protocol.
scatter_dat <- run_metrics_df %>% filter(lab == "JIMB")
ggplot(scatter_dat) +
geom_point(aes(x = Date, y = N50)) +
geom_point(data = filter(scatter_dat, optimized_protocol == 1),
aes(x = Date, y = N50), color = "darkorange") +
geom_smooth(aes(x = Date, y = N50)) +
geom_hline(aes(yintercept = 100), linetype = 2) +
theme_bw() +
labs(y = "N50 (Kb)")
scatter_dat <- run_metrics_df %>% filter(lab == "JIMB")
ggplot(scatter_dat) +
geom_point(aes(x = Date, y = throughput)) +
geom_smooth(aes(x = Date, y = throughput)) +
geom_point(data = filter(scatter_dat, optimized_protocol == 1),
aes(x = Date, y = throughput), color = "darkorange") +
theme_bw() +
scale_y_log10() +
labs(y = "Throughput (Mb)")
Figure 1: Throughput over time.
pore_counts <- jimb_metadata %>%
select(`Flowcell ID`, Date, `Flowcell Well Count`) %>%
dplyr::rename(Flowcell = `Flowcell ID`, pore_num = `Flowcell Well Count`) %>%
group_by(Flowcell) %>%
summarise(pore_num = as.numeric(max(pore_num))) %>%
filter(!is.na(pore_num))
## Warning in evalq(as.numeric(max(pore_num)), <environment>): NAs introduced
## by coercion
run_metrics_df %>% filter(lab == "JIMB") %>%
right_join(pore_counts) %>%
ggplot() +
geom_point(aes(x = Date, y = throughput/pore_num)) +
geom_smooth(aes(x = Date, y = throughput/pore_num)) +
theme_bw() +
scale_y_log10() +
labs(y = "Throughput (Mb)/Pores")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
Figure 2: Throughput normalized by flowcell pores
long_scatter_dat <- scatter_dat %>%
select(Date, Run, N50, throughput) %>%
mutate(throughput = log10(throughput)) %>%
gather("Stat","Value", -Date, -Run)
ggplot(long_scatter_dat) +
geom_smooth(aes(x = Date, y = Value)) +
geom_point(data = filter(long_scatter_dat, Date < "2018-07-22"),
aes(x = Date, y = Value)) +
geom_point(data = filter(long_scatter_dat, Date > "2018-07-20"),
aes(x = Date, y = Value), color = "darkorange") +
theme_bw() +
facet_wrap(~Stat, ncol = 1, scales = "free_y") +
labs(y = "Statistic")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
scatter_dat <- run_metrics_df %>% filter(lab == "JIMB") %>%
gather()
## Warning: attributes are not identical across measure variables;
## they will be dropped
NOTT runs have consistently high throughput but lower N50. JIMB runs from July have high N50 and throughput.
plt <- run_metrics_df %>%
ggplot(aes(text = paste("Date:", Date))) +
geom_point(aes(x = throughput, y = N50, color = lab, shape = platform)) +
scale_x_log10() +
scale_color_brewer(palette = 2, type = "qual") +
theme_bw() +
labs(x = "Throughput (Mb)", y = "N50 (Kb)",
shape = "Platform", color = "Lab") +
theme(legend.position = "bottom")
ggplotly(plt)
Figure 3: Relationship between throughput and N50.
plt <- run_metrics_df %>% group_by(lab) %>% top_n(n = 10, wt = Date) %>%
ggplot(aes(text = paste("Date:", Date))) +
geom_point(aes(x = throughput, y = N50,
color = lab, shape = platform)) +
scale_x_log10() +
scale_color_brewer(palette = 2, type = "qual") +
theme_bw() +
labs(x = "Throughput (Mb)", y = "N50 (Kb)",
shape = "Platform", color = "Lab") +
theme(legend.position = "bottom")
ggplotly(plt)
Figure 4: Relationship between throughput and N50 for 10 latest runs for each lab.
run_metrics_df %>% filter(throughput > 500, N50 > 59) %>%
select(-data) %>%
arrange(-N50) %>%
knitr::kable(digits = 2, caption = "Runs with N50 > 65Kb and throughput > 400Mb. The `high_precip` Run was performed across flowcells on the two dates. The N50 and throughput were calculated using data from both dates. ")
| lab | platform | Flowcell | Date | Run | N50 | throughput | optimized_protocol |
|---|---|---|---|---|---|---|---|
| JIMB | minion | FAH92627 | 2018-07-25 | gm24385_180714DNAd | 144.09 | 1201.61 | 1 |
| JIMB | minion | FAJ14777 | 2018-08-23 | gm24385_180807DNA2a | 141.65 | 2485.94 | 1 |
| JIMB | minion | FAJ18538 | 2018-08-23 | gm24385_180807DNA2c | 119.10 | 1453.43 | 1 |
| JIMB | minion | FAH96481 | 2018-07-22 | gm24385_180714DNA | 107.23 | 935.97 | 1 |
| BIRM | gridion | FAH86477 | 2018-06-22 | GIAB_GM24385_UB_Run6 | 105.56 | 1014.92 | NA |
| JIMB | minion | FAJ18554 | 2018-08-23 | gm24385_180807DNA2b | 102.97 | 1463.25 | 1 |
| BIRM | gridion | FAH86902 | 2018-06-25 | GIAB_GM24385_UB_Run8 | 97.06 | 1788.24 | NA |
| JIMB | minion | FAH95006 | 2018-07-25 | gm24385_180714DNAc | 93.70 | 883.56 | 1 |
| JIMB | minion | FAH96699 | 2018-07-22 | gm24385_180714DNAb | 93.69 | 920.29 | 1 |
| JIMB | minion | FAJ14951 | 2018-08-20 | gm24385_180807DNA1f | 84.53 | 2803.01 | 1 |
| JIMB | minion | FAH86511 | 2018-06-04 | high_precip | 84.05 | 1702.10 | 0 |
| JIMB | minion | FAH96539 | 2018-06-27 | high_precip | 84.05 | 1702.10 | 0 |
| BIRM | gridion | FAH86787 | 2018-06-22 | GIAB_GM24385_UB_Run5 | 81.45 | 961.91 | NA |
| JIMB | minion | FAH31830 | 2017-12-04 | 20171204_2345_171204_24385_PCIA_2 | 79.65 | 612.86 | NA |
| JIMB | minion | FAJ14969 | 2018-08-14 | gm24385_180807DNA1a | 63.38 | 2344.53 | 1 |
| JIMB | minion | FAH41164 | 2017-12-20 | 20171220_1902_20171220_PCIAx2_DIALYSIS_RAD4 | 60.95 | 698.39 | NA |
| JIMB | minion | FAJ15116 | 2018-08-14 | gm24385_180807DNA1c | 60.65 | 756.58 | 1 |
| BIRM | gridion | FAJ04298 | 2018-06-20 | GIAB_GM24385_UB_Run4 | 59.99 | 3322.82 | NA |
# Coverage plots for all runs.
cov_plot <- function(data){
plt <- data %>%
mutate(cov_pct = coverage/max(cum_coverage)) %>%
ggplot() +
geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
geom_point(aes(x = length_bins/1000, y = cum_coverage)) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = signif(coverage,2)), nudge_x = 15, nudge_y = 0.4) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = signif(cov_pct,2)*100), nudge_x = 15, nudge_y = 0.25) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = n_reads), nudge_x = 15, nudge_y = 0.1) +
theme_bw() +
labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
return(plt)
}
run_metrics_df %>%
mutate(panel = map_plot(data, ~cov_plot(.))) %>%
trelliscope(name = "Coverage By Read Length", nrow = 2, ncol = 2,
self_contained = TRUE)
run_metrics_df %>%
filter(throughput > 500, N50 > 59, Date != ymd("2018-06-04")) %>%
unnest() %>%
ggplot() +
geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
geom_point(aes(x = length_bins/1000, y = cum_coverage,
color = lab, shape = platform)) +
theme_bw() +
labs(x = "Read Length (Kb)", y = "Coverage by Reads > length") +
facet_wrap(~Run)
Figure 5: Coverage distribution plots for run with high throughput and N50
run_metrics_df %>%
filter(str_detect(Run, "Prom")) %>%
unnest() %>%
mutate(cov_pct = coverage/max(cum_coverage)) %>%
ggplot() +
geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
geom_point(aes(x = length_bins/1000, y = cum_coverage)) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = signif(coverage,2)), nudge_x = 15, nudge_y = 0.4) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = signif(cov_pct,2)*100), nudge_x = 15, nudge_y = 0.25) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = n_reads), nudge_x = 15, nudge_y = 0.1) +
theme_bw() +
labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
Figure 6: Prometion run coverage. Text coverage by length bin, percent of run total, and number of reads.
total_summary <- run_metrics_df %>%
unnest() %>%
group_by(length_bins) %>%
summarise(coverage = sum(coverage),
n_reads = sum(n_reads),
cum_coverage = sum(cum_coverage)) %>%
mutate(cov_pct = coverage/max(cum_coverage))
ggplot(total_summary) +
geom_path(aes(x = length_bins/1000, y = cum_coverage)) +
geom_point(aes(x = length_bins/1000, y = cum_coverage)) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = signif(coverage,2)),
nudge_x = 17, nudge_y = 2) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = paste0(signif(cov_pct,2)*100,"%")),
nudge_x = 15, nudge_y = 1.25) +
geom_text(aes(x = length_bins/1000, y = cum_coverage,
label = round(n_reads/1000,1)),
nudge_x = 15, nudge_y = 0.5) +
theme_bw() +
labs(x = "Read Length (Kb)", y = "Coverage by Reads > length")
Figure 7: Total coverage across all runs. Text coverage by length bin, percent of run total, and number of reads (1k).
aln_stat_df %>%
filter(aln_lengthmax > 500000) %>%
ggplot() +
geom_histogram(aes(x = aln_lengthmax/100000)) +
theme_bw() +
labs(x = "Read Length (100kb)", y = "Number of Reads")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
aln_stat_df %>% arrange(-aln_lengthmax) %>%
top_n(10) %>%
select(Run, aln_lengthmax) %>%
mutate(aln_lengthmax = formatC(aln_lengthmax,
format="d",
big.mark=",")) %>%
knitr::kable(align = c("l","r"))
## Selecting by bases
| Run | aln_lengthmax |
|---|---|
| gm24385_180807DNA2a | 1,569,394 |
| gm24385_180807DNA1a | 1,262,407 |
| NoDip | 1,191,269 |
| gm24385_180807DNA2c | 1,191,025 |
| NoDip | 1,181,879 |
| gm24385_180807DNA2a | 980,211 |
| gm24385_180807DNA1c | 870,155 |
| 20171201_0030_171130_PCIA_2 | 774,983 |
| gm24385_180807DNA2a | 643,092 |
| gm24385_180807DNA2c | 539,559 |
long_bin_metrics <- run_metrics_df %>%
unnest() %>%
mutate(cov_pct = coverage/max(cum_coverage)*100) %>%
select(lab, platform, Run, length_bins,
coverage, cum_coverage, cov_pct, n_reads) %>%
gather("metric","value", -lab, -platform, -Run, -length_bins)
ggplot(long_bin_metrics) +
geom_boxplot(aes(x = factor(length_bins), y = value, fill = lab)) +
scale_y_log10() +
theme_bw() +
facet_wrap(~metric, scales = "free_y")
Figure 8: Distribution coverage metrics and number of reads by read length bins. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.
long_bin_metrics <- run_metrics_df %>%
group_by(lab) %>%
top_n(10, wt = Date) %>%
unnest() %>%
mutate(cov_pct = coverage/max(cum_coverage)*100) %>%
select(lab, platform, Run, length_bins,
coverage, cum_coverage, cov_pct, n_reads) %>%
gather("metric","value", -lab, -platform, -Run, -length_bins)
ggplot(long_bin_metrics) +
geom_boxplot(aes(x = factor(length_bins), y = value, fill = lab)) +
scale_y_log10() +
theme_bw() +
facet_wrap(~metric, scales = "free_y")
Figure 9: Distribution coverage metrics and number of reads by read length bins for last 10 runs from each lab. Coverage percent (cov_pct) is the percent of run total coverage for reads in a bin. Coverage is genome coverage by read length bin. Cumulative coverage is genome coverage by reads longer than read bin. Number of reads (n_reads) is the number of reads in each read bin.
run_metrics_df %>%
mutate(N50 = signif(N50, 2), throughput = signif(throughput, 3)) %>%
unnest() %>%
filter(length_bins == 0) %>%
mutate(`Total Coverage` = cum_coverage) %>%
select(lab, platform, Date, Run, N50, throughput, `Total Coverage`) %>%
datatable(caption = "Summary metrics for all runs", rownames = FALSE)