fastqのkmerカウント(リードの長さが同じ場合).
fastq読み込み.
- ShortReadパッケージのサンプルデータを使用.
Biostrings::readDNAStringSet(filepath, format)でfastqを指定した場合,quality値は読み込まれない.
Biostrings::subseqを用いてリードのpositionごとにkmerカウント.
Biostrings::subseqでは全配列の指定positionから部分配列をまとめて取得可能.
- startをベクトルで与えることができないため, リードの長さが同じでなければk-merの取得ができない.
k <- 7
wd <- unique(BiocGenerics::width(fa))
if (length(wd) == 1) {
starts <- 1:(wd - k + 1)
}
kmer_content_bsub <- lapply(seq_along(starts), function(i){
as.character(Biostrings::subseq(fa, start = starts[i], width = k))
}) %>%
setNames(., sprintf("s%02d", 1:length(.))) %>%
do.call(cbind, .) %>%
tibble::as_tibble(.name_repair = "minimal") %>%
tidyr::gather(., "start", "kmer") %>%
dplyr::group_by(start) %>%
dplyr::count(., kmer) %>%
tidyr::spread("start", "n") %>%
dplyr::filter(., !grepl("N", kmer)) %>%
replace(., is.na(.), 0)
kmers_bsub <- kmer_content_bsub %>%
{mutate(., count = rowSums(.[-1]))} %>%
select(kmer, count) %>%
arrange(desc(count))
# kmer_table表示
kableExtra::kable(kmer_content_bsub[1:6,1:6], format = "html", align = 'cr',
caption = "k-mer contents using Biostrings::subseq") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "float_left")
kableExtra::kable(kmers_bsub[1:6,], format = "html", align = 'cr', caption = "total k-mer") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
k-mer contents using Biostrings::subseq
|
kmer
|
s01
|
s02
|
s03
|
s04
|
s05
|
|
AAAAAAA
|
13
|
6
|
4
|
4
|
3
|
|
AAAAAAC
|
8
|
6
|
0
|
0
|
0
|
|
AAAAAAG
|
6
|
7
|
1
|
1
|
2
|
|
AAAAAAT
|
9
|
3
|
1
|
0
|
0
|
|
AAAAACA
|
5
|
9
|
4
|
2
|
0
|
|
AAAAACC
|
2
|
6
|
6
|
0
|
0
|
total k-mer
|
kmer
|
count
|
|
GGGGGGG
|
1497
|
|
CCCCCCC
|
1409
|
|
CAGCAGC
|
608
|
|
GCTGCTG
|
588
|
|
GCAGCAG
|
540
|
|
CTGCTGC
|
530
|
stringr::str_subを使ってリード毎にk-merカウント.
stringr::str_subはstartとendをベクトルで与えることができる.
k <- 7
starts <- 1:unique(width(fa) - k + 1)
ends <- starts + k - 1
kmers_contents_ssub <- unlist(stringr::str_split(toString(fa), ", ")) %>%
lapply(., function(x) stringr::str_sub(x, starts, ends)) %>%
do.call(rbind, .) %>%
as_tibble(.name_repair = "minimal") %>%
setNames(., sprintf("s%02d", 1:ncol(.))) %>%
tidyr::gather(., "start", "kmer") %>%
group_by(start) %>%
dplyr::count(., kmer) %>%
tidyr::spread("start", "n") %>%
dplyr::filter(., !grepl("N", kmer)) %>%
replace(., is.na(.), 0)
kmers_ssub <- kmers_contents_ssub %>%
{mutate(., count = rowSums(.[-1]))} %>%
select(kmer, count) %>%
arrange(desc(count))
kableExtra::kable(kmers_contents_ssub[1:6,1:6], format = "html", align = 'cr',
caption = "k-mer content using stringr::str_sub") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "float_left")
kableExtra::kable(kmers_ssub[1:6,], format = "html", align = 'cr', caption = "total k-mer") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
k-mer content using stringr::str_sub
|
kmer
|
s01
|
s02
|
s03
|
s04
|
s05
|
|
AAAAAAA
|
13
|
6
|
4
|
4
|
3
|
|
AAAAAAC
|
8
|
6
|
0
|
0
|
0
|
|
AAAAAAG
|
6
|
7
|
1
|
1
|
2
|
|
AAAAAAT
|
9
|
3
|
1
|
0
|
0
|
|
AAAAACA
|
5
|
9
|
4
|
2
|
0
|
|
AAAAACC
|
2
|
6
|
6
|
0
|
0
|
total k-mer
|
kmer
|
count
|
|
GGGGGGG
|
1497
|
|
CCCCCCC
|
1409
|
|
CAGCAGC
|
608
|
|
GCTGCTG
|
588
|
|
GCAGCAG
|
540
|
|
CTGCTGC
|
530
|
Biostrings::oligonucleotideFrequencyを使って全リードのkmerカウント.
oligonucleotideFrequency(fas, width) widthにkの数を指定. リード数が大きくなるとエラー
# リード毎のkmerカウントを取得して集計
kmers_of <- Biostrings::oligonucleotideFrequency(fa, width = 7) %>%
apply(., 2, sum) %>%
as.data.frame %>%
setNames(c("count")) %>%
tibble::rownames_to_column(., var = "kmer") %>%
dplyr::filter(., !grepl("N", kmer)) %>%
arrange(desc(count))
# kmer_table表示
kableExtra::kable(kmers_of[1:6,], format = "html", align = 'cr',
caption = "total k-mer with oligonucleotideFrequency") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>%
kableExtra::column_spec(2, width = "10em")
total k-mer with oligonucleotideFrequency
|
kmer
|
count
|
|
GGGGGGG
|
1497
|
|
CCCCCCC
|
1409
|
|
CAGCAGC
|
608
|
|
GCTGCTG
|
588
|
|
GCAGCAG
|
540
|
|
CTGCTGC
|
530
|
fastqのk-merカウント(リード毎に長さが異なる場合).
リード毎にstringr::str_subを用いてk-merカウント.
- k-merのリストを取得後,
NAでfillしてデータフレームに変換後, dplyr, tidyrを使って集計
- リードの方をNで埋めて
Biostrings::subseqを用いた場合, リード数が増えても時間はたいしてかわらなかった.
k <- 7
wd <- Biostrings::width(fa2)
kmer_contents_ssub <- unlist(stringr::str_split(toString(fa2), ", ")) %>%
{
if (length(unique(wd)) != 1) {
lapply(., function(x){
st <- 1:(nchar(x) - k + 1)
ed <- st + k - 1
res <- stringr::str_sub(x, st, ed)
mxlen <- max(wd)
c(res, rep(NA, mxlen - length(res)))
})
} else {
starts <- 1:unique(wd - k + 1)
ends <- starts + k - 1
lapply(., function(x) stringr::str_sub(x, starts, ends))
}
} %>%
do.call(rbind, .) %>%
tibble::as_tibble(.name_repair = "minimal") %>%
setNames(., sprintf("s%02d", 1:ncol(.))) %>%
tidyr::gather(., "start", "kmer") %>%
group_by(start) %>%
count(., kmer) %>%
tidyr::spread("start", "n") %>%
filter(., !grepl("N", kmer)) %>%
replace(., is.na(.), 0)
kmers_ssub <- kmer_contents_ssub %>%
{mutate(., count = rowSums(.[-1]))} %>%
select(kmer, count) %>%
filter(kmer != "0") %>%
arrange(desc(count))
# k-mer_table表示
kableExtra::kable(kmer_contents_ssub[1:6,1:6], format = "html", align = 'cr',
caption = "k-mer content for different read length") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "float_left")
# k-mer_content
kableExtra::kable(kmers_ssub[1:6,], format = "html", align = 'cr', caption = "total k-mer") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
k-mer content for different read length
|
kmer
|
s01
|
s02
|
s03
|
s04
|
s05
|
|
AAAAAAA
|
13
|
5
|
3
|
3
|
2
|
|
AAAAAAC
|
8
|
6
|
0
|
0
|
0
|
|
AAAAAAG
|
6
|
7
|
1
|
1
|
2
|
|
AAAAAAT
|
9
|
3
|
1
|
0
|
0
|
|
AAAAACA
|
5
|
9
|
4
|
2
|
0
|
|
AAAAACC
|
2
|
6
|
6
|
0
|
0
|
total k-mer
|
kmer
|
count
|
|
CAGCAGC
|
511
|
|
GCTGCTG
|
501
|
|
GCAGCAG
|
448
|
|
CTGCTGC
|
446
|
|
GGAGGAG
|
443
|
|
CAGCAGG
|
426
|
fastqファイルを分割して並列処理.
stringr::str_subを使ってリード毎にk-merカウントする関数
- 入力はfastqファイルパスもしくはDNAStringsetオブジェクト
- 戻り値の形を
resfmtで指定する. リードポジションごとのk-mer数 “pos” もしくは総リードのk-mer数 “total”. それぞれデータフレームで返す.
kmer_cnt <- function(x, k, resfmt = "total"){
# x is a file path of fastq or DNAStringSet object
if (class(x) != "DNAStringSet") {
fa <- Biostrings::readDNAStringSet(x, format = "fastq")
} else if (class(x) == "DNAStringSet") {
fa <- x
} else {
stop("'x' must be a fastq file path or a DNAStringSet object.")
}
# k-mer table per start position
wd <- Biostrings::width(fa)
kmer_tab <- unlist(stringr::str_split(toString(fa), ", ")) %>%
{
if (length(unique(wd)) != 1) {
lapply(., function(x){
st <- 1:(nchar(x) - k + 1)
ed <- st + k - 1
res <- stringr::str_sub(x, st, ed)
mxst <- max(wd) - k + 1
c(res, rep(NA, mxst - length(res)))
})
} else {
starts <- 1:(unique(wd) - k + 1)
ends <- starts + k - 1
lapply(., function(x) stringr::str_sub(x, starts, ends))
}
} %>%
do.call(rbind, .) %>%
tibble::as_tibble(.name_repair = "minimal") %>%
setNames(., sprintf(paste0("s%0",nchar(ncol(.)), "d"), 1:ncol(.))) %>%
tidyr::gather(., "start", "kmer") %>%
dplyr::group_by(start) %>%
dplyr::count(., kmer) %>%
tidyr::spread("start", "n") %>%
dplyr::filter(., !grepl("N", kmer)) %>%
replace(., is.na(.), 0) %>%
{
if (resfmt == "pos") { # k-mer content at read position
return(.)
} else if (resfmt == "total") { # total k-mer table
{dplyr::mutate(., count = rowSums(.[-1]))} %>%
dplyr::select(kmer, count) %>%
dplyr::filter(kmer != "0") %>%
dplyr::arrange(desc(count))
}
}
return(kmer_tab)
}
リードを分割して並列処理を行う.
kmer_strm <- function(x, k, n, resfmt = "total", type = "PSOCK"){
# x is a file path of fastq or DNAStringSet object
if (class(x) != "DNAStringSet") {
fa <- Biostrings::readDNAStringSet(x, format = "fastq")
} else if (class(x) == "DNAStringSet") {
fa <- x
} else {
stop("'x' must be a fastq file path or a DNAStringSet object.")
}
# function of splitting fasta sequences to chunk.
chunk <- function(x, n) split(x, ceiling(seq_along(x)/n))
chnk_fq <- chunk(fa, n)
# parallel processing of fasta sequence chunks
cores <- parallel::detectCores(logical = FALSE)
cluster <- parallel::makeCluster(cores, type)
if (type == "PSOCK") {
parallel::clusterEvalQ(cluster, library(dplyr))
parallel::clusterExport(cluster, "kmer_cnt")
}
doParallel::registerDoParallel(cluster)
res_chnks <- foreach::foreach(fa = chnk_fq) %dopar% {
kmer_cnt(fa, k, resfmt)
}
parallel::stopCluster(cluster)
# merge results of parallel processing
if (resfmt == "total") {
res <- dplyr::bind_rows(res_chnks) %>%
group_by(kmer) %>%
mutate(count = sum(count)) %>%
slice(1) %>%
arrange(desc(count))
} else if (resfmt == "pos") {
res <- dplyr::bind_rows(res_chnks) %>%
group_by(kmer) %>%
mutate_at(-1, funs(sum)) %>%
slice(1)
}
return(res)
}
時間を計測
- 250000リードずつに分割
- kの数が増えると差が大きくなる.
library(dplyr); library(doParallel); library(foreach); library(microbenchmark)
# 引数
fp <- system.file("extdata/E-MTAB-1147", package = "ShortRead") # ShortReadのサンプルデータ場所
fqs <- list.files(fp, "fastq.gz", full.names = T) # fastqファイルパス
fa <- Biostrings::readDNAStringSet(fqs[1], "fastq") # fastqファイル読み込み(2万リード)
fas <- sample(fa, 10^6, replace = T) # 1M リード
# k-mer count
k <- 7; n <- 250000
system.time(res_1 <- kmer_cnt(fas, k, "total"))
system.time(res_2 <- kmer_strm(fas, k, n, "total", "PSOCK"))
system.time(res_3 <- kmer_strm(fas, k, n, "total", "FORK"))
# sapply(list(as.matrix(res_2), as.matrix(res_3)), FUN = identical, as.matrix(res_1))
# # 32スレッドで実行
# fas <- sample(fa, 10^7, replace = T) # 10M リード
# system.time(res2 <- kmer_strm(fas, k = 25, n = 312500))
# # ユーザ システム 経過
# # 2860.128 190.620 161.032
# system.time(res1 <- kmer_strm(fas, k = 25))
# # ユーザ システム 経過
# # 526.348 14.028 540.365
## user system elapsed
## 51.185 3.750 58.147
## user system elapsed
## 1.267 0.189 48.550
## user system elapsed
## 91.378 9.973 63.667
リードボジション毎のkmer出現確率
# positionごとのにkmerカウントデータからkmer出現確率のggplot
gg_kmer <- function(x, h = 10, topn = 10){
# サンプル毎のkmer出現率の期待値のリスト
obs_exp <- cbind(kmer = x$kmer, sweep(x[-1], 1, apply(x[-1], 1, mean), "/"))
obs_exp <- obs_exp %>%
mutate(kmer = paste(rownames(obs_exp), kmer, sep = ":")) # 行番号で比較するため
# 出現確率top10のkmer
obs_exp_top <- obs_exp[order(apply(x[-1], 1, sum), decreasing = T)[1:topn],]
gg_top <- obs_exp_top %>%
mutate(kmer = factor(kmer, levels = kmer)) %>%
tidyr::gather(key = start, value = count, -kmer, factor_key = T) %>%
ggplot2::ggplot(ggplot2::aes(x = start, y = count, colour = kmer, group = kmer)) +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(size = 8, angle = 90, hjust = 1)) +
ggplot2::labs(title = "top 10 of total positions", y = "Obs/Exp") +
ggplot2::geom_line()
# 先頭x塩基で出現確率Obs/Expがtopのkmer
gg_head <-
obs_exp[apply(obs_exp[,2:(1 + h)], 2, which.max),] %>%
tidyr::gather(key = start, value = count, -kmer, factor_key = T) %>%
ggplot2::ggplot(ggplot2::aes(x = start, y = count, colour = kmer, group = kmer)) +
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(size = 8, angle = 90, hjust = 1)) +
ggplot2::labs(title = "top 10 at head 10 nuc", y = "Obs/Exp") +
ggplot2::geom_line()
return(list(gg_top, gg_head))
}
# k-mer content
kmer_pos <- kmer_cnt(fa, k = 7, "pos")
# create ggplot object
gg_km <- gg_kmer(x = kmer_pos, h = 10, topn = 10)
do.call(gridExtra::grid.arrange, c(gg_km, list(ncol = 2)))

環境
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] microbenchmark_1.4-6 gridExtra_2.3
## [3] ggplot2_3.1.0 doParallel_1.0.14
## [5] iterators_1.0.10 foreach_1.4.4
## [7] tibble_2.0.1 tidyr_0.8.2
## [9] dplyr_0.8.0.1 stringr_1.3.1
## [11] ShortRead_1.38.0 GenomicAlignments_1.16.0
## [13] SummarizedExperiment_1.10.1 DelayedArray_0.6.6
## [15] matrixStats_0.54.0 Biobase_2.40.0
## [17] Rsamtools_1.32.3 GenomicRanges_1.32.7
## [19] GenomeInfoDb_1.16.0 Biostrings_2.48.0
## [21] XVector_0.20.0 IRanges_2.14.12
## [23] S4Vectors_0.18.3 BiocParallel_1.14.2
## [25] BiocGenerics_0.26.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 lattice_0.20-38 assertthat_0.2.0
## [4] digest_0.6.18 R6_2.3.0 plyr_1.8.4
## [7] evaluate_0.12 highr_0.7 httr_1.4.0
## [10] pillar_1.3.1 zlibbioc_1.26.0 rlang_0.3.1
## [13] lazyeval_0.2.1 rstudioapi_0.9.0 Matrix_1.2-15
## [16] rmarkdown_1.11 labeling_0.3 webshot_0.5.1
## [19] readr_1.3.1 RCurl_1.95-4.11 munsell_0.5.0
## [22] compiler_3.5.1 xfun_0.4 pkgconfig_2.0.2
## [25] htmltools_0.3.6 tidyselect_0.2.5 GenomeInfoDbData_1.1.0
## [28] codetools_0.2-16 viridisLite_0.3.0 crayon_1.3.4
## [31] withr_2.1.2 bitops_1.0-6 grid_3.5.1
## [34] gtable_0.2.0 pacman_0.5.0 magrittr_1.5
## [37] scales_1.0.0 stringi_1.2.4 hwriter_1.3.2
## [40] latticeExtra_0.6-28 xml2_1.2.0 kableExtra_1.0.0
## [43] RColorBrewer_1.1-2 tools_3.5.1 glue_1.3.0
## [46] purrr_0.3.0 hms_0.4.2 yaml_2.2.0
## [49] colorspace_1.4-0 rvest_0.3.2 knitr_1.21