.Renvironを作成しPATH=~/program/ncbi-blast-2.2.31+/binのように記述# # コンソールから設定する場合は
# Sys.setenv(PATH = "~/program/ncbi-blast-2.2.31+/bin")
# Sys.getenv("PATH")MYSYSBLAST <- function(in_f, out_f, program, db, ...){
# usege: MYSYBLAST(in_f="./hoge.fst", # 入力ファイルパス
# out_f="./out.tab", # 出力ファイルパス, - : stdout
# program="blastn", # プログラム
# db="./db/nuc.blastdb", # データベース
# "-num_threads 4" # 追加のオプション
# )
suppressMessages(suppressWarnings(library(data.table)))
# コマンド
com <- paste(program, "-query", in_f, "-db", db, "-out", out_f,
"-outfmt", "\"6 std qlen slen salltitles\"" , ..., sep = " ")
cat(com)
# 実行
bout <- system(command = com, intern = T, wait = T)
# 標準出力に読み込む場合
if(out_f=="-"){
bout <- lapply(bout, function(x){unlist(strsplit(x, "\t"))})
bout <- data.frame(Reduce(rbind, bout), row.names=NULL, stringsAsFactors = F)
# ファイルから読み込む場合
}else{
bout <- fread(out_f, sep = "\t", data.table = F, stringsAsFactors = F)
}
names(bout) <- c("query", "hit", "identity", "length", "mismatch", "gapopen",
"q_start", "q_end", "h_start", "h_end", "evalue", "bitscore",
"q_len", "h_len", "description")
bout[c(3:14)] <- lapply(bout[c(3:14)], as.numeric)
return(bout)
}# オプション
p1 <- "~/pub/dat/sampledata/multiple_alignment/AtMlos.fna"
bndb = "~/db/cdna/TAIR10_cdna"
out <- "-"
# blast実行
bnout <-MYSYSBLAST(in_f = p1, out_f = out, program = "blastn", db = bndb, "-num_threads 4")## blastn -query ~/pub/dat/sampledata/multiple_alignment/AtMlos.fna -db ~/db/cdna/TAIR10_cdna -out - -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(bnout[1:3,], format = "pandoc", caption="blastn")| query | hit | identity | length | mismatch | gapopen | q_start | q_end | h_start | h_end | evalue | bitscore | q_len | h_len | description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NM_116494.5 | AT4G02600.1 | 100 | 1976 | 0 | 0 | 112 | 2087 | 1 | 1976 | 0 | 3650 | 2240 | 1976 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143999-1147409 FORWARD LENGTH=1976 |
| NM_116494.5 | AT4G02600.2 | 100 | 1852 | 0 | 0 | 249 | 2100 | 91 | 1942 | 0 | 3421 | 2240 | 1942 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942 |
| NM_116494.5 | AT4G02600.2 | 100 | 91 | 0 | 0 | 75 | 165 | 1 | 91 | 0 | 169 | 2240 | 1942 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942 |
# オプション(ファイルから読み込む場合)
p2 <- "~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa"
bpdb = "~/db/protein/TAIR10_pep"
out <- "bpout.tab"
# blastp実行
bpout <- MYSYSBLAST(in_f = p2, out_f = out, program = "blastp", db = bpdb, "-num_threads 4")## blastp -query ~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa -db ~/db/protein/TAIR10_pep -out bpout.tab -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(bpout[1:3,], format = "pandoc", caption="blastn")| query | hit | identity | length | mismatch | gapopen | q_start | q_end | h_start | h_end | evalue | bitscore | q_len | h_len | description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAM45040.1 | AT4G02600.2 | 100.00 | 526 | 0 | 0 | 1 | 526 | 1 | 526 | 0 | 1085 | 526 | 526 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1144141-1147156 FORWARD LENGTH=526 |
| AAM45040.1 | AT4G02600.1 | 100.00 | 526 | 0 | 0 | 1 | 526 | 1 | 526 | 0 | 1085 | 526 | 526 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1144141-1147156 FORWARD LENGTH=526 |
| AAM45040.1 | AT2G44110.2 | 59.34 | 482 | 182 | 6 | 4 | 485 | 2 | 469 | 0 | 584 | 526 | 497 | | Symbols: MLO15, ATMLO15 | Seven transmembrane MLO family protein | chr2:18245489-18248316 REVERSE LENGTH=497 |
## tblastn(aa -> nuc)
p3 <- "~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa"
tbndb = "~/db/cdna/TAIR10_cdna"
bnp <- "tblastn"
tbnout <-MYSYSBLAST(in_f=p3, out_f="-", program=bnp, db=tbndb, "-num_threads 4")## tblastn -query ~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa -db ~/db/cdna/TAIR10_cdna -out - -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(tbnout[1:3,], format = "pandoc", caption="tblastn")| query | hit | identity | length | mismatch | gapopen | q_start | q_end | h_start | h_end | evalue | bitscore | q_len | h_len | description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAM45040.1 | AT4G02600.1 | 100.00 | 526 | 0 | 0 | 1 | 526 | 143 | 1720 | 0 | 1016 | 526 | 1976 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143999-1147409 FORWARD LENGTH=1976 |
| AAM45040.1 | AT4G02600.2 | 100.00 | 526 | 0 | 0 | 1 | 526 | 96 | 1673 | 0 | 1015 | 526 | 1942 | | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942 |
| AAM45040.1 | AT2G44110.2 | 60.55 | 469 | 171 | 6 | 4 | 472 | 17 | 1381 | 0 | 581 | 526 | 1639 | | Symbols: MLO15, ATMLO15 | Seven transmembrane MLO family protein | chr2:18245357-18248329 REVERSE LENGTH=1639 |
pacman::p_load(RColorBrewer, gridExtra, dplyr, ggplot2, zoo)
# レジェンドラベル
vlab <-
c(paste0(zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1,
function(x){paste(x, collapse = ":")}), "(-)"),
zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1,
function(x){paste(x, collapse = ":")}))
# identityに対応したカラーコード
vcol <- c(brewer.pal(7, "Reds"), brewer.pal(7, "Blues"))
# queryごとにplot
blast_out <- bnout # blast_out <- bpout
gg_blast <-
lapply(unique(blast_out$query),
function(x){
# queryごとのデータフレーム
qdat <-blast_out %>%
# identityを離散値に変換
mutate(identity=as.integer(cut(identity, breaks=c(0,seq(70,100,5)), 1:7))) %>%
# hitした向きで値を変える
mutate(identity=ifelse((h_start-h_end) > 0, identity, identity+7)) %>%
# queryで行選択
filter(query==x) %>%
# identity, hitをfactorにして水準を逆に
mutate(identity=factor(identity, levels=rev(levels(factor(identity)))),
hit=factor(hit, levels=rev(unique(hit))))
# queryごとのカラーコードとレジェンドラベル
col_qdat <- vcol[as.integer(levels(factor(qdat$identity)))]
lab_qdat <- vlab[as.integer(levels(factor(qdat$identity)))]
#
ggplot(qdat, aes(x=q_start,xend=q_end, y=hit, yend=hit, colour=identity)) +
theme_bw() + labs(title=x) +
geom_segment(size=3) +
#theme(legend.position="none") +
scale_color_manual(values = col_qdat, name="identity", labels = lab_qdat)
}
)
# 6つだけ
do.call(grid.arrange, c(gg_blast[1:6], list(ncol=2)))# blast結果(データフレーム)とtop hit 数を指定与えて、ggplot objectを返す関数 ----
ggblast <- function(blast_out, num_hit){
# usege: ggblast(blastout=bnout, # blast結果data.frame
# num_query=c(1,3,5), # uniqueなqueryの番号、もしくはid
# num_hit=10 # top10
# )
pacman::p_load(RColorBrewer, zoo, dplyr, ggplot2)
# レジェンドラベル
vlab <- c(paste0(zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1,
function(x){paste(x, collapse = ":")}), "(-)"),
zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1,
function(x){paste(x, collapse = ":")}))
# identityに対応したカラーコード
vcol <- c(brewer.pal(7, "Reds"), brewer.pal(7, "Blues"))
if(missing(num_hit)){ num_hit <- sum(blast_out$query %in% x) }
# queryごとにplot -----
gg_blast <-
lapply(unique(blast_out$query),
function(x){
# queryごとのデータフレーム
qdat <-blast_out %>%
# identityを離散値に変換
mutate(identity=as.integer(cut(identity, breaks=c(0,seq(70,100,5)), 1:7))) %>%
# hitした向きで値を変える
mutate(identity=ifelse((h_start-h_end) > 0, identity, identity+7)) %>%
# queryで行選択
filter(query==x) %>%
# top hitの数を指定
filter(row_number() %in%
1:ifelse(is.null(num_hit), sum(blast_out$query %in% x), num_hit)) %>%
# identity, hitをfactorにして水準を逆に
mutate(identity=factor(identity, levels=rev(levels(factor(identity)))),
hit=factor(hit, levels=rev(unique(hit))))
# queryごとのカラーコードとレジェンドラベル
col_qdat <- vcol[as.integer(levels(factor(qdat$identity)))]
lab_qdat <- vlab[as.integer(levels(factor(qdat$identity)))]
#
ggplot(qdat, aes(x=q_start,xend=q_end, y=hit, yend=hit, colour=identity)) +
theme_bw() + labs(title=x) +
geom_segment(size=3) +
#theme(legend.position="none") +
scale_color_manual(values = col_qdat, name="identity", labels = lab_qdat)
}
)
return(gg_blast)
}gg_bpout <- ggblast(blast_out = bpout, num_hit = 10)
do.call(grid.arrange, c(gg_bpout[1:4], list(ncol=2)))gg_bpout <- ggblast(blast_out = bpout, num_hit = NULL)
do.call(grid.arrange, c(gg_bpout[1:4], list(ncol=2)))gg_bpout <- ggblast(blast_out = tbnout, num_hit = NULL)
do.call(grid.arrange, c(gg_bpout[1:6], list(ncol=2)))sessionInfo()## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 zoo_1.8-0 ggplot2_2.2.1
## [4] dplyr_0.7.2 gridExtra_2.2.1 RColorBrewer_1.1-2
## [7] data.table_1.10.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.12 knitr_1.16 bindr_0.1 magrittr_1.5
## [5] munsell_0.4.3 lattice_0.20-35 colorspace_1.3-2 R6_2.2.2
## [9] rlang_0.1.1 plyr_1.8.4 stringr_1.2.0 highr_0.6
## [13] tools_3.4.1 grid_3.4.1 gtable_0.2.0 pacman_0.4.6
## [17] htmltools_0.3.6 lazyeval_0.2.0 yaml_2.1.14 rprojroot_1.2
## [21] digest_0.6.12 assertthat_0.2.0 tibble_1.3.3 glue_1.1.1
## [25] evaluate_0.10.1 rmarkdown_1.6 labeling_0.3 stringi_1.1.5
## [29] compiler_3.4.1 scales_0.4.1 backports_1.1.0 pkgconfig_2.0.1