• Rで外部コマンドとしてblastを呼び出して実行する。

1 PATH環境変数の設定

  • ホームディレクトリに.Renvironを作成しPATH=~/program/ncbi-blast-2.2.31+/binのように記述
# # コンソールから設定する場合は
# Sys.setenv(PATH = "~/program/ncbi-blast-2.2.31+/bin")
# Sys.getenv("PATH")

2 blastを外部コマンドとして実行する関数

MYSYSBLAST <- function(in_f, out_f, program, db,  ...){
  # usege: MYSYBLAST(in_f="./hoge.fst",   # 入力ファイルパス
  #                 out_f="./out.tab",     # 出力ファイルパス,  - : stdout
  #                 program="blastn",    # プログラム
  #                 db="./db/nuc.blastdb", # データベース
  #                 "-num_threads 4"       # 追加のオプション
  #                 ) 
  suppressMessages(suppressWarnings(library(data.table)))
  
  # コマンド
  com  <- paste(program, "-query", in_f, "-db", db, "-out", out_f,  
               "-outfmt", "\"6 std qlen slen salltitles\"" , ...,  sep = " ")
  cat(com)
  # 実行
  bout <- system(command = com, intern = T, wait = T)
  
  # 標準出力に読み込む場合
  if(out_f=="-"){
    bout <- lapply(bout, function(x){unlist(strsplit(x, "\t"))})
    bout <- data.frame(Reduce(rbind, bout), row.names=NULL, stringsAsFactors = F)

  # ファイルから読み込む場合
  }else{
    bout <- fread(out_f, sep = "\t", data.table = F, stringsAsFactors = F)
  }
  
  names(bout) <- c("query", "hit", "identity", "length", "mismatch", "gapopen", 
                     "q_start", "q_end", "h_start", "h_end", "evalue", "bitscore",
                     "q_len", "h_len", "description")

  bout[c(3:14)] <- lapply(bout[c(3:14)], as.numeric)
  return(bout)
}

2.1 blastn

# オプション
p1 <- "~/pub/dat/sampledata/multiple_alignment/AtMlos.fna"
bndb = "~/db/cdna/TAIR10_cdna"
out <- "-"

# blast実行
bnout <-MYSYSBLAST(in_f = p1, out_f = out, program = "blastn", db = bndb, "-num_threads 4")
## blastn -query ~/pub/dat/sampledata/multiple_alignment/AtMlos.fna -db ~/db/cdna/TAIR10_cdna -out - -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(bnout[1:3,], format = "pandoc", caption="blastn")
blastn
query hit identity length mismatch gapopen q_start q_end h_start h_end evalue bitscore q_len h_len description
NM_116494.5 AT4G02600.1 100 1976 0 0 112 2087 1 1976 0 3650 2240 1976 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143999-1147409 FORWARD LENGTH=1976
NM_116494.5 AT4G02600.2 100 1852 0 0 249 2100 91 1942 0 3421 2240 1942 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942
NM_116494.5 AT4G02600.2 100 91 0 0 75 165 1 91 0 169 2240 1942 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942

2.2 blastp

# オプション(ファイルから読み込む場合)
p2 <- "~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa"
bpdb = "~/db/protein/TAIR10_pep"
out <- "bpout.tab"

# blastp実行
bpout <- MYSYSBLAST(in_f = p2, out_f = out, program = "blastp", db = bpdb, "-num_threads 4")
## blastp -query ~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa -db ~/db/protein/TAIR10_pep -out bpout.tab -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(bpout[1:3,], format = "pandoc", caption="blastn")
blastn
query hit identity length mismatch gapopen q_start q_end h_start h_end evalue bitscore q_len h_len description
AAM45040.1 AT4G02600.2 100.00 526 0 0 1 526 1 526 0 1085 526 526 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1144141-1147156 FORWARD LENGTH=526
AAM45040.1 AT4G02600.1 100.00 526 0 0 1 526 1 526 0 1085 526 526 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1144141-1147156 FORWARD LENGTH=526
AAM45040.1 AT2G44110.2 59.34 482 182 6 4 485 2 469 0 584 526 497 | Symbols: MLO15, ATMLO15 | Seven transmembrane MLO family protein | chr2:18245489-18248316 REVERSE LENGTH=497

2.3 tblastn

## tblastn(aa -> nuc)
p3 <- "~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa"
tbndb = "~/db/cdna/TAIR10_cdna"
bnp  <- "tblastn"

tbnout <-MYSYSBLAST(in_f=p3, out_f="-", program=bnp, db=tbndb, "-num_threads 4")
## tblastn -query ~/pub/dat/sampledata/multiple_alignment/AtMLOs.faa -db ~/db/cdna/TAIR10_cdna -out - -outfmt "6 std qlen slen salltitles" -num_threads 4
# 3行表示
knitr::kable(tbnout[1:3,], format = "pandoc", caption="tblastn")
tblastn
query hit identity length mismatch gapopen q_start q_end h_start h_end evalue bitscore q_len h_len description
AAM45040.1 AT4G02600.1 100.00 526 0 0 1 526 143 1720 0 1016 526 1976 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143999-1147409 FORWARD LENGTH=1976
AAM45040.1 AT4G02600.2 100.00 526 0 0 1 526 96 1673 0 1015 526 1942 | Symbols: MLO1, ATMLO1 | Seven transmembrane MLO family protein | chr4:1143962-1147422 FORWARD LENGTH=1942
AAM45040.1 AT2G44110.2 60.55 469 171 6 4 472 17 1381 0 581 526 1639 | Symbols: MLO15, ATMLO15 | Seven transmembrane MLO family protein | chr2:18245357-18248329 REVERSE LENGTH=1639

3 クエリごとの結果を描画する

  • 逆向きのhitは赤で表示する
pacman::p_load(RColorBrewer, gridExtra, dplyr, ggplot2, zoo)
# レジェンドラベル
vlab <- 
  c(paste0(zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1, 
                   function(x){paste(x, collapse = ":")}), "(-)"),
    zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1, 
                   function(x){paste(x, collapse = ":")}))

# identityに対応したカラーコード
vcol <- c(brewer.pal(7, "Reds"), brewer.pal(7, "Blues"))

# queryごとにplot
blast_out <- bnout # blast_out <- bpout
gg_blast <-
  lapply(unique(blast_out$query),
         function(x){
           # queryごとのデータフレーム
           qdat <-blast_out %>%
             # identityを離散値に変換
             mutate(identity=as.integer(cut(identity, breaks=c(0,seq(70,100,5)), 1:7))) %>%
             # hitした向きで値を変える
             mutate(identity=ifelse((h_start-h_end) > 0, identity, identity+7)) %>%
             # queryで行選択
             filter(query==x) %>%
             # identity, hitをfactorにして水準を逆に
             mutate(identity=factor(identity, levels=rev(levels(factor(identity)))),
                    hit=factor(hit, levels=rev(unique(hit))))
           
           # queryごとのカラーコードとレジェンドラベル
           col_qdat <- vcol[as.integer(levels(factor(qdat$identity)))]
           lab_qdat <- vlab[as.integer(levels(factor(qdat$identity)))]
           
           # 
           ggplot(qdat, aes(x=q_start,xend=q_end, y=hit, yend=hit, colour=identity)) +
            theme_bw() + labs(title=x) +
            geom_segment(size=3) + 
            #theme(legend.position="none") +
            scale_color_manual(values = col_qdat, name="identity", labels = lab_qdat)
           }
         )

# 6つだけ
do.call(grid.arrange, c(gg_blast[1:6], list(ncol=2)))

4 tophit

4.1 関数

# blast結果(データフレーム)とtop hit 数を指定与えて、ggplot objectを返す関数 ----
ggblast <- function(blast_out, num_hit){
  # usege: ggblast(blastout=bnout,       # blast結果data.frame
  #                num_query=c(1,3,5),    # uniqueなqueryの番号、もしくはid
  #                num_hit=10             # top10
  #                 ) 
  pacman::p_load(RColorBrewer, zoo, dplyr, ggplot2)
  
  
  # レジェンドラベル 
  vlab <- c(paste0(zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1, 
                                  function(x){paste(x, collapse = ":")}), "(-)"),
            zoo::rollapply(c(0,seq(70,100,5)), width=2, by=1, 
                           function(x){paste(x, collapse = ":")}))
  # identityに対応したカラーコード
  vcol <- c(brewer.pal(7, "Reds"), brewer.pal(7, "Blues"))
  if(missing(num_hit)){ num_hit <- sum(blast_out$query %in% x) }
  # queryごとにplot -----
  gg_blast <-
    lapply(unique(blast_out$query),
        function(x){
          # queryごとのデータフレーム
          qdat <-blast_out %>%
            # identityを離散値に変換
            mutate(identity=as.integer(cut(identity, breaks=c(0,seq(70,100,5)), 1:7))) %>%
            # hitした向きで値を変える
            mutate(identity=ifelse((h_start-h_end) > 0, identity, identity+7)) %>%
            # queryで行選択
            filter(query==x) %>%
            # top hitの数を指定
            filter(row_number() %in% 
                     1:ifelse(is.null(num_hit), sum(blast_out$query %in% x), num_hit)) %>%
            # identity, hitをfactorにして水準を逆に
            mutate(identity=factor(identity, levels=rev(levels(factor(identity)))),
                   hit=factor(hit, levels=rev(unique(hit))))
             
            # queryごとのカラーコードとレジェンドラベル
            col_qdat <- vcol[as.integer(levels(factor(qdat$identity)))]
            lab_qdat <- vlab[as.integer(levels(factor(qdat$identity)))]
             
            # 
             ggplot(qdat, aes(x=q_start,xend=q_end, y=hit, yend=hit, colour=identity)) +
              theme_bw() + labs(title=x) +
              geom_segment(size=3) + 
              #theme(legend.position="none") +
              scale_color_manual(values = col_qdat, name="identity", labels = lab_qdat)
             }
    )
  return(gg_blast)
}

4.2 blastp top10

gg_bpout <- ggblast(blast_out = bpout, num_hit = 10)
do.call(grid.arrange, c(gg_bpout[1:4], list(ncol=2)))

4.3 blastp hit数指定なし

gg_bpout <- ggblast(blast_out = bpout, num_hit = NULL)
do.call(grid.arrange, c(gg_bpout[1:4], list(ncol=2)))

4.4 tblastn hit数指定なし

gg_bpout <- ggblast(blast_out = tbnout, num_hit = NULL)
do.call(grid.arrange, c(gg_bpout[1:6], list(ncol=2)))

5 環境

sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.5
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2       zoo_1.8-0          ggplot2_2.2.1     
## [4] dplyr_0.7.2        gridExtra_2.2.1    RColorBrewer_1.1-2
## [7] data.table_1.10.4 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.12     knitr_1.16       bindr_0.1        magrittr_1.5    
##  [5] munsell_0.4.3    lattice_0.20-35  colorspace_1.3-2 R6_2.2.2        
##  [9] rlang_0.1.1      plyr_1.8.4       stringr_1.2.0    highr_0.6       
## [13] tools_3.4.1      grid_3.4.1       gtable_0.2.0     pacman_0.4.6    
## [17] htmltools_0.3.6  lazyeval_0.2.0   yaml_2.1.14      rprojroot_1.2   
## [21] digest_0.6.12    assertthat_0.2.0 tibble_1.3.3     glue_1.1.1      
## [25] evaluate_0.10.1  rmarkdown_1.6    labeling_0.3     stringi_1.1.5   
## [29] compiler_3.4.1   scales_0.4.1     backports_1.1.0  pkgconfig_2.0.1