ライブラリ, ファイルパス
# ライブラリロード ----
pacman::p_load(ShortRead, Biostrings, dplyr, ggplot2, gridExtra, data.table)
options(stringsAsFactors = F)
# ファイル名取得 ----
p <- system.file("extdata/E-MTAB-1147", package = "ShortRead")
f <- list.files(p, "fastq.gz", full.names = T)
fastqファイル読み込み
readFastq 全リードをよみこむ
FastqSampler ランダムにリードを取り出す(defaultで1Mリード)
FastqStreamer 全リードを1Mリード(変更可)づつ連続で読み込みながら処理する
# fastqファイル読み込み ----
## a. readFastq (20000リード)
system.time(fq1 <- readFastq(f[1]))
## user system elapsed
## 0.087 0.004 0.093
## b. FastqSampler (ランダム1Mリード)
system.time(
{
sampler <- FastqSampler(f[1]) # nでリード数指定できる
fqs <- yield(sampler)
close(sampler)
}
)
## user system elapsed
## 0.360 0.013 0.375
## c. FastqStreamer 1Mリードづつ連続処理
system.time(
{
strm <- FastqStreamer(f[1], n = 10000) # ここでは1万リードづつ
repeat {
fq <- yield(strm)
if (length(fq) == 0)
break
## process chunk
print(length(fq))
}
close(strm)
}
)
## [1] 10000
## [1] 10000
## user system elapsed
## 0.336 0.008 0.347
fastqオブジェクト中身確認
sread 配列取り出し
quality quality情報を取り出す
id descriptionを取り出す
width read長
tables 重複リード
writeFastq fqファイル書き出し
# オブジェクト確認
fq1
## class: ShortReadQ
## length: 20000 reads; width: 72 cycles
# ShortReadQ クラスのオブジェクト
showClass("ShortReadQ")
## Class "ShortReadQ" [package "ShortRead"]
##
## Slots:
##
## Name: quality sread id
## Class: QualityScore DNAStringSet BStringSet
##
## Extends:
## Class "ShortRead", directly
## Class ".ShortReadBase", by class "ShortRead", distance 2
##
## Known Subclasses: "AlignedRead"
# クラス属性
names(attributes(fq1)) # "quality" "sread" "id" "class"
## [1] "quality" "sread" "id" "class"
# quality値を取り出す
quality(fq1)
## class: FastqQuality
## quality:
## A BStringSet instance of length 20000
## width seq
## [1] 72 HHHHHHHHHHHHHHHHHHHHEBDBB?B:B...FBDBD@DDECEE3>:?;@@@>?=BAB?##
## [2] 72 IIIIHIIIGIIIIIIIHIIIIEGBGHIII...IIIHIIIIIGIIIEGIIGBGE@DDGGGIG
## [3] 72 GGHBHGBGGGHHHHDHHHHHHHHHFGHHH...HHHHGHFHHHHHHHHHHHHHH8AGDGGG>
## [4] 72 IIIIIIIIIIIIIIIIIHIIIGHIIIIII...IIIIIIIHIIIIHIIHIGDIIHIHIIIHF
## [5] 72 GGEGGE8EEEDDBD?EEEDEGGG3GE<EE...CB8BBB?E>E?CGEGG<G@F>GGB>D###
## ... ... ...
## [19996] 72 IIEIIIIIIGIIIIIIHEGDIIIIIHGII...IIIIIIEIIHGIIIDIFGHIGGIDHIIIG
## [19997] 72 HHHHHHHHHHHHHHHFBHHHHH@GHHHHH...HHHHHHHHHDHHHHHHHHHHFGHHHHHHD
## [19998] 72 IIIIHIIIIIIIIIIIHIIIHIHIIIFIG...@.@EC>@AACAC>GDDD7668<B;=??<?
## [19999] 72 IIIIIIIIIIIIIIIIHIIIFIGIGHHII...GBA?C@C?C>C;AACCA+78;;BBB@BB<
## [20000] 72 FFFECBEGGGGGGBGGBGGGGEDG@ECCA...#############################
toString(quality(fq1)[[1]])
## [1] "HHHHHHHHHHHHHHHHHHHHEBDBB?B:BBGG<DDAA?AABFEFBDBD@DDECEE3>:?;@@@>?=BAB?##"
as(PhredQuality(toString(quality(fq1)[[1]])), "IntegerList")
## IntegerList of length 1
## [[1]] 39 39 39 39 39 39 39 39 39 39 39 ... 31 31 29 30 28 33 32 33 30 2 2
# 塩基配列取り出す
sread(fq1) # DNAStringSet object ->
## A DNAStringSet instance of length 20000
## width seq
## [1] 72 GTCTGCTGTATCTGTGTCGGCTGTCTCGC...ATGAAGGCCTGGAATGTCACTACCCCCAG
## [2] 72 CTAGGGCAATCTTTGCAGCAATGAATGCC...GGCTTTTGAGGCCAGAGCAGACCTTCGGG
## [3] 72 TGGGCTGTTCCTTCTCACTGTGGCCTGAC...ATTAAGAAAGAGTCACGTTTCCCAAGTCT
## [4] 72 CTCATCCACACCTTTGGTCTTGATGGCTG...GCATCCCGCTCAGCATCAAAGTTAGTATA
## [5] 72 GTTTGGATATATGGAGGATGGGGATTATT...GGATAGTAATAGGGCAAGGACGCCTCCTA
## ... ... ...
## [19996] 72 GCGGGAGCGGCCAAAATGAAGTTTAATCC...ACCGAAGCAAGAATCGCAAAAGGCATTTC
## [19997] 72 TTGTAATCTACTCTTGAACAAAGAATATT...GTTTGTTGGGCAGCTAATAGTGTGAACCA
## [19998] 72 TGTTGATGGTGCTGGTTACTGGGCCGTGG...GTCCCTGCAGTTACACACAGCCCTGCCTC
## [19999] 72 CGGAGGTGCAGCCCCCGCCCAAGCGGCGG...CACACACAACCTGCCCGAGTTCATTGTGA
## [20000] 72 GCAAGGGCGTCATGCTGGCCGTCAGCCAG...CAACGGGCTCAACATCCCCAACGAGGACT
toString(sread(fq1)[[1]])
## [1] "GTCTGCTGTATCTGTGTCGGCTGTCTCGCGGGACATGAAGTCAATGAAGGCCTGGAATGTCACTACCCCCAG"
# idを取り出す
ShortRead::id(fq1)
## A BStringSet instance of length 20000
## width seq
## [1] 53 ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/1
## [2] 53 ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0/1
## [3] 55 ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0/1
## [4] 54 ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0/1
## [5] 54 ERR127302.19486260 HWI-EAS350_0441:1:80:13838:5544#0/1
## ... ... ...
## [19996] 56 ERR127302.24686961 HWI-EAS350_0441:1:101:10393:16964#0/1
## [19997] 54 ERR127302.8014168 HWI-EAS350_0441:1:32:16962:18478#0/1
## [19998] 54 ERR127302.14811006 HWI-EAS350_0441:1:61:5820:18247#0/1
## [19999] 51 ERR127302.1875050 HWI-EAS350_0441:1:8:6759:5325#0/1
## [20000] 54 ERR127302.14540580 HWI-EAS350_0441:1:60:3157:19203#0/1
# read数
length(fq1)
## [1] 20000
# quality score ----
head(alphabetScore(fq1))
## [1] 2397 2804 2739 2857 2332 2623
alphabetScore(fq1[1])/width(fq1[1])
## [1] 33.29167
# read長 (このサンプルデータは全て72bp) ----
table(width(fq1))
##
## 72
## 20000
# 重複リード ----
ShortRead::tables(fq1, n=10)$top # top10
## CTATTCTCTACAAACCACAAAGACATTGGAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGC
## 7
## GTTTGGTCTAGGGTGTAGCCTGAGAATAGGGGAAATCAGTGAATGAAGCCTCCTATGATGGCAAATACAGCT
## 7
## CGATAACGTTGTAGATGTGGTCGTTACCTAGAAGGTTGCCTGGCTGGCCCAGCTCGGCTCGAATAAGGAGGC
## 6
## CTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTA
## 6
## CACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTTAGCTGACTCG
## 5
## CCTTGGTATGTGCTTTCTCGTGTTACATCGCGCCATCATTGGTATATGGTTAGTGTGTTGGTTAGTAGGCCT
## 5
## CTGAGATGTTAGTATTAGTTAGTTTTGTTGTGAGTGTTAGGAAAAGGGCATACAGGACTAGGAAGCAGATAA
## 5
## CAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGG
## 4
## CAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTT
## 4
## CAAGGAGTCGCAGGTCGCCTGGTTCTAGGAATAATGGGGGAAGTATGTAGGAGTTGAAGATTAGTCCGCCGT
## 4
ShortRead::tables(fq1)$distribution # 重複リードの分布
## nOccurrences nReads
## 1 1 19291
## 2 2 247
## 3 3 34
## 4 4 18
## 5 5 3
## 6 6 2
## 7 7 2
# fqファイル書き出し ----
# writeFastq(object, "fq.gz", compress=T)
クォリティアセスメントレポート
# # qa report -----
# report(qa.sum, dest = "./qa.report")
クォリティアセスメント
qa(dirPath="~/fqdat")
- デフォルトで、ランダム1Mリードずつ読まれている。
- 全リード読み込みは
qa(path1, sample=F)とすればよい。
# quality assesment
(qa.sum <- qa(dirPath = f)) # file名のベクトルでもO.K.
## class: FastqQA(10)
## QA elements (access with qa[["elt"]]):
## readCounts: data.frame(2 3)
## baseCalls: data.frame(2 5)
## readQualityScore: data.frame(1024 4)
## baseQuality: data.frame(188 3)
## alignQuality: data.frame(2 3)
## frequentSequences: data.frame(100 4)
## sequenceDistribution: data.frame(14 4)
## perCycle: list(2)
## baseCall: data.frame(695 4)
## quality: data.frame(5322 5)
## perTile: list(2)
## readCounts: data.frame(0 4)
## medianReadQualityScore: data.frame(0 4)
## adapterContamination: data.frame(2 1)
# read counts
qa.sum[["readCounts"]]
## read filter aligned
## ERR127302_1_subset.fastq.gz 20000 NA NA
## ERR127302_2_subset.fastq.gz 20000 NA NA
# base call
qa.sum[["baseCalls"]]
## A C G T N
## ERR127302_1_subset.fastq.gz 328410 393391 391338 326039 822
## ERR127302_2_subset.fastq.gz 323860 399757 394210 320875 1298
# readQualityScore
head(qa.sum[["readQualityScore"]])
## quality density lane type
## 1 0.5202013 2.466628e-06 ERR127302_1_subset.fastq.gz read
## 2 0.6003299 3.944517e-06 ERR127302_1_subset.fastq.gz read
## 3 0.6804585 6.140242e-06 ERR127302_1_subset.fastq.gz read
## 4 0.7605871 9.310446e-06 ERR127302_1_subset.fastq.gz read
## 5 0.8407157 1.376064e-05 ERR127302_1_subset.fastq.gz read
## 6 0.9208443 1.983660e-05 ERR127302_1_subset.fastq.gz read
# baseQuality ascii
head(qa.sum[["baseQuality"]])
## score count lane
## 1 0 ERR127302_1_subset.fastq.gz
## 2 ! 0 ERR127302_1_subset.fastq.gz
## 3 " 0 ERR127302_1_subset.fastq.gz
## 4 # 87226 ERR127302_1_subset.fastq.gz
## 5 $ 6 ERR127302_1_subset.fastq.gz
## 6 % 185 ERR127302_1_subset.fastq.gz
# リード出現頻度 sequence, count, read, lane
head(qa.sum[["frequentSequences"]]) # topいくつか
## sequence
## 1 CTATTCTCTACAAACCACAAAGACATTGGAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGC
## 2 GTTTGGTCTAGGGTGTAGCCTGAGAATAGGGGAAATCAGTGAATGAAGCCTCCTATGATGGCAAATACAGCT
## 3 CGATAACGTTGTAGATGTGGTCGTTACCTAGAAGGTTGCCTGGCTGGCCCAGCTCGGCTCGAATAAGGAGGC
## 4 CTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTA
## 5 CACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTTAGCTGACTCG
## 6 CCTTGGTATGTGCTTTCTCGTGTTACATCGCGCCATCATTGGTATATGGTTAGTGTGTTGGTTAGTAGGCCT
## count type lane
## 1 7 read ERR127302_1_subset.fastq.gz
## 2 7 read ERR127302_1_subset.fastq.gz
## 3 6 read ERR127302_1_subset.fastq.gz
## 4 6 read ERR127302_1_subset.fastq.gz
## 5 5 read ERR127302_1_subset.fastq.gz
## 6 5 read ERR127302_1_subset.fastq.gz
head(qa.sum[["frequentSequences"]]$sequence) # 重複リード配列
## [1] "CTATTCTCTACAAACCACAAAGACATTGGAACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGC"
## [2] "GTTTGGTCTAGGGTGTAGCCTGAGAATAGGGGAAATCAGTGAATGAAGCCTCCTATGATGGCAAATACAGCT"
## [3] "CGATAACGTTGTAGATGTGGTCGTTACCTAGAAGGTTGCCTGGCTGGCCCAGCTCGGCTCGAATAAGGAGGC"
## [4] "CTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTA"
## [5] "CACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTTAGCTGACTCG"
## [6] "CCTTGGTATGTGCTTTCTCGTGTTACATCGCGCCATCATTGGTATATGGTTAGTGTGTTGGTTAGTAGGCCT"
head(qa.sum[["frequentSequences"]]$count) # count
## [1] 7 7 6 6 5 5
# 重複リードの分布
head(qa.sum[["sequenceDistribution"]])
## nOccurrences nReads type lane
## 1 1 19291 read ERR127302_1_subset.fastq.gz
## 2 2 247 read ERR127302_1_subset.fastq.gz
## 3 3 34 read ERR127302_1_subset.fastq.gz
## 4 4 18 read ERR127302_1_subset.fastq.gz
## 5 5 3 read ERR127302_1_subset.fastq.gz
## 6 6 2 read ERR127302_1_subset.fastq.gz
# fastqオブジェクトから調べる場合は ShortReads::tables(fq1)
# per cycle baseCall, quality
head(qa.sum[["perCycle"]][[1]]) # baseCall
## Cycle Base Count lane
## 1 1 A 2248 ERR127302_1_subset.fastq.gz
## 2 1 C 10283 ERR127302_1_subset.fastq.gz
## 3 1 G 3105 ERR127302_1_subset.fastq.gz
## 4 1 T 4344 ERR127302_1_subset.fastq.gz
## 15 1 N 20 ERR127302_1_subset.fastq.gz
## 19 2 A 4965 ERR127302_1_subset.fastq.gz
head(qa.sum[["perCycle"]][[2]]) # quality
## Cycle Quality Score Count lane
## 4 1 # 2 5 ERR127302_1_subset.fastq.gz
## 6 1 % 4 2 ERR127302_1_subset.fastq.gz
## 7 1 & 5 9 ERR127302_1_subset.fastq.gz
## 8 1 ' 6 6 ERR127302_1_subset.fastq.gz
## 13 1 , 11 3 ERR127302_1_subset.fastq.gz
## 14 1 - 12 6 ERR127302_1_subset.fastq.gz
# ---
# qa.sum[["perTile"]]
# qa.sum[["perTile"]][[1]] #readCounts
# qa.sum[["perTile"]][[2]] #medianReadQualityScore
# adapter Contaminationの確認 (これはqaではなく全リードで確認すべきか?)----
qa.sum[["adapterContamination"]]
## contamination
## ERR127302_1_subset.fastq.gz NA
## ERR127302_2_subset.fastq.gz NA
kmerカウント
positionごとの塩基出現頻度k=7
- kmerカウント
readDNAStringSetでは、現在、長さの異なるリードを含むfastqを読み込めない
- readFastqで読み込んでから取り出し。`sread(readFastq(f[1]))
- positionごとにkmerカウントする意味はない
# fastqもしくはfastaファイルのkmerカウント
kmer_count <- function(filepath, format, k){
## usage: kmer_count(filepath="~/fastq.gz, format="fastq", k=7)
# 1. ファイル読み込み -----
if(format=="fastq"){
fas <- sread(readFastq(filepath))
}else if(format=="fasta"){
fas <- readDNAStringSet(filepath, format = "fasta")
}
# 2. 指定kmerの全kmerデータフレーム ----
nuc <-c("A","T","G","C")
d <- as.data.frame(lapply(expand.grid(rep(list(nuc), k)), as.character))
kn <-data.frame(kmer=sapply(1:nrow(d),
function(x)paste(as.character(d[x,]), collapse = "")))
# 3.1 リード長が異なる場合。 配列ごとに d----
if(length(unique(width(fas))) > 1 && format=="fastq"){
## 配列毎のkmerリスト ----
kl <- lapply(fas,
function(x){
st <- 1:(length(x)-k+1)
ed <- st+k-1
str_sub(toString(x), start = st, end = ed)
})
## kmer table ----
kmer_table <- data.frame(kmer=names(table(unlist(kl))),
count=as.integer(table(unlist(kl))))
kmer_table <- left_join(k7, kmer_table, by="kmer")
kmer_table[is.na(kmer_table)] <- 0
## リード(列), スタート位置(行)のkmer matrix ----
kmer_mat <- sapply(kl, "[", 1:as.numeric(max(names(table(sapply(kl, length))))))
## スタート位置ごとにkmerの集計 ----
l_ktab <- apply(kmer_mat, 1, function(x){
data.frame(kmer=names(table(x)), count=as.integer(table(x)))})
kmer_dat <- Reduce(function(x,y){left_join(x,y, by="kmer")}, c(list(k7), l_ktab))
## 列ラベルをスタート位置の塩基番号に変換 ----
names(kmer_dat)[-1] <- 1:as.numeric(max(names(table(sapply(kl, length)))))
## NA -> 0 ----
kmer_dat[is.na(kmer_dat)] <- 0
# 3.2 fasta形式 ----
}else if(length(unique(width(fas))) >1 && format=="fasta"){
## 配列毎のkmerリスト ----
kl <- lapply(fas,
function(x){
st <- 1:(length(x)-k+1)
ed <- st+k-1
str_sub(toString(x), start = st, end = ed)
})
## kmer table ----
kmer_table <- data.frame(kmer=names(table(unlist(kl))),
count=as.integer(table(unlist(kl))))
kmer_dat <- left_join(kn, kmer_table, by="kmer")
kmer_dat[is.na(kmer_dat)] <- 0
# 4. リードの長さが同じ場合 -----
}else if(length(unique(width(fas)))==1){
## read widthからstart positionを決める ----
starts <- 1:(max(width(fas))-k+1)
## kmerカウント -----
kmer_cnts <- lapply(seq_along(starts), function(x){
as.data.frame(table(subseq(fas, start =starts[x], width = k)), stringsAsFactors = F)})
## kmerカウントラベル変更 ----
invisible(lapply(starts, function(x){names(kmer_cnts[[x]]) <<- c("kmer",x)}))
## kmer tableを作成する ----
kmer_dat <- Reduce(function(x,y){left_join(x, y, by="kmer")},
c(list(kn), kmer_cnts))
## NAを0に変更する。 ----
kmer_dat[is.na(kmer_dat)] <- 0
}
# 5. kmer tableを返す ----
return(kmer_dat)
}
# positionごとのにkmerカウントデータからkmer出現確率のggplot
kmer_gg <- function(kmercnt, h, topn, smp){
# サンプル毎のkmer出現率の期待値のリスト
obs_exp <- cbind(kmer=kmercnt$kmer,
sweep(kmercnt[-1], 1, apply(kmercnt[-1], 1, mean), "/"))
obs_exp <- obs_exp %>%
mutate(kmer=paste(rownames(obs_exp), kmer, sep=":")) # 行番号で比較するため
# 出現確率top10のkmer
obs_exp_top <- obs_exp[order(apply(kmercnt[-1], 1, sum), decreasing = T)[1:topn],]
gg_top <- obs_exp_top %>%
mutate(kmer=factor(kmer, levels=kmer)) %>%
tidyr::gather(key=start, value=count, -kmer, factor_key=T) %>%
ggplot(aes(x=start, y=count, colour=kmer, group=kmer))+
theme_bw() +
theme(axis.text.x = element_text(size=8, angle=90, hjust=1))+
labs(title=paste0(smp,"_kmer_count_top", topn), y="Obs/Exp") +
geom_line()
# 先頭x塩基で出現確率Obs/Expがtopのkmer
gg_head <-
obs_exp[apply(obs_exp[,2:(1+h)], 2, which.max),] %>%
tidyr::gather(key=start, value=count, -kmer, factor_key=T) %>%
ggplot(aes(x=start, y=count, colour=kmer, group=kmer)) +
theme_bw() +
theme(axis.text.x = element_text(size=8, angle=90, hjust=1))+
labs(title=paste0(smp, "_head_",h ), y="Obs/Exp") +
geom_line()
return(list(gg_top, gg_head))
}
# kmer table作成
k7_dat <- kmer_count(filepath = f[1], format = "fastq", k = 7)
# kmer出現確率のggplotオブジェクト作成
gg_kmer <- kmer_gg(kmercnt = k7_dat, h =10, topn = 10, smp = "R1")
# plot
do.call(grid.arrange, c(gg_kmer, list(ncol=2)))

各種グラフ化
qaの結果(default 1Mリード)を基に計算している ## リード数
# nread_gg (qaの結果とファイル名拡張子を与えてread数のggplotオブジェクトを返す関数) ----
nread_gg <- function(qadat, suffix){
# library
pacman::p_load(dplyr, ggplot2, gridExtra)
# read数データフレーム
readat <- data.frame(sample=sub(suffix, "", rownames(qadat[["readCounts"]])),
read=qadat[["readCounts"]]$read)
#write.table(readat, "./num_reads.txt", sep="\t", quote=F, row.names=F)
read_gg <- ggplot(data = readat, aes(x = sample, y = read), fill=sampe) +
geom_bar(stat="identity") +
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
return(read_gg)
}
# ggplot ----
nread <- nread_gg(qadat = qa.sum, suffix = ".fastq.gz")
print(nread)

各塩基出現頻度
# sc_gg (各塩基出現頻度のggplotオブジェクトを返す関数) ----
sc_gg <- function(qadat, suffix, facet_col){
pacman::p_load(dplyr, ggplot2)
# sequence content データフレーム
scdat <- qadat[["perCycle"]][[1]]
# write.table(ggdat, "./sc_ggdat.txt", sep="\t", quote=F, row.names=F)
scrate <- scdat %>%
mutate(Base=factor(Base, levels = c("A","T","G","C","N")),
lane=sub(suffix, "", lane)) %>%
group_by(Cycle, lane) %>%
mutate(Sequence_Content=Count/sum(Count)) # 割合に変換
# ggplot サンプル別sequence content
scgg <- ggplot(scrate, aes(x=Cycle, y=Sequence_Content, group=Base, colour=Base))+
geom_line() +
theme_bw() +
facet_wrap(~lane, ncol=facet_col)
# ggplot サンプル別Nの割合
nrgg <- scrate %>%
filter(Base=="N") %>%
ggplot(aes(x=Cycle, y=Sequence_Content)) +
geom_point() +
theme_bw() +
labs(y="Sequence Content(N)") +
facet_wrap(~lane, ncol=2)
# ggplot 塩基別sequenceカウント
scgg2 <- ggplot(scrate, aes(x=Cycle, y=Count, colour=Base, group=lane))+
geom_line(alpha=0.5, size=1) +
theme_bw() +
theme(text = element_text(size=20)) +
facet_wrap(~Base, ncol=2)
# ggplot 塩基別sequence 割合
scgg3 <- ggplot(scrate, aes(x=Cycle, y=Sequence_Content, colour=Base, group=lane))+
geom_line(alpha=0.5, size=1) +
theme_bw() +
theme(text = element_text(size=20)) +
facet_wrap(~Base, ncol=2)
return(list(scgg, nrgg, scgg2, scgg3))
}
# ggplot ----
scs <- sc_gg(qadat = qa.sum, suffix = ".fastq.gz", facet_col = 2 )
do.call(grid.arrange, c(scs, list(ncol=2)))

Quality Score(ポジションごとの平均値と中央値)
# qs_gg (Quality Scoreのggplotオブジェクトを返す)-----
qs_gg <- function(qadat, suffix, facet_col){
qsdat <- qadat[["perCycle"]][[2]]
# サンプル別qscore (mean, median)
qsds <- qsdat %>%
mutate(lane=sub(suffix,"", lane)) %>%
group_by(Cycle, lane) %>%
summarise(median=median(rep(Score, Count)), mean=mean(rep(Score, Count))) %>%
tidyr::gather(key = Qscore, value = Quality_score, median, mean)
qs_gg <- ggplot(qsds, aes(x=Cycle, y=Quality_score, group=Qscore, colour=Qscore))+
geom_point() +
theme_bw() +
facet_wrap(~lane, ncol=facet_col)
# 平均q_score
meanqs <- qsds %>% dplyr::filter(Qscore=="mean")
gttle <- paste0("mean Quality Score(all ",length(unique(meanqs$lane)), " samples)")
qs_gg2 <-
ggplot(meanqs, aes(x=Cycle, y=Quality_score, colour=lane))+
scale_color_manual(values = rep(1,length(unique(meanqs$lane)))) +
geom_point(size=2, alpha=0.5) +
theme_light() +
theme(legend.position="none", text = element_text(size=20)) +
labs(y="mean Quality Score")
return(list(qs_gg, qs_gg2))
}
# ggplot
qs <- qs_gg(qadat = qa.sum, suffix = ".fastq.gz", facet_col = 2)
do.call(gridExtra::grid.arrange, c(qs, list(ncol=2)))

GC含量
# GC 含量 (リード毎に計算する。) ----
## FastqSamlerで1Mリード読み込む
myGC_sampler<- function(file_path){
sampler_fq <- FastqSampler(file_path) # nでリード数指定できる
fqs <- yield(sampler_fq)
close(sampler_fq)
# リード毎のGC含量
v_gc <- letterFrequency(sread(fqs), "GC", as.prob=TRUE)*100
return(v_gc)
}
## ggplot GC含量 ----
gc_gg <- function(f, suffix, facet_col){
# サンプル毎のGC含量のリスト
l_gc <- lapply(f, function(x)myGC_sampler(file_path = x))
# サンプル名をつける
names(l_gc) <- sub(suffix, "", sapply(strsplit(f, "/"), function(x){tail(x, n=1)}))
# データフレーム
## 長さの異なるベクトルをデータフレームに入れる
cbind.fillna <- function(l, fill){
mlen <- max(sapply(l, length))
d <- data.frame(sapply(l, function(x){ c(x, rep(NA, mlen-length(x)))}) )
return(d)
}
## ggplot用のデータフレーム
gcdat <- tidyr::gather(cbind.fillna(l_gc), key="sample", value="GC")
if(any(which(is.na(gcdat$GC)))){
gcdat <- gcdat[-which(is.na(gcdat$GC)),]
}
# ggplot
gggc <- ggplot(gcdat, aes(x=GC)) +
geom_histogram(bins = 20, alpha=0.5) +
theme_bw()+
labs(x="GC(%)")+
facet_wrap(~sample, ncol=facet_col)
return(gggc)
}
#
system.time(gggc <- gc_gg(f = f, suffix = ".fastq.gz", facet_col=2))
## user system elapsed
## 0.173 0.024 0.200
print(gggc)

# ## GCrateのリストを返す関数(fastqファイルのあるdirパスを指定) ##----
# myGCcontent <- function(file_path){
# gc.rate <- numeric()
# # open input stream
# stream <- FastqStreamer(file_path) # defaultで1Mリードずつ
#
# # 関数を抜ける時に必ずclose(stream)が実行される。
# on.exit(close(stream))
#
# # 以下繰り返し
# repeat {
# ## input chunk
# fq <- yield(stream)
# if (length(fq) == 0)
# break
# # リード毎にGCの割合
# gc.chunk <- letterFrequency(sread(fq), "GC", as.prob=TRUE)
# gc.rate <- c(gc.rate, gc.chunk)
# }
#
# # リード毎GC割合のベクトル
# return(gc.rate*100)
# }
#system.time(l_gc <- lapply(f, function(x)myGCcontent(file_path = x)))
# ## ヒストグラムの一部領域に色をつける関数 ##----
# histcol2 = function(obj, lower, upper, probability = FALSE, ...) {
# if (class(obj) != "histogram") {
# obj = hist(data, plot = FALSE)
# }
# y = if (probability) obj$density else obj$counts
# breaks = obj$breaks
# n = length(y)
# paint = (upper[1] > breaks) & (breaks >= lower[1])
# if (length(lower) > 1) {
# for (i in 2:length(lower)) {
# paint = paint | ((upper[i] > breaks) & (breaks >= lower[i]))
# }
# }
# for (i in 1:n) {
# if (paint[i]) {
# rect(breaks[i], 0, breaks[i + 1], y[i], ...)
# }
# }
# }
#
# # 各サンプルのリード別GC割合のヒストグラム ----
# par(mfrow=c(1,2))
# invisible(lapply(f, function(x){ # fはfull path
# nsample <- sub(".fastq.gz","",sapply(strsplit(x, "/"), function(x){tail(x, n=1)}))
# v.gc <- myGCcontent(x)
# hist.obj <- hist(v.gc, breaks = 100, main=nsample, xlab = "GC(%)")
# histcol2(hist.obj, lower=0.49, upper=0.50, density=50, col="magenta")
# }
# )
# )
重複リード分布
# 重複リードの分布 ----
d.dup <- qa.sum[["sequenceDistribution"]]
s <- unique(d.dup$lane)
par(mfrow=c(1,2))
invisible(lapply(s,
function(x){
d <- d.dup[d.dup$lane==x, "nReads"]
names(d) <- d.dup$nOccurrences[d.dup$lane==x]
barplot(log10(d), cex.names = 0.5, main = x,
xlab="Occurrences", ylab = "Reads(log10)", las=2)
}
)
)

MYFQDUP <- function(qa.sum){
d.dup <- qa.sum[["sequenceDistribution"]]
s <- unique(d.dup$lane)
invisible(lapply(s,
function(x){
d <- d.dup[d.dup$lane==x, "nReads"]
names(d) <- d.dup$nOccurrences[d.dup$lane==x]
barplot(log10(d), cex.names = 0.5, main= x,
xlab="Occurrences", ylab = "Reads(log10)", las=2)
}
)
)
}
関数化
# Quality assesment (read数, Sequence Content, N fraquency, quality socre) ----
qa_gg <- function(qadat, suffix, facet_col){
pacman::p_load(dplyr, ggplot2, gridExtra)
# read数 -----
readat <- data.frame(sample=sub(suffix, "", rownames(qadat[["readCounts"]])),
read=qadat[["readCounts"]]$read)
read_gg <- ggplot(data = readat, aes(x = sample, y = read), fill=sampe) +
geom_bar(stat="identity") +
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# sequence content ----
scdat <- qadat[["perCycle"]][[1]]
scrate <- scdat %>%
arrange(Base) %>%
mutate(Base=factor(Base, levels=c("A","T","G","C","N"))) %>%
mutate(lane=sub(suffix, "", lane)) %>%
group_by(Cycle, lane) %>%
mutate(sc=Count/sum(Count))
write.table(scrate, "./sc_ggdat.txt", sep="\t", quote=F, row.names=F)
sc_gg <- ggplot(scrate, aes(x=Cycle, y=sc, group=Base, colour=Base))+
geom_line() +
theme_bw() +
facet_wrap(~lane, ncol=facet_col)
# sequence content ATGC別 ----
sc_gg2 <- ggplot(scrate, aes(x=Cycle, y=Count, colour=Base, group=lane))+
geom_line(alpha=0.5, size=1.5) +
theme_bw() +
theme(text = element_text(size=20)) +
facet_wrap(~Base, ncol=2)
# fraquency of N -----
nrates <- scdat %>%
mutate(lane=sub(suffix, "", lane)) %>%
group_by(Cycle, lane) %>%
mutate(freq_of_N=Count/sum(Count)) %>%
filter(Base=="N")
write.table(nrates, "./nr_ggdat.txt", sep="\t", quote=F, row.names=F)
nrate_gg <- ggplot(nrates, aes(x=Cycle, y=freq_of_N))+
geom_point() +
theme_bw() +
facet_wrap(~lane, ncol=facet_col)
# quality socre per sample -----
qsdat <- qadat[["perCycle"]][[2]]
qsds <- qsdat %>%
mutate(lane=sub(suffix,"", lane)) %>%
group_by(Cycle, lane) %>%
summarise(median=median(rep(Score, Count)), mean=mean(rep(Score, Count))) %>%
tidyr::gather(key = Qscore, value = value, median, mean)
write.table(qsds, "./qs_ggdat.txt", sep="\t", quote=F, row.names=F)
qs_gg <- ggplot(qsds, aes(x=Cycle, y=value, group=Qscore, colour=Qscore))+
geom_point() +
theme_bw() +
facet_wrap(~lane, ncol=facet_col)
# quality score all sample -----
meanqs <- qsds %>% dplyr::filter(Qscore=="mean")
gttle <- paste0("mean Quality Score(all ",length(unique(meanqs$lane)), " samples)")
qs_gg2 <-
ggplot(meanqs, aes(x=Cycle, y=value, colour=lane))+
scale_color_manual(values = rep(1,length(unique(meanqs$lane)))) +
geom_point(size=0.7, alpha=0.5) +
theme_light() +
theme(legend.position="none", text = element_text(size=20)) +
labs(y="mean Quality Score")
return(list(read_gg, sc_gg, sc_gg2, nrate_gg, qs_gg, qs_gg2))
}
## ggplotオブジェクトを作成 ----
ggall <- qa_gg(qadat = qa.sum, suffix = ".fastq.gz", facet_col = 2)
## まとめて描画 ----
grid.arrange(ggall[[1]], ggall[[2]],ggall[[3]],ggall[[4]],ggall[[5]],ggall[[6]], ncol=2)

do.call(gridExtra::grid.arrange, c(ggall, list(ncol=2)))

# ppl <- list(p1 = arrangeGrob(grobs=ggall[2]),
# p2 = arrangeGrob(grobs=ggall[4]),
# p3 = arrangeGrob(grobs=ggall[5]))
# class(ppl) <- c("arrangelist", class(ggall))
# ggsave("multipage.pdf", ppl)
Filtering and trimming
srFilter 指定した配列のフィルタ
trimTails, etc low-qualityの塩基をトリミング
narrow 範囲を指定して取り出し(start/end)
tables リードの出現頻度のsummary
srduplicated, etc. 重複リードのIdentify
filterFastq Filter reads from one file to another
# 関数定義 - trimming と quality filtering ----
myFilterAndTrim <-
## fastqファイルがある場所にtrim済みfastqファイルを作成する。
function(fl, destination=sprintf("%s", sub(".fastq.gz", ".qc.fq.gz", fl))){
## open input stream
stream <- FastqStreamer(fl)
fn <- unlist(strsplit(fl, "/"))[length(unlist(strsplit(fl, "/")))]
## 関数を抜ける時に必ずclose(stream)が実行される。
on.exit(close(stream))
## 以下filtering/trimming 繰り返し
nfil <- numeric(); wfil <- numeric(); i <- 0
repeat {
## input chunk
fq <- yield(stream)
if (length(fq) == 0)
break
i <- i+1
len1 <- length(fq)
## nを含まないもの nFilter(threshold)
fq <- fq[nFilter()(fq)]
len2 <- length(fq)
nfil[i] <- len1 - len2
#cat(paste0("chunk", i, ": N_filterd\n"))
## 先頭13bpを除く。 末端3bpを除く場合は end = width(fq)-3
fq <- narrow(x = fq, start = 14 )
## クォリティスコアが "4" (phred score 20)未満の塩基を除去
fq <- trimTailw(fq, k=2, a="4", halfwidth=2)
## 20bp未満になったリードを捨てる
fq <- fq[width(fq) >= 20]
wfil[i] <- len2 - length(fq)
#cat(paste0("chunk", i, ": trimmed_filterd\n"))
## 入力ファイルと同じ場所に出力。mode="a"で追加書き込み,
## デフォルトでgz圧縮
writeFastq(object = fq, file = destination, mode = "a")
}
cat(paste(fn, "N-containing", sum(nfil), "discard\n",sep = "\t"))
cat(paste(fn, "width of trimmed_fq < 20", sum(wfil), "discard\n",sep = "\t"))
}
# 実行 ----
## 4万リードのfastqのフィルタ&トリミングで約5分程度。
system.time(myFilterAndTrim(fl = f[1])) #
## ERR127302_1_subset.fastq.gz N-containing 529 discard
## ERR127302_1_subset.fastq.gz width of trimmed_fq < 20 1212 discard
## user system elapsed
## 0.519 0.031 0.579
system.time(myFilterAndTrim(fl = f[2])) #
## ERR127302_2_subset.fastq.gz N-containing 529 discard
## ERR127302_2_subset.fastq.gz width of trimmed_fq < 20 2197 discard
## user system elapsed
## 0.778 0.015 0.809
# system.time(sapply(fls, myFilterAndTrim))
# #QVが20以上の塩基の割合が20%未満のリードを削除 ----
# ## 時間かかりすぎ。1Mリードで約20minかかる
# system.time(
# {
# torf <- logical()
# pb <- txtProgressBar(min = 1, max = length(fq), style = 3) # プログレスバー
# for(i in 1:length(fq)){
# qsi <-unlist(as(PhredQuality(toString(quality(fq)[[i]])), "IntegerList"))
# torf[i] <- ifelse(length(qsi[qsi>30])/length(qsi) > 0.2, T, F)
# setTxtProgressBar(pb, i)
# }
# }
# )
# fq <- fq[torf]
# system.time(
# torf <- sapply(c(1:length(fq)), function(x){
# qsi <-unlist(as(PhredQuality(toString(quality(fq)[[x]])), "IntegerList"))
# ifelse(length(qsi[qsi>30])/length(qsi) > 0.2, T, F)
# })
# )
paired-end 共通のリードを取り出す。
- プラットフォームによりやり方変える。idの書式を確認
writeFastq
# qc済みペアエンドfastq読み込み ----
qcfq <- list.files(p, "qc.fq.gz", full.names = T)
# トリム済みfastqを読み込み ----
R1 <- readFastq(qcfq[1]); R2 <- readFastq(qcfq[2])
length(R1); length(R2)
## [1] 18259
## [1] 17274
# idの書式 check ----
toString(ShortRead::id(R1)[1]); toString(ShortRead::id(R2)[1])
## [1] "ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/1"
## [1] "ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/2"
# id取り出し。----
R1id <- as.vector(ShortRead::id(R1))
R2id <- as.vector(ShortRead::id(R2))
head(R1id)
## [1] "ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/1"
## [2] "ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0/1"
## [3] "ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0/1"
## [4] "ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0/1"
## [5] "ERR127302.19486260 HWI-EAS350_0441:1:80:13838:5544#0/1"
## [6] "ERR127302.5145549 HWI-EAS350_0441:1:21:5902:5333#0/1"
head(R2id)
## [1] "ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/2"
## [2] "ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0/2"
## [3] "ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0/2"
## [4] "ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0/2"
## [5] "ERR127302.5145549 HWI-EAS350_0441:1:21:5902:5333#0/2"
## [6] "ERR127302.643220 HWI-EAS350_0441:1:3:10784:4167#0/2"
# 共通id取り出し(ファイルによって変える) ----
R1id.pre <- sapply(strsplit(R1id, "/"), "[",1)
R2id.pre <- sapply(strsplit(R2id, "/"), "[",1)
# 共通idを取り出す----
com.id <- intersect(R1id.pre, R2id.pre)
length(com.id); head(com.id)
## [1] 16033
## [1] "ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0"
## [2] "ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0"
## [3] "ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0"
## [4] "ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0"
## [5] "ERR127302.5145549 HWI-EAS350_0441:1:21:5902:5333#0"
## [6] "ERR127302.643220 HWI-EAS350_0441:1:3:10784:4167#0"
# 共通idからなるfastqを取り出す ----
R1.com <- R1[match(paste0(com.id, "/1"), as.vector(ShortRead::id(R1)))]
R2.com <- R2[match(paste0(com.id, "/2"), as.vector(ShortRead::id(R2)))]
# check
ShortRead::id(R1.com); ShortRead::id(R2.com)
## A BStringSet instance of length 16033
## width seq
## [1] 53 ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/1
## [2] 53 ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0/1
## [3] 55 ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0/1
## [4] 54 ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0/1
## [5] 52 ERR127302.5145549 HWI-EAS350_0441:1:21:5902:5333#0/1
## ... ... ...
## [16029] 55 ERR127302.25561965 HWI-EAS350_0441:1:104:18378:4938#0/1
## [16030] 56 ERR127302.24686961 HWI-EAS350_0441:1:101:10393:16964#0/1
## [16031] 54 ERR127302.8014168 HWI-EAS350_0441:1:32:16962:18478#0/1
## [16032] 54 ERR127302.14811006 HWI-EAS350_0441:1:61:5820:18247#0/1
## [16033] 51 ERR127302.1875050 HWI-EAS350_0441:1:8:6759:5325#0/1
## A BStringSet instance of length 16033
## width seq
## [1] 53 ERR127302.8493430 HWI-EAS350_0441:1:34:16191:2123#0/2
## [2] 53 ERR127302.21406531 HWI-EAS350_0441:1:88:9330:2587#0/2
## [3] 55 ERR127302.22173106 HWI-EAS350_0441:1:91:10434:14757#0/2
## [4] 54 ERR127302.10402268 HWI-EAS350_0441:1:42:13757:9559#0/2
## [5] 52 ERR127302.5145549 HWI-EAS350_0441:1:21:5902:5333#0/2
## ... ... ...
## [16029] 55 ERR127302.25561965 HWI-EAS350_0441:1:104:18378:4938#0/2
## [16030] 56 ERR127302.24686961 HWI-EAS350_0441:1:101:10393:16964#0/2
## [16031] 54 ERR127302.8014168 HWI-EAS350_0441:1:32:16962:18478#0/2
## [16032] 54 ERR127302.14811006 HWI-EAS350_0441:1:61:5820:18247#0/2
## [16033] 51 ERR127302.1875050 HWI-EAS350_0441:1:8:6759:5325#0/2
# fastq書き出し ----
writeFastq(R1.com, "R1.fq.gz", mode="a", compress=T) # mode=a(上書き) compress=T (gz圧縮)
writeFastq(R2.com, "R2.fq.gz", mode="a", compress=T)
複数ファイルに適用する場合
# qc済みペアエンドfastq読み込み ----
qcfq <- list.files(p, "qc.fq.gz", full.names = T)
# ペアエンドfastq読み込み ----
R1fqs <- list(grep("_1_", qcfq, value = T))
R2fqs <- list(grep("_2_", qcfq, value = T))
#
pairsfq <- function(x, y){
r1 <- sapply(strsplit(x, "/"), function(a)tail(a, 1)) # R1ファイル名
r2 <- sapply(strsplit(y, "/"), function(a)tail(a, 1)) # R2ファイル名
# サンプルidが共通であることを確認
if(sapply(strsplit(r1, "_1_"), "[", 1) == sapply(strsplit(r2, "_2_"), "[", 1)){
# fastq読み込み
R1 <- readFastq(x); R2 <- readFastq(y)
# リードid
R1id.pre <- sapply(strsplit(as.vector(ShortRead::id(R1)) , "/"), "[", 1)
R2id.pre <- sapply(strsplit(as.vector(ShortRead::id(R2)) , "/"), "[", 1)
# 共通リードid
com.id <- intersect(R1id.pre, R2id.pre)
# 共通リード取り出し
R1.com <- R1[match(paste0(com.id, "/1"), as.vector(ShortRead::id(R1)))]
R2.com <- R2[match(paste0(com.id, "/2"), as.vector(ShortRead::id(R2)))]
# ファイル書き出し
wx <- sub("qc.fq.gz", "com.qc.fq.gz", x)
wy <- sub("qc.fq.gz", "com.qc.fq.gz", y)
writeFastq(R1.com, wx, mode="a", compress=T)
writeFastq(R2.com, wy, mode="a", compress=T)
}
}
# mapplyでまとめて実行
invisible(mapply(pairsfq, R1fqs, R2fqs))
環境
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2 data.table_1.10.4
## [3] gridExtra_2.2.1 ggplot2_2.2.1
## [5] dplyr_0.7.2 ShortRead_1.34.0
## [7] GenomicAlignments_1.12.1 SummarizedExperiment_1.6.3
## [9] DelayedArray_0.2.7 matrixStats_0.52.2
## [11] Biobase_2.36.2 Rsamtools_1.28.0
## [13] GenomicRanges_1.28.4 GenomeInfoDb_1.12.2
## [15] Biostrings_2.44.2 XVector_0.16.0
## [17] IRanges_2.10.2 S4Vectors_0.14.3
## [19] BiocParallel_1.10.1 BiocGenerics_0.22.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.12 plyr_1.8.4
## [3] bindr_0.1 compiler_3.4.1
## [5] RColorBrewer_1.1-2 bitops_1.0-6
## [7] tools_3.4.1 zlibbioc_1.22.0
## [9] digest_0.6.12 gtable_0.2.0
## [11] tibble_1.3.3 evaluate_0.10.1
## [13] lattice_0.20-35 pkgconfig_2.0.1
## [15] rlang_0.1.1 Matrix_1.2-10
## [17] yaml_2.1.14 GenomeInfoDbData_0.99.0
## [19] stringr_1.2.0 hwriter_1.3.2
## [21] knitr_1.16 rprojroot_1.2
## [23] grid_3.4.1 glue_1.1.1
## [25] R6_2.2.2 rmarkdown_1.6
## [27] pacman_0.4.6 latticeExtra_0.6-28
## [29] tidyr_0.6.3 magrittr_1.5
## [31] scales_0.4.1 backports_1.1.0
## [33] htmltools_0.3.6 assertthat_0.2.0
## [35] colorspace_1.3-2 labeling_0.3
## [37] stringi_1.1.5 lazyeval_0.2.0
## [39] munsell_0.4.3 RCurl_1.95-4.8