メタデータ読み込み
meta <- read.table("PRJEB4224.txt", header = T, sep = "\t", stringsAsFactors = F)
meta[1:3,1:10]
## study_accession sample_accession secondary_sample_accession
## 1 PRJEB4224 SAMEA2173964 ERS342022
## 2 PRJEB4224 SAMEA2173965 ERS342023
## 3 PRJEB4224 SAMEA2173966 ERS342024
## experiment_accession run_accession tax_id scientific_name
## 1 ERX306486 ERR333581 1118232 root metagenome
## 2 ERX306487 ERR333582 1118232 root metagenome
## 3 ERX306488 ERR333583 1118232 root metagenome
## instrument_model library_name library_layout
## 1 Illumina MiSeq RunB PAIRED
## 2 Illumina MiSeq RunB PAIRED
## 3 Illumina MiSeq RunB PAIRED
OTUテーブルの作成
- サンプル別のOTUテーブルの読み込み(usearch_globalの出力ファイル)
- サンプル別のOTUテーブルの結合
- taxonomy アノテーションをまとめる。
- OTUテーブル作成
- OTUテーブル書き出し
# 1. サンプル別のOTUテーブルの読み込み ----
fls <- list.files(".", ".otu.tab")
v.samples <- sub(".otu.tab", "", fls)
l.cnt <- list()
for(i in 1:length(fls)){
d <- read.table(fls[i], header=T, sep="\t", stringsAsFactors=F)
names(d) <- c("OTU_ID", v.samples[i], paste0(v.samples[i], "_tax"))
l.cnt[[i]] <- d
}
## check
head(l.cnt[[1]])
## OTU_ID ERR333581
## 1 Otu232 3273
## 2 Otu612 1692
## 3 Otu214 1343
## 4 Otu115 10280
## 5 Otu4312 174
## 6 Otu145 6623
## ERR333581_tax
## 1 d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)
## 2 d:Bacteria(0.1029),p:Proteobacteria(0.0164),c:Alphaproteobacteria(0.0104),o:Rhodospirillales(0.0026),f:Acetobacteraceae(0.0012),g:Stella(0.0007)
## 3 d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)
## 4 d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9998),c:Chloroplast(0.9961),f:Chloroplast(0.9909),g:Streptophyta(0.9952)
## 5 d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9999),c:Chloroplast(0.9996),f:Chloroplast(0.9994),g:Streptophyta(0.9957)
## 6 d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9996),c:Chloroplast(0.9984),f:Chloroplast(0.9972),g:Streptophyta(0.9833)
# 2. サンプル別のOTUテーブルの結合 ----
suppressWarnings(suppressMessages(library(dplyr)))
otudat <- Reduce(function(x,y){full_join(x,y, by="OTU_ID")}, l.cnt)
## OTU id が付いていないやつを除外
otudat <- otudat[-which(otudat$OTU_ID==""),]
dim(otudat)
## [1] 15400 145
# 3. taxonomy アノテーションをまとめる。 ----
options(stringsAsFactors = F)
utax <- data.frame(t(otudat[, grep("_tax",names(otudat))]), stringsAsFactors = F)
utax <- unlist(lapply(utax, function(x){
x[which(is.na(x))] <- unique(x[which(!is.na(x))])}))
names(utax) <- otudat$OTU_ID
utax[1:3]
## Otu232
## "d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)"
## Otu612
## "d:Bacteria(0.1029),p:Proteobacteria(0.0164),c:Alphaproteobacteria(0.0104),o:Rhodospirillales(0.0026),f:Acetobacteraceae(0.0012),g:Stella(0.0007)"
## Otu214
## "d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)"
# 4. OTUテーブル作成 ----
cnt <- otudat[,c(-grep("_tax",names(otudat)))]
cnt[is.na(cnt)] <- 0
dat <- transform(cnt, tax.anno =utax ) # taxonomy annotation有りのOTUテーブル
rownames(cnt) <- cnt$OTU_ID; cnt <- cnt[-1] # カウントデータのみOTUテーブル
dim(cnt); dim(dat)
## [1] 15400 72
## [1] 15400 74
cnt[1:3,1:3]
## ERR333581 ERR333582 ERR333583
## Otu232 3273 877 1085
## Otu612 1692 558 645
## Otu214 1343 407 486
# 5. カウントテーブル書き出し ----
write.table(dat, "cnt.txt", sep="\t", quote=F, row.names = F)
taxonomy アノテーションの集計
- OTU配列に付与された分類階層(界 門 綱 目 科 属 種)ごとに集計する
# Taxonomy annotation リスト
l.taxon <- strsplit(dat$tax.anno, ",")
table(unlist(lapply(l.taxon, function(x){sub(":.*", "", x)})))
##
## c d f g o p
## 14543 15400 11604 15400 11391 15400
## ドメイン BacteriaもしくはArchaea
domains <- unlist(lapply(l.taxon, function(x){x[grep("d:", x)]}))
table(sub("\\(.*", "", domains))
##
## d:Archaea d:Bacteria
## 120 15280
## 門 phylum
phylums <- unlist(lapply(l.taxon, function(x){x[grep("p:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", phylums)), decreasing = T)))
## Var1 Freq
## 1 p:Proteobacteria 4513
## 2 p:Actinobacteria 2629
## 3 p:Acidobacteria 2101
## 4 p:Firmicutes 1178
## 5 p:Bacteroidetes 1028
## 6 p:Planctomycetes 864
## 綱 class
classes <- unlist(lapply(l.taxon, function(x){x[grep("c:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", classes)), decreasing = T)))
## Var1 Freq
## 1 c:Actinobacteria 2603
## 2 c:Alphaproteobacteria 1399
## 3 c:Deltaproteobacteria 1214
## 4 c:Gammaproteobacteria 1047
## 5 c:Planctomycetia 851
## 6 c:Betaproteobacteria 818
## 目 order
orders <- unlist(lapply(l.taxon, function(x){x[grep("o:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", orders)), decreasing = T)))
## Var1 Freq
## 1 o:Actinomycetales 1729
## 2 o:Planctomycetales 824
## 3 o:Myxococcales 820
## 4 o:Sphingobacteriales 622
## 5 o:Bacillales 594
## 6 o:Rhizobiales 567
## 科 family
family <- unlist(lapply(l.taxon, function(x){x[grep("f:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", family)), decreasing = T)))
## Var1 Freq
## 1 f:Planctomycetaceae 824
## 2 f:Chitinophagaceae 464
## 3 f:Streptomycetaceae 372
## 4 f:Gemmatimonadaceae 347
## 5 f:Xanthomonadaceae 327
## 6 f:Gaiellaceae 320
## 属 genus
genus <- unlist(lapply(l.taxon, function(x){x[grep("g:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", genus)), decreasing = T)))
## Var1 Freq
## 1 g:Gp6 523
## 2 g:Streptomyces 353
## 3 g:Subdivision3_genera_incertae_sedis 348
## 4 g:Gemmatimonas 347
## 5 g:Gaiella 320
## 6 g:Gp4 319
phylumレベルの解析
- phylumレベルのOTUテーブルの作成
- 信頼レベル0.97以上のOTUテーブルを取り出す。
- 頻度値に変換
- サンプルの種類
- サンプル別積み上げ頻度グラフ
# 1. phylumレベルのOTUテーブルの作成 ----
v.p <- unlist(lapply(l.taxon, function(x){x[grep("p:", x)]}))
uv.p <- v.p[as.numeric(sub("\\)", "", sub(".*\\(", "", v.p))) >= 0.97]
v.p <- sub("\\(.*", "", v.p); v.p <- gsub("\"", "", v.p)
uv.p <- sub("\\(.*", "", uv.p); uv.p <- gsub("\"", "", uv.p); uv.p <- unique(uv.p)
phylum.dat <- cbind(v.phylum=v.p, cnt)
## check
phylum.dat[1:5,1:5]
## v.phylum ERR333581 ERR333582 ERR333583
## Otu232 p:Cyanobacteria/Chloroplast 3273 877 1085
## Otu612 p:Proteobacteria 1692 558 645
## Otu214 p:Cyanobacteria/Chloroplast 1343 407 486
## Otu115 p:Cyanobacteria/Chloroplast 10280 0 3542
## Otu4312 p:Cyanobacteria/Chloroplast 174 50 54
## ERR333584
## Otu232 7
## Otu612 1737
## Otu214 6
## Otu115 53
## Otu4312 1
dim(phylum.dat)
## [1] 15400 73
# 2. 信頼レベル0.97以上のOTUテーブルを取り出す。----
up.cnt <- data.frame(matrix(NA, nrow=length(uv.p), ncol = 72))
names(up.cnt) <- names(phylum.dat)[-1]; rownames(up.cnt) <- uv.p
for(i in 1:length(uv.p)){
up.cnt[i,] <- apply(phylum.dat[phylum.dat$v.phylum == uv.p[i], c(2:ncol(phylum.dat))], 2, sum)
}
## check
up.cnt[1:5,1:5]
## ERR333581 ERR333582 ERR333583 ERR333584
## p:Cyanobacteria/Chloroplast 93341 23790 29547 577
## p:Proteobacteria 22213 6320 7391 22623
## p:Actinobacteria 7564 1838 2300 8257
## p:Bacteroidetes 759 179 219 842
## p:Acidobacteria 244 52 101 243
## ERR333585
## p:Cyanobacteria/Chloroplast 57612
## p:Proteobacteria 5659
## p:Actinobacteria 4681
## p:Bacteroidetes 417
## p:Acidobacteria 132
dim(up.cnt)
## [1] 23 72
# 3. 頻度値に変換 ----
up.dat <- data.frame(matrix(NA, nrow=length(uv.p), ncol = 72))
names(up.dat) <- names(up.cnt); rownames(up.dat) <- uv.p
for(i in 1:length(uv.p)){
up.dat[i,] <- (up.cnt[i,]/apply(up.cnt, 2, sum))*100
}
## check
up.dat[1:5,1:5]
## ERR333581 ERR333582 ERR333583 ERR333584
## p:Cyanobacteria/Chloroplast 68.9861349 67.2889266 68.1183143 1.353920
## p:Proteobacteria 16.4171052 17.8758309 17.0393766 53.084450
## p:Actinobacteria 5.5903743 5.1986989 5.3024714 19.374897
## p:Bacteroidetes 0.5609590 0.5062933 0.5048875 1.975737
## p:Acidobacteria 0.1803347 0.1470796 0.2328477 0.570195
## ERR333585
## p:Cyanobacteria/Chloroplast 82.0193047
## p:Proteobacteria 8.0564335
## p:Actinobacteria 6.6641041
## p:Bacteroidetes 0.5936619
## p:Acidobacteria 0.1879218
# 4. サンプルの種類----
sc.names <- meta$scientific_name[match(names(up.dat), meta$run_accession)]
soil.names <- names(up.dat)[sc.names =="soil metagenome"]
root.names <- names(up.dat)[sc.names =="root metagenome"]
leaf.names <- names(up.dat)[sc.names =="leaf metagenome"]
synt.names <- names(up.dat)[sc.names =="synthetic metagenome"]
# 5. サンプル別積み上げ頻度グラフ----
suppressMessages(suppressWarnings(library(RColorBrewer)))
phylum.col <- c(brewer.pal(12, "Set3"), brewer.pal(9, "Set1"), brewer.pal(4, "Dark2"))
par(mfrow=c(2,2))
barplot(as.matrix(up.dat[soil.names]), las=2, col = phylum.col, cex.names = 0.7)
barplot(as.matrix(up.dat[leaf.names]), las=2, col = phylum.col, cex.names = 0.7)
barplot(as.matrix(up.dat[root.names]), las=2, col = phylum.col, cex.names = 0.7)
plot(x = 1:200, y = 1:200, xaxt="n", yaxt="n", pch="", xlab=NA, ylab=NA, bty="n")
legend(20, 200, rownames(up.dat), col = phylum.col, pch=15, bty = "n", cex = 0.7)
