1 メタデータ読み込み

meta <- read.table("PRJEB4224.txt", header = T, sep = "\t", stringsAsFactors = F) 
meta[1:3,1:10]
##   study_accession sample_accession secondary_sample_accession
## 1       PRJEB4224     SAMEA2173964                  ERS342022
## 2       PRJEB4224     SAMEA2173965                  ERS342023
## 3       PRJEB4224     SAMEA2173966                  ERS342024
##   experiment_accession run_accession  tax_id scientific_name
## 1            ERX306486     ERR333581 1118232 root metagenome
## 2            ERX306487     ERR333582 1118232 root metagenome
## 3            ERX306488     ERR333583 1118232 root metagenome
##   instrument_model library_name library_layout
## 1   Illumina MiSeq         RunB         PAIRED
## 2   Illumina MiSeq         RunB         PAIRED
## 3   Illumina MiSeq         RunB         PAIRED

2 OTUテーブルの作成

  1. サンプル別のOTUテーブルの読み込み(usearch_globalの出力ファイル)
  2. サンプル別のOTUテーブルの結合
  3. taxonomy アノテーションをまとめる。
  4. OTUテーブル作成
  5. OTUテーブル書き出し
# 1. サンプル別のOTUテーブルの読み込み ----
fls <- list.files(".", ".otu.tab")
v.samples <- sub(".otu.tab", "", fls)

l.cnt <- list()
for(i in 1:length(fls)){
    d <- read.table(fls[i], header=T, sep="\t", stringsAsFactors=F)
    names(d) <- c("OTU_ID", v.samples[i], paste0(v.samples[i], "_tax"))
    l.cnt[[i]] <- d
}

## check
head(l.cnt[[1]])
##    OTU_ID ERR333581
## 1  Otu232      3273
## 2  Otu612      1692
## 3  Otu214      1343
## 4  Otu115     10280
## 5 Otu4312       174
## 6  Otu145      6623
##                                                                                                                                      ERR333581_tax
## 1                        d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)
## 2 d:Bacteria(0.1029),p:Proteobacteria(0.0164),c:Alphaproteobacteria(0.0104),o:Rhodospirillales(0.0026),f:Acetobacteraceae(0.0012),g:Stella(0.0007)
## 3                        d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)
## 4                        d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9998),c:Chloroplast(0.9961),f:Chloroplast(0.9909),g:Streptophyta(0.9952)
## 5                        d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9999),c:Chloroplast(0.9996),f:Chloroplast(0.9994),g:Streptophyta(0.9957)
## 6                        d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9996),c:Chloroplast(0.9984),f:Chloroplast(0.9972),g:Streptophyta(0.9833)
# 2. サンプル別のOTUテーブルの結合 ----
suppressWarnings(suppressMessages(library(dplyr)))
otudat <- Reduce(function(x,y){full_join(x,y, by="OTU_ID")}, l.cnt)

## OTU id が付いていないやつを除外 
otudat <- otudat[-which(otudat$OTU_ID==""),]
dim(otudat)
## [1] 15400   145
# 3. taxonomy アノテーションをまとめる。 ----
options(stringsAsFactors = F)
utax <- data.frame(t(otudat[, grep("_tax",names(otudat))]), stringsAsFactors = F)
utax <- unlist(lapply(utax, function(x){
  x[which(is.na(x))] <- unique(x[which(!is.na(x))])}))
names(utax) <- otudat$OTU_ID
utax[1:3]
##                                                                                                                                             Otu232 
##                        "d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)" 
##                                                                                                                                             Otu612 
## "d:Bacteria(0.1029),p:Proteobacteria(0.0164),c:Alphaproteobacteria(0.0104),o:Rhodospirillales(0.0026),f:Acetobacteraceae(0.0012),g:Stella(0.0007)" 
##                                                                                                                                             Otu214 
##                        "d:Bacteria(1.0000),p:Cyanobacteria/Chloroplast(0.9997),c:Chloroplast(0.9987),f:Chloroplast(0.9976),g:Streptophyta(0.9898)"
# 4. OTUテーブル作成 ----
cnt <- otudat[,c(-grep("_tax",names(otudat)))]
cnt[is.na(cnt)] <- 0

dat <- transform(cnt, tax.anno =utax )  # taxonomy annotation有りのOTUテーブル
rownames(cnt) <- cnt$OTU_ID; cnt <- cnt[-1] # カウントデータのみOTUテーブル

dim(cnt); dim(dat)
## [1] 15400    72
## [1] 15400    74
cnt[1:3,1:3]
##        ERR333581 ERR333582 ERR333583
## Otu232      3273       877      1085
## Otu612      1692       558       645
## Otu214      1343       407       486
# 5. カウントテーブル書き出し ----
write.table(dat, "cnt.txt",  sep="\t", quote=F, row.names = F)

3 taxonomy アノテーションの集計

  • OTU配列に付与された分類階層(界 門 綱 目 科 属 種)ごとに集計する
# Taxonomy annotation リスト
l.taxon <- strsplit(dat$tax.anno, ",")
table(unlist(lapply(l.taxon, function(x){sub(":.*", "", x)})))
## 
##     c     d     f     g     o     p 
## 14543 15400 11604 15400 11391 15400
## ドメイン BacteriaもしくはArchaea
domains <- unlist(lapply(l.taxon,  function(x){x[grep("d:", x)]})) 
table(sub("\\(.*", "", domains))
## 
##  d:Archaea d:Bacteria 
##        120      15280
## 門 phylum
phylums <- unlist(lapply(l.taxon,  function(x){x[grep("p:", x)]})) 
head(data.frame(sort(table(sub("\\(.*", "", phylums)), decreasing = T)))
##               Var1 Freq
## 1 p:Proteobacteria 4513
## 2 p:Actinobacteria 2629
## 3  p:Acidobacteria 2101
## 4     p:Firmicutes 1178
## 5  p:Bacteroidetes 1028
## 6 p:Planctomycetes  864
## 綱 class
classes <- unlist(lapply(l.taxon,  function(x){x[grep("c:", x)]})) 
head(data.frame(sort(table(sub("\\(.*", "", classes)), decreasing = T)))
##                    Var1 Freq
## 1      c:Actinobacteria 2603
## 2 c:Alphaproteobacteria 1399
## 3 c:Deltaproteobacteria 1214
## 4 c:Gammaproteobacteria 1047
## 5      c:Planctomycetia  851
## 6  c:Betaproteobacteria  818
## 目 order
orders <- unlist(lapply(l.taxon,  function(x){x[grep("o:", x)]})) 
head(data.frame(sort(table(sub("\\(.*", "", orders)), decreasing = T)))
##                   Var1 Freq
## 1    o:Actinomycetales 1729
## 2   o:Planctomycetales  824
## 3       o:Myxococcales  820
## 4 o:Sphingobacteriales  622
## 5         o:Bacillales  594
## 6        o:Rhizobiales  567
## 科 family
family <- unlist(lapply(l.taxon,  function(x){x[grep("f:", x)]}))
head(data.frame(sort(table(sub("\\(.*", "", family)), decreasing = T)))
##                  Var1 Freq
## 1 f:Planctomycetaceae  824
## 2  f:Chitinophagaceae  464
## 3 f:Streptomycetaceae  372
## 4 f:Gemmatimonadaceae  347
## 5  f:Xanthomonadaceae  327
## 6       f:Gaiellaceae  320
## 属 genus
genus <- unlist(lapply(l.taxon,  function(x){x[grep("g:", x)]})) 
head(data.frame(sort(table(sub("\\(.*", "", genus)), decreasing = T)))
##                                   Var1 Freq
## 1                                g:Gp6  523
## 2                       g:Streptomyces  353
## 3 g:Subdivision3_genera_incertae_sedis  348
## 4                       g:Gemmatimonas  347
## 5                            g:Gaiella  320
## 6                                g:Gp4  319

4 phylumレベルの解析

  1. phylumレベルのOTUテーブルの作成
  2. 信頼レベル0.97以上のOTUテーブルを取り出す。
  3. 頻度値に変換
  4. サンプルの種類
  5. サンプル別積み上げ頻度グラフ
# 1. phylumレベルのOTUテーブルの作成 ----
v.p <- unlist(lapply(l.taxon,  function(x){x[grep("p:", x)]}))
uv.p <- v.p[as.numeric(sub("\\)", "", sub(".*\\(", "", v.p))) >= 0.97]

v.p <- sub("\\(.*", "", v.p); v.p <- gsub("\"", "", v.p)
uv.p <- sub("\\(.*", "", uv.p); uv.p <- gsub("\"", "", uv.p); uv.p <- unique(uv.p)

phylum.dat <- cbind(v.phylum=v.p, cnt)

## check
phylum.dat[1:5,1:5]
##                            v.phylum ERR333581 ERR333582 ERR333583
## Otu232  p:Cyanobacteria/Chloroplast      3273       877      1085
## Otu612             p:Proteobacteria      1692       558       645
## Otu214  p:Cyanobacteria/Chloroplast      1343       407       486
## Otu115  p:Cyanobacteria/Chloroplast     10280         0      3542
## Otu4312 p:Cyanobacteria/Chloroplast       174        50        54
##         ERR333584
## Otu232          7
## Otu612       1737
## Otu214          6
## Otu115         53
## Otu4312         1
dim(phylum.dat)
## [1] 15400    73
# 2. 信頼レベル0.97以上のOTUテーブルを取り出す。----
up.cnt <- data.frame(matrix(NA, nrow=length(uv.p), ncol = 72))
names(up.cnt) <- names(phylum.dat)[-1]; rownames(up.cnt) <- uv.p

for(i in 1:length(uv.p)){
  up.cnt[i,] <- apply(phylum.dat[phylum.dat$v.phylum == uv.p[i], c(2:ncol(phylum.dat))], 2, sum)
}
## check
up.cnt[1:5,1:5]
##                             ERR333581 ERR333582 ERR333583 ERR333584
## p:Cyanobacteria/Chloroplast     93341     23790     29547       577
## p:Proteobacteria                22213      6320      7391     22623
## p:Actinobacteria                 7564      1838      2300      8257
## p:Bacteroidetes                   759       179       219       842
## p:Acidobacteria                   244        52       101       243
##                             ERR333585
## p:Cyanobacteria/Chloroplast     57612
## p:Proteobacteria                 5659
## p:Actinobacteria                 4681
## p:Bacteroidetes                   417
## p:Acidobacteria                   132
dim(up.cnt)
## [1] 23 72
# 3. 頻度値に変換 ----
up.dat <- data.frame(matrix(NA, nrow=length(uv.p), ncol = 72))
names(up.dat) <- names(up.cnt); rownames(up.dat) <- uv.p
for(i in 1:length(uv.p)){
  up.dat[i,] <- (up.cnt[i,]/apply(up.cnt, 2, sum))*100
}
## check
up.dat[1:5,1:5]
##                              ERR333581  ERR333582  ERR333583 ERR333584
## p:Cyanobacteria/Chloroplast 68.9861349 67.2889266 68.1183143  1.353920
## p:Proteobacteria            16.4171052 17.8758309 17.0393766 53.084450
## p:Actinobacteria             5.5903743  5.1986989  5.3024714 19.374897
## p:Bacteroidetes              0.5609590  0.5062933  0.5048875  1.975737
## p:Acidobacteria              0.1803347  0.1470796  0.2328477  0.570195
##                              ERR333585
## p:Cyanobacteria/Chloroplast 82.0193047
## p:Proteobacteria             8.0564335
## p:Actinobacteria             6.6641041
## p:Bacteroidetes              0.5936619
## p:Acidobacteria              0.1879218
# 4. サンプルの種類----
sc.names <- meta$scientific_name[match(names(up.dat), meta$run_accession)]
soil.names <- names(up.dat)[sc.names =="soil metagenome"]
root.names   <- names(up.dat)[sc.names =="root metagenome"]
leaf.names   <- names(up.dat)[sc.names =="leaf metagenome"]
synt.names  <- names(up.dat)[sc.names =="synthetic metagenome"]

# 5. サンプル別積み上げ頻度グラフ----
suppressMessages(suppressWarnings(library(RColorBrewer)))
phylum.col <- c(brewer.pal(12, "Set3"), brewer.pal(9, "Set1"), brewer.pal(4, "Dark2"))

par(mfrow=c(2,2))
barplot(as.matrix(up.dat[soil.names]), las=2, col = phylum.col, cex.names = 0.7)
barplot(as.matrix(up.dat[leaf.names]), las=2, col = phylum.col, cex.names = 0.7)
barplot(as.matrix(up.dat[root.names]), las=2, col = phylum.col, cex.names = 0.7)
plot(x = 1:200, y = 1:200, xaxt="n", yaxt="n", pch="", xlab=NA, ylab=NA, bty="n")
legend(20, 200, rownames(up.dat), col = phylum.col, pch=15, bty = "n", cex = 0.7)