require(vegan)
require(dplyr)
require(tidyr)
require(ggplot2)
require(stringr)
require(reshape2)
require(manipulate)
# require(gridExtra) # you could use with grid.arrange if you want to show several ggplot plots in one Figure,
########################################################
###### the color palette #####
########################################################
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
########################################################
###### show changes in OTUs between two conditions #####
########################################################
plotOTUChanges <- function(LongDataFrame){
LongDataFrame$OTU <- as.character(LongDataFrame$OTU)
LongDataFrame$Condition <- as.character(LongDataFrame$Condition)
Tr <- ggplot(LongDataFrame, aes(x = OTU, y = Value,
colour = Condition,
size = Condition,
shape = Condition))
Tr <- Tr + geom_point() +
facet_grid(AbundType ~., scales = "free") +
scale_size_discrete(range = c(6,4)) +
scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
scale_shape_manual(values = c(16,17)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("OTU") +
ylab("Abundance")
}
###############################################################
###### show changes in composition between two conditions #####
###############################################################
plotConditionChanges <- function(LongDataFrame){
LongDataFrame$OTU <- as.character(LongDataFrame$OTU)
LongDataFrame$Condition <- as.character(LongDataFrame$Condition)
Tr <- ggplot(LongDataFrame, aes(x = Condition, y = Value,
colour = OTU))
Tr <- Tr + geom_point(size = 5, alpha = .7) +
facet_grid(AbundType ~., scales = "free") +
scale_colour_manual(values = cbPalette[1:length(unique(LongDataFrame$OTU))]) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("Condition") +
ylab("Abundance")
}
###############################################################
###### function to show how well one OTU predicts the others #####
###############################################################
# predictor is just the OTU, so "A", "B" and so on.
# could be combined with manipulate but not in Markdown,see below and in text
plotPaulPredict <- function(WideDataFrame, Predictor){
WideDataFrame <- select(WideDataFrame, Condition_k, Condition_l, matches(Predictor))
names(WideDataFrame)[3] <- paste("Pred._from:",Predictor)
WideDataFrame$OTU <- LETTERS[1:nrow(WideDataFrame)]
#WideDataFrame <- melt(WideDataFrame, id.vars = "OTU")
WideDataFrame <- gather(WideDataFrame, Condition, Value, -OTU)
Tr <- ggplot(WideDataFrame, aes(x = OTU, y = Value,
colour = Condition,
size = Condition,
shape = Condition))
Tr <- Tr + geom_point() +
scale_size_discrete(range = c(6,4,7)) +
scale_colour_manual(values = c(cbPalette[1],cbPalette[2],cbPalette[3])) +
scale_shape_manual(values = c(16,17,4)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("OTU") +
ylab("Rel. Abundance")
print(Tr)
}
## you can now do manipulate, but it does not work with this markdown
# manipulate(
# plotPaulPredict(PredAbund,Predictor),
# Predictor = picker("A","B","C","D","E"))
###############################################################
###### Functions to illustrate the prediction precision #####
###############################################################
illustratePrecision <- function(DF){
Tr <- ggplot(DF, aes(x = OTU, y = Ratio))
Tr <- Tr +
geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.5) +
geom_point(size = 6, colour = cbPalette[2]) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("predictor OTU") +
ylab("Ratio_PredictedToRealDifferences")
print(Tr)
}
###############################################################
###### Functions to illustrate the power measures #####
###############################################################
illustratePowers <- function(DF){
Tr <- ggplot(DF, aes(x = OTU, y = value, color = Measure,
shape = Measure, size = Measure))
Tr <- Tr +
geom_hline(aes(yintercept = 0), colour = cbPalette[1], size = 1.5) +
geom_point() +
scale_size_discrete(range = c(6,4)) +
scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
scale_shape_manual(values = c(16,17)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("predicted OTU")
print(Tr)
}
###############################################################
###### function to predict the relative abundance of OTU j in
# condition l from the relative abundances of OTU i in condition l
# and condtion k #####
###############################################################
# so k and l are the vectors of relative abundances in these conditions
# i and j are the indices of the OTUs of interst.
predictAbundance <- function( k, l, i, j ){
if( i == j ){
l[i]
}
else {
k[j]*(1 - l[i])/(1 - k[i])
}
}
###################################################
##### Jensen Shannon Distance #####
###################################################
dist.JSD <- function(inMatrix, pseudocount=0.000001, ...) {
KLD <- function(x,y) sum(x *log(x/y))
JSD<- function(x,y) sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2))
matrixColSize <- length(colnames(inMatrix))
matrixRowSize <- length(rownames(inMatrix))
colnames <- colnames(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
inMatrix = apply(inMatrix,1:2,function(x) ifelse (x==0,pseudocount,x))
for(i in 1:matrixColSize) {
for(j in 1:matrixColSize) {
resultsMatrix[i,j]=JSD(as.vector(inMatrix[,i]),
as.vector(inMatrix[,j]))
}
}
colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
as.dist(resultsMatrix)->resultsMatrix
attr(resultsMatrix, "method") <- "dist"
return(resultsMatrix)
}
Always a comparison between two conditions. k is the index of the first condition and l of the second. The predictor OTU is i, the predicted OTU is j. Then the predicted rel abundance of j in condition l is (knowing the rel abundance of i in both k and l)
\[ \bar{x}_{jl} = x_{jk} \frac{1-x_{il}}{1-x_{ik}} \]
The equation is directly intuitiv: You assume that no OTU except the predictor changes. That means the relations among the other OTUs must remain the same (i.e. they are proportional). Thus, you just multiply the relative abundances of the predicted OTUs in condition k with the fraction they together make up in the new condition l, divided by the fraction they made up in condition k.
The next step is to compare the predicted relative abundances of condition l with the real relative abundances of condition l. I.e. does the prediction from the one OTU bring the composition closer to the real compositon. Closer, so there is a distance measure, we should test bray-curtis, Jensen-Shannon distance
In an iterative method it would be: - you have the two distributions of condition k and l - calculate for each OTU the prediction for condition l - find the OTU whose prediction gets closest to the composition of condition l - if there is a significant improvement, use the prediction as the new condition k
Further Definitions:
# Condition1 <- data.frame(OTU = c("A", "B", "C", "D", "E"),
# absolute = c(5E8, 3.5E8, 2E8, 1E8, .2E8))
# Condition2 <- data.frame(OTU = c("A", "B", "C", "D", "E"),
# absolute = c(5E8, 2E8, 3.5E8, 1E8, .2E8))
Condition1 <- data.frame(OTU = c("A", "B", "C", "D", "E"),
absolute = c(5E8, 2.5E8, 2E8, 1E8, .2E8))
Condition2 <- data.frame(OTU = c("A", "B", "C", "D", "E"),
absolute = c(10E8, 2.5E8, 2E8, 1E8, .2E8))
Tr
## A B C D E Condition_k
## A 0.63694268 0.51266118 0.50150084 0.48263182 0.47012436 0.46728972
## B 0.15923567 0.15923567 0.25075042 0.24131591 0.23506218 0.23364486
## C 0.12738854 0.20506447 0.12738854 0.19305273 0.18804974 0.18691589
## D 0.06369427 0.10253224 0.10030017 0.06369427 0.09402487 0.09345794
## E 0.01273885 0.02050645 0.02006003 0.01930527 0.01273885 0.01869159
## Condition_l
## A 0.63694268
## B 0.15923567
## C 0.12738854
## D 0.06369427
## E 0.01273885
plotPaulPredict(PredAbund,Predictor)
In this case, there are no compositional effects here (B goes really down, C really up, as their changes balance, no compositional effects). So all predictors should be bad.
I tested several methods here:
And the visualisation of this prescision measure:
illustratePrecision(Ratiodf)
Both for Euclidean and Bray Curtis Distance I fist use the vegan package. The problem with the vegdist() function is, that it calculates pairwise distances. I want to only calculate the distance to condition l and maybe k.
T_RA_Abundances <- t(RA_Abundances)
# the transformed data.frame where rows are samples and colums the OTUs
Matrix_RA_Abundances <- data.matrix(T_RA_Abundances)
DistEucl <- vegdist(Matrix_RA_Abundances, method = 'euclidean')
DistEucl <- as.matrix(DistEucl)
DistTo_k <- DistEucl[nrow(DistEucl)-1,]
DistTo_l <- DistEucl[nrow(DistEucl),]
# Since I want the data later in a nice data frame, I first
PredAReal_Dists <- data.frame(OTU = names(DistTo_l), Pred_Dists =
DistTo_l)
PredAReal_Dists <- PredAReal_Dists[-nrow(PredAReal_Dists),]
# removes the condition_l (distances always 0)
# Note again, you should not compare these distances to the real distance, but to the real distance with the distance from the predicting strain out.
# you can do this as follows
MatrixAllk <- t(replicate(5,Matrix_RA_Abundances[nrow(Matrix_RA_Abundances)-1,]))
diag(MatrixAllk) <- Matrix_RA_Abundances[nrow(Matrix_RA_Abundances),]
Matrix_RA_Abundances_ToComp <- Matrix_RA_Abundances
Matrix_RA_Abundances_ToComp[1:(nrow(Matrix_RA_Abundances)-2),] <- MatrixAllk
DistEucl_TC <- vegdist(Matrix_RA_Abundances_ToComp, method = 'euclidean')
DistEucl_TC <- as.matrix(DistEucl_TC)
DistTo_l_TC <- DistEucl_TC[nrow(DistEucl_TC),]
PredAReal_Dists$Real_Dists <- DistTo_l_TC[1:(length(DistTo_l_TC)-1)]
#The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
PredAReal_Dists <- transform(PredAReal_Dists, PredRealRatio = Pred_Dists/Real_Dists)
PredAReal_Dists
## OTU Pred_Dists Real_Dists PredRealRatio
## A A 0.0000000 0.1000077 0.0000000
## B B 0.1518162 0.1823374 0.8326113
## C C 0.1676694 0.1877236 0.8931715
## D D 0.1868259 0.1946736 0.9596880
## E E 0.1953912 0.1968457 0.9926111
## Condition_k Condition_k 0.1969357 0.1969357 1.0000000
# ########################################
# #Calculate te distance on your own ######
# ########################################
#
# DistTo_k_S <- sqrt(colSums((RA_Abundances-RA_Abundances$Condition_k)^2))
# DistTo_l_S <- sqrt(colSums((RA_Abundances-RA_Abundances$Condition_l)^2))
# #all.equal(DistTo_k, DistTo_k_S)
# #all.equal(DistTo_l, DistTo_l_S)
#
# # Since I want the data later in a nice data frame, I first
# PredAReal_DistsS <- data.frame(OTU = names(DistTo_l_S), Pred_Dists =
# DistTo_l_S)
# PredAReal_DistsS <- PredAReal_DistsS[-nrow(PredAReal_DistsS),]
#
# Allk_TC <- replicate(nrow(RA_Abundances), RA_Abundances$Condition_k)
# colnames(Allk_TC) <- LETTERS[1:nrow(relativeAbundances)]
#
# diag(Allk_TC) <- RA_Abundances$Condition_l
#
# DistTo_l_TCS <- sqrt(colSums((Allk_TC-RA_Abundances$Condition_l)^2))
#
# PredAReal_DistsS$Real_Dists <- c(DistTo_l_TCS, DistTo_l_S[length(DistTo_l_S)-1])
#
# #The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
#
# PredAReal_DistsS <- transform(PredAReal_DistsS, PredRealRatio = Pred_Dists/Real_Dists)
And the visualisation of this prescision measure:
illustratePrecision(Ratiodf)
T_RA_Abundances <- t(RA_Abundances)
# the transformed data.frame where rows are samples and colums the OTUs
Matrix_RA_Abundances <- data.matrix(T_RA_Abundances)
DistBray <- vegdist(Matrix_RA_Abundances, method = 'bray')
DistBray <- as.matrix(DistBray)
DistTo_k <- DistBray[nrow(DistBray)-1,]
DistTo_l <- DistBray[nrow(DistBray),]
# Since I want the data later in a nice data frame, I first
PredAReal_Dists <- data.frame(OTU = names(DistTo_l), Pred_Dists =
DistTo_l)
PredAReal_Dists <- PredAReal_Dists[-nrow(PredAReal_Dists),]
# removes the condition_l (distances always 0)
# Note again, you should not compare these distances to the real distance, but to the real distance with the distance from the predicting strain out.
# you can do this as follows
MatrixAllk <- t(replicate(nrow(relativeAbundances),relativeAbundances[,1]))
diag(MatrixAllk) <- relativeAbundances[,2]
Matrix_RA_Abundances_ToComp <- Matrix_RA_Abundances
Matrix_RA_Abundances_ToComp[1:nrow(relativeAbundances),] <- MatrixAllk
DistBray_TC <- vegdist(Matrix_RA_Abundances_ToComp, method = 'bray')
DistBray_TC <- as.matrix(DistBray_TC)
DistTo_l_TC <- DistBray_TC[nrow(DistBray_TC),]
PredAReal_Dists$Real_Dists <- DistTo_l_TC[1:(length(DistTo_l_TC)-1)]
#The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
PredAReal_Dists <- transform(PredAReal_Dists, PredRealRatio = Pred_Dists/Real_Dists)
########################################
#Calculate the distance on your own ######
########################################
# DistTo_k_S <- colSums(abs(RA_Abundances-RA_Abundances$Condition_k))/colSums(
# RA_Abundances+RA_Abundances$Condition_k)
# DistTo_l_S <- colSums(abs(RA_Abundances-RA_Abundances$Condition_l))/colSums(
# RA_Abundances+RA_Abundances$Condition_l)
#
# #all.equal(DistTo_k, DistTo_k_S)
# #all.equal(DistTo_l, DistTo_l_S)
#
# # Since I want the data later in a nice data frame, I first
# PredAReal_DistsS <- data.frame(OTU = names(DistTo_l_S), Pred_Dists =
# DistTo_l_S)
# PredAReal_DistsS <- PredAReal_DistsS[-nrow(PredAReal_DistsS),]
#
# Allk_TC <- replicate(nrow(RA_Abundances), RA_Abundances$Condition_k)
# colnames(Allk_TC) <- LETTERS[1:nrow(relativeAbundances)]
#
# diag(Allk_TC) <- RA_Abundances$Condition_l
#
# DistTo_l_TCS <- colSums(abs(Allk_TC-RA_Abundances$Condition_l))/colSums(
# Allk_TC+RA_Abundances$Condition_l)
#
# PredAReal_DistsS$Real_Dists <- c(DistTo_l_TCS, DistTo_l_S[length(DistTo_l_S)-1])
#
# #The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
#
# PredAReal_DistsS <- transform(PredAReal_DistsS, PredRealRatio = Pred_Dists/Real_Dists)
And the visualisation of this prescision measure:
illustratePrecision(Ratiodf)
(http://stackoverflow.com/questions/11226627/jensen-shannon-divergence-in-r) and of course (http://enterotype.embl.de/enterotypes.html) This gives the function dist.JSD, that I copied, but I also did my own again
# # Since dist.JSD works with a matrix where the samples are columns I do not have to transform here.
# Matrix_RA_Abundances <- data.matrix(RA_Abundances)
#
# DistJS <- dist.JSD(Matrix_RA_Abundances)
# # I was surprised it produced some NaNs, but I could not find a mistake in the
# # function
# # note the function directly produces an object of class dist
# DistJS <- as.matrix(DistJS)
#
# DistTo_l <- DistJS[nrow(DistJS),]
#
#
# # Since I want the data later in a nice data frame, I first
# PredAReal_Dists <- data.frame(OTU = names(DistTo_l), Pred_Dists =
# DistTo_l)
# PredAReal_Dists <- PredAReal_Dists[-nrow(PredAReal_Dists),]
# # removes the condition_l (distances always 0)
#
# # Note again, you should not compare these distances to the real distance, but to the real distance with the distance from the predicting strain out.
#
#
# MatrixAllk <- replicate(nrow(relativeAbundances),relativeAbundances[,1])
#
# diag(MatrixAllk) <- relativeAbundances[,2]
#
# Matrix_RA_Abundances_ToComp <- Matrix_RA_Abundances
# Matrix_RA_Abundances_ToComp[,1:nrow(relativeAbundances)] <- MatrixAllk
#
# DistJS_TC <- dist.JSD(Matrix_RA_Abundances_ToComp)
# DistJS_TC <- as.matrix(DistJS_TC)
#
# DistTo_l_TC <- DistJS_TC[nrow(DistJS_TC),]
#
# PredAReal_Dists$Real_Dists <- DistTo_l_TC[1:(length(DistTo_l_TC)-1)]
#
# #The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
#
# PredAReal_Dists <- transform(PredAReal_Dists, PredRealRatio = Pred_Dists/Real_Dists)
########################################
#Calculate the distance on your own ######
########################################
# m <- 0.5 * (p + q)
# my p is the predictions, my q is condition l
Ms <- .5*(RA_Abundances+RA_Abundances$Condition_l)
JS <- 0.5 * (colSums(RA_Abundances * log(RA_Abundances / Ms)) + colSums(RA_Abundances$Condition_l * log(RA_Abundances$Condition_l / Ms)))
JS = ifelse(JS<0,0,JS) # Sometimes 0 is given as -3...e-17
DistTo_l_S <- sqrt(JS)
#all.equal(DistTo_l, DistTo_l_S)
# So this method really works
# Since I want the data later in a nice data frame, I first
PredAReal_DistsS <- data.frame(OTU = names(DistTo_l_S), Pred_Dists =
DistTo_l_S)
PredAReal_DistsS <- PredAReal_DistsS[-nrow(PredAReal_DistsS),]
Allk_TC <- replicate(nrow(RA_Abundances), RA_Abundances$Condition_k)
colnames(Allk_TC) <- LETTERS[1:nrow(relativeAbundances)]
diag(Allk_TC) <- RA_Abundances$Condition_l
Ms_TC <- .5*(Allk_TC+RA_Abundances$Condition_l)
JS_TC <- 0.5 * (colSums(Allk_TC * log(Allk_TC / Ms_TC)) + colSums(RA_Abundances$Condition_l * log(RA_Abundances$Condition_l / Ms_TC)))
DistTo_l_TCS <- sqrt(JS_TC)
PredAReal_DistsS$Real_Dists <- c(DistTo_l_TCS, DistTo_l_S[length(DistTo_l_S)-1])
#The precision measure is the ratio of the Pred_Diffs to the Real_Diffs
PredAReal_DistsS <- transform(PredAReal_DistsS, PredRealRatio = Pred_Dists/Real_Dists)
And the visualisation of this prescision measure:
illustratePrecision(Ratiodf)
Then do the same thing completely with the Zeller Matrix.
if(file.exists("Zeller.Rda")){
load("Zeller.Rda")
}
# I decide from now on I define an Abundance Table with the OTUs as rows and so
# samples as columns. Since Zeller is currently the other way around, I call it
# T_AT
T_AT <- Zeller2014AbundTable
Samples <- T_AT$Sample
Groups <- T_AT$Groups
T_AT <- T_AT[-2:-1]
AT <- as.data.frame(t(T_AT))
colnames(AT) <- Samples
# choosing the two samples to compare
k <- 5
l <- 14
AT_sub <- select(AT, k, l)
# I chose these because they are control and cancer and because they are very different in UNMAPPED
# renaming the samples and OTUs
colnames(AT_sub) <- c('k', 'l')
rownames(AT_sub) <- c('UM', paste('OTU_', 1:300, sep= ''))
# calculating the predicted relative abundances for each predictor.
pred_RAs <- sapply(seq(nrow(AT_sub)), function(i) sapply(seq(nrow(AT_sub)),
function(j) predictAbundance(AT_sub[,1], AT_sub[,2], i, j)))
# with 301 OTUs, pred_RAs is a 301x301 matrix, each column is the predicted composition predicted from one OTU
colnames(pred_RAs) <- rownames(AT_sub)
# to test pred_RAs-AT_sub$l should be 0 on the diagonal
# diag(pred_RAs-AT_sub$l) # indeed all 0
# turn into data frame and add the k and l compositions
pred_RAs <- as.data.frame(pred_RAs)
# names(PredAbund) <- LETTERS[1:ncol(PredAbund)]
pred_RAs$k <- AT_sub$k
pred_RAs$l <- AT_sub$l
# pred_RAs is now a data.frame, columns 1-301 are the predicted compositons predicted from each OTU, then there is the k and l compositions.
# This data frame will be used to calculate the distances
And the visualisation of this prescision measure:
illustratePrecision(Ratiodf)