require(vegan)
require(dplyr)
require(tidyr)
require(ggplot2)
require(stringr)
require(reshape2)
require(manipulate)
# require(gridExtra) # you could use with grid.arrange if you want to show several ggplot plots in one Figure,
########################################################
###### the color palette #####
########################################################
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
###############################################################
###### function to predict the relative abundance of OTU j in
# condition l from the relative abundances of OTU i in condition l
# and condtion k #####
###############################################################
# INPUT: k and l are the vectors of relative abundances at two conditions.
# i and j are the indices of the OTUs of interest.
# OUTPUT: the predicted abundance of OTU j in condition
# l
predictAbundance <- function( k, l, i, j ){
if( i == j ){
l[i]
}
else {
k[j]*(1 - l[i])/(1 - k[i])
}
}
###############################################################
###### function to calculate all predicted abundances, predicted
# from each OTU for each OTU >> matrix/data frame #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n, alternatively for the
# simulations, there can be an OTU column
# OUTPUT: pred_RAs is a data.frame. The columns are the predicted compositions
# using each OTU as a predictor. In addition, the last two colums are k and l
# from AbTable_sub. pred_RAs can be used as input of the distance functions.
calc_pred_RAs <- function(AbTable_sub) {
# to make sure OTUs are rownames and not the extra column
if ('OTU' %in% colnames(AbTable_sub)) {
rownames(AbTable_sub) <- AbTable_sub$OTU
AbTable_sub <- select(AbTable_sub, k, l)
}
pred_RAs <- sapply(seq(nrow(AbTable_sub)), function(i)
sapply(seq(nrow(AbTable_sub)),
function(j) predictAbundance(AbTable_sub$k, AbTable_sub$l, i, j)))
# calculates the predicted relative Abundance for each OTU as predictor.
# for n OTUs you get a nxn matrix
colnames(pred_RAs) <- rownames(AbTable_sub)
rownames(pred_RAs) <- rownames(AbTable_sub)
# turn into data frame and add the k and l compositions
pred_RAs <- as.data.frame(pred_RAs)
pred_RAs$k <- AbTable_sub$k
pred_RAs$l <- AbTable_sub$l
# Thus, for nOTUs pred_RAs is a data.frame with n+2 variables.
return(pred_RAs)
}
###############################################################
###### function to produce a data frame where each column is
# composition k only the abundance of the "predictor OTU" is
# from composition l. This to calculate the real distances
# between composition k and l without contribuition from the
# predictor OTU #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n, alternatively for the
# simulations, there can be an OTU column
# OUTPUT: All_k_Predictor_l is a similar data.frame as pred_RAs. Each column is
# the composition k but the predictor is l. k and l are the last two columns.
# can be used as input of the distance functions to calculate the real distances
# between composition k and l with the contribution of the predictor removed
make_AllkPredictorl <- function(AbTable_sub) {
# to make sure OTUs are rownames and not the extra column
if ('OTU' %in% colnames(AbTable_sub)) {
rownames(AbTable_sub) <- AbTable_sub$OTU
AbTable_sub <- select(AbTable_sub, k, l)
}
All_k_Predictor_l <- replicate(nrow(AbTable_sub), AbTable_sub$k)
# a square matrix, each column is composition k
colnames(All_k_Predictor_l) <- rownames(AbTable_sub)
rownames(All_k_Predictor_l) <- rownames(AbTable_sub)
diag(All_k_Predictor_l) <- AbTable_sub$l
# changes the "predictor" to the value of condition l
All_k_Predictor_l <- as.data.frame(All_k_Predictor_l)
All_k_Predictor_l$k <- AbTable_sub$k
All_k_Predictor_l$l <- AbTable_sub$l
return(All_k_Predictor_l)
}
###############################################################
###### combine calc_pred_RAs and make_AllkPredictorl #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n
# OUTPUT: Predicts is a list with the data.frames pred_RAs and
# All_k_Predictor_l
predictions <- function(AbTable_sub) {
pred_RAs <- calc_pred_RAs(AbTable_sub)
All_k_Predictor_l <- make_AllkPredictorl(AbTable_sub)
Predicts <- list(pred_RAs, All_k_Predictor_l)
return(Predicts)
}
#################################################
### Generate data.frames from simulated examples##
##################################################
# INPUT: the vectors k and l containing the absolute abundances
# of condition k and l
# OUTPUT: SimDFs, the three data frames WideDF (absolute and realtive
# abundances), AT_Sim the relative Abundance Table of the two conditions,
# LongDF, the long version of WideDF for plotting
GenerateSimDFs <- function(k,l){
WideDF <- data.frame(OTU = LETTERS[1:length(k)], absolute_k = k, absolute_l = l, relative_k = k/sum(k), relative_l =l/sum(l))
AT_Sim <- select(WideDF, OTU, k = relative_k, l = relative_l)
LongDF <- gather(WideDF, Condition, Value, -OTU)
LongDF <- separate(LongDF, Condition,
into = c('AbundType', 'Condition'))
SimDFs <- list(WideDF, AT_Sim, LongDF)
return(SimDFs)
}
### ATTENTION for each distance I always generate two functions.
### One that uses the input data.frame to calculate the distance.
### one that uses the Predicts input list, runs the basic functions
### two times and directly gives you the final PredAReal_DistCompare data.frame
###############################################################
###### sum of absolute differences for each OTU #####
###############################################################
# INPUT: for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.
### BASIC
absoluteDiff <- function(df){
DistsTo_l <- colSums(abs(df-df$l))
return(DistsTo_l)
}
### FOLLOW UP
absoluteDiffs <- function(Predicts){
DistsPred <- absoluteDiff(Predicts[[1]])
DistsReal <- absoluteDiff(Predicts[[2]])
PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
DistsPred, Real_Dists = DistsReal)
if (!isTRUE(all.equal(0,
PredReal_DistComp$Pred_Dists[
nrow(PredReal_DistComp)],
PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
stop("predicted or real distance to l was not zero")
}
PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
# removes the l row because it is always 0 (tested above)
PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio =
Pred_Dists/Real_Dists)
return(PredReal_DistComp)
}
########################################
###### Euclidean Distance #####
#######################################
# INPUT: for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.
### BASIC
EuclDist <- function(df){
DistsTo_l <- sqrt(colSums((df-df$l)^2))
return(DistsTo_l)
}
### FOLLOW UP
EuclDists <- function(Predicts){
DistsPred <- EuclDist(Predicts[[1]])
DistsReal <- EuclDist(Predicts[[2]])
PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
DistsPred, Real_Dists = DistsReal)
if (!isTRUE(all.equal(0,
PredReal_DistComp$Pred_Dists[
nrow(PredReal_DistComp)],
PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
stop("predicted or real distance to l was not zero")
}
PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
# removes the l row because it is always 0 (tested above)
PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio =
Pred_Dists/Real_Dists)
return(PredReal_DistComp)
}
########################################
###### Bray-Curtis Distance #####
#######################################
# ATTENTION: For the equation see ?vegdist() from the vegan package
# INPUT: for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.
### BASIC
BrayCurtisDist <- function(df){
DistsTo_l <- colSums(abs(df-df$l))/colSums(df+df$l)
return(DistsTo_l)
}
### FOLLOW UP
BrayCurtisDists <- function(Predicts){
DistsPred <- BrayCurtisDist(Predicts[[1]])
DistsReal <- BrayCurtisDist(Predicts[[2]])
PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
DistsPred, Real_Dists = DistsReal)
if (!isTRUE(all.equal(0,
PredReal_DistComp$Pred_Dists[
nrow(PredReal_DistComp)],
PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
stop("predicted or real distance to l was not zero")
}
PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
# removes the l row because it is always 0 (tested above)
PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio =
Pred_Dists/Real_Dists)
return(PredReal_DistComp)
}
########################################
###### Jensen-Shannon Distance #####
#######################################
# ATTENTION: For the equation see:
# http://stackoverflow.com/questions/11226627/jensen-shannon-divergence-in-r
# the equation the guy came self up with plus the link to the enterotypes EMBL.
# wikipedia
# INPUT: for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.
### BASIC
JSDist <- function(df){
# since it does not work with 0, I add a pseudocount, that was also
# added in the enterotype paper
pseudocount <- 0.000001
df <- df + pseudocount
Ms <- .5*(df+df$l)
JS <- 0.5 * (colSums(df * log(df/Ms)) + colSums(df$l * log(df$l/Ms)))
JS = ifelse(JS<0,0,JS) # Sometimes 0 is given as -3...e-17, then
# the following sqrt would give a NaN warning message
DistsTo_l <- sqrt(JS)
return(DistsTo_l)
}
### FOLLOW UP
JSDists <- function(Predicts){
DistsPred <- JSDist(Predicts[[1]])
DistsReal <- JSDist(Predicts[[2]])
PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
DistsPred, Real_Dists = DistsReal)
if (!isTRUE(all.equal(0,
PredReal_DistComp$Pred_Dists[
nrow(PredReal_DistComp)],
PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
stop("predicted or real distance to l was not zero")
}
PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
# removes the l row because it is always 0 (tested above)
PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio =
Pred_Dists/Real_Dists)
return(PredReal_DistComp)
}
###############################################################
###### Illustrate the OTU changes between condition k and l #####
###############################################################
# INPUT: AbTable_sub, so the subsetted Abundance Table, a data frame with
# the columns k and l
plotOTUChanges <- function(AbTable_sub){
# make sure you have the OTUs as variable and not just as rownames
if (! 'OTU' %in% colnames(AbTable_sub)) {
AbTable_sub$OTU <- rownames(AbTable_sub)
}
# the size of the dots and the writing depends on how many OTUs there
# are
if (nrow(AbTable_sub) < 20) {
size = c(6,4)
sizetextx = 18
} else if((nrow(AbTable_sub) > 19) & (nrow(AbTable_sub) < 100)) {
size = c(4,2)
sizetextx = 10
} else if (nrow(AbTable_sub) > 99) {
size = c(3,1.5)
sizetextx = 6
}
AbTable_sub$OTU <- factor(AbTable_sub$OTU)
LevelsWant <- rev(as.character(AbTable_sub$OTU)) #you had to reverse
for (i in 1:length(LevelsWant)) {
AbTable_sub$OTU <- relevel(AbTable_sub$OTU, ref = LevelsWant[i])
}
AbTable_sub_long <- gather(AbTable_sub, Condition, RA, -OTU)
#AbTable_sub_long$Condition <- as.character(AbTable_sub_long$Condition)
Tr <- ggplot(AbTable_sub_long, aes(x = OTU, y = RA,
colour = Condition,
size = Condition,
shape = Condition))
Tr <- Tr + geom_point() +
scale_size_discrete(range = size) +
scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
scale_shape_manual(values = c(16,17)) +
#scale_x_discrete(expand = c(0.01,.01)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = .5)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5,
size = sizetextx)) +
xlab("") +
ylab("relative Abundance") +
labs(colour = "", size = "", shape = "")
return(Tr)
}
###############################################################
###### Illustrate the prediction precision #####
###############################################################
# INPUT: usually PredReal_DistComp, so just a data frame with
# the columns OTU and PredReal_Ratio
illustratePrecision <- function(DF){
# the size of the dots and the writing depends on how many OTUs there
# are
if (nrow(DF) < 20) {
size = 6
sizetextx = 18
} else if((nrow(DF) > 19) & (nrow(DF) < 100)) {
size = 4
sizetextx = 14
} else if (nrow(DF) > 99) {
size = 3
sizetextx = 6
}
# get the order of the OTU right. OTU is used as a factor and levels
# in R are alphabetically. But you want the OTUs in the same order as
# they are in DF (rownames)
LevelsWant <- rev(as.character(DF$OTU)) #you had to reverse
for (i in 1:length(LevelsWant)) {
DF$OTU <- relevel(DF$OTU, ref = LevelsWant[i])
}
Tr <- ggplot(DF, aes(x = OTU, y = PredReal_Ratio))
Tr <- Tr +
geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) +
geom_point(size = size, colour = cbPalette[2]) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx)) +
xlab("") +
ylab("PredReal_DistanceRatio")
return(Tr)
}
###############################################################
###### Show Changes Relative vs Absolute for simulations #####
###############################################################
plotSimAbsRel <- function(LongDataFrame){
LongDataFrame$OTU <- as.character(LongDataFrame$OTU)
LongDataFrame$Condition <- as.character(LongDataFrame$Condition)
# do I need this?
Tr <- ggplot(LongDataFrame, aes(x = OTU, y = Value,
colour = Condition,
size = Condition,
shape = Condition))
Tr <- Tr + geom_point() +
facet_grid(AbundType ~., scales = "free") +
scale_size_discrete(range = c(6,4)) +
scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
scale_shape_manual(values = c(16,17)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.4)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("OTU") +
ylab("Abundance") +
labs(colour = "", size = "", shape = "")
return(Tr)
}
###############################################################
###### Plot Predictions dependent on Predictor #####
###############################################################
# INPUT: The data frame where each column is a prediction, last two columns are
# k and l. E.g. pred_RAs, so the outcome of calc_pred_RAs, or Predicts[[1]],
# being the outcome of the predictions function. Further you have to give the
# predictor, e.g. "A" or "OTU1"
# ATTENTION: This plot is nicest when combiend with manipulate
plotPaulPredict <- function(pred_RAs, Predictor){
rownames(pred_RAs) <- colnames(pred_RAs)[1:(ncol(pred_RAs)-2)]
DF <- select(pred_RAs, k, l, matches(Predictor))
names(DF)[3] <- paste("from:",Predictor)
DF$OTU <- rownames(DF)
if (nrow(DF) < 20) {
size = c(6,4,6)
sizetextx = 18
} else if((nrow(AbTable_sub) > 19) & (nrow(AbTable_sub) < 100)) {
size = c(4,2,4)
sizetextx = 10
} else if (nrow(AbTable_sub) > 99) {
size = c(3,1.5, 3)
sizetextx = 6
}
DF <- gather(DF, Condition, Value, -OTU)
Tr <- ggplot(DF, aes(x = OTU, y = Value,
colour = Condition,
size = Condition,
shape = Condition))
Tr <- Tr + geom_point() +
scale_size_discrete(range = size) +
scale_colour_manual(values = c(cbPalette[1:3])) +
scale_shape_manual(values = c(16,17,18)) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
xlab("") +
ylab("Relative Abundance") +
labs(colour = "", size = "", shape = "")
return(Tr)
}
# NOTE use with manipulate possible
# manipulate(
# plotPaulPredict(PredictsSim[[1]],Predictor),
# Predictor = picker(as.list(rownames(PredictsSim[[1]]))))
###############################################################
###### Plot Predictions dependent on Predictor LOG #####
###############################################################
plotPaulPredictLog <- function(pred_RAs, Predictor){
Tr <- plotPaulPredict(pred_RAs, Predictor)
Tr <- Tr + scale_y_log10()
return(Tr)
}
#NOTE Use with manipulate possible
# manipulate(
# plotPaulPredictLog(PredictsSim[[1]],Predictor),
# Predictor = picker(as.list(rownames(PredictsSim[[1]]))))
if(file.exists("Zeller.Rda")){
load("Zeller.Rda")
}
# An Abundance Table should have OTUs as rows and so samples as columns.
T_AT <- Zeller2014AbundTable
Samples <- T_AT$Sample
Groups <- T_AT$Groups
T_AT <- T_AT[-2:-1]
AT <- as.data.frame(t(T_AT))
colnames(AT) <- Samples
# choosing the two samples to compare
k <- 5
l <- 14
AT_sub <- select(AT, k, l)
# I chose these because they are control and cancer and because they are very different in UNMAPPED
# renaming the samples and OTUs
colnames(AT_sub) <- c('k', 'l')
rownames(AT_sub) <- c('UM', paste('OTU_', 1:300, sep= ''))
Trr <- plotOTUChanges(AT_sub)
Trr
Trr + scale_y_log10()
Trr + coord_cartesian(ylim = c(-0.01,.2))
Predicts <- predictions(AT_sub)
PredReal_DistComp <- absoluteDiffs(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98)
Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr
Trr
PredReal_DistComp <- EuclDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98)
Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr
Trr
PredReal_DistComp <- BrayCurtisDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98)
Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr
Trr
PredReal_DistComp <- JSDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98)
Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr
Trr
k <- c(5E8, 2.5E8, 2E8, 1E8, .2E8)
l= c(10E8, 2.5E8, 2E8, 1E8, .2E8)
SimDFs <- GenerateSimDFs(k,l)
# plot relative and absolute abundances under each other
Tr <- plotSimAbsRel(SimDFs[[3]])
Tr
# get all predictions
PredictsSim <- predictions(SimDFs[[2]])
Tr <- plotPaulPredict(PredictsSim[[1]], "A")
Tr
Diffs <- absoluteDiffs(PredictsSim)
Tr <- illustratePrecision(Diffs)
Diffs
Tr
DistsEu <- EuclDists(PredictsSim)
Tr <- illustratePrecision(DistsEu)
Euclidean
Tr
DistsBC <- BrayCurtisDists(PredictsSim)
Tr <- illustratePrecision(DistsBC)
Bray-Curtis
Tr
DistsJS <- JSDists(PredictsSim)
Tr <- illustratePrecision(DistsJS)
JSD
Tr