require(vegan)
require(dplyr)
require(tidyr)
require(ggplot2)
require(stringr)
require(reshape2)
require(manipulate)
# require(gridExtra) # you could use with grid.arrange if you want to show several ggplot plots in one Figure,

########################################################
###### the color palette                         #####
########################################################
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

Function definitions

From AbundanceTable to data.frames with the predictions and real data that are input of the Distance Functions.

###############################################################
###### function to predict the relative abundance of OTU j in 
# condition l from the relative abundances of OTU i in condition l
# and condtion k                                           #####
###############################################################
# INPUT:  k and l are the vectors of relative abundances at two conditions.
# i and j are the indices of the OTUs of interest.
# OUTPUT: the predicted abundance of OTU j in condition
# l


predictAbundance <- function( k, l, i, j ){
        if( i == j ){
                l[i]
              }
        else {
               k[j]*(1 - l[i])/(1 - k[i]) 
        }
}


###############################################################
###### function to calculate all predicted abundances, predicted
# from each OTU for each OTU >> matrix/data frame         #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n, alternatively for the
# simulations, there can be an OTU column
# OUTPUT: pred_RAs is a data.frame. The columns are the predicted compositions
# using each OTU as a predictor. In addition, the last two colums are k and l
# from AbTable_sub. pred_RAs can be used as input of the distance functions.

calc_pred_RAs <- function(AbTable_sub) {
        
        # to make sure OTUs are rownames and not the extra column
        if ('OTU' %in% colnames(AbTable_sub)) {
                rownames(AbTable_sub) <- AbTable_sub$OTU
                AbTable_sub <- select(AbTable_sub, k, l)
        }

        pred_RAs <- sapply(seq(nrow(AbTable_sub)), function(i) 
                sapply(seq(nrow(AbTable_sub)),
        function(j) predictAbundance(AbTable_sub$k, AbTable_sub$l, i, j)))
        # calculates the predicted relative Abundance for each OTU as predictor.
        # for n OTUs you get a nxn matrix
        colnames(pred_RAs) <- rownames(AbTable_sub)
        rownames(pred_RAs) <- rownames(AbTable_sub)
        # turn into data frame and add the k and l compositions
        pred_RAs <- as.data.frame(pred_RAs)
        pred_RAs$k <- AbTable_sub$k
        pred_RAs$l <- AbTable_sub$l
        # Thus, for nOTUs pred_RAs is a data.frame with n+2 variables.
        return(pred_RAs)
} 


###############################################################
###### function to produce a data frame where each column is 
# composition k only the abundance of the "predictor OTU" is
# from composition l. This to calculate the real distances 
# between composition k and l without contribuition from the
# predictor OTU                                          #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n, alternatively for the
# simulations, there can be an OTU column
# OUTPUT: All_k_Predictor_l is a similar data.frame as pred_RAs. Each column is
# the composition k but the predictor is l. k and l are the last two columns.
# can be used as input of the distance functions to calculate the real distances
# between composition k and l with the contribution of the predictor removed

make_AllkPredictorl <- function(AbTable_sub) {
        
        # to make sure OTUs are rownames and not the extra column
        if ('OTU' %in% colnames(AbTable_sub)) {
                rownames(AbTable_sub) <- AbTable_sub$OTU
                AbTable_sub <- select(AbTable_sub, k, l)
        }
        
        All_k_Predictor_l <- replicate(nrow(AbTable_sub), AbTable_sub$k)
        # a square matrix, each column is composition k
        colnames(All_k_Predictor_l) <- rownames(AbTable_sub)
        rownames(All_k_Predictor_l) <- rownames(AbTable_sub)
        diag(All_k_Predictor_l) <- AbTable_sub$l 
        # changes the "predictor" to the value of condition l
        All_k_Predictor_l <- as.data.frame(All_k_Predictor_l)
        All_k_Predictor_l$k <- AbTable_sub$k
        All_k_Predictor_l$l <- AbTable_sub$l
        return(All_k_Predictor_l)
} 

###############################################################
###### combine calc_pred_RAs and make_AllkPredictorl       #####
###############################################################
# INPUT: AbTable_sub is an abundance table subset of two conditions, i.e.
# a data.frame with two variables k and l (the two compositions), rownames(AbTable_sup) should be the OTUs, e.g. OTU_1 to OTU_n
# OUTPUT: Predicts is a list with the data.frames pred_RAs and
# All_k_Predictor_l

predictions <- function(AbTable_sub) {
        
        pred_RAs <- calc_pred_RAs(AbTable_sub)
        All_k_Predictor_l <- make_AllkPredictorl(AbTable_sub)
        Predicts <- list(pred_RAs, All_k_Predictor_l)
        return(Predicts)

} 


#################################################
### Generate data.frames from simulated examples##
##################################################
# INPUT: the vectors k and l containing the absolute abundances
# of condition k and l
# OUTPUT: SimDFs, the three data frames WideDF (absolute and realtive 
# abundances), AT_Sim the relative Abundance Table of the two conditions,
# LongDF, the long version of WideDF for plotting

GenerateSimDFs <- function(k,l){
        WideDF <- data.frame(OTU = LETTERS[1:length(k)], absolute_k = k, absolute_l = l, relative_k = k/sum(k), relative_l =l/sum(l))
        AT_Sim <- select(WideDF, OTU, k = relative_k, l = relative_l)
        LongDF <- gather(WideDF, Condition, Value, -OTU)
        LongDF <- separate(LongDF, Condition, 
                           into = c('AbundType', 'Condition'))
        SimDFs <- list(WideDF, AT_Sim, LongDF)
        return(SimDFs)
}

Distance Functions

### ATTENTION for each distance I always generate two functions.
### One that uses the input data.frame to calculate the distance.
### one that uses the Predicts input list, runs the basic functions
### two times and directly gives you the final PredAReal_DistCompare data.frame

###############################################################
######   sum of absolute differences for each OTU        #####
###############################################################
# INPUT:  for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames 
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.

### BASIC

absoluteDiff <- function(df){
        
        DistsTo_l <- colSums(abs(df-df$l))
        return(DistsTo_l)
}

### FOLLOW UP

absoluteDiffs <- function(Predicts){
        
        DistsPred <- absoluteDiff(Predicts[[1]])
        DistsReal <- absoluteDiff(Predicts[[2]])
        
        PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
                                      DistsPred, Real_Dists = DistsReal)
        
        if (!isTRUE(all.equal(0,
                              PredReal_DistComp$Pred_Dists[
                                      nrow(PredReal_DistComp)],
                  PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
                stop("predicted or real distance to l was not zero")
                  }
        
        PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
        # removes the l row because it is always 0 (tested above)
        
        PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio = 
                                               Pred_Dists/Real_Dists)
        return(PredReal_DistComp) 
}



########################################
######   Euclidean Distance       #####
#######################################
# INPUT:  for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames 
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.

### BASIC

EuclDist <- function(df){
        
        DistsTo_l <- sqrt(colSums((df-df$l)^2))
        return(DistsTo_l)
}

### FOLLOW UP

EuclDists <- function(Predicts){
        
        DistsPred <- EuclDist(Predicts[[1]])
        DistsReal <- EuclDist(Predicts[[2]])
        
        PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
                                      DistsPred, Real_Dists = DistsReal)
        
        if (!isTRUE(all.equal(0,
                              PredReal_DistComp$Pred_Dists[
                                      nrow(PredReal_DistComp)],
                  PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
                stop("predicted or real distance to l was not zero")
                  }
        
        PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
        # removes the l row because it is always 0 (tested above)
        
        PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio = 
                                               Pred_Dists/Real_Dists)
        return(PredReal_DistComp) 
}

########################################
######   Bray-Curtis Distance      #####
#######################################
# ATTENTION: For the equation see ?vegdist() from the vegan package
# INPUT:  for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames 
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.

### BASIC

BrayCurtisDist <- function(df){
        
        DistsTo_l <- colSums(abs(df-df$l))/colSums(df+df$l)
        return(DistsTo_l)
}

### FOLLOW UP

BrayCurtisDists <- function(Predicts){
        
        DistsPred <- BrayCurtisDist(Predicts[[1]])
        DistsReal <- BrayCurtisDist(Predicts[[2]])
        
        PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
                                      DistsPred, Real_Dists = DistsReal)
        
        if (!isTRUE(all.equal(0,
                              PredReal_DistComp$Pred_Dists[
                                      nrow(PredReal_DistComp)],
                  PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
                stop("predicted or real distance to l was not zero")
                  }
        
        PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
        # removes the l row because it is always 0 (tested above)
        
        PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio = 
                                               Pred_Dists/Real_Dists)
        return(PredReal_DistComp) 
}


########################################
######   Jensen-Shannon Distance      #####
#######################################
# ATTENTION: For the equation see:
# http://stackoverflow.com/questions/11226627/jensen-shannon-divergence-in-r
# the equation the guy came self up with plus the link to the enterotypes EMBL.
# wikipedia
# INPUT:  for the basic the df such as pred_RAs, for the follow up
# the list Predicts with the data frames 
# OUTPUT: The Distance vector for the basic, PredReal_DistComp for
# the follow up.

### BASIC

JSDist <- function(df){
        # since it does not work with 0, I add a pseudocount, that was also 
        # added in the enterotype paper
        pseudocount <- 0.000001
        df <- df + pseudocount
        Ms <- .5*(df+df$l)
        JS <- 0.5 * (colSums(df * log(df/Ms)) + colSums(df$l * log(df$l/Ms)))
        JS = ifelse(JS<0,0,JS) # Sometimes 0 is given as -3...e-17, then
        # the following sqrt would give a NaN warning message
        DistsTo_l <- sqrt(JS)
        return(DistsTo_l)
}

### FOLLOW UP

JSDists <- function(Predicts){
        
        DistsPred <- JSDist(Predicts[[1]])
        DistsReal <- JSDist(Predicts[[2]])
        
        PredReal_DistComp <- data.frame(OTU = names(DistsPred), Pred_Dists =
                                      DistsPred, Real_Dists = DistsReal)
        
        if (!isTRUE(all.equal(0,
                              PredReal_DistComp$Pred_Dists[
                                      nrow(PredReal_DistComp)],
                  PredReal_DistComp$Real_Dists[nrow(PredReal_DistComp)]))) {
                stop("predicted or real distance to l was not zero")
                  }
        
        PredReal_DistComp <- PredReal_DistComp[-nrow(PredReal_DistComp),]
        # removes the l row because it is always 0 (tested above)
        
        PredReal_DistComp <- transform(PredReal_DistComp, PredReal_Ratio = 
                                               Pred_Dists/Real_Dists)
        return(PredReal_DistComp) 
}

Illustration Functions

###############################################################
###### Illustrate the OTU changes between condition k and l #####
###############################################################
# INPUT: AbTable_sub, so the subsetted Abundance Table, a data frame with 
# the columns k and l

plotOTUChanges <- function(AbTable_sub){
        
        # make sure you have the OTUs as variable and not just as rownames
        if (! 'OTU' %in% colnames(AbTable_sub)) { 
                AbTable_sub$OTU <- rownames(AbTable_sub)       
        }
        
        # the size of the dots and the writing depends on how many OTUs there
        # are
        if (nrow(AbTable_sub) < 20) {
                size = c(6,4)
                sizetextx = 18
        } else if((nrow(AbTable_sub) > 19) & (nrow(AbTable_sub) < 100)) {
                size = c(4,2)
                sizetextx = 10
        } else if (nrow(AbTable_sub) > 99) {
                size = c(3,1.5)
                sizetextx = 6
        }
        
        AbTable_sub$OTU <- factor(AbTable_sub$OTU)
        LevelsWant <- rev(as.character(AbTable_sub$OTU)) #you had to reverse
        for (i in 1:length(LevelsWant)) {
                AbTable_sub$OTU <- relevel(AbTable_sub$OTU, ref = LevelsWant[i])
        }
        
        AbTable_sub_long <- gather(AbTable_sub, Condition, RA, -OTU)
        
        #AbTable_sub_long$Condition <- as.character(AbTable_sub_long$Condition)
        Tr <- ggplot(AbTable_sub_long, aes(x = OTU, y = RA, 
                                colour = Condition, 
                                size = Condition, 
                                shape = Condition))

        Tr <- Tr + geom_point()  +
        scale_size_discrete(range = size) +
        scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
        scale_shape_manual(values = c(16,17)) +
        #scale_x_discrete(expand = c(0.01,.01)) +
        theme_bw(base_size = 18) +
        theme(panel.grid.minor = element_blank()) +
        # removes all minor gridlines
        theme(panel.grid.major = element_line(colour = cbPalette[1],
                                              size = 0.2)) +
        # sets the grid lines thicker and in a new colour
        theme(panel.background = element_rect(colour = cbPalette[1],
                                          size = .5)) +
        # adjust colour and size of the surrounding box
        theme(panel.grid.major.y = element_blank()) +
        # removes the horisontal major grid lines +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5,
                                         size = sizetextx)) +
        xlab("") +
        ylab("relative Abundance") +
        labs(colour = "", size = "", shape = "")
        return(Tr)
}



###############################################################
###### Illustrate the prediction precision               #####
###############################################################
# INPUT: usually PredReal_DistComp, so just a data frame with
# the columns OTU and PredReal_Ratio


illustratePrecision <- function(DF){
        
        # the size of the dots and the writing depends on how many OTUs there
        # are
        if (nrow(DF) < 20) {
                size = 6
                sizetextx = 18
        } else if((nrow(DF) > 19) & (nrow(DF) < 100)) {
                size = 4
                sizetextx = 14
        } else if (nrow(DF) > 99) {
                size = 3
                sizetextx = 6
        }
        
        # get the order of the OTU right. OTU is used as a factor and levels 
        # in R are alphabetically. But you want the OTUs in the same order as
        # they are in DF (rownames)
        LevelsWant <- rev(as.character(DF$OTU)) #you had to reverse
        for (i in 1:length(LevelsWant)) {
                DF$OTU <- relevel(DF$OTU, ref = LevelsWant[i])
        }
                
        Tr <- ggplot(DF, aes(x = OTU, y = PredReal_Ratio))
        Tr <- Tr +
        geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) +
        geom_point(size = size, colour = cbPalette[2]) +
        theme_bw(base_size = 18) +
        theme(panel.grid.minor = element_blank()) +
        # removes all minor gridlines
        theme(panel.grid.major = element_line(colour = cbPalette[1],
                                              size = 0.2)) +
        # sets the grid lines thicker and in a new colour
        theme(panel.background = element_rect(colour = cbPalette[1],
                                          size = 2)) +
        # adjust colour and size of the surrounding box
        theme(panel.grid.major.y = element_blank()) +
        # removes the horisontal major grid lines +
        theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx)) +
        xlab("") +
        ylab("PredReal_DistanceRatio")
    return(Tr)
}

###############################################################
###### Show Changes Relative vs Absolute for simulations #####
###############################################################

plotSimAbsRel <- function(LongDataFrame){
        LongDataFrame$OTU <- as.character(LongDataFrame$OTU)
        LongDataFrame$Condition <- as.character(LongDataFrame$Condition)
        # do I need this?
        Tr <- ggplot(LongDataFrame, aes(x = OTU, y = Value, 
                                colour = Condition, 
                                size = Condition, 
                                shape = Condition))

        Tr <- Tr + geom_point() +
        facet_grid(AbundType ~., scales = "free") +
        scale_size_discrete(range = c(6,4)) +
        scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
        scale_shape_manual(values = c(16,17)) +
        theme_bw(base_size = 18) +
        theme(panel.grid.minor = element_blank()) +
        # removes all minor gridlines
        theme(panel.grid.major = element_line(colour = cbPalette[1],
                                              size = 0.4)) +
        # sets the grid lines thicker and in a new colour
        theme(panel.background = element_rect(colour = cbPalette[1],
                                          size = 2)) +
        # adjust colour and size of the surrounding box
        theme(panel.grid.major.y = element_blank()) +
        # removes the horisontal major grid lines +
        xlab("OTU") +
        ylab("Abundance") +
        labs(colour = "", size = "", shape = "")
        return(Tr)
}

###############################################################
###### Plot Predictions dependent on Predictor            #####
###############################################################
# INPUT: The data frame where each column is a prediction, last two columns are
# k and l. E.g. pred_RAs, so the outcome of calc_pred_RAs, or Predicts[[1]],
# being the outcome of the predictions function. Further you have to give the
# predictor, e.g. "A" or "OTU1"
# ATTENTION: This plot is nicest when combiend with manipulate

plotPaulPredict <- function(pred_RAs, Predictor){
        
        rownames(pred_RAs) <- colnames(pred_RAs)[1:(ncol(pred_RAs)-2)]
        DF <- select(pred_RAs, k, l, matches(Predictor))
        names(DF)[3] <- paste("from:",Predictor)
        DF$OTU <- rownames(DF)
        
        if (nrow(DF) < 20) {
                size = c(6,4,6)
                sizetextx = 18
        } else if((nrow(AbTable_sub) > 19) & (nrow(AbTable_sub) < 100)) {
                size = c(4,2,4)
                sizetextx = 10
        } else if (nrow(AbTable_sub) > 99) {
                size = c(3,1.5, 3)
                sizetextx = 6
        }
        
        DF <- gather(DF, Condition, Value, -OTU)
        
        Tr <- ggplot(DF, aes(x = OTU, y = Value, 
                                colour = Condition, 
                                size = Condition, 
                                shape = Condition))

        Tr <- Tr + geom_point()  +
        scale_size_discrete(range = size) +
        scale_colour_manual(values = c(cbPalette[1:3])) +
        scale_shape_manual(values = c(16,17,18)) +
        theme_bw(base_size = 18) +
        theme(panel.grid.minor = element_blank()) +
        # removes all minor gridlines
        theme(panel.grid.major = element_line(colour = cbPalette[1],
                                              size = 0.2)) +
        # sets the grid lines thicker and in a new colour
        theme(panel.background = element_rect(colour = cbPalette[1],
                                          size = 2)) +
        # adjust colour and size of the surrounding box
        theme(panel.grid.major.y = element_blank()) +
        # removes the horisontal major grid lines +
        xlab("") +
        ylab("Relative Abundance") +
        labs(colour = "", size = "", shape = "")
        return(Tr)        
}

# NOTE use with manipulate possible
# manipulate(
# plotPaulPredict(PredictsSim[[1]],Predictor),
# Predictor = picker(as.list(rownames(PredictsSim[[1]]))))

###############################################################
###### Plot Predictions dependent on Predictor LOG           #####
###############################################################

plotPaulPredictLog <- function(pred_RAs, Predictor){
        Tr <- plotPaulPredict(pred_RAs, Predictor)
        Tr <- Tr + scale_y_log10()
        return(Tr)
}

#NOTE Use with manipulate possible
# manipulate(
# plotPaulPredictLog(PredictsSim[[1]],Predictor),
# Predictor = picker(as.list(rownames(PredictsSim[[1]]))))

Generating the abundance subtable (e.g. the two conditions to compare)

if(file.exists("Zeller.Rda")){
        load("Zeller.Rda")
}

# An Abundance Table should have OTUs as rows and so samples as columns. 

T_AT <- Zeller2014AbundTable
Samples <- T_AT$Sample
Groups <- T_AT$Groups
T_AT <- T_AT[-2:-1]
AT <- as.data.frame(t(T_AT))
colnames(AT) <- Samples

# choosing the two samples to compare
k <- 5
l <- 14
AT_sub <- select(AT, k, l)
# I chose these because they are control and cancer and because they are very different in UNMAPPED
# renaming the samples and OTUs
colnames(AT_sub) <- c('k', 'l')
rownames(AT_sub) <- c('UM', paste('OTU_', 1:300, sep= ''))
Trr <- plotOTUChanges(AT_sub)
Trr

Trr + scale_y_log10()

Trr + coord_cartesian(ylim = c(-0.01,.2))

Calculate the predictions

Predicts <- predictions(AT_sub)

Calculate and show Distances

Calculate the absolute Diffs

PredReal_DistComp <- absoluteDiffs(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98) 

Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr

Trr

Calculate and show the euclidean distance

PredReal_DistComp <- EuclDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98) 

Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr

Trr

Calculate and show the Bray-Curtis distance

PredReal_DistComp <- BrayCurtisDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98) 

Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr

Trr

Calculate and show the Jensen-Shannon distance

PredReal_DistComp <- JSDists(Predicts)
Filtered <- filter(PredReal_DistComp, PredReal_Ratio < 0.98) 

Tr <- illustratePrecision(PredReal_DistComp)
Trr <- illustratePrecision(Filtered)
Tr

Trr

Simulations

k <- c(5E8, 2.5E8, 2E8, 1E8, .2E8)
l= c(10E8, 2.5E8, 2E8, 1E8, .2E8)

SimDFs <- GenerateSimDFs(k,l)
# plot relative and absolute abundances under each other
Tr <- plotSimAbsRel(SimDFs[[3]])
Tr

# get all predictions
PredictsSim <- predictions(SimDFs[[2]])
Tr <- plotPaulPredict(PredictsSim[[1]], "A")
Tr

Diffs <- absoluteDiffs(PredictsSim)

Tr <- illustratePrecision(Diffs)

Diffs

Tr

DistsEu <- EuclDists(PredictsSim)

Tr <- illustratePrecision(DistsEu)

Euclidean

Tr

DistsBC <- BrayCurtisDists(PredictsSim)

Tr <- illustratePrecision(DistsBC)

Bray-Curtis

Tr

DistsJS <- JSDists(PredictsSim)

Tr <- illustratePrecision(DistsJS)

JSD

Tr