source('Functions/Dist_Functs.R')
source('Functions/Sim_Functs.R')
source('Functions/Predict_Functs.R')
source('Functions/Illus_Functs.R')
source('Functions/SubComp_Functs.R')

The current weighting corrections

We have the precentage explained \(\text{PE}\) by a prediction compared to the k-l distance.

no weighting
\(\frac{\text{PE}}{\text{ES}}\) with \[\text{ES} = |l - k|\]
\(\text{PE}*\text{ES1}\) with \[\text{ES1} = \frac{l}{k} ~~~ (\text{if ES1} < 1 ~~\text{so}~~ \text{ES1} = 1/\text{ES1})\]
\(\text{PE}*\text{ES2}\) with \[\text{ES2} = \frac{2*|l - k|}{k + l} = \frac{|l - k|}{mean(k,l)}\]
\(\text{PE}*\text{ES1P}\) with \[\text{ES1P} = |\log(l) - \log(k)|\]

Relation of ES1P to ES2

We found in our last meeting that ES1P results in almost the same correction as ES2, so \[ | \log(l) - \log(k) | = | log(\frac{l}{k})| \sim \frac{2*| l - k |}{l + k} \]

In fact ES2 is always a bit smaller than ES1P, so \[ | \log(l) - \log(k) | = | log(\frac{l}{k})| > \frac{2*| l - k |}{l + k} \]

but as long as the difference between k and l is not big, the difference is not so drastic

ES2 <- function(k,l){
        x <- 2*abs(l-k)/(l + k)
        return(x)
}

ES1P <- function(k,l){
        x <- abs(log(l) - log(k))
        return(x)
}

l <- seq(from = .1, by = 0.02, to = 1)
k <- rep(.1, length(l))

DF <- data.frame(k = k, l = l)
DF <- mutate(DF, Diff = abs(l-k), ES2 = ES2(k,l), ES1P = ES1P(k,l))
DF <- gather(DF, Corrector, Value, -k, -l, -Diff)
ggplot(data = DF, aes(x = Diff, y = Value, colour = Corrector)) +
        geom_line() + xlab('Difference l to k') + theme_bw(18)

It also depends on the ratio k/l but I skip this, ES2 and ES1P are pretty similar unless there is a massive jump in k to l.

The real changer Idea

Simulations

k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)
l <- c(10E8, 1E8, 4E8, 1E8, .2E8, 1E8, 1E8)

** FOR SURE the 0 removal isssue **

# Make a matrix out of k, where each colum contains the ratios of the 
# relative abundance of one OTU divided by the abudnances of the others

RatioMatrix_k <- sapply(1:nrow(AT), function(x) AT$k[x]/AT$k)
colnames(RatioMatrix_k) <- AT$OTU
rownames(RatioMatrix_k) <- AT$OTU

RatioMatrix_l <- sapply(1:nrow(AT), function(x) AT$l[x]/AT$l)
colnames(RatioMatrix_l) <- AT$OTU
rownames(RatioMatrix_l) <- AT$OTU

RatioCompareMatrix <- RatioMatrix_l/RatioMatrix_k

########## just to proof that this is independent of relative abundances##
## because we are talking simulations here, let's check these matrix from
## the absolute data
ATA <- AT
ATA$k <- k
ATA$l <- l

ARatioMatrix_k <- sapply(1:nrow(ATA), function(x) ATA$k[x]/ATA$k)
colnames(ARatioMatrix_k) <- ATA$OTU
rownames(ARatioMatrix_k) <- ATA$OTU

ARatioMatrix_l <- sapply(1:nrow(ATA), function(x) ATA$l[x]/ATA$l)
colnames(ARatioMatrix_l) <- ATA$OTU
rownames(ARatioMatrix_l) <- ATA$OTU

ARatioCompareMatrix <- ARatioMatrix_l/ARatioMatrix_k

all.equal(RatioMatrix_k,ARatioMatrix_k)

## [1] TRUE

all.equal(RatioMatrix_l,ARatioMatrix_l)

## [1] TRUE

all.equal(ARatioCompareMatrix,RatioCompareMatrix)

## [1] TRUE

# This is a very good sign, these Ratios are independent on whether they 
# were obtained from the relative or absolute abundance data!!
##################################################

How to use the RatioCompareMatrix

it is as shown independent of compositional effects (same for relative and absolute data)
each row is 1/column because
- in each column the OTU is numerator (so if OTUs absolute abundance goes down these values are probably low)
- in each row the OTU is denominator
determine for each OTU agains how many it increased and decreased
sort the list for increased against and decreased against

Plotting the data, first a heatmap, then a boxplot

##
heatmap(RatioCompareMatrix)

For the boxplot (ggplot2) and further calculations you need a data.frame

#diag(RatioCompareMatrix) = NA
RatioCompare <- data.frame(RatioCompareMatrix)
RatioCompare$OTU <- AT$OTU

# summarise_each(RatioCompare, funs(mean(., na.rm = TRUE), sd(., na.rm = TRUE)), -OTU)

RatioCompareLong <- gather(RatioCompare, OTUNom, Ratio, -OTU)

Tr <- ggplot(RatioCompareLong, aes(x = OTUNom, y = Ratio))
set.seed(124) # to get teh same jitter!
Tr <- Tr + 
        geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) + 
        geom_boxplot(alpha = 1/2,
                  fill = cbPalette[1],
                  outlier.colour = NA) + 
        geom_jitter(aes(color = OTU), position = position_jitter(width = .2), 
                    alpha =3/4, pch = 19) +
        xlab("") + 
        ylab("Compare Ratios") +
        theme_bw( 18 ) 
Tr

########### Determine for each OTU to how many OTUs its abundance decreased
# or increased ############################################

#### SOMETHING REALLY WEIRD, sometimes the 1s in the RatioCompareMatrix are not
# really 1s, try with the example k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8)
# l <- c(10E8, 1E8, 2E8, 1E8, .2E8), RatioCompareMatrix < 1 at this point
## THEREFORE I say 0.99 to 1.01 are 1 for me here

DecreaseSum <- colSums(RatioCompareMatrix < .99)
DecreasePerc <- colMeans(RatioCompareMatrix < .99)
# FYI, could also be done with the cumulative distribution function,
# e.g. Distr <- RatioCompareMatrix[,2]
# ecdf(Distr)(.99) 
# see https://stat.ethz.ch/pipermail/r-help/2012-March/305368.html

IncreaseSum <- colSums(RatioCompareMatrix > 1.01)
IncreasePerc <- colMeans(RatioCompareMatrix > 1.01)
# 
UnchangedPerc <- 1-DecreasePerc-IncreasePerc

############################################

## Plot this data in a barplot
UpDownDF <- data.frame(OTU  = AT$OTU, Decr = DecreasePerc,
                       Equal = UnchangedPerc, Incr = IncreasePerc)
UpDownDFLong <- gather(UpDownDF, Change, Perc, -OTU)

Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))

Tr + geom_bar(stat = 'identity') +
        scale_fill_manual(values = c("Equal" = cbPalette[1],
                                    "Incr" = cbPalette[2],
                                    "Decr" = cbPalette[3])) +
        theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = 2)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                # theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
                xlab("") +
                ylab("Percentage of Ratio Comparison to other OTUs")

Maybe the plot would be even nicer when you see the ones that changed most closest to the y axis

# for this you have to change the levels of the OTUs, I want to show the ones
# that change most first, so lowest equal, then highest increase to break ties
LevelsWant <- as.character(arrange(UpDownDF, Equal, desc(Incr))$OTU)
UpDownDFLong <- within(UpDownDFLong, OTU <- factor(OTU, levels = LevelsWant))


Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))

Tr + geom_bar(stat = 'identity') +
        scale_fill_manual(values = c("Equal" = cbPalette[1],
                                    "Incr" = cbPalette[2],
                                    "Decr" = cbPalette[3])) +
        theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = 2)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                # theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
                xlab("") +
                ylab("Percentage of Comparisons to other OTUs")

Since barplots will be very difficult for real data, let’s do a dotplot as well

Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, color = Change))
Tr + geom_point(size = 5) +
        scale_color_manual(values = c("Equal" = cbPalette[1],
                                    "Incr" = cbPalette[2],
                                    "Decr" = cbPalette[3])) +
        theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = 2)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                # theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
                xlab("") +
                ylab("Percentage of Comparisons to other OTUs")

# Medians <- summarise_each(RatioCompare, funs(median(., na.rm = TRUE)),-OTU)
# Medians <- gather(Medians, OTU, Median)
# # a median below 1, means the OTU decreased compared to more than half of the OTUs
# # cool would be percentile
# 
# SDs <- summarise_each(RatioCompare, funs(sd(., na.rm = TRUE)),-OTU)
# SDs <- gather(SDs, OTU, SD)
# 
# DFCompare <- data.frame(OTU = Medians$OTU, Median = Medians$Median,
#                         SD = SDs$SD, ES = AT_sub2$l-AT_sub2$k, MeanRA = rowMeans(cbind(AT_sub2$l, AT_sub2$k)))

Simulation Examples and what you can learn

k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8) l <- c(10E8, 1E8, 2E8, 1E8, .2E8)

Only B gets smaller, rest stays unaffected.

In the RatioCompareMatrix only the B row and column differs from 1
so it might make sense to use columns and rows?

Try real data

if(file.exists("Data/genus.profile.csv")){
        AT <- read.table("Data/genus.profile.csv", header = TRUE)
}

names(AT)[1] <- "OTU"

# choosing the two samples to compare
k <- 2
l <- 3
AT_sub <- select(AT, OTU, k, l)
# I chose these because they are control and cancer and because they are very different in UNMAPPED
# renaming the samples and OTUs
colnames(AT_sub)[2:3] <- c('k', 'l')

# rownames(AT_sub) <- c('UM', paste('OTU_', 1:300, sep= ''))


## remove all OTUs that are 0 in both conditions
AT_sub <- subset(AT_sub, (k != 0) | (l != 0))
# REMOVED QUITE SOME

# works also with filter but filter as all dplyr removes rownames
# AT_sub <- filter(AT_sub, (k != 0) | (l != 0))

## the zero issue again, here
# colSums(AT_sub[,1:2]) # not 1
# find a pseudocount
AT_sub2 <- AT_sub
AT_sub2[AT_sub2 == 0] <- NA
Pseudocount <- min(AT_sub2[,2:3], na.rm = TRUE)
# set all 0 to this min = Psuedocount Value
AT_sub2 <- mutate(AT_sub, k = ifelse(k == 0, Pseudocount, k), 
                  l = ifelse(l == 0, Pseudocount, l))
#colSums(AT_sub2[,2:3]) # k over 1, l still not 1!
AT_sub2 <- mutate(AT_sub2, k = k/sum(k), l = l/sum(l))
#colSums(AT_sub2[,2:3]) # indeed now they sum up to 1

# so the difference bw AT_sub and AT_sub2, is that AT_sub2 was added a 
# pseudocount, and renormalised (it actually sums up to 1)

# remove the factor levels from the OTU
AT_sub2$OTU <- as.character(AT_sub2$OTU)

# Make a matrix out of k, where each colum are the ratios of the relative 
# abundance of the respective OTU divided by the abudnances of the others

RatioMatrix_k <- sapply(1:nrow(AT_sub2), function(x) AT_sub2$k[x]/AT_sub2$k)
colnames(RatioMatrix_k) <- AT_sub2$OTU
rownames(RatioMatrix_k) <- AT_sub2$OTU

RatioMatrix_l <- sapply(1:nrow(AT_sub2), function(x) AT_sub2$l[x]/AT_sub2$l)
colnames(RatioMatrix_l) <- AT_sub2$OTU
rownames(RatioMatrix_l) <- AT_sub2$OTU

RatioCompareMatrix <- RatioMatrix_l/RatioMatrix_k

See who increased and decreased compared to most others

EqualProc <- 10
Min <- 1/(1+EqualProc/100)
Max <- 1/Min

DecreaseSum <- colSums(RatioCompareMatrix < Min)
DecreasePerc <- colMeans(RatioCompareMatrix < Min)
# FYI, could also be done with the cumulative distribution function,
# e.g. Distr <- RatioCompareMatrix[,2]
# ecdf(Distr)(.99) 
# see https://stat.ethz.ch/pipermail/r-help/2012-March/305368.html

IncreaseSum <- colSums(RatioCompareMatrix > Max)
IncreasePerc <- colMeans(RatioCompareMatrix > Max)
# 
# For the real data I decided to round
IncreasePerc <- round(IncreasePerc, 3)
DecreasePerc <- round(DecreasePerc, 3)
UnchangedPerc <- round(1-DecreasePerc-IncreasePerc, 3)


## For the Plotting
UpDownDF <- data.frame(OTU  = AT_sub2$OTU, Decr = DecreasePerc,
                       Equal = UnchangedPerc, Incr = IncreasePerc)
UpDownDFLong <- gather(UpDownDF, Change, Perc, -OTU)

# for this you have to change the levels of the OTUs, I want to show the ones
# that change most first, so lowest equal, then highest increase to break ties
LevelsWant <- as.character(arrange(UpDownDF, desc(Incr), Equal)$OTU)
UpDownDFLong <- within(UpDownDFLong, OTU <- factor(OTU, levels = LevelsWant))


if (length(levels(UpDownDFLong$OTU)) < 20) {
                size = 6
                sizetextx = 18
        } else if(( length(levels(UpDownDFLong$OTU)) > 19) & 
                          (length(levels(UpDownDFLong$OTU)) < 100)) {
                size = 4
                sizetextx = 14
        } else if (length(levels(UpDownDFLong$OTU)) > 99) {
                size = 3
                sizetextx = 6
        }

Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))

Tr + geom_bar(stat = 'identity') +
        scale_fill_manual(values = c("Equal" = cbPalette[1],
                                    "Incr" = cbPalette[2],
                                    "Decr" = cbPalette[3])) +
        theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = 2)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
                xlab("") +
                ylab("Percentage of Comparisons to other OTUs")

## Warning in loop_apply(n, do.ply): position_stack requires constant width:
## output may be incorrect

Since barplots will be very difficult for real data, let’s do a dotplot as well

if (length(levels(UpDownDFLong$OTU)) < 20) {
                size = 6
                sizetextx = 18
        } else if(( length(levels(UpDownDFLong$OTU)) > 19) & 
                          (length(levels(UpDownDFLong$OTU)) < 100)) {
                size = 4
                sizetextx = 14
        } else if (length(levels(UpDownDFLong$OTU)) > 99) {
                size = 3
                sizetextx = 6
        }

Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, color = Change))
Tr + geom_point(size = size) +
        scale_color_manual(values = c("Equal" = cbPalette[1],
                                    "Incr" = cbPalette[2],
                                    "Decr" = cbPalette[3])) +
        theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = 2)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
                xlab("") +
                ylab("Percentage of Comparisons to other OTUs")

since the comparison to 1 is not telling you much, you could

diag(RatioCompareMatrix) = NA

# RatioCompare <- data.frame(RatioCompareMatrix)
# RatioCompare$OTU <- AT_sub2$OTU
# 
# Medians <- summarise_each(RatioCompare, funs(median(., na.rm = TRUE)),-OTU)
# Medians <- gather(Medians, OTU, Median)
# SDs <- summarise_each(RatioCompare, funs(sd(., na.rm = TRUE)),-OTU)
# SDs <- gather(SDs, OTU, SD)
# 
# DFCompare <- data.frame(OTU = Medians$OTU, Median = Medians$Median,
#                         SD = SDs$SD, ES = AT_sub2$l-AT_sub2$k, MeanRA = rowMeans(cbind(AT_sub2$l, AT_sub2$k)))
# 
# RatioCompareLong <- gather(RatioCompare, OTUNom, Ratio, -OTU)

###############################################################
###### Illustrate the Percentage Explained               #####
###############################################################
# INPUT: usually PredReal_DistComp, 

# DF <- DFCompare
#         if (nrow(DF) < 20) {
#                 size = 6
#                 sizetextx = 18
#         } else if((nrow(DF) > 19) & (nrow(DF) < 100)) {
#                 size = 4
#                 sizetextx = 14
#         } else if (nrow(DF) > 99) {
#                 size = 3
#                 sizetextx = 6
#         }
#         
#         # get the order of the OTU right. OTU is used as a factor and levels 
#         # in R are alphabetically. But you want the OTUs in the same order as
#         # they are in DF (rownames)
#         LevelsWant <- rev(as.character(DF$OTU)) #you had to reverse
#         for (i in 1:length(LevelsWant)) {
#                 DF$OTU <- relevel(DF$OTU, ref = LevelsWant[i])
#         }
#         
#         Tr <- ggplot(DF, aes(x = OTU, y = Median))
#         Tr <- Tr +
#                 geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) +
#                 geom_point(size = size, colour = cbPalette[2]) +
#                 theme_bw(base_size = 18) +
#                 theme(panel.grid.minor = element_blank()) +
#                 # removes all minor gridlines
#                 theme(panel.grid.major = element_line(colour = cbPalette[1],
#                                                       size = 0.2)) +
#                 # sets the grid lines thicker and in a new colour
#                 theme(panel.background = element_rect(colour = cbPalette[1],
#                                                       size = 2)) +
#                 # adjust colour and size of the surrounding box
#                 theme(panel.grid.major.y = element_blank()) +
#                 # removes the horisontal major grid lines +
#                 theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
#                 xlab("") +
#                 ylab("Ratio Median")
# 
# Tr
# Tr + scale_y_log10()

# Tops <- filter(DFCompare, Median>3)
# head(arrange(Tops, desc(Median)),10)
# head(arrange(DFCompare, Median),10)

150604_RealChangers

Thorsten Brach