source('Functions/Dist_Functs.R')
source('Functions/Sim_Functs.R')
source('Functions/Predict_Functs.R')
source('Functions/Illus_Functs.R')
source('Functions/SubComp_Functs.R')
We have the precentage explained \(\text{PE}\) by a prediction compared to the k-l distance.
We found in our last meeting that ES1P results in almost the same correction as ES2, so \[ | \log(l) - \log(k) | = | log(\frac{l}{k})| \sim \frac{2*| l - k |}{l + k} \]
In fact ES2 is always a bit smaller than ES1P, so \[ | \log(l) - \log(k) | = | log(\frac{l}{k})| > \frac{2*| l - k |}{l + k} \]
but as long as the difference between k and l is not big, the difference is not so drastic
ES2 <- function(k,l){
x <- 2*abs(l-k)/(l + k)
return(x)
}
ES1P <- function(k,l){
x <- abs(log(l) - log(k))
return(x)
}
l <- seq(from = .1, by = 0.02, to = 1)
k <- rep(.1, length(l))
DF <- data.frame(k = k, l = l)
DF <- mutate(DF, Diff = abs(l-k), ES2 = ES2(k,l), ES1P = ES1P(k,l))
DF <- gather(DF, Corrector, Value, -k, -l, -Diff)
ggplot(data = DF, aes(x = Diff, y = Value, colour = Corrector)) +
geom_line() + xlab('Difference l to k') + theme_bw(18)
It also depends on the ratio k/l but I skip this, ES2 and ES1P are pretty similar unless there is a massive jump in k to l.
k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)
l <- c(10E8, 1E8, 4E8, 1E8, .2E8, 1E8, 1E8)
** FOR SURE the 0 removal isssue **
# Make a matrix out of k, where each colum contains the ratios of the
# relative abundance of one OTU divided by the abudnances of the others
RatioMatrix_k <- sapply(1:nrow(AT), function(x) AT$k[x]/AT$k)
colnames(RatioMatrix_k) <- AT$OTU
rownames(RatioMatrix_k) <- AT$OTU
RatioMatrix_l <- sapply(1:nrow(AT), function(x) AT$l[x]/AT$l)
colnames(RatioMatrix_l) <- AT$OTU
rownames(RatioMatrix_l) <- AT$OTU
RatioCompareMatrix <- RatioMatrix_l/RatioMatrix_k
########## just to proof that this is independent of relative abundances##
## because we are talking simulations here, let's check these matrix from
## the absolute data
ATA <- AT
ATA$k <- k
ATA$l <- l
ARatioMatrix_k <- sapply(1:nrow(ATA), function(x) ATA$k[x]/ATA$k)
colnames(ARatioMatrix_k) <- ATA$OTU
rownames(ARatioMatrix_k) <- ATA$OTU
ARatioMatrix_l <- sapply(1:nrow(ATA), function(x) ATA$l[x]/ATA$l)
colnames(ARatioMatrix_l) <- ATA$OTU
rownames(ARatioMatrix_l) <- ATA$OTU
ARatioCompareMatrix <- ARatioMatrix_l/ARatioMatrix_k
all.equal(RatioMatrix_k,ARatioMatrix_k)
## [1] TRUE
all.equal(RatioMatrix_l,ARatioMatrix_l)
## [1] TRUE
all.equal(ARatioCompareMatrix,RatioCompareMatrix)
## [1] TRUE
# This is a very good sign, these Ratios are independent on whether they
# were obtained from the relative or absolute abundance data!!
##################################################
How to use the RatioCompareMatrix
Plotting the data, first a heatmap, then a boxplot
##
heatmap(RatioCompareMatrix)
For the boxplot (ggplot2) and further calculations you need a data.frame
#diag(RatioCompareMatrix) = NA
RatioCompare <- data.frame(RatioCompareMatrix)
RatioCompare$OTU <- AT$OTU
# summarise_each(RatioCompare, funs(mean(., na.rm = TRUE), sd(., na.rm = TRUE)), -OTU)
RatioCompareLong <- gather(RatioCompare, OTUNom, Ratio, -OTU)
Tr <- ggplot(RatioCompareLong, aes(x = OTUNom, y = Ratio))
set.seed(124) # to get teh same jitter!
Tr <- Tr +
geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) +
geom_boxplot(alpha = 1/2,
fill = cbPalette[1],
outlier.colour = NA) +
geom_jitter(aes(color = OTU), position = position_jitter(width = .2),
alpha =3/4, pch = 19) +
xlab("") +
ylab("Compare Ratios") +
theme_bw( 18 )
Tr
########### Determine for each OTU to how many OTUs its abundance decreased
# or increased ############################################
#### SOMETHING REALLY WEIRD, sometimes the 1s in the RatioCompareMatrix are not
# really 1s, try with the example k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8)
# l <- c(10E8, 1E8, 2E8, 1E8, .2E8), RatioCompareMatrix < 1 at this point
## THEREFORE I say 0.99 to 1.01 are 1 for me here
DecreaseSum <- colSums(RatioCompareMatrix < .99)
DecreasePerc <- colMeans(RatioCompareMatrix < .99)
# FYI, could also be done with the cumulative distribution function,
# e.g. Distr <- RatioCompareMatrix[,2]
# ecdf(Distr)(.99)
# see https://stat.ethz.ch/pipermail/r-help/2012-March/305368.html
IncreaseSum <- colSums(RatioCompareMatrix > 1.01)
IncreasePerc <- colMeans(RatioCompareMatrix > 1.01)
#
UnchangedPerc <- 1-DecreasePerc-IncreasePerc
############################################
## Plot this data in a barplot
UpDownDF <- data.frame(OTU = AT$OTU, Decr = DecreasePerc,
Equal = UnchangedPerc, Incr = IncreasePerc)
UpDownDFLong <- gather(UpDownDF, Change, Perc, -OTU)
Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))
Tr + geom_bar(stat = 'identity') +
scale_fill_manual(values = c("Equal" = cbPalette[1],
"Incr" = cbPalette[2],
"Decr" = cbPalette[3])) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
# theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
xlab("") +
ylab("Percentage of Ratio Comparison to other OTUs")
Maybe the plot would be even nicer when you see the ones that changed most closest to the y axis
# for this you have to change the levels of the OTUs, I want to show the ones
# that change most first, so lowest equal, then highest increase to break ties
LevelsWant <- as.character(arrange(UpDownDF, Equal, desc(Incr))$OTU)
UpDownDFLong <- within(UpDownDFLong, OTU <- factor(OTU, levels = LevelsWant))
Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))
Tr + geom_bar(stat = 'identity') +
scale_fill_manual(values = c("Equal" = cbPalette[1],
"Incr" = cbPalette[2],
"Decr" = cbPalette[3])) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
# theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
xlab("") +
ylab("Percentage of Comparisons to other OTUs")
Since barplots will be very difficult for real data, let’s do a dotplot as well
Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, color = Change))
Tr + geom_point(size = 5) +
scale_color_manual(values = c("Equal" = cbPalette[1],
"Incr" = cbPalette[2],
"Decr" = cbPalette[3])) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
# theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
xlab("") +
ylab("Percentage of Comparisons to other OTUs")
# Medians <- summarise_each(RatioCompare, funs(median(., na.rm = TRUE)),-OTU)
# Medians <- gather(Medians, OTU, Median)
# # a median below 1, means the OTU decreased compared to more than half of the OTUs
# # cool would be percentile
#
# SDs <- summarise_each(RatioCompare, funs(sd(., na.rm = TRUE)),-OTU)
# SDs <- gather(SDs, OTU, SD)
#
# DFCompare <- data.frame(OTU = Medians$OTU, Median = Medians$Median,
# SD = SDs$SD, ES = AT_sub2$l-AT_sub2$k, MeanRA = rowMeans(cbind(AT_sub2$l, AT_sub2$k)))
k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8) l <- c(10E8, 1E8, 2E8, 1E8, .2E8)
Only B gets smaller, rest stays unaffected.
if(file.exists("Data/genus.profile.csv")){
AT <- read.table("Data/genus.profile.csv", header = TRUE)
}
names(AT)[1] <- "OTU"
# choosing the two samples to compare
k <- 2
l <- 3
AT_sub <- select(AT, OTU, k, l)
# I chose these because they are control and cancer and because they are very different in UNMAPPED
# renaming the samples and OTUs
colnames(AT_sub)[2:3] <- c('k', 'l')
# rownames(AT_sub) <- c('UM', paste('OTU_', 1:300, sep= ''))
## remove all OTUs that are 0 in both conditions
AT_sub <- subset(AT_sub, (k != 0) | (l != 0))
# REMOVED QUITE SOME
# works also with filter but filter as all dplyr removes rownames
# AT_sub <- filter(AT_sub, (k != 0) | (l != 0))
## the zero issue again, here
# colSums(AT_sub[,1:2]) # not 1
# find a pseudocount
AT_sub2 <- AT_sub
AT_sub2[AT_sub2 == 0] <- NA
Pseudocount <- min(AT_sub2[,2:3], na.rm = TRUE)
# set all 0 to this min = Psuedocount Value
AT_sub2 <- mutate(AT_sub, k = ifelse(k == 0, Pseudocount, k),
l = ifelse(l == 0, Pseudocount, l))
#colSums(AT_sub2[,2:3]) # k over 1, l still not 1!
AT_sub2 <- mutate(AT_sub2, k = k/sum(k), l = l/sum(l))
#colSums(AT_sub2[,2:3]) # indeed now they sum up to 1
# so the difference bw AT_sub and AT_sub2, is that AT_sub2 was added a
# pseudocount, and renormalised (it actually sums up to 1)
# remove the factor levels from the OTU
AT_sub2$OTU <- as.character(AT_sub2$OTU)
# Make a matrix out of k, where each colum are the ratios of the relative
# abundance of the respective OTU divided by the abudnances of the others
RatioMatrix_k <- sapply(1:nrow(AT_sub2), function(x) AT_sub2$k[x]/AT_sub2$k)
colnames(RatioMatrix_k) <- AT_sub2$OTU
rownames(RatioMatrix_k) <- AT_sub2$OTU
RatioMatrix_l <- sapply(1:nrow(AT_sub2), function(x) AT_sub2$l[x]/AT_sub2$l)
colnames(RatioMatrix_l) <- AT_sub2$OTU
rownames(RatioMatrix_l) <- AT_sub2$OTU
RatioCompareMatrix <- RatioMatrix_l/RatioMatrix_k
See who increased and decreased compared to most others
EqualProc <- 10
Min <- 1/(1+EqualProc/100)
Max <- 1/Min
DecreaseSum <- colSums(RatioCompareMatrix < Min)
DecreasePerc <- colMeans(RatioCompareMatrix < Min)
# FYI, could also be done with the cumulative distribution function,
# e.g. Distr <- RatioCompareMatrix[,2]
# ecdf(Distr)(.99)
# see https://stat.ethz.ch/pipermail/r-help/2012-March/305368.html
IncreaseSum <- colSums(RatioCompareMatrix > Max)
IncreasePerc <- colMeans(RatioCompareMatrix > Max)
#
# For the real data I decided to round
IncreasePerc <- round(IncreasePerc, 3)
DecreasePerc <- round(DecreasePerc, 3)
UnchangedPerc <- round(1-DecreasePerc-IncreasePerc, 3)
## For the Plotting
UpDownDF <- data.frame(OTU = AT_sub2$OTU, Decr = DecreasePerc,
Equal = UnchangedPerc, Incr = IncreasePerc)
UpDownDFLong <- gather(UpDownDF, Change, Perc, -OTU)
# for this you have to change the levels of the OTUs, I want to show the ones
# that change most first, so lowest equal, then highest increase to break ties
LevelsWant <- as.character(arrange(UpDownDF, desc(Incr), Equal)$OTU)
UpDownDFLong <- within(UpDownDFLong, OTU <- factor(OTU, levels = LevelsWant))
if (length(levels(UpDownDFLong$OTU)) < 20) {
size = 6
sizetextx = 18
} else if(( length(levels(UpDownDFLong$OTU)) > 19) &
(length(levels(UpDownDFLong$OTU)) < 100)) {
size = 4
sizetextx = 14
} else if (length(levels(UpDownDFLong$OTU)) > 99) {
size = 3
sizetextx = 6
}
Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, fill = Change))
Tr + geom_bar(stat = 'identity') +
scale_fill_manual(values = c("Equal" = cbPalette[1],
"Incr" = cbPalette[2],
"Decr" = cbPalette[3])) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
xlab("") +
ylab("Percentage of Comparisons to other OTUs")
## Warning in loop_apply(n, do.ply): position_stack requires constant width:
## output may be incorrect
Since barplots will be very difficult for real data, let’s do a dotplot as well
if (length(levels(UpDownDFLong$OTU)) < 20) {
size = 6
sizetextx = 18
} else if(( length(levels(UpDownDFLong$OTU)) > 19) &
(length(levels(UpDownDFLong$OTU)) < 100)) {
size = 4
sizetextx = 14
} else if (length(levels(UpDownDFLong$OTU)) > 99) {
size = 3
sizetextx = 6
}
Tr <- ggplot(UpDownDFLong, aes(x = OTU, y = Perc, color = Change))
Tr + geom_point(size = size) +
scale_color_manual(values = c("Equal" = cbPalette[1],
"Incr" = cbPalette[2],
"Decr" = cbPalette[3])) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
# removes all minor gridlines
theme(panel.grid.major = element_line(colour = cbPalette[1],
size = 0.2)) +
# sets the grid lines thicker and in a new colour
theme(panel.background = element_rect(colour = cbPalette[1],
size = 2)) +
# adjust colour and size of the surrounding box
theme(panel.grid.major.y = element_blank()) +
# removes the horisontal major grid lines +
theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
xlab("") +
ylab("Percentage of Comparisons to other OTUs")
diag(RatioCompareMatrix) = NA
# RatioCompare <- data.frame(RatioCompareMatrix)
# RatioCompare$OTU <- AT_sub2$OTU
#
# Medians <- summarise_each(RatioCompare, funs(median(., na.rm = TRUE)),-OTU)
# Medians <- gather(Medians, OTU, Median)
# SDs <- summarise_each(RatioCompare, funs(sd(., na.rm = TRUE)),-OTU)
# SDs <- gather(SDs, OTU, SD)
#
# DFCompare <- data.frame(OTU = Medians$OTU, Median = Medians$Median,
# SD = SDs$SD, ES = AT_sub2$l-AT_sub2$k, MeanRA = rowMeans(cbind(AT_sub2$l, AT_sub2$k)))
#
# RatioCompareLong <- gather(RatioCompare, OTUNom, Ratio, -OTU)
###############################################################
###### Illustrate the Percentage Explained #####
###############################################################
# INPUT: usually PredReal_DistComp,
# DF <- DFCompare
# if (nrow(DF) < 20) {
# size = 6
# sizetextx = 18
# } else if((nrow(DF) > 19) & (nrow(DF) < 100)) {
# size = 4
# sizetextx = 14
# } else if (nrow(DF) > 99) {
# size = 3
# sizetextx = 6
# }
#
# # get the order of the OTU right. OTU is used as a factor and levels
# # in R are alphabetically. But you want the OTUs in the same order as
# # they are in DF (rownames)
# LevelsWant <- rev(as.character(DF$OTU)) #you had to reverse
# for (i in 1:length(LevelsWant)) {
# DF$OTU <- relevel(DF$OTU, ref = LevelsWant[i])
# }
#
# Tr <- ggplot(DF, aes(x = OTU, y = Median))
# Tr <- Tr +
# geom_hline(aes(yintercept = 1), colour = cbPalette[1], size = 1.2) +
# geom_point(size = size, colour = cbPalette[2]) +
# theme_bw(base_size = 18) +
# theme(panel.grid.minor = element_blank()) +
# # removes all minor gridlines
# theme(panel.grid.major = element_line(colour = cbPalette[1],
# size = 0.2)) +
# # sets the grid lines thicker and in a new colour
# theme(panel.background = element_rect(colour = cbPalette[1],
# size = 2)) +
# # adjust colour and size of the surrounding box
# theme(panel.grid.major.y = element_blank()) +
# # removes the horisontal major grid lines +
# theme(axis.text.x = element_text(angle = 90, vjust = .5, size = sizetextx, hjust = 1)) +
# xlab("") +
# ylab("Ratio Median")
#
# Tr
# Tr + scale_y_log10()
# Tops <- filter(DFCompare, Median>3)
# head(arrange(Tops, desc(Median)),10)
# head(arrange(DFCompare, Median),10)