source('Functions/Dist_Functs.R')
source('Functions/Sim_Functs.R')
source('Functions/Predict_Functs.R')
source('Functions/Illus_Functs.R')
source('Functions/SubComp_Functs.R')

Background Discretization ideas

Simulations

k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)
l <- c(8E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)

** FOR SURE the 0 removal isssue **

ATsi <- AT[,-1]

RatioMatrixes <- lapply( 1:nrow(ATsi), function(i) {
        Mat <- sapply(1:ncol(ATsi), function(x) ATsi[i,x]/ATsi[,x])
        colnames(Mat) <- names(ATsi)
        rownames(Mat) <- AT$OTU
        return(Mat)
        }
        )

names(RatioMatrixes) <- AT$OTU
       
# Matrixes saying if bigger or smaller than median compared to each OTU

BinMatrixes <- lapply( 1:length(RatioMatrixes), function(i) {
        
        Bin <- sapply(1:nrow(ATsi), function(x) {
                Mat <- RatioMatrixes[[i]]
                Mat[x,] >= .999*median(Mat[x,])})
        Bin <- t(Bin)
        colnames(Bin) <- names(ATsi)
        rownames(Bin) <- AT$OTU
        return(Bin)
        })

names(BinMatrixes) <- AT$OTU

# Now create one Matrix, again the columns represent the samples,
# Let's call the OTU of a row again the Owner OTU. 
# The number is the percentage of OTUs for which the ratio Owner OTU/ OTU was
# above median in this sample

AssessMatrix <- sapply(1:length(BinMatrixes), function(x) colMeans(BinMatrixes[[x]]))
AssessMatrix <- t(AssessMatrix)
rownames(AssessMatrix) <- AT$OTU

ATsi
##            k          l
## 1 0.56497175 0.50955414
## 2 0.14124294 0.15923567
## 3 0.11299435 0.12738854
## 4 0.05649718 0.06369427
## 5 0.01129944 0.01273885
## 6 0.05649718 0.06369427
## 7 0.05649718 0.06369427
AssessMatrix
##           k         l
## A 1.0000000 0.1428571
## B 0.8571429 1.0000000
## C 0.8571429 1.0000000
## D 0.8571429 1.0000000
## E 0.8571429 1.0000000
## F 0.8571429 1.0000000
## G 0.8571429 1.0000000

Real Data, selection of the Gastric Bypass Data

Loading and subsetting the data

## tried the Zeller Data Before, and maybe again t
# load("Zeller2014AbundTable_ATA.RData")
# AT <- Zeller2014AbundTable_ATA[,-2:-1]
# AT <- t(AT)
# AT <- as.data.frame(AT)

## if we wanted species level ########
if(file.exists("Data/mOTU.species.profile.tsv")){
        AT <- read.table("Data/mOTU.species.profile.tsv", header = TRUE)
}

# ### if we wanted genus level, weirdly even more OTUs ####
# if(file.exists("Data/mOTU.genus.profile.tsv")){
#         AT <- read.table("Data/mOTU.genus.profile.tsv", header = TRUE)
# }

# Data before and 3 months after Gastric Bypass, from the bypass_read_information
# file, I chose the following 10 samples
Before <- c('X9437', 'X9439', 'X9441', 'X9460', 'X9464')
After <- c('X9438', 'X9440', 'X9442', 'X9462', 'X9465')

names(AT)[1] <- "OTU"

# select the 10 samples as our Abundance Table to work with
AT_sub <- select(AT, OTU, one_of(Before), one_of(After))

# test if the relative abundances of each sample sum up to 1
summarise_each(AT_sub[,-1], funs(sum))
##       X9437 X9439     X9441 X9460 X9464     X9438 X9440 X9442 X9462
## 1 0.9999997     1 0.9999995     1     1 0.9999998     1     1     1
##      X9465
## 1 1.000001
# yes they practically do
# are there no unmapped, what is the max?
summarise_each(AT_sub[,-1], funs(max))
##       X9437     X9439     X9441     X9460     X9464    X9438     X9440
## 1 0.2279863 0.1519103 0.1243271 0.2340772 0.2739951 0.224817 0.3986044
##       X9442     X9462     X9465
## 1 0.1911199 0.1719057 0.1127358
# ranging from 11 to 40%

# what is the max OTU in each case?
WhichMax <- function(x) { AT_sub$OTU[which(x == max(x))]}
summarise_each(AT_sub[,-1], funs(WhichMax))
##                    X9437            X9439            X9441
## 1 motu_linkage_group_585 Prevotella_copri Prevotella_copri
##                        X9460                      X9464            X9438
## 1 Bacteroides_dorei/vulgatus Bacteroides_dorei/vulgatus Prevotella_copri
##              X9440            X9442                  X9462
## 1 Prevotella_copri Prevotella_copri motu_linkage_group_127
##              X9465
## 1 Escherichia_coli
# so different, interesting

Removing all zero OTUs, replacing all zeros with Pseudocount

## remove all OTUs that are 0 in all conditions, i.e. rowSum != 0
AT_sub <- AT_sub[which(rowSums(AT_sub[,-1]) != 0),]


## replace all other 0 with Pseudocount
# change all 0 to NA
AT_sub[AT_sub == 0] <- NA
# find the minimum of all other values
Pseudocount <- min(AT_sub[,-1], na.rm = TRUE)
# set all NA to this min = Psuedocount Value
AT_sub[is.na(AT_sub)] <- Pseudocount

# how did the pseudocount affect the colsums?
summarise_each(AT_sub[,-1], funs(sum))
##      X9437    X9439  X9441    X9460    X9464  X9438    X9440    X9442
## 1 1.010041 1.010408 1.0091 1.012604 1.014226 1.0091 1.011192 1.008421
##      X9462    X9465
## 1 1.009623 1.011455
# since they went a bit up, clear, I renormalise
Norm <- function(x) { x/sum(x)}
AT_sub <- mutate_each(AT_sub, funs(Norm), -OTU)

summarise_each(AT_sub, funs(sum), -OTU)
##   X9437 X9439 X9441 X9460 X9464 X9438 X9440 X9442 X9462 X9465
## 1     1     1     1     1     1     1     1     1     1     1
# ok all 1, AT_sub is our final AT to work with

Calculating the RatioMatrix and the AssessMatrix

ATsi <- AT_sub[,-1] # remove the OTU column
dim(ATsi)
## [1] 358  10
## the following produces a list with RatioMatrixes, for each OTU one RatioMatrix.
# let's call this OTU, the Owner OTU of the Matrix.
# each column of a RatioMatrix contains the relative abundance ratios of the 
# Owner OTU to all other OTUs in this sample. 
# Consequently, each row of a RatioMatrix contains the ratios of the Owner OTU to
# the OTU of that row over all samples

RatioMatrixes <- lapply( 1:nrow(ATsi), function(i) {
        Mat <- sapply(1:ncol(ATsi), function(x) ATsi[i,x]/ATsi[,x])
        colnames(Mat) <- names(ATsi)
        #rownames(Mat) <- AT$OTU
        return(Mat)})

# names(RatioMatrixes) <- AT$OTU
       
### The Big question is how to use these ratio distributions (they are
## independent of compositional effects, that is clear)

## The current idea: For decide based on the ratios for each OTU, in which
## samples the Owner OTU is above median or below >> BinMatrixes: Each row
## consists of 1 and 0 (T and F) showing if the Ratio Owner OTU/row OTU was 
## above median in this sample.
# So for each Owner OTU and sample you have a vector showing whether the Ratio
# Owner OTU/ row OTU was above median

BinMatrixes <- lapply( 1:length(RatioMatrixes), function(i) {
                Bin <- sapply(1:nrow(ATsi), function(x) {
                Mat <- RatioMatrixes[[i]]
                Mat[x,] >= .999*median(Mat[x,])})
        Bin <- t(Bin)
        colnames(Bin) <- names(ATsi)
        #rownames(Bin) <- AT$OTU
        return(Bin)})

# names(BinMatrixes) <- AT$OTU

# Now create one Matrix in which each entry reflects the percentage of the ratios
# of the OTU that is above Median for this sample

AssessMatrix <- sapply(1:length(BinMatrixes), function(x) colMeans(BinMatrixes[[x]]))
AssessMatrix <- t(AssessMatrix)
rownames(AssessMatrix) <- AT_sub$OTUs
colnames(AssessMatrix) <- names(ATsi)

head(ATsi)
##          X9437        X9439        X9441        X9460        X9464
## 1 6.900708e-05 1.840294e-03 4.169122e-04 0.0005068051 4.341115e-04
## 2 6.039357e-05 6.491894e-04 7.087506e-04 0.1215882638 1.973230e-04
## 3 5.178006e-05 5.176128e-05 5.182838e-05 0.0000516490 5.156644e-05
## 4 2.223939e-04 3.394882e-03 4.169122e-04 0.0002472002 1.243137e-03
## 5 1.683189e-04 1.551187e-02 2.855849e-03 0.0018270898 1.267802e-02
## 6 5.178006e-05 2.496883e-04 2.501477e-04 0.0001036644 5.156644e-05
##          X9438        X9440        X9442        X9462        X9465
## 1 3.070082e-04 1.254707e-04 3.265532e-04 2.179937e-03 3.801704e-03
## 2 2.570681e-03 1.380182e-03 3.158821e-04 2.797351e-02 5.170771e-05
## 3 5.182836e-05 5.172113e-05 5.186328e-05 7.032326e-05 5.170771e-05
## 4 8.488505e-04 2.174829e-04 5.817592e-04 9.067649e-04 5.689627e-04
## 5 4.502785e-04 3.677872e-03 2.266665e-03 7.032326e-05 3.168091e-04
## 6 6.666460e-03 5.172113e-05 8.259887e-04 1.617375e-04 5.340487e-04
head(AssessMatrix)
##           X9437     X9439     X9441     X9460     X9464     X9438
## [1,] 0.02793296 0.8575419 0.4022346 0.7486034 0.8128492 0.1703911
## [2,] 0.02793296 0.4162011 0.6312849 0.9944134 0.2094972 0.7821229
## [3,] 0.72346369 0.6480447 0.6312849 0.7178771 0.8016760 0.5782123
## [4,] 0.12849162 0.8966480 0.3156425 0.2039106 0.8407821 0.6675978
## [5,] 0.01955307 0.9441341 0.7290503 0.5335196 0.9413408 0.1648045
## [6,] 0.12569832 0.7262570 0.6592179 0.3407821 0.1871508 0.9469274
##          X9440     X9442      X9462      X9465
## [1,] 0.1201117 0.2094972 0.79888268 0.87150838
## [2,] 0.8463687 0.1229050 0.95530726 0.03351955
## [3,] 0.7709497 0.5670391 0.63687151 0.64245810
## [4,] 0.1284916 0.5837989 0.68156425 0.56703911
## [5,] 0.8631285 0.6340782 0.01955307 0.16480447
## [6,] 0.1480447 0.7737430 0.45251397 0.77932961
# Let's look at the Escherichias
apply(AssessMatrix[which(grepl('Escher', AT_sub$OTU)),], 1, range)
##             [,1]      [,2]
## [1,] 0.002793296 0.5670391
## [2,] 0.960893855 0.8016760
AssessMatrix[which(grepl('Escher', AT_sub$OTU)),]
##            X9437       X9439       X9441     X9460     X9464     X9438
## [1,] 0.002793296 0.002793296 0.008379888 0.8044693 0.1312849 0.8938547
## [2,] 0.723463687 0.648044693 0.631284916 0.7178771 0.8016760 0.5782123
##          X9440     X9442     X9462     X9465
## [1,] 0.9413408 0.3854749 0.8826816 0.9608939
## [2,] 0.7709497 0.5670391 0.6368715 0.6424581
ATsi[which(grepl('Escher', AT_sub$OTU)),]
##            X9437        X9439        X9441       X9460        X9464
## 108 6.900708e-05 5.176128e-05 1.459192e-04 0.006483753 3.946469e-04
## 111 5.178006e-05 5.176128e-05 5.182838e-05 0.000051649 5.156644e-05
##            X9438        X9440        X9442        X9462        X9465
## 108 4.258816e-02 4.739956e-02 4.663953e-03 3.975924e-02 1.114591e-01
## 111 5.182836e-05 5.172113e-05 5.186328e-05 7.032326e-05 5.170771e-05