source('Functions/Dist_Functs.R')
source('Functions/Sim_Functs.R')
source('Functions/Predict_Functs.R')
source('Functions/Illus_Functs.R')
source('Functions/SubComp_Functs.R')

Background Discretization ideas

you want to decide for each sample, whether an OTU is present or not
this with regard to possible compositional effects, i.e.
- the relative abundance of an OTU can be higher in a sample where it’s absolute abundance is actually lower.

Simulations

k <- c(10E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)
l <- c(8E8, 2.5E8, 2E8, 1E8, .2E8, 1E8, 1E8)

** FOR SURE the 0 removal isssue **

ATsi <- AT[,-1]

RatioMatrixes <- lapply( 1:nrow(ATsi), function(i) {
        Mat <- sapply(1:ncol(ATsi), function(x) ATsi[i,x]/ATsi[,x])
        colnames(Mat) <- names(ATsi)
        rownames(Mat) <- AT$OTU
        return(Mat)
        }
        )

names(RatioMatrixes) <- AT$OTU
       
# Matrixes saying if bigger or smaller than median compared to each OTU

BinMatrixes <- lapply( 1:length(RatioMatrixes), function(i) {
        
        Bin <- sapply(1:nrow(ATsi), function(x) {
                Mat <- RatioMatrixes[[i]]
                Mat[x,] >= .999*median(Mat[x,])})
        Bin <- t(Bin)
        colnames(Bin) <- names(ATsi)
        rownames(Bin) <- AT$OTU
        return(Bin)
        })

names(BinMatrixes) <- AT$OTU

# Now create one Matrix, again the columns represent the samples,
# Let's call the OTU of a row again the Owner OTU. 
# The number is the percentage of OTUs for which the ratio Owner OTU/ OTU was
# above median in this sample

AssessMatrix <- sapply(1:length(BinMatrixes), function(x) colMeans(BinMatrixes[[x]]))
AssessMatrix <- t(AssessMatrix)
rownames(AssessMatrix) <- AT$OTU

ATsi

##            k          l
## 1 0.56497175 0.50955414
## 2 0.14124294 0.15923567
## 3 0.11299435 0.12738854
## 4 0.05649718 0.06369427
## 5 0.01129944 0.01273885
## 6 0.05649718 0.06369427
## 7 0.05649718 0.06369427

AssessMatrix

##           k         l
## A 1.0000000 0.1428571
## B 0.8571429 1.0000000
## C 0.8571429 1.0000000
## D 0.8571429 1.0000000
## E 0.8571429 1.0000000
## F 0.8571429 1.0000000
## G 0.8571429 1.0000000

Real Data, selection of the Gastric Bypass Data

Loading and subsetting the data

## tried the Zeller Data Before, and maybe again t
# load("Zeller2014AbundTable_ATA.RData")
# AT <- Zeller2014AbundTable_ATA[,-2:-1]
# AT <- t(AT)
# AT <- as.data.frame(AT)

## if we wanted species level ########
if(file.exists("Data/mOTU.species.profile.tsv")){
        AT <- read.table("Data/mOTU.species.profile.tsv", header = TRUE)
}

# ### if we wanted genus level, weirdly even more OTUs ####
# if(file.exists("Data/mOTU.genus.profile.tsv")){
#         AT <- read.table("Data/mOTU.genus.profile.tsv", header = TRUE)
# }

# Data before and 3 months after Gastric Bypass, from the bypass_read_information
# file, I chose the following 10 samples
Before <- c('X9437', 'X9439', 'X9441', 'X9460', 'X9464')
After <- c('X9438', 'X9440', 'X9442', 'X9462', 'X9465')

names(AT)[1] <- "OTU"

# select the 10 samples as our Abundance Table to work with
AT_sub <- select(AT, OTU, one_of(Before), one_of(After))

# test if the relative abundances of each sample sum up to 1
summarise_each(AT_sub[,-1], funs(sum))

##       X9437 X9439     X9441 X9460 X9464     X9438 X9440 X9442 X9462
## 1 0.9999997     1 0.9999995     1     1 0.9999998     1     1     1
##      X9465
## 1 1.000001

# yes they practically do
# are there no unmapped, what is the max?
summarise_each(AT_sub[,-1], funs(max))

##       X9437     X9439     X9441     X9460     X9464    X9438     X9440
## 1 0.2279863 0.1519103 0.1243271 0.2340772 0.2739951 0.224817 0.3986044
##       X9442     X9462     X9465
## 1 0.1911199 0.1719057 0.1127358

# ranging from 11 to 40%

# what is the max OTU in each case?
WhichMax <- function(x) { AT_sub$OTU[which(x == max(x))]}
summarise_each(AT_sub[,-1], funs(WhichMax))

##                    X9437            X9439            X9441
## 1 motu_linkage_group_585 Prevotella_copri Prevotella_copri
##                        X9460                      X9464            X9438
## 1 Bacteroides_dorei/vulgatus Bacteroides_dorei/vulgatus Prevotella_copri
##              X9440            X9442                  X9462
## 1 Prevotella_copri Prevotella_copri motu_linkage_group_127
##              X9465
## 1 Escherichia_coli

# so different, interesting

Removing all zero OTUs, replacing all zeros with Pseudocount

## remove all OTUs that are 0 in all conditions, i.e. rowSum != 0
AT_sub <- AT_sub[which(rowSums(AT_sub[,-1]) != 0),]


## replace all other 0 with Pseudocount
# change all 0 to NA
AT_sub[AT_sub == 0] <- NA
# find the minimum of all other values
Pseudocount <- min(AT_sub[,-1], na.rm = TRUE)
# set all NA to this min = Psuedocount Value
AT_sub[is.na(AT_sub)] <- Pseudocount

# how did the pseudocount affect the colsums?
summarise_each(AT_sub[,-1], funs(sum))

##      X9437    X9439  X9441    X9460    X9464  X9438    X9440    X9442
## 1 1.010041 1.010408 1.0091 1.012604 1.014226 1.0091 1.011192 1.008421
##      X9462    X9465
## 1 1.009623 1.011455

# since they went a bit up, clear, I renormalise
Norm <- function(x) { x/sum(x)}
AT_sub <- mutate_each(AT_sub, funs(Norm), -OTU)

summarise_each(AT_sub, funs(sum), -OTU)

##   X9437 X9439 X9441 X9460 X9464 X9438 X9440 X9442 X9462 X9465
## 1     1     1     1     1     1     1     1     1     1     1

# ok all 1, AT_sub is our final AT to work with

Calculating the RatioMatrix and the AssessMatrix

ATsi <- AT_sub[,-1] # remove the OTU column
dim(ATsi)

## [1] 358  10

## the following produces a list with RatioMatrixes, for each OTU one RatioMatrix.
# let's call this OTU, the Owner OTU of the Matrix.
# each column of a RatioMatrix contains the relative abundance ratios of the 
# Owner OTU to all other OTUs in this sample. 
# Consequently, each row of a RatioMatrix contains the ratios of the Owner OTU to
# the OTU of that row over all samples

RatioMatrixes <- lapply( 1:nrow(ATsi), function(i) {
        Mat <- sapply(1:ncol(ATsi), function(x) ATsi[i,x]/ATsi[,x])
        colnames(Mat) <- names(ATsi)
        #rownames(Mat) <- AT$OTU
        return(Mat)})

# names(RatioMatrixes) <- AT$OTU
       
### The Big question is how to use these ratio distributions (they are
## independent of compositional effects, that is clear)

## The current idea: For decide based on the ratios for each OTU, in which
## samples the Owner OTU is above median or below >> BinMatrixes: Each row
## consists of 1 and 0 (T and F) showing if the Ratio Owner OTU/row OTU was 
## above median in this sample.
# So for each Owner OTU and sample you have a vector showing whether the Ratio
# Owner OTU/ row OTU was above median

BinMatrixes <- lapply( 1:length(RatioMatrixes), function(i) {
                Bin <- sapply(1:nrow(ATsi), function(x) {
                Mat <- RatioMatrixes[[i]]
                Mat[x,] >= .999*median(Mat[x,])})
        Bin <- t(Bin)
        colnames(Bin) <- names(ATsi)
        #rownames(Bin) <- AT$OTU
        return(Bin)})

# names(BinMatrixes) <- AT$OTU

# Now create one Matrix in which each entry reflects the percentage of the ratios
# of the OTU that is above Median for this sample

AssessMatrix <- sapply(1:length(BinMatrixes), function(x) colMeans(BinMatrixes[[x]]))
AssessMatrix <- t(AssessMatrix)
rownames(AssessMatrix) <- AT_sub$OTUs
colnames(AssessMatrix) <- names(ATsi)

head(ATsi)

##          X9437        X9439        X9441        X9460        X9464
## 1 6.900708e-05 1.840294e-03 4.169122e-04 0.0005068051 4.341115e-04
## 2 6.039357e-05 6.491894e-04 7.087506e-04 0.1215882638 1.973230e-04
## 3 5.178006e-05 5.176128e-05 5.182838e-05 0.0000516490 5.156644e-05
## 4 2.223939e-04 3.394882e-03 4.169122e-04 0.0002472002 1.243137e-03
## 5 1.683189e-04 1.551187e-02 2.855849e-03 0.0018270898 1.267802e-02
## 6 5.178006e-05 2.496883e-04 2.501477e-04 0.0001036644 5.156644e-05
##          X9438        X9440        X9442        X9462        X9465
## 1 3.070082e-04 1.254707e-04 3.265532e-04 2.179937e-03 3.801704e-03
## 2 2.570681e-03 1.380182e-03 3.158821e-04 2.797351e-02 5.170771e-05
## 3 5.182836e-05 5.172113e-05 5.186328e-05 7.032326e-05 5.170771e-05
## 4 8.488505e-04 2.174829e-04 5.817592e-04 9.067649e-04 5.689627e-04
## 5 4.502785e-04 3.677872e-03 2.266665e-03 7.032326e-05 3.168091e-04
## 6 6.666460e-03 5.172113e-05 8.259887e-04 1.617375e-04 5.340487e-04

head(AssessMatrix)

##           X9437     X9439     X9441     X9460     X9464     X9438
## [1,] 0.02793296 0.8575419 0.4022346 0.7486034 0.8128492 0.1703911
## [2,] 0.02793296 0.4162011 0.6312849 0.9944134 0.2094972 0.7821229
## [3,] 0.72346369 0.6480447 0.6312849 0.7178771 0.8016760 0.5782123
## [4,] 0.12849162 0.8966480 0.3156425 0.2039106 0.8407821 0.6675978
## [5,] 0.01955307 0.9441341 0.7290503 0.5335196 0.9413408 0.1648045
## [6,] 0.12569832 0.7262570 0.6592179 0.3407821 0.1871508 0.9469274
##          X9440     X9442      X9462      X9465
## [1,] 0.1201117 0.2094972 0.79888268 0.87150838
## [2,] 0.8463687 0.1229050 0.95530726 0.03351955
## [3,] 0.7709497 0.5670391 0.63687151 0.64245810
## [4,] 0.1284916 0.5837989 0.68156425 0.56703911
## [5,] 0.8631285 0.6340782 0.01955307 0.16480447
## [6,] 0.1480447 0.7737430 0.45251397 0.77932961

# Let's look at the Escherichias
apply(AssessMatrix[which(grepl('Escher', AT_sub$OTU)),], 1, range)

##             [,1]      [,2]
## [1,] 0.002793296 0.5670391
## [2,] 0.960893855 0.8016760

AssessMatrix[which(grepl('Escher', AT_sub$OTU)),]

##            X9437       X9439       X9441     X9460     X9464     X9438
## [1,] 0.002793296 0.002793296 0.008379888 0.8044693 0.1312849 0.8938547
## [2,] 0.723463687 0.648044693 0.631284916 0.7178771 0.8016760 0.5782123
##          X9440     X9442     X9462     X9465
## [1,] 0.9413408 0.3854749 0.8826816 0.9608939
## [2,] 0.7709497 0.5670391 0.6368715 0.6424581

ATsi[which(grepl('Escher', AT_sub$OTU)),]

##            X9437        X9439        X9441       X9460        X9464
## 108 6.900708e-05 5.176128e-05 1.459192e-04 0.006483753 3.946469e-04
## 111 5.178006e-05 5.176128e-05 5.182838e-05 0.000051649 5.156644e-05
##            X9438        X9440        X9442        X9462        X9465
## 108 4.258816e-02 4.739956e-02 4.663953e-03 3.975924e-02 1.114591e-01
## 111 5.182836e-05 5.172113e-05 5.186328e-05 7.032326e-05 5.170771e-05

150608_SimulationsKyla

Thorsten Brach

8 Jun 2015

Background Discretization ideas

Simulations

Real Data, selection of the Gastric Bypass Data

Loading and subsetting the data

Removing all zero OTUs, replacing all zeros with Pseudocount

Calculating the RatioMatrix and the AssessMatrix