require(dplyr)
require(ggplot2)
require(RColorBrewer)
require(reshape2)
require(tidyr)
require(igraph) # for the clustering analysis
# require(compositions) # needed for the clr function
## the compositions package is only needed if you want to use Lovells functions (mainly if you want to calculate the logratio variance)

source(file = "../Functions/ProportionalityFuncs.R")
source(file = "../Functions/MemoryFuncs.R")

Background

You find a lot of background on the equations in 150618_JustThetaAndMore and also in 150521_LovelletAl2015_OwnReproduction
- and of course in the long supplement of the Lovelletal paper
It was confusing how the clr transformation is handled in the paper, initially I thought when he writes clr(X) he means you had to take the geometric mean for each OTU over each sample (so the rows of an AT), but this is not the case. Truth is: it is dividing the compositions (samples) by their geometric mean. Then you log.

Getting the Data from Marguerat et al 2012, i.e. preparing the AT (relative Data). Here you also have an absolute AT as comparison.

I did not follow and question all steps here (such as removing not complete cases), important is, in the end of the Getting the Data section you have:

Abs: the AT of absolute data (samples as columns)
Abs.t: transposed absolute AT
AT: the relative AT as you will have normally (later AT_T) will be produced too

RNA.seq           <- read.csv("./data/RNA.seq.csv",        header=T, skip=1)
microarray        <- read.csv("./data/microarray.csv",     header=T, skip=3)
names(microarray) <- sub("T", "timepoint", names(microarray))
go                <- read.csv("./data/Complex_annotation", header=T, sep="\t")
names(go)[6]      <- "Systematic.name"


tmp <- data.frame(Systematic.name=RNA.seq$Systematic.name,
                  RNA.seq=rowSums(
                    RNA.seq[,grep("MM[12].*cpc.*", names(RNA.seq))],
                    na.rm=TRUE
                    )/2
                  )

# Drop any mRNAs that have a zero count in the RNA-seq
tmp <- subset(tmp, RNA.seq > 0)

# Do an inner join of Abs and the microarray data based on the Systematic names
tmp <- merge(tmp, microarray, by="Systematic.name")

# Now use the relative abundances at each microarray timepoint to multiply
# the initial mRNA copies per cell. Remove any rows that contain NAs
multipliers   <- as.matrix(tmp[, grep("timepoint", names(tmp))])
Abs           <- data.frame(tmp$RNA.seq * multipliers)
rownames(Abs) <- tmp[,"Systematic.name"]
Abs           <- na.omit(Abs) # nice function to keep only complete cases
# Remember the convention, Abs has now the samples/timepoints as columns
Abs.t         <- as.data.frame(t(Abs))
# neither Abs nor Abs.t is tidy, an observation would be the abundance of an mRNA at a timepoint

Rel   <- sweep(Abs,2,colSums(Abs, na.rm=TRUE),"/")
AT <- Rel

Calculating Theta (Phi)

My way, without composition package, without logratio variances, including inversely proportionals

If you want logratio variance, just uncomment. A bit on the procedure and what I changed:

clr: centered logratio is to make the theta values comparable
- you divide the compositions by their geometric mean
- then you take the log for each value in the AT
- NB!: the logratio variance (not calculated here unless uncommented) is unaffected by dividing the compositions/samples by their geometric mean. If you want to calculate it, it is just var(X/Y), X from AT_clr, because you already took the log in the clr equation. So either var(log(X/Y)), X from AT or AT geometric mean adjusted, or var(X/Y), X from AT_clr
AT.sma: super efficient calculation of b the SMA slope and the cor that you need to calculate theta
You see two Theta equations:
- Theta: is the one Lovell implements via logratio variance/var(clr(X))
- ThetaAbs: is what you would get after his equation, and the one that also detects inversely proprotional OTUs/mRNAs in the relative data. The values for proporitonals are absolutely the same. Try: Theta/ThetaAbs if you want to test.
- So I would always use ThetaAbs, the inversely proportionals you can always trash later.

# ptm <- proc.time() # in case you want to time

OTUs <- rownames(AT) # or if there is an OTU column you have to remove it
AT.clr <-  mutate_each(AT, funs(clrSelf))
rownames(AT.clr) <- OTUs
AT.clr_t <- as.data.frame(t(AT.clr))

AT.sma <- sma.df.Short(AT.clr_t) 
# sma because it beta is the standardised major axis slope!

# # ## IF YOU ALSO WANT THE P VALUE OF THE SLOPE, YOU HAVE TO USE
# AT.sma <- sma.df(AT.clr_t)
# AT.sma$corre <- sign(AT.sma$b)*sqrt(AT.sma$r2)
#  

Thetas <- 1 + (AT.sma$b)^2 - 2*abs(AT.sma$b)*AT.sma$corre
ThetasAbs <- 1 + (AT.sma$b)^2 -2*AT.sma$b*AT.sma$corre
# which is the same as
# ThetaAbs2 <- 1 + (AT.sma$b)^2 -2*abs(AT.sma$b)*abs(AT.sma$corre)

# ## IF YOU WANT LOGRATIO VARIANCES, YOU SHOULD USE THE COMPOSITIONS PACKAGE
# # unless you figure out how to vectorize (sapply sucked in time, see
# # 150618_JustThetaAndMore)
# # LOOK up the vlr function, I assume no zeros allowed!
# LogVarRatios <- variation(acomp(t(AT))) 

# proc.time() - ptm # timer off

Finding all OTU pairs with theta < .05, defined as the highly proportionals and inversely proportionals (saved in AT.sma.lo.phi).

He just looks at the lower triangle of the theta matrix, though it is not symmetric. In principle it is still fine, because for the proportional ones theta is almost symmetric. You will see, I anyway considered all the data, I copied his approach with the lt (lower triangle) and added the upper triangle (simply by transposing the lt). So that way I detect pairs in which theta is below 0.05 only for the X/Y or Y/X combination.

Getting all the data in a large data.frame showing b, correlation, and the two thetas for each OTU pair. Filtering it for the proportionals.

lt     <- which(col(AT.sma$b)<row(AT.sma$b),arr.ind=FALSE)
# col(matrix) just puts the numbers of the column in the matrix
# try:
# x <- matrix(1:9,3,3)
# col(x)
# which(col(x)<row(x))
# which(col(x)<row(x), arr.ind = T)
# so lt is just an integer vector referring to the lower left triangle of
# the Theta, or b matrix
lt.ind             <- which(col(AT.sma$b)<row(AT.sma$b),arr.ind=TRUE)



# now he composes a large data frame

# I find growing the dataframe incrementally helps, as does getting rid 
# of any objects that aren't used from here on in
AT.sma.df         <- data.frame(
  row=factor(rownames(AT.sma$b)[lt.ind[,"row"]]),
  col=factor(colnames(AT.sma$b)[lt.ind[,"col"]])
  )
AT.sma.df$b       <- AT.sma$b[lt] # stores all b values of the lower triangle
#AT.sma.df$p       <- AT.sma$p[lt]
#AT.sma.df$r2      <- AT.sma$r2[lt]
#AT.sma.df$vlr     <- AT.vlr[lt]
AT.sma.df$cor     <- AT.sma$corre[lt]
AT.sma.df$theta <- ThetasAbs[lt]
AT.sma.df$thetaUT <- t(ThetasAbs)[lt]



rm(AT.sma, lt, lt.ind)

AT.sma.lo.phi <- filter(AT.sma.df, (theta <  0.05) | (thetaUT < 0.05))

I love the memory function, see the source

lsos()

##                     Type      Size    Rows Columns
## AT.sma.df     data.frame 184068360 4591965       6
## Thetas            matrix  73884104    3031    3031
## ThetasAbs         matrix  73884104    3031    3031
## Abs           data.frame    848320    3031      16
## AT.clr_t      data.frame    729024      16    3031
## Abs.t         data.frame    729024      16    3031
## AT            data.frame    584304    3031      16
## AT.clr        data.frame    584304    3031      16
## AT.sma.lo.phi data.frame    410920     529       6
## OTUs           character    194024    3031      NA

#AT.sma.df is huge, so might want to 
rm(AT.sma.df)

Splitting into proptionals and inversely proportionals. Plotting the rel abundances of the proportional and inversely proportional pairs.

these plots will give you an impression whether the pairs you found in the relative data are really proporional or inversely proportional

NB: the algorithm (theta equation) is based on clr data, so log data. It makes use of the fact that both proportional and inversely proprotional rel. abundance data after log transformation lies on linear lines with slope 1 or -1 respectively. I.e. slope (beta) = 1 and -1, and also cor is 1 and -1, respectively.

Short proof for the inversely proportional data (for the proportional I have done it elsewhere and it is easier.)

\[ X*Y = C ~~~~ C = \text{constant}\] \[ Y = 1/X * C \] \[ \log{Y} = \log{1/X} + \log{C} \] \[ \log{Y} = \log{C} + (-1)*\log{X}\] so when X and Y are inversely proporitonal their logarithms are on a straight line with slope -1 and intercept \(\log{C}\).

So if you would find pairs in the relative data that are not well proportional or inversely proportional something would be wrong with the algorithm

Very unlikely

## Separation into proportional and inverse proportional

AT.sma.lo.phi.pro <- filter(AT.sma.lo.phi, b >0)
AT.sma.lo.phi.inv <- filter(AT.sma.lo.phi, b <0)


# How many OTUs are in proportional associations?
# you could use the stack function on AT.sma.lo.phi, but because it ignores
# factors, you had to use as character first, so it is easier to use
# the following my.pairs
my.pairs <- cbind(as.character(AT.sma.lo.phi$row), as.character(AT.sma.lo.phi$col)) # if you want to do it with select, make SURE you change to character.

my.pairs.pro <- cbind(as.character(AT.sma.lo.phi.pro$row), as.character(AT.sma.lo.phi.pro$col))
my.pairs.inv <- cbind(as.character(AT.sma.lo.phi.inv$row), as.character(AT.sma.lo.phi.inv$col))

length(unique(c(my.pairs))) # tells you how many OTUs are in at least one proportional association,

## [1] 267

AT_t <- as.data.frame(t(AT))

# # plot all:
# if (nrow(my.pairs) > 0) {
#         my.data <- stack.pairs(AT_t, my.pairs)
#         # already a nice long format data.frame
#         my.data$timepoint <- factor(rep(1:ncol(AT),nrow(my.pairs)))
#         ggplot(data=my.data, aes(x=x, y=y, group=group)) +
#                 geom_path(aes(colour=group)) +
#                 labs(x=expression(OTU[1]), y=expression(OTU[2])) + 
#                 #scale_colour_discrete(name=expression(paste(OTU[1], " vs.", OTU[2]))) +
#                 theme(legend.position="none") 
#         
#         ggplot(data=my.data, aes(x=x, y=y, group=group)) +
#                 geom_path(aes(colour=group)) +
#                 labs(x=expression(mRNA[1]), y=expression(mRNA[2])) + 
#                 #scale_colour_discrete(name=expression(paste(mRNA[1], " vs.", mRNA[2]))) +
#                 theme(legend.position="none") + 
#                 scale_x_log10() +
#                 scale_y_log10() +
#                 coord_equal()
# }

# plot only proportionals:
if (nrow(my.pairs.pro) > 0) {
        my.data.pro <- stack.pairs(AT_t, my.pairs.pro)
        # already a nice long format data.frame
        my.data.pro$timepoint <- factor(rep(1:ncol(AT),nrow(my.pairs.pro)))
        Tr <- ggplot(data=my.data.pro, aes(x=x, y=y, group=group)) +
                geom_path(aes(colour=group)) +
                labs(x=expression(OTU[1]), y=expression(OTU[2])) + 
                #scale_colour_discrete(name=expression(paste(OTU[1], " vs.", OTU[2]))) +
                theme(legend.position="none") 
        
        Tr1 <- ggplot(data=my.data.pro, aes(x=x, y=y, group=group)) +
                geom_path(aes(colour=group)) +
                labs(x=expression(mRNA[1]), y=expression(mRNA[2])) + 
                #scale_colour_discrete(name=expression(paste(mRNA[1], " vs.", mRNA[2]))) +
                theme(legend.position="none") + 
                scale_x_log10() +
                scale_y_log10() +
                coord_equal()
}

Tr

Tr1

# plot only inversely proportionals:
if (nrow(my.pairs.inv) > 0) {
        my.data.inv <- stack.pairs(AT_t, my.pairs.inv)
        # already a nice long format data.frame
        my.data.inv$timepoint <- factor(rep(1:ncol(AT),nrow(my.pairs.inv)))
        Tr <- ggplot(data=my.data.inv, aes(x=x, y=y, group=group)) +
                geom_path(aes(colour=group)) +
                labs(x=expression(OTU[1]), y=expression(OTU[2])) + 
                #scale_colour_discrete(name=expression(paste(OTU[1], " vs.", OTU[2]))) +
                theme(legend.position="none") 
        
        Tr1 <- ggplot(data=my.data.inv, aes(x=x, y=y, group=group)) +
                geom_path(aes(colour=group)) +
                labs(x=expression(mRNA[1]), y=expression(mRNA[2])) + 
                #scale_colour_discrete(name=expression(paste(mRNA[1], " vs.", mRNA[2]))) +
                theme(legend.position="none") + 
                scale_x_log10() +
                scale_y_log10() +
                coord_equal()
}

Tr

Tr1

Finding clusters of proportional and inversely proportional OTUs and plotting their relative abundances (in comparison to their absolute abundances)

You found pairs of proportional and inversely proporitonal OTUs, They might be clustered (OTU1 is prop to OTU2, OTU2 also to OTU4 and so on).

The igraph package allows you to find these clusters, I did not work myself into package, so it is mostly copied. It works fine

Both proportional and inversely propotionals possibly in one cluster

Here I only plot the relative abundances for clusters with a minimum cluster size. So again, these will be proportional or inversely proportional otherwise the algorithm would be wrong.

#needs the igraph package

set.seed(2) # there is some randomness in the plot
g  <- graph.data.frame(AT.sma.lo.phi, directed=FALSE)
# NB: the function only uses the row and col column, but you should keep at least theta, see in the following commented out plot there is some weighting with it.

# # a plot that would simply show you clusters, not saying much.
# plot(
#   g, 
#   layout=layout.fruchterman.reingold.grid(g, weight=0.05/E(g)$theta), 
#   vertex.size=1, 
#   vertex.color="black", 
#   vertex.label=NA
#   )


g.clust     <- clusters(g) # also from the igraph package
#g.clust is a list with the entries membership, csize and no

g.df        <- data.frame(
  Name=V(g)$name, 
  cluster=g.clust$membership, 
  cluster.size=g.clust$csize[g.clust$membership]
  )
# g.df is a data frame, it contains first the names of the OTUs that are in a proportional association 
# (so nrow(g.df) should correspond with the number from above), in which cluster they are and how many OTUs are in that cluster.
# it might be worth being printed

#g.df
# save uncut g.df for autohpagy research

MinClusterSize <- 8
g.df <- filter(g.df, cluster.size > MinClusterSize-1)

ClusterToPlot <- unique(g.df$cluster)

TrList <- list()
counter = 0
for (i in ClusterToPlot) {
        
        counter <- counter + 1
        ClOTUs <- as.character(g.df$Name[g.df$cluster == i])
        dfPlot <- stack(AT_t, select=ClOTUs)
        colnames(dfPlot) <- c("relAbund", 'OTU')
        dfPlot$Sample <- rep(1:ncol(AT),g.df$cluster.size[g.df$cluster ==i][1])
        Tr <- ggplot(dfPlot, aes(x = Sample, y = relAbund, colour = OTU)) +
                geom_path() +
                scale_y_log10()
        TrList[[counter]] = Tr
        
}

TrList

## [[1]]

## 
## [[2]]

## 
## [[3]]

Only the proportionals

Here I also plot in comparison to the absolute abundances. Should be no problem for absolute proportionals. And I plot each cluster twice, second time log(Abundance).

set.seed(2) # there is some randomness in the plot
g.pro  <- graph.data.frame(AT.sma.lo.phi.pro, directed=FALSE)

g.pro.clust     <- clusters(g.pro) 
g.pro.df        <- data.frame(
  Name=V(g.pro)$name, 
  cluster=g.pro.clust$membership, 
  cluster.size=g.pro.clust$csize[g.pro.clust$membership]
  )

head(g.pro.df,15)

##             Name cluster cluster.size
## 1    SPBC543.02c       1            3
## 2  SPAC22H12.04c       2          116
## 3    SPBC19F8.08       2          116
## 4   SPBC21C3.15c       3            5
## 5    SPAC6B12.09       2          116
## 6  SPCPB16A4.04c       2          116
## 7     SPAC1F7.12       4            2
## 8    SPAC3H1.06c       5            2
## 9   SPAC18G6.14c       2          116
## 10   SPAC31G5.03       2          116
## 11   SPAC3A12.10       2          116
## 12    SPAC521.05       2          116
## 13    SPAC644.15       2          116
## 14   SPAC6G9.09c       2          116
## 15   SPAC9G1.03c       2          116

MinClusterSize <- 8
g.pro.df <- filter(g.pro.df, cluster.size > MinClusterSize-1)

ClusterToPlot <- unique(g.pro.df$cluster)

TrproList <- list()
counter = 0
for (i in ClusterToPlot) {
        
        counter <- counter + 1
        ClOTUs <- as.character(g.pro.df$Name[g.pro.df$cluster == i])
        dfPlot <- stack(AT_t, select=ClOTUs)
        dfPlot$Sample <- rep(1:ncol(AT),g.pro.df$cluster.size[g.pro.df$cluster ==i][1])
        dfPlot$Type <- 'relative'
        dfPlotAbs <- stack(Abs.t, select=ClOTUs)
        dfPlotAbs$Sample <- rep(1:ncol(AT),g.pro.df$cluster.size[g.pro.df$cluster ==i][1])
        dfPlotAbs$Type <- 'absolute'
        dfPlot <- rbind(dfPlot, dfPlotAbs)
        colnames(dfPlot)[1:2] <- c("Abundance", 'OTU')
        
        Tr <- ggplot(dfPlot, aes(x = Sample, y = Abundance, colour = OTU)) +
                geom_path() +
                facet_grid(Type ~., scales = "free")
        TrproList[[counter]] = Tr
        counter = counter + 1
        Tr <- Tr + scale_y_log10()
        TrproList[[counter]] = Tr
        
}

TrproList

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

In addition I plot the clusters in association, i.e. the proportional pairs against each other as above for the entire data, but here in comparison to the absolute data

TrproList <- list()
CorProList <- list()
counter = 0
for (i in ClusterToPlot) {
        counter = counter + 1
        ClOTUs <- as.character(g.pro.df$Name[g.pro.df$cluster == i])
        SelectPairs <- matrix(my.pairs.pro %in% ClOTUs, nrow(my.pairs.pro),
                             ncol(my.pairs.pro))
        # Seems one would have been enough, row or columns
        SelectPairs <- my.pairs.pro[SelectPairs[,1] == TRUE & SelectPairs[,2] == TRUE,]

        my.data.pro <- stack.pairs(AT_t, SelectPairs)
        # already a nice long format data.frame
        my.data.pro$timepoint <- factor(rep(1:ncol(AT),nrow(SelectPairs)))
        my.data.pro$Type <- 'relative'
        my.data.pro.abs <- stack.pairs(Abs.t, SelectPairs)
        my.data.pro.abs$timepoint <- factor(rep(1:ncol(AT),nrow(SelectPairs)))
        my.data.pro.abs$Type <- 'absolute'
        my.data.pro <- rbind(my.data.pro, my.data.pro.abs)
        Tr <- ggplot(data=my.data.pro, aes(x=x, y=y, colour=group)) +
                geom_path() +
                labs(x=expression(OTU[1]), y=expression(OTU[2])) + 
                facet_wrap(~ Type, scales = "free") +
                theme(legend.position="none") 
        TrproList[[counter]] <- Tr
        
        CorRel <- numeric(nrow(SelectPairs))
        CorAbs <- numeric(nrow(SelectPairs))
        for (e in 1:nrow(SelectPairs)) {
                CorRel[e] <- cor(AT_t[,colnames(AT_t)==SelectPairs[e,1]],
                                 AT_t[,colnames(AT_t)==SelectPairs[e,2]])
                CorAbs[e] <- cor(Abs.t[,colnames(AT_t)==SelectPairs[e,1]],
                                 Abs.t[,colnames(AT_t)==SelectPairs[e,2]])
        }
       CorProList[[counter]] <- data.frame(row = SelectPairs[,1],
                                           col = SelectPairs[,2],
                                           Rel = CorRel,
                                           Abs = CorAbs)
        
        counter <- counter + 1
       
        Tr <- Tr + 
                scale_x_log10() +
                scale_y_log10() +
                coord_equal()
        TrproList[[counter]] <- Tr
       
       CorRel <- numeric(nrow(SelectPairs))
        CorAbs <- numeric(nrow(SelectPairs))
        for (e in 1:nrow(SelectPairs)) {
                CorRel[e] <- cor(log(AT_t[,colnames(AT_t)==SelectPairs[e,1]]),
                                 log(AT_t[,colnames(AT_t)==SelectPairs[e,2]]))
                CorAbs[e] <- cor(log(Abs.t[,colnames(AT_t)==SelectPairs[e,1]]),
                                 log(Abs.t[,colnames(AT_t)==SelectPairs[e,2]]))
        }
       CorProList[[counter]] <- data.frame(row = SelectPairs[,1],
                                           col = SelectPairs[,2],
                                           Rel = CorRel,
                                           Abs = CorAbs)

        }

for (i in 1:length(TrproList)) {
        print(TrproList[[i]])
        print(head(CorProList[[i]],10))
}

##              row          col       Rel       Abs
## 1  SPAC22H12.04c SPAC1071.07c 0.9704640 0.9928387
## 2    SPBC19F8.08 SPAC1071.07c 0.9852984 0.9983288
## 3    SPAC6B12.09  SPAC10F6.10 0.9692772 0.9953621
## 4  SPCPB16A4.04c  SPAC10F6.10 0.9696391 0.9974263
## 5   SPAC18G6.14c SPAC13G6.02c 0.9682165 0.9933528
## 6    SPAC31G5.03 SPAC13G6.02c 0.9578292 0.9947453
## 7    SPAC3A12.10 SPAC13G6.02c 0.9894862 0.9982237
## 8     SPAC521.05 SPAC13G6.02c 0.9825753 0.9990985
## 9     SPAC644.15 SPAC13G6.02c 0.9923702 0.9991513
## 10   SPAC6G9.09c SPAC13G6.02c 0.9306675 0.9955962

##              row          col       Rel       Abs
## 1  SPAC22H12.04c SPAC1071.07c 0.9861254 0.9945893
## 2    SPBC19F8.08 SPAC1071.07c 0.9793847 0.9938050
## 3    SPAC6B12.09  SPAC10F6.10 0.9806938 0.9846665
## 4  SPCPB16A4.04c  SPAC10F6.10 0.9767767 0.9799772
## 5   SPAC18G6.14c SPAC13G6.02c 0.9806453 0.9903803
## 6    SPAC31G5.03 SPAC13G6.02c 0.9797884 0.9887802
## 7    SPAC3A12.10 SPAC13G6.02c 0.9783663 0.9890688
## 8     SPAC521.05 SPAC13G6.02c 0.9846948 0.9895258
## 9     SPAC644.15 SPAC13G6.02c 0.9888088 0.9855606
## 10   SPAC6G9.09c SPAC13G6.02c 0.9743296 0.9930005

##              row           col       Rel       Abs
## 1  SPAC2E1P3.03c  SPAC19D5.09c 0.9983848 0.9997834
## 2       SPAC9.04  SPAC19D5.09c 0.9987617 0.9998971
## 3    SPBC1289.17  SPAC19D5.09c 0.9993006 0.9998295
## 4     SPBC1E8.04  SPAC19D5.09c 0.9994121 0.9997411
## 5   SPCC1494.11c  SPAC19D5.09c 0.9991882 0.9999118
## 6       SPAC9.04 SPAC2E1P3.03c 0.9991236 0.9997372
## 7    SPBC1289.17 SPAC2E1P3.03c 0.9984798 0.9999075
## 8     SPBC1E8.04 SPAC2E1P3.03c 0.9983695 0.9998668
## 9   SPCC1494.11c SPAC2E1P3.03c 0.9990796 0.9998995
## 10  SPNCRNA.1056 SPAC2E1P3.03c 0.9981839 0.9994390

##              row           col       Rel       Abs
## 1  SPAC2E1P3.03c  SPAC19D5.09c 0.9910678 0.9963638
## 2       SPAC9.04  SPAC19D5.09c 0.9934031 0.9970653
## 3    SPBC1289.17  SPAC19D5.09c 0.9953342 0.9982706
## 4     SPBC1E8.04  SPAC19D5.09c 0.9961045 0.9982650
## 5   SPCC1494.11c  SPAC19D5.09c 0.9939850 0.9975420
## 6       SPAC9.04 SPAC2E1P3.03c 0.9955767 0.9979794
## 7    SPBC1289.17 SPAC2E1P3.03c 0.9918664 0.9966562
## 8     SPBC1E8.04 SPAC2E1P3.03c 0.9921696 0.9966259
## 9   SPCC1494.11c SPAC2E1P3.03c 0.9953442 0.9980359
## 10  SPNCRNA.1056 SPAC2E1P3.03c 0.9843470 0.9949199

Finally the inversely proportional

Here it is getting exciting, does it make sense to look for invesely propotionals in relative abundacne data, is there a correlation in the absolute abundance data at all???

I fear not

set.seed(2) # there is some randomness in the plot
g.inv  <- graph.data.frame(AT.sma.lo.phi.inv, directed=FALSE)

# # a plot that would simply show you clusters, not saying much.
# plot(
#   g.inv, 
#   layout=layout.fruchterman.reingold.grid(g.inv, weight=0.05/E(g.inv)$thetaAbs), 
#   vertex.size=1, 
#   vertex.color="black", 
#   vertex.label=NA
#   )


g.inv.clust     <- clusters(g.inv) # also from the igraph package
#g.inv.clust is a list with the entries membership, csize and no

g.inv.df        <- data.frame(
  Name=V(g.inv)$name, 
  cluster=g.inv.clust$membership, 
  cluster.size=g.inv.clust$csize[g.inv.clust$membership]
  )
# g.df is a data frame, it contains first the names of the OTUs that are in a proportional association (should correspond with the number from above), in which cluster they are and how many OTUs are in that cluster.
# it is worth being printed
head(g.inv.df,15)

##            Name cluster cluster.size
## 1   SPAC27E2.09       1            2
## 2   SPAC4F10.20       2            2
## 3    SPAC6G9.08       3            2
## 4   SPCC576.15c       4            2
## 5   SPNCRNA.727       5            3
## 6   SPSNORNA.10       5            3
## 7   SPNCRNA.491       6            4
## 8   SPSNORNA.54       6            4
## 9   SPBC543.02c       7            3
## 10  SPCC306.09c       8            2
## 11 SPBC28F2.04c       9            2
## 12  SPBC24C6.02      10            2
## 13 SPBP16F5.03c      11            2
## 14  SPNCRNA.445      12            2
## 15  SPBC2A9.05c      13            2

MinClusterSize <- 4
g.inv.df <- filter(g.inv.df, cluster.size > MinClusterSize-1)

ClusterToPlot <- unique(g.inv.df$cluster)

TrInvList <- list()
counter = 0
for (i in ClusterToPlot) {
        
        counter <- counter + 1
        ClOTUs <- as.character(g.inv.df$Name[g.inv.df$cluster == i])
        dfPlot <- stack(AT_t, select=ClOTUs)
        dfPlot$Sample <- rep(1:ncol(AT),g.inv.df$cluster.size[g.inv.df$cluster ==i][1])
        dfPlot$Type <- 'relative'
        dfPlotAbs <- stack(Abs.t, select=ClOTUs)
        dfPlotAbs$Sample <- rep(1:ncol(AT),g.inv.df$cluster.size[g.inv.df$cluster ==i][1])
        dfPlotAbs$Type <- 'absolute'
        dfPlot <- rbind(dfPlot, dfPlotAbs)
        colnames(dfPlot)[1:2] <- c("Abundance", 'OTU')
        
        Tr <- ggplot(dfPlot, aes(x = Sample, y = Abundance, colour = OTU)) +
                geom_path() +
                facet_grid(Type ~., scales = "free")
        TrInvList[[counter]] = Tr
        counter = counter + 1
        Tr <- Tr + scale_y_log10()
        TrInvList[[counter]] = Tr
         
        
}

TrInvList

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

TrInvList <- list()
CorInvList <- list()
counter = 0
for (i in ClusterToPlot) {
        counter = counter + 1
        ClOTUs <- as.character(g.inv.df$Name[g.inv.df$cluster == i])
        SelectPairs <- matrix(my.pairs.inv %in% ClOTUs, nrow(my.pairs.inv),
                             ncol(my.pairs.inv))
        # Seems one would have been enough, row or columns
        SelectPairs <- my.pairs.inv[SelectPairs[,1] == TRUE & SelectPairs[,2] == TRUE,]

        my.data.Inv <- stack.pairs(AT_t, SelectPairs)
        # already a nice long format data.frame
        my.data.Inv$timepoint <- factor(rep(1:ncol(AT),nrow(SelectPairs)))
        my.data.Inv$Type <- 'relative'
        my.data.Inv.abs <- stack.pairs(Abs.t, SelectPairs)
        my.data.Inv.abs$timepoint <- factor(rep(1:ncol(AT),nrow(SelectPairs)))
        my.data.Inv.abs$Type <- 'absolute'
        my.data.Inv <- rbind(my.data.Inv, my.data.Inv.abs)
        Tr <- ggplot(data=my.data.Inv, aes(x=x, y=y, colour=group)) +
                geom_path() +
                labs(x=expression(OTU[1]), y=expression(OTU[2])) + 
                facet_wrap(~ Type, scales = "free") +
                theme(legend.position="none") 
        TrInvList[[counter]] <- Tr
        
        CorRel <- numeric(nrow(SelectPairs))
        CorAbs <- numeric(nrow(SelectPairs))
        for (e in 1:nrow(SelectPairs)) {
                CorRel[e] <- cor(AT_t[,colnames(AT_t)==SelectPairs[e,1]],
                                 AT_t[,colnames(AT_t)==SelectPairs[e,2]])
                CorAbs[e] <- cor(Abs.t[,colnames(AT_t)==SelectPairs[e,1]],
                                 Abs.t[,colnames(AT_t)==SelectPairs[e,2]])
        }
       CorInvList[[counter]] <- data.frame(row = SelectPairs[,1],
                                           col = SelectPairs[,2],
                                           Rel = CorRel,
                                           Abs = CorAbs)
        
        counter <- counter + 1
       
        Tr <- Tr + 
                scale_x_log10() +
                scale_y_log10() +
                coord_equal()
        TrInvList[[counter]] <- Tr
       
       CorRel <- numeric(nrow(SelectPairs))
        CorAbs <- numeric(nrow(SelectPairs))
        for (e in 1:nrow(SelectPairs)) {
                CorRel[e] <- cor(log(AT_t[,colnames(AT_t)==SelectPairs[e,1]]),
                                 log(AT_t[,colnames(AT_t)==SelectPairs[e,2]]))
                CorAbs[e] <- cor(log(Abs.t[,colnames(AT_t)==SelectPairs[e,1]]),
                                 log(Abs.t[,colnames(AT_t)==SelectPairs[e,2]]))
        }
       CorInvList[[counter]] <- data.frame(row = SelectPairs[,1],
                                           col = SelectPairs[,2],
                                           Rel = CorRel,
                                           Abs = CorAbs)

        }

for (i in 1:length(TrInvList)) {
        print(TrInvList[[i]])
        print(head(CorInvList[[i]],10))
}

##           row           col        Rel       Abs
## 1 SPNCRNA.491 SPAC19G12.15c -0.8029015 0.7547520
## 2 SPSNORNA.54 SPAC19G12.15c -0.8707235 0.7299556
## 3 SPSNORNA.54    SPAC637.06 -0.8279560 0.7546649

##           row           col        Rel       Abs
## 1 SPNCRNA.491 SPAC19G12.15c -0.9356963 0.6886016
## 2 SPSNORNA.54 SPAC19G12.15c -0.9628782 0.7417726
## 3 SPSNORNA.54    SPAC637.06 -0.9448771 0.7599803

##            row          col        Rel       Abs
## 1   SPCC4G3.17 SPBC13E7.10c -0.9012894 0.9095629
## 2   SPBP8B7.11   SPBC337.12 -0.8372574 0.6966663
## 3   SPCC4G3.17   SPBC337.12 -0.8603512 0.7795014
## 4 SPNCRNA.1436   SPCC4G3.17 -0.8680173 0.7579243

##            row          col        Rel       Abs
## 1   SPCC4G3.17 SPBC13E7.10c -0.9434212 0.9743674
## 2   SPBP8B7.11   SPBC337.12 -0.9487379 0.8918364
## 3   SPCC4G3.17   SPBC337.12 -0.9267393 0.9158434
## 4 SPNCRNA.1436   SPCC4G3.17 -0.9132845 0.9026285

IF you just want to show how many of the OTUs are highly proportional to at least one other OTU

# # Return the row minimum of the lower triangle of X
# lt.row.min <- function(X){
#   result <- numeric(nrow(X) - 1) # just defines a vector with zeros of
#   # length: nrow(X) -1 
#   for(i in 2:nrow(X)){
#     result[i-1] <- min(X[i, 1:i-1])
#   }
#   result
# }
# 
# 
# # Find the row minimum of the lower triangle of phi
# AT.phi.min        <- lt.row.min(Thetas)
# # AT.phi.min is a vector, storing the minimal theta in each row (for each
# # species/mRNA) of the lower left triangle (draw it)
# 
# AT.phi.min.sort <- sort(AT.phi.min)
# 
# cum              <- seq(from=0,to=1,along.with=AT.phi.min.sort)
# ggplot(data=data.frame(AT.phi.min=AT.phi.min.sort,cum=cum), aes(x=AT.phi.min, y=cum)) +
#   geom_line() +
#   labs(x=expression(paste("min( ", phi, "(clr(", x[i], "), clr(", x[j], ") ) ")), y="cumulative distribution") +
#   coord_cartesian(xlim = c(0,0.05))
# rm(Rel.phi.min.sort, cum)

150618_ImplementingProportionality

Thorsten Brach

18 Jun 2015