source('Functions/Dist_Functs.R')
source('Functions/Sim_Functs.R')
source('Functions/Predict_Functs.R')
source('Functions/Illus_Functs.R')
source('Functions/SubComp_Functs.R')

Loading Data and AT_sub preparation

## if we wanted species level ########
if(file.exists("Data/mOTU.species.profile.tsv")){
        AT <- read.table("Data/mOTU.species.profile.tsv", header = TRUE)
}

# ### if we wanted genus level, weirdly even more OTUs ####
# if(file.exists("Data/mOTU.genus.profile.tsv")){
#         AT <- read.table("Data/mOTU.genus.profile.tsv", header = TRUE)
# }

Replacing all 0 with a pseudocount being the next smalles relative abundance

The data consists of microbiome compositions of persons who underwent gastric bypass. Compositions are before, 3 months after or a year after

Replacing all 0 with a pseudocount that corresponds to the next smalles rel abundance

# AT is a decent Abundance table with the OTUs as rows and the sample as variables
# i.e. columns, the OTU names is a column which I rename

names(AT)[1] <- "OTU"
# there is no problem with UNMAPPED
summarise_each(AT[,-1], funs(sum)) # all Samples sum up well
##   X9462    X9468     X9437 X9451 X9466 X9464     X9453     X9463     X9450
## 1     1 1.000001 0.9999997     1     1     1 0.9999997 0.9999998 0.9999997
##   X9460     X9456     X9441     X9447    X9465 X9461     X9444 X9443
## 1     1 0.9999994 0.9999995 0.9999994 1.000001     1 0.9999997     1
##       X9449 X9439     X9448 X9442     X9455 X9452     X9446     X9445
## 1 0.9999999     1 0.9999995     1 0.9999993     1 0.9999995 0.9999994
##   X9454     X9438 X9467     X9459     X9469 X9440     X9458    X9457
## 1     1 0.9999998     1 0.9999999 0.9999997     1 0.9999995 1.000001
# there are 477 OTUs in AT, are some of them 0 in all?
# min(rowSums(AT[,-1])) # answer no
# therefore you remove no OTU with this command
# AT <- AT[which(rowSums(AT[,-1]) != 0),]
# consequently I can spare it

## find the next smalles relative abundance and use it as pseudocount
ATM <- as.matrix(AT[,-1])
next.smallest <- (unique(sort(ATM)))[2]
AT[AT==0.0] <- next.smallest

#### you could or should? renormalise here, because of course the pseudocounts
# increase the colSums

Norm <- function(x) {x/sum(x)}
AT1 <- mutate_each(AT[,-1], funs(Norm))
AT[,-1] <- AT1
rm(AT1)

Generating the long data frame for the plot with ggplot2.

## I want a long df with the variables OTU, relAbundance, Person, Time

# first I rename the colnames based on Person and Time, for this I use the
# bypass_read_information Excel file

if(file.exists("Data/bypass_reads_information.csv")){
        Meta <- read.table("Data/bypass_reads_information.csv", header = TRUE,
                           sep = ";")
}

Meta <- Meta[,c(1,3)]
Meta$SampleName <- as.character(Meta$SampleName)

# replace the names in AT with the Sample Names from Meta
for( i in 1:nrow(Meta)) {
        names(AT)[grep(Meta$id[i], names(AT))] <- Meta$SampleName[i]
}


AT_Long <- gather(AT, ID, relAbund, -OTU)

# Now split the ID column into Person and Time Columns
AT_Long <- separate(AT_Long, ID, into = c("Person", "Time"), sep = "-")

# if you want to see the pseudocount
min(AT_Long$relAbund)
## [1] 5.057953e-05

The Plots for Escherichia and Faecalibacterium

grep("Escherichia", AT$OTU) # shows you there are two with Escherichia
## [1] 141 148
AT$OTU[grep("Escherichia", AT$OTU)] 
## [1] Escherichia_coli     Escherichia_albertii
## 477 Levels: Acidaminococcus_fermentans ... unclassified_Lachnospiraceae
# I want coli, and I
AT_Long_EColi <- filter(AT_Long, OTU == 'Escherichia_coli')

# because I hope to work later with a similar plot
df <- AT_Long_EColi
# to relevel
df$Time <- as.factor(df$Time)

LevelsWant <- (unique(as.character(df$Time)))[c(3,1,2)] #you had to reverse
        for (i in 1:length(LevelsWant)) {
                df$Time <- relevel(df$Time, ref = LevelsWant[i])
        }

Palette <- brewer.pal(8, "Dark2")
Palette <- c(Palette, cbPalette[2:6])

Palette2 <- brewer.pal(12, "Paired")
Palette2 <- c(Palette2, cbPalette[1])

Tr <- ggplot(df, aes(x = Time, y = relAbund)) +
        #geom_boxplot(fill = cbPalette[1]) + 
        geom_boxplot(fill = "white") + 
        geom_point(aes(color = Person, group = Person), size = 4, alpha  = 1) +
        geom_line(aes(color = Person, group = Person)) +
        scale_color_manual(values = Palette2) +
        theme_bw(18) +
        theme(panel.grid.minor = element_blank()) +
        ylab("relative Abundance") +
        xlab("")

Trr <- Tr + scale_y_log10()
Tr

Trr

grep("Faec", AT$OTU) # shows you there are two with Escherichia
## [1] 83
AT$OTU[grep("Faec", AT$OTU)] 
## [1] Faecalibacterium_prausnitzii
## 477 Levels: Acidaminococcus_fermentans ... unclassified_Lachnospiraceae
# I want coli, and I
AT_Long_Faec <- filter(AT_Long, OTU == 'Faecalibacterium_prausnitzii')

# because I hope to work later with a similar plot
df <- AT_Long_Faec
# to relevel
df$Time <- as.factor(df$Time)

LevelsWant <- (unique(as.character(df$Time)))[c(3,1,2)] #you had to reverse
        for (i in 1:length(LevelsWant)) {
                df$Time <- relevel(df$Time, ref = LevelsWant[i])
        }

Tr <- ggplot(df, aes(x = Time, y = relAbund)) +
        #geom_boxplot(fill = cbPalette[1]) + 
        geom_boxplot(fill = "white") + 
        geom_point(aes(color = Person, group = Person), size = 4, alpha  = 1) +
        geom_line(aes(color = Person, group = Person)) +
        scale_color_manual(values = Palette2) +
        theme_bw(18) +
        theme(panel.grid.minor = element_blank()) +
        ylab("relative Abundance") +
        xlab("")

Trr <- Tr + scale_y_log10()
Tr

Trr

Removing E.coli from the AT, renormalise and form the long data frame again.

ATwoEC <- filter(AT, OTU != "Escherichia_coli")

# renormalise
AT1 <- mutate_each(ATwoEC[,-1], funs(Norm))
ATwoEC[,-1] <- AT1
rm(AT1)

ATwoEC_Long <- gather(ATwoEC, ID, relAbund, -OTU)

# Now split the ID column into Person and Time Columns
ATwoEC_Long <- separate(ATwoEC_Long, ID, into = c("Person", "Time"), sep = "-")

ATwoEC_Long_Faec <- filter(ATwoEC_Long, OTU == 'Faecalibacterium_prausnitzii')

# because I hope to work later with a similar plot
df <- ATwoEC_Long_Faec
# to relevel
df$Time <- as.factor(df$Time)

LevelsWant <- (unique(as.character(df$Time)))[c(3,1,2)] #you had to reverse
        for (i in 1:length(LevelsWant)) {
                df$Time <- relevel(df$Time, ref = LevelsWant[i])
        }

Tr <- ggplot(df, aes(x = Time, y = relAbund)) +
        #geom_boxplot(fill = cbPalette[1]) + 
        geom_boxplot(fill = "white") + 
        geom_point(aes(color = Person, group = Person), size = 4, alpha  = 1) +
        geom_line(aes(color = Person, group = Person)) +
        scale_color_manual(values = Palette2) +
        theme_bw(18) +
        theme(panel.grid.minor = element_blank()) +
        ylab("relative Abundance") +
        xlab("")

Trr <- Tr + scale_y_log10()
Tr

Trr

Compare in Numbers since the plots look so alike

AT_Long_FaecG <- group_by(AT_Long_Faec, Time)
summarise(AT_Long_FaecG, median(relAbund), mean(relAbund))
## Source: local data frame [3 x 3]
## 
##   Time median(relAbund) mean(relAbund)
## 1   1Y      0.004586951    0.005194759
## 2 3MDR      0.002958125    0.003406387
## 3  PRE      0.010969405    0.012693672
ATwoEC_Long_FaecG <- group_by(ATwoEC_Long_Faec, Time)
summarise(ATwoEC_Long_FaecG, median(relAbund), mean(relAbund))
## Source: local data frame [3 x 3]
## 
##   Time median(relAbund) mean(relAbund)
## 1   1Y      0.004664355    0.005278145
## 2 3MDR      0.003116940    0.003560650
## 3  PRE      0.010969960    0.012715401

Show the change in a plot

Before <- summarise(AT_Long_FaecG, mean(relAbund))
After <- summarise(ATwoEC_Long_FaecG, mean(relAbund))
Before$Ecoli <- 'yes'
After$Ecoli <- 'no'

df1 <- rbind(Before,After)
names(df1)[2] <- "Mean_relAbund"

# to relevel
df1$Time <- as.factor(df1$Time)

LevelsWant <- (unique(as.character(df1$Time)))[c(1,2,3)] #you had to reverse
        for (i in 1:length(LevelsWant)) {
                df1$Time <- relevel(df1$Time, ref = LevelsWant[i])
        }

df1$Ecoli <- as.factor(df1$Ecoli)

LevelsWant <- (unique(as.character(df1$Ecoli)))[c(2,1)] #you had to reverse
        for (i in 1:length(LevelsWant)) {
                df1$Ecoli <- relevel(df1$Ecoli, ref = LevelsWant[i])
        }

Tr <- ggplot(df1, aes(x = Time, y =Mean_relAbund, color = Ecoli, shape =Ecoli,
                      size = Ecoli)) +
        geom_point()+
        scale_size_discrete(range = c(7,5)) +
                scale_colour_manual(values = c(cbPalette[1],cbPalette[2])) +
                scale_shape_manual(values = c(16,17)) +
                #scale_x_discrete(expand = c(0.01,.01)) +
                theme_bw(base_size = 18) +
                theme(panel.grid.minor = element_blank()) +
                # removes all minor gridlines
                theme(panel.grid.major = element_line(colour = cbPalette[1],
                                                      size = 0.2)) +
                # sets the grid lines thicker and in a new colour
                theme(panel.background = element_rect(colour = cbPalette[1],
                                                      size = .5)) +
                # adjust colour and size of the surrounding box
                theme(panel.grid.major.y = element_blank()) +
                # removes the horisontal major grid lines +
                #theme(axis.text.x = element_text(angle = 90, vjust = 0.5,
                                                 #hjust = 1, 
                                                 #size = sizetextx)) +
                xlab("") +
                ylab("Mean relative Abundance") 
                #labs(colour = "", size = "", shape = "")
Tr

Tr + scale_y_log10()

Calculate Paired p value and wilcoxon rank somehow if you can

AT_Faec_Test <- filter(AT_Long_Faec, Time != "1Y")

Faec1 <- filter(AT_Long_Faec, Time == "PRE")
Faec2 <- filter(AT_Long_Faec, Time == "3MDR")
Faec1 <- arrange(Faec1, Person)
Faec2 <- arrange(Faec2, Person)

# as you can see X14 dropped out, since I want a pairwise t-test, I remove it from
# Faec1

Faec1 <- filter(Faec1, Person != "X14")

t.test(Faec1$relAbund, Faec2$relAbund, alternative = 'great', paired = TRUE)
## 
##  Paired t-test
## 
## data:  Faec1$relAbund and Faec2$relAbund
## t = 2.7906, df = 11, p-value = 0.008784
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.003321885         Inf
## sample estimates:
## mean of the differences 
##             0.009319394
# so there we get a significant difference
# alternatively you could have tested the difference
Diff <- Faec1$relAbund - Faec2$relAbund
t.test(Diff, alternative = 'great')
## 
##  One Sample t-test
## 
## data:  Diff
## t = 2.7906, df = 11, p-value = 0.008784
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
##  0.003321885         Inf
## sample estimates:
##   mean of x 
## 0.009319394
# can you test directly from the data.frame??
# YES YOU CAN, IMPORTANT was to understand the formula interface,
# see t.test Examples with sleep data
AT_FT <- filter(AT_Faec_Test, Person != "X14")
AT_FT <- arrange(AT_FT, Person)
t.test(relAbund ~ Time, data = AT_FT, paired = TRUE, alternative = 'less')
## 
##  Paired t-test
## 
## data:  relAbund by Time
## t = -2.7906, df = 11, p-value = 0.008784
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##          -Inf -0.003321885
## sample estimates:
## mean of the differences 
##            -0.009319394
### Same for the data without E.coli

AT_Faec_Test <- filter(ATwoEC_Long_Faec, Time != "1Y")
AT_FT <- filter(AT_Faec_Test, Person != "X14")
AT_FT <- arrange(AT_FT, Person)
t.test(relAbund ~ Time, data = AT_FT, paired = TRUE, alternative = 'less')
## 
##  Paired t-test
## 
## data:  relAbund by Time
## t = -2.754, df = 11, p-value = 0.009378
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##          -Inf -0.003196694
## sample estimates:
## mean of the differences 
##            -0.009188619
# so still significant