# Import metadata
metadata <-read.csv("~/Documents/gbru_fy18_rice_methane/march2020/metadata_biomassCH4_noRIL9.csv", header=TRUE)

Explore uni-variate distributions of data

library(tidyverse)
metadata %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins=12)

Genotype vs Sum Methane

d <- ggplot(metadata, aes(genotype, sum_ch4))
d + geom_point(aes(color=dev))

Total methane is highly consistent across treatment time point for each genotype.

Genotype vs Daily Methane

d <- ggplot(metadata, aes(genotype, daily_ch4))
d + geom_point(aes(color=dev))

D2 is the high methane producing time-point

Total methane and root biomass

d <- ggplot(metadata[metadata$dev=="D3",], aes(sum_ch4, root_biomass_2017))
d + geom_point(aes(colour = factor(genotype)))

Total methane and shoot biomass

d <- ggplot(metadata[metadata$dev=="D3",], aes((sum_ch4), log10(shoot_root_ratio2017)))
d + geom_point(aes(colour = factor(genotype)))

Load taxonomic count data

# Import OTU counts
otudata<-read.table("~/Documents/gbru_fy18_rice_methane/march2020/otu_table.tsv", header=TRUE)
# Move feature ID to rownames
row.names(otudata)<-otudata$feature.id
otudata<-otudata[,-1]
# Sort columns by name
otudata<- otudata[, order(names(otudata))]
# load Taxonomy annotation 
taxa <- read.table("~/Documents/gbru_fy18_rice_methane/march2020/observation_meta_data.tsv", sep="\t", header=TRUE, quote = "")

Define function to filter taxa with low abundance and many zeros


#Define the geometric mean for all values greater than 0
geomean <- function(x) { exp(mean(log(x[x>0]))) }

# Create a function to filter rows by the proportion of nonzero samples and the geometric mean of the nonzero samples
prefilter <- function(x, zcutoff, mcutoff){
    nzvect <- apply(x, 1, function(y){
      ((sum(y>0)/length(y))> zcutoff) & (geomean(y)> mcutoff)
      })
    lowcounts <- t(as.data.frame(apply(x[!nzvect,],2,sum)))
    rownames(lowcounts)<-c("lowconts")
    df1 <-rbind(lowcounts, x[nzvect,])
    return(df1)
}

Group by Family and Genus

Initial tests of un-grouped and unfiltered data returned no significant taxa. Independent filtering by depth is something which has been done in DESeq2 and EdgeR and is considered a valid preselection technique.

#Filter data to specific taxonomic levels
cdata <-cbind(taxa,otudata)
library(dplyr)
library(tidyr)

fd_long<- gather(cdata, sample, count, colnames(cdata)[10:105], factor_key=TRUE)

#group by family
fd_fam_tally <- fd_long[,c(7,10,11)] %>% group_by(taxonomy_5, sample) %>%  summarise(n = sum(count))
fd_fam_wide <- pivot_wider(fd_fam_tally, names_from= sample, values_from=n)
df_fam_wide<- as.data.frame(fd_fam_wide)
rownames(df_fam_wide)<- df_fam_wide[,1]
df_fam_wide<- df_fam_wide[,-1]
otutaxa_fam<-prefilter(df_fam_wide, 0.5, 100)

#group by genus
fd_genus_tally <- fd_long[,c(8,10,11)] %>% group_by(taxonomy_6, sample) %>%  summarise(n = sum(count))
fd_genus_wide <- fd_genus_tally %>% pivot_wider(names_from= sample, values_from=n)
df_genus_wide<- as.data.frame(fd_genus_wide)
rownames(df_genus_wide)<- df_genus_wide[,1]
df_genus_wide<- df_genus_wide[,-1]
otutaxa_genus<-prefilter(df_genus_wide, 0.5, 100)

ALDEx2 statistical analysis for compositional count data at the family level

library("ALDEx2")

# Construct covariates matrix
covariates1 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d3cov<-  covariates1[covariates1$dev=="D3",]
d3cov<-d3cov[1:30,]
d3rows <- as.numeric(rownames(d3cov))
d3otus <- otutaxa_fam[,d3rows]

# create model matrix for log10 of total methane
mm1 <- model.matrix(~ sumch4, d3cov)

Run Centered log ratio transformations and Dirichlet sampling

# run CLR transformations and sampling
d3ch4fam.clr<-  aldex.clr(otutaxa_fam[,d3rows], mm1, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
multicore environment is is OK -- using the BiocParallel package
removed rows with sums equal to zero
computing center with all features
data format is OK
'package:stats' may not be available when loading'package:stats' may not be available when loadingdirichlet samples complete
'package:stats' may not be available when loading'package:stats' may not be available when loadingtransformation complete

Fit a GLM on total methane

# run CLR transformations and sampling
d3ch4fam.clr.glm<- aldex.glm(d3ch4fam.clr, verbose=TRUE)
running tests for each MC instance:
|------------(25%)----------(50%)----------(75%)----------|
d3ch4fam.clr.glm[d3ch4fam.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]

Results

Because there are no significant taxa after BH correction. The challenge of regressing this data is that each sample does not have its own menthane measurment whic reduces the degrees of freedom

Do a correlation test

corr.test.fam <- aldex.corr(d3ch4fam.clr, d3cov$sumch4)

Results

Eight families have a signifigant statistical association with methane in the correlation test

corr.test.fam[corr.test.fam$BH<0.05,]
hist(corr.test.fam$BH)

ALDEx22 statistical analysis for compositional count data at the Genus Level

Run Centered log ratio transformations and Dirichlet sampling

# run CLR transformations and sampling
d3ch4genus.clr<-  aldex.clr(otutaxa_genus[,d3rows], mm1, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
multicore environment is is OK -- using the BiocParallel package
removed rows with sums equal to zero
computing center with all features
data format is OK
'package:stats' may not be available when loading'package:stats' may not be available when loadingdirichlet samples complete
'package:stats' may not be available when loading'package:stats' may not be available when loadingtransformation complete

Fit a GLM on total methane

# run CLR transformations and sampling
d3ch4genus.clr.glm<- aldex.glm(d3ch4genus.clr, verbose=TRUE)
running tests for each MC instance:
|------------(25%)----------(50%)----------(75%)----------|

Results

Because there are no significant taxa after BH correction. The challenge of regressing this data is that each sample does not have its own menthane measurment whic reduces the degrees of freedom

d3ch4genus.clr.glm[d3ch4genus.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]

Do a correlation test

corr.test.fam <- aldex.corr(d3ch4genus.clr, d3cov$sumch4)

Results

Eight families have a signifigant statistical association with methane in the correlation test

corr.test.fam[corr.test.fam$BH<0.10,]

# New apprach: Split methane into a categoical variable and do a Welche’s test on high vs low methane

# Construct covariates matrix
covariates2 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d3cov<-  covariates2[covariates2$dev=="D3",]
d3cov<-d3cov[1:30,]
d3cov$methfactor<-cut(d3cov$sumch4,breaks=2)
d3rows <- as.numeric(rownames(d3cov))
d3otus <- otutaxa_fam[,d3rows]

# create model matrix for log10 of total methane
mm1 <- model.matrix(~ methfactor, d3cov)

hlmeth.clr <- aldex.clr(d3otus, as.character(d3cov$methfactor), mc.samples=128, denom="iqlr")
operating in serial mode
computing iqlr centering
hlmeth.test <- aldex.ttest(hlmeth.clr, hist.plot=TRUE )

hlmeth.test[hlmeth.test$we.eBH<0.1,]

## Results Spliting into high and low methane at D2 did not highlihght new groups.

# Run the methane GLM anaysis for timepoint D2

## Construct new covariates matrix

covariates1 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d2cov<-  covariates1[covariates1$dev=="D2",]
d2cov<-d2cov[1:30,] # Remove soil samples
d2rows <- as.numeric(rownames(d2cov))
d2otus <- otutaxa_fam[,d2rows]

# create model matrix for log10 of total methane
mm2 <- model.matrix(~ sumch4, d2cov)

Run Centered log ratio transformations and Dirichlet sampling


## Run Centered log ratio transformations and Dirichlet sampling

d2ch4fam.clr<-  aldex.clr(otutaxa_fam[,d2rows], mm2, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
multicore environment is is OK -- using the BiocParallel package
removed rows with sums equal to zero
computing center with all features
data format is OK
'package:stats' may not be available when loading'package:stats' may not be available when loadingdirichlet samples complete
'package:stats' may not be available when loading'package:stats' may not be available when loadingtransformation complete

Fit a GLM on total methane

# run CLR transformations and sampling
d2ch4fam.clr.glm<- aldex.glm(d2ch4fam.clr, verbose=TRUE)
running tests for each MC instance:
|------------(25%)----------(50%)----------(75%)----------|
d2ch4fam.clr.glm[d2ch4fam.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]

Results

There were no signifigant effects at stage D2

#Test RIL6 (lowest emitter) and RIL1 (highest emmitter) at D3


c16<-covariates1[covariates1$genotype %in% c("RIL1", "RIL6") & covariates1$dev=="D3",]
c16rows <- as.numeric(rownames(c16))
c16otus <- otutaxa_fam[,c16rows]

hlgeno.clr <- aldex.clr(c16otus, c16$genotype, mc.samples=128, denom="iqlr")
operating in serial mode
computing iqlr centering
hlgeno.test <- aldex.ttest(hlmeth.clr, hist.plot=TRUE )

# run CLR transformations and sampling
hlgeno.test[hlgeno.test$we.eBH<0.1,]
---
title: "Exploratory analysis of rice methane data set"
output: html_notebook
---


```{r}
# Import metadata
metadata <-read.csv("~/Documents/gbru_fy18_rice_methane/march2020/metadata_biomassCH4_noRIL9.csv", header=TRUE)
```

# Explore uni-variate distributions of data
```{r}
library(tidyverse)
metadata %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins=12)
```

# Genotype vs Sum Methane
```{r}
d <- ggplot(metadata, aes(genotype, sum_ch4))
d + geom_point(aes(color=dev))

```
Total methane is highly consistent across treatment time point for each genotype.

# Genotype vs Daily Methane
```{r}
d <- ggplot(metadata, aes(genotype, daily_ch4))
d + geom_point(aes(color=dev))

```
D2 is the high methane producing time-point 

# Total methane and root biomass
```{r}
d <- ggplot(metadata[metadata$dev=="D3",], aes(sum_ch4, root_biomass_2017))
d + geom_point(aes(colour = factor(genotype)))
```

# Total methane and shoot biomass
```{r}
d <- ggplot(metadata[metadata$dev=="D3",], aes((sum_ch4), log10(shoot_root_ratio2017)))
d + geom_point(aes(colour = factor(genotype)))
```

# Load taxonomic count data
```{r}
# Import OTU counts
otudata<-read.table("~/Documents/gbru_fy18_rice_methane/march2020/otu_table.tsv", header=TRUE)
# Move feature ID to rownames
row.names(otudata)<-otudata$feature.id
otudata<-otudata[,-1]
# Sort columns by name
otudata<- otudata[, order(names(otudata))]
# load Taxonomy annotation 
taxa <- read.table("~/Documents/gbru_fy18_rice_methane/march2020/observation_meta_data.tsv", sep="\t", header=TRUE, quote = "")

```

# Define function to filter taxa with low abundance and many zeros 

```{r}

#Define the geometric mean for all values greater than 0
geomean <- function(x) { exp(mean(log(x[x>0]))) }

# Create a function to filter rows by the proportion of nonzero samples and the geometric mean of the nonzero samples
prefilter <- function(x, zcutoff, mcutoff){
    nzvect <- apply(x, 1, function(y){
      ((sum(y>0)/length(y))> zcutoff) & (geomean(y)> mcutoff)
      })
    lowcounts <- t(as.data.frame(apply(x[!nzvect,],2,sum)))
    rownames(lowcounts)<-c("lowconts")
    df1 <-rbind(lowcounts, x[nzvect,])
    return(df1)
}
```


# Group by Family and Genus
Initial tests of un-grouped and unfiltered data returned no significant taxa. Independent filtering by depth is something which has been done in DESeq2 and EdgeR and is considered a valid preselection technique.  
```{r}
#Filter data to specific taxonomic levels
cdata <-cbind(taxa,otudata)
library(dplyr)
library(tidyr)

fd_long<- gather(cdata, sample, count, colnames(cdata)[10:105], factor_key=TRUE)

#group by family
fd_fam_tally <- fd_long[,c(7,10,11)] %>% group_by(taxonomy_5, sample) %>%  summarise(n = sum(count))
fd_fam_wide <- pivot_wider(fd_fam_tally, names_from= sample, values_from=n)
df_fam_wide<- as.data.frame(fd_fam_wide)
rownames(df_fam_wide)<- df_fam_wide[,1]
df_fam_wide<- df_fam_wide[,-1]
otutaxa_fam<-prefilter(df_fam_wide, 0.5, 100)

#group by genus
fd_genus_tally <- fd_long[,c(8,10,11)] %>% group_by(taxonomy_6, sample) %>%  summarise(n = sum(count))
fd_genus_wide <- fd_genus_tally %>% pivot_wider(names_from= sample, values_from=n)
df_genus_wide<- as.data.frame(fd_genus_wide)
rownames(df_genus_wide)<- df_genus_wide[,1]
df_genus_wide<- df_genus_wide[,-1]
otutaxa_genus<-prefilter(df_genus_wide, 0.5, 100)
```


# ALDEx2 statistical analysis for compositional count data at the family level
```{r}
library("ALDEx2")

# Construct covariates matrix
covariates1 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d3cov<-  covariates1[covariates1$dev=="D3",]
d3cov<-d3cov[1:30,]
d3rows <- as.numeric(rownames(d3cov))
d3otus <- otutaxa_fam[,d3rows]

# create model matrix for log10 of total methane
mm1 <- model.matrix(~ sumch4, d3cov)


```

## Run Centered log ratio transformations and Dirichlet sampling
```{r}
# run CLR transformations and sampling
d3ch4fam.clr<-  aldex.clr(otutaxa_fam[,d3rows], mm1, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
```

## Fit a GLM on total methane
```{r}
# run CLR transformations and sampling
d3ch4fam.clr.glm<- aldex.glm(d3ch4fam.clr, verbose=TRUE)
```
```{r}
d3ch4fam.clr.glm[d3ch4fam.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]
```

### Results
Because there are no significant taxa after BH correction. The challenge of regressing this data is that each sample does not have its own menthane measurment whic reduces the degrees of freedom

## Do a correlation test
```{r}
corr.test.fam <- aldex.corr(d3ch4fam.clr, d3cov$sumch4)
```


### Results
Eight families have a signifigant statistical association with methane in the correlation test
```{r}
corr.test.fam[corr.test.fam$BH<0.05,]
```

```{r}
hist(corr.test.fam$BH)
```

# ALDEx22 statistical analysis for compositional count data at the Genus Level

## Run Centered log ratio transformations and Dirichlet sampling
```{r}
# run CLR transformations and sampling
d3ch4genus.clr<-  aldex.clr(otutaxa_genus[,d3rows], mm1, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
```

## Fit a GLM on total methane
```{r}
# run CLR transformations and sampling
d3ch4genus.clr.glm<- aldex.glm(d3ch4genus.clr, verbose=TRUE)
```

### Results
Because there are no significant taxa after BH correction. The challenge of regressing this data is that each sample does not have its own menthane measurment whic reduces the degrees of freedom
```{r}
d3ch4genus.clr.glm[d3ch4genus.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]
```

## Do a correlation test
```{r}
corr.test.fam <- aldex.corr(d3ch4genus.clr, d3cov$sumch4)
```


### Results
Eight families have a signifigant statistical association with methane in the correlation test
```{r}
corr.test.fam[corr.test.fam$BH<0.10,]
```


 
 # New apprach: Split methane into a categoical variable and do a Welche's test on high vs low methane
  
```{r}
# Construct covariates matrix
covariates2 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d3cov<-  covariates2[covariates2$dev=="D3",]
d3cov<-d3cov[1:30,]
d3cov$methfactor<-cut(d3cov$sumch4,breaks=2)
d3rows <- as.numeric(rownames(d3cov))
d3otus <- otutaxa_fam[,d3rows]

# create model matrix for log10 of total methane
mm1 <- model.matrix(~ methfactor, d3cov)

hlmeth.clr <- aldex.clr(d3otus, as.character(d3cov$methfactor), mc.samples=128, denom="iqlr")
hlmeth.test <- aldex.ttest(hlmeth.clr, hist.plot=TRUE )

```
 
```{r}
hlmeth.test[hlmeth.test$we.eBH<0.1,]
```
 ## Results
 Spliting into high and low methane at D2 did not highlihght new groups.
 
 # Run the methane GLM anaysis for timepoint D2
 
 
 ## Construct new covariates matrix
 
```{r}
covariates1 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process 

#select D3 developmental stage 
d2cov<-  covariates1[covariates1$dev=="D2",]
d2cov<-d2cov[1:30,] # Remove soil samples
d2rows <- as.numeric(rownames(d2cov))
d2otus <- otutaxa_fam[,d2rows]

# create model matrix for log10 of total methane
mm2 <- model.matrix(~ sumch4, d2cov)
```
 
## Run Centered log ratio transformations and Dirichlet sampling
```{r}
d2ch4fam.clr<-  aldex.clr(otutaxa_fam[,d2rows], mm2, mc.samples = 128, denom="all", verbose=TRUE, useMC=TRUE)
```

## Fit a GLM on total methane
```{r}
# run CLR transformations and sampling
d2ch4fam.clr.glm<- aldex.glm(d2ch4fam.clr, verbose=TRUE)
```

```{r}
d2ch4fam.clr.glm[d2ch4fam.clr.glm$`model.sumch4 Pr(>|t|).BH`<0.5,]
```

## Results
There were no signifigant effects at stage D2


#Test RIL6 (lowest emitter) and RIL1 (highest emmitter) at D3
```{r}

c16<-covariates1[covariates1$genotype %in% c("RIL1", "RIL6") & covariates1$dev=="D3",]
c16rows <- as.numeric(rownames(c16))
c16otus <- otutaxa_fam[,c16rows]

hlgeno.clr <- aldex.clr(c16otus, c16$genotype, mc.samples=128, denom="iqlr")
hlgeno.test <- aldex.ttest(hlmeth.clr, hist.plot=TRUE )
```

```{r}
# run CLR transformations and sampling
hlgeno.test[hlgeno.test$we.eBH<0.1,]
```