---
title: "Wk5 In Class"
author: "Emily Vernon"
date: "2/11/2020"
output: html_document
editor_options: 
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# In Class Activity
### I decided to gain practice with R markdown by inputting the in class work and activity. 

##Load libraries and data

```{r}
library(phyloseq); packageVersion("phyloseq")
library(ggplot2); packageVersion("ggplot2")
library(vegan); packageVersion("vegan")
library(plyr); packageVersion("plyr")
library(dplyr); packageVersion("dplyr")
library(tidyverse); packageVersion("tidyverse")
library(compositions); packageVersion("compositions")
library(rmarkdown); packageVersion("rmarkdown")
library(knitr); packageVersion("knitr")
library(devtools); packageVersion("devtools")
install_github("jfq3/ggordiplots", force=TRUE)
library(ggordiplots) #  may have to check the box to load this package
```
##Load data and visualize
```{r}
data("soilrep")  
ntaxa(soilrep)
```

```{r}
nsamples(soilrep)
```
```{r}
sample_names(soilrep)
```
```{r}
sample_data(soilrep)
```
```{r}
sample_variables(soilrep) 
```
```{r}
sample_sums(soilrep) 
```
##Data Processing in Phlyoseq
```{r}
# make a copy of the ps object
ps_gp_bact <- soilrep
# add new environmental data columns to the sample data file
# get the sample data file from the existing ps object
sam.old <- sample_data(ps_gp_bact)
sam.old[1]
```
```{r}
str(sam.old)
```
```{r}
# get the new data to add
sam.new <- read.csv(url("https://raw.github.ncsu.edu/MicrobiomeAnalysis/ClassData/master/Wk5_add_sampledata.csv?token=AAAEEWQVOWMTMV4KRPO4S6C6IL7KU"), row.names = 1)
sam.new[1,]
str(sam.new)
```
```{r}
#merge the two data frames and set up the resulting file for phyloseq
sam.all<-merge(sam.old, sam.new, by="row.names")
sam.all[1,]
```
```{r}
sam.all <- column_to_rownames(sam.all, var = "Row.names")
sam.all[1,]
str(sam.all)
```
```{r}
#Make a new phlyoseq object with the new sample data
otus <- otu_table(ps_gp_bact)
SAM <- sample_data(sam.all)
OTU <- otu_table(otus)
sample_names(SAM) #check that sample names match!
```
```{r}
sample_names(OTU)
```
```{r}
ps_gp_new <- phyloseq(OTU, SAM)

# remove singletons 
ps_nosing <- prune_taxa(taxa_sums(ps_gp_new) > 1, ps_gp_new)
ntaxa(ps_nosing)
```
```{r}
# file too large for class to use even with singletons removed
# subset dataset to taxa seen more than 2 times in at least 5% of the samples. 
ps_filt <- filter_taxa(ps_gp_new, function(x) sum(x > 2) > (0.05*length(x)), TRUE)
ntaxa
```

##NMDS with Bray-Curtis distances in phyloseq::ordinate
```{r}
# use phyloseq::ordinate and specify file, method, distance

ord1 <- ordinate(ps_filt, "NMDS", "bray")
```
```{r}
ord1
```
```{r}
#evaluate NMDS run with stress plot
stressplot(ord1)
```
```{r}
# plot samples
p1_ord1 <- plot_ordination(ps_nosing, ord1, type="samples", color="Treatment", title="NMDS bray samples") 
p1_ord1
```
```{r}
#draw confidence ellipses around the treatments
# Rmarkdown did not like this - try it in class
#p1_ord1 + 
#  stat_ellipse(type="norm", linetype = 2) +
#  stat_ellipse(type="t") +
# theme_bw()
# plot taxa

p2_ord1 <- plot_ordination(ps_nosing, ord1, 
                           
                           type="taxa", 
                           
                           title="NMDS bray taxa") 
p2_ord1
```

```{r}
# biplot samples + taxa
p3_ord1 <- plot_ordination(ps_nosing, ord1, 
                           
                           type="biplot", 
                           
                           color="Treatment", 
                           
                           title = "NMDS bray biplot")

p3_ord1
```
```{r}
# ggplot doesn't handle many symbols well
# can also split the plot into two
p4_ord1 <- plot_ordination(ps_nosing, ord1, 
                           
                           type="split", color="warmed", 
                           
                           label="Treatment", shape = "clipped", 
                           
                           title="split NMDS plots") 

p4_ord1
```
```{r}
# two ways to download the data for further analyses or graphics
# just the sample names + xy coordinates
write.csv(ord1$points, "NMDS data xy coords.csv")
# alternative for sample names, xy coordinates, and sample data
nmds.bray.xy <- plot_ordination(ps_filt, ord1, justDF = TRUE)
write.csv(nmds.bray.xy, "Wk5 NMDS xycoords 2.csv")
```

##NMDS with Bray-Curtis distances+environmental data in vegan::metaMDS
```{r}
# create otu and sample data files from phyloseq object ps_filt
# make sure otu file has samples as rows and otus as columns
otus_filt <- otu_table(ps_filt)
otus_filt <- as.data.frame(t(otus_filt))
# can check file with str() and head()
sam <- sam.new # limit file to continuous vars
# use vegan::metaMDS and specify community file, distance, and 
# autotransform = FALSE to avoid automated vegan transformations
ord2 <- metaMDS(otus_filt, distance = "bray", autotransform = FALSE)
```
```{r}
ord2
```

```{r}
stressplot(ord2)
```

```{r}
# what to do when you don't have convergence?
# can extend number of random starts by specifying "trymax" > default = 20
# can start a new ordination using the previous run as the start with "previous.best" 
# can increase max iterations with "maxit"
# consider a data transformation 
# add the environmental correlations
# can also specify the variables in the file to use
ord2_env <- envfit(otus_filt, sam.new, permutations = 99, strata = NULL, choices=c(1,2))
ord2_env
```

```{r}
p1_ord2 <- plot(ord2)
```

```{r}
# p1_ord2 + plot(ord2_env, p.max = 0.05) # Rmarkdown did not like this - try it in class
# some nicer figures to explore sample-environment correlations with gg_ordiplot 
# see https://john-quensen.com/wp-content/uploads/2017/12/Ordiplots_with_ggplot.pdf
p2_ord2 <- gg_ordiplot(ord2, groups=sam.old$Treatment, choices = c(1,2), kind = "sd", conf = 0.95, pt.size=4)
```

```{r}
p3_ord2 <- gg_envfit(ord=ord2, env=sam.new, perm=99, pt.size=4, alpha= 0.2)
```

```{r}
p4_ord2 <- gg_ordisurf(ord=ord2, env.var=sam.new$MAT, choices = c(1,2), binwidth=0.05, pt.size=1, var.label="MAT") 
```

```{r}
gg_ordibubble(ord=ord2, env.var=sam.new$soilC, var.label="soilC")
```

```{r}

# can also extract the data for use elsewhere #don't have to do this for class, but nice to know
sampleScores <- ord2$points
otuScores <- ord2$species
envScores <- scores(ord2_env, "vectors")
```

#In-Classs Assignment

##1. repeat NMDS with different distances and data types in vegan
##Plot both of the above and report how presence-absence data changes the outcome
```{r}
#1.a jaccard
ord3 <- metaMDS(otus_filt, distance = "jaccard", autotransform = FALSE)
ord3
```
```{r}
stressplot(ord3)
```


##Plotting ordination
```{r}
p1_ord3 <- plot_ordination(ps_nosing, ord3, type="samples", color="Treatment", title="NMDS Jaccard Samples") 
p1_ord3
```

```{r}

p2_ord3 <- plot_ordination(ps_nosing, ord3, 
                           
                           type="taxa", 
                           
                           title="NMDS Jaccard Taxa") 
p2_ord3
```
```{r}
p3_ord3 <- plot_ordination(ps_nosing, ord3, 
                           
                           type="biplot", 
                           
                           color="Treatment", 
                           
                           title = "NMDS Jaccard Biplot")

p3_ord3
```

```{r}

p4_ord3 <- plot_ordination(ps_nosing, ord3, 
                           
                           type="split", color="warmed", 
                           
                           label="Treatment", shape = "clipped", 
                           
                           title="Split NMDS plots") 

p4_ord3
```

##For the inclusion of environmental data, additional plots can be formulated.
```{r}
p1_ord3 <- plot(ord3)
```

```{r}
p2_ord3 <- gg_ordiplot(ord3, groups=sam.old$Treatment, choices = c(1,2), kind = "sd", conf = 0.95, pt.size=4)
```
```{r}
p3_ord3 <- gg_envfit(ord=ord3, env=sam.new, perm=99, pt.size=4, alpha= 0.2)
```

```{r}
p4_ord3 <- gg_ordisurf(ord=ord3, env.var=sam.new$MAT, choices = c(1,2), binwidth=0.05, pt.size=1, var.label="MAT") 
```

```{r}
gg_ordibubble(ord=ord3, env.var=sam.new$soilC, var.label="soilC")
```

#1.b. jaccard, set binary = TRUE
```{r}
ord4 <- metaMDS(otus_filt, distance = "jaccard", autotransform = TRUE)
ord4
```

```{r}
stressplot(ord4)
```

```{r}
##Plotting ordination
p1_ord4 <- plot_ordination(ps_nosing, ord4, type="samples", color="Treatment", title="NMDS Jaccard (True) Samples") 
p1_ord4
```

```{r}
p2_ord4 <- plot_ordination(ps_nosing, ord4, 
                           
                           type="taxa", 
                           
                           title="NMDS Jaccard (True) Taxa") 
p2_ord4
```
```{r}
p3_ord4 <- plot_ordination(ps_nosing, ord4, 
                           
                           type="biplot", 
                           
                           color="Treatment", 
                           
                           title = "NMDS Jaccard (True) Biplot")

p3_ord4
```

```{r}
p4_ord4 <- plot_ordination(ps_nosing, ord4, 
                           
                           type="split", color="warmed", 
                           
                           label="Treatment", shape = "clipped", 
                           
                           title="Split NMDS (True) plots") 

p4_ord4
```


##For the inclusion of environmental data, additional plots can be formulated. 
```{r}
p1_ord4 <- plot(ord4)
```

```{r}
p2_ord4 <- gg_ordiplot(ord4, groups=sam.old$Treatment, choices = c(1,2), kind = "sd", conf = 0.95, pt.size=4)
```

```{r}
p3_ord4 <- gg_envfit(ord=ord4, env=sam.new, perm=99, pt.size=4, alpha= 0.2)
```
```{r}
p4_ord4 <- gg_ordisurf(ord=ord4, env.var=sam.new$MAT, choices = c(1,2), binwidth=0.05, pt.size=1, var.label="MAT") 
```
```{r}
gg_ordibubble(ord=ord4, env.var=sam.new$soilC, var.label="soilC")
```
##There are differences in the presence/absence distribution of the ordinance plots for the Jaccard distances. 
##Overall, the distribution of the species are similar in that there are not overt significant differences between running the non-metric multidimensional scaling with binary either true or false. 
##For both the jaccard NMDS plots for samples, there are two clusters in the same position on the graph. There are some changes on the distribution of the actual samples, but it is hard to draw conclusions from this data set due to the fact that the treatment is not directly correlated to the data. 
##The distribution of the jaccard NMDS plots for taxa are similar, but do have some differences. For the true jaccard plot, the distribution is clustered around 0 on the y-axis of the graph, which represents the change in distance in the original data. The other jaccard plot is still clustered around 0 on the y-axis, but has distribution on the negative side of the y-axis as well, which is not the case with the true jaccard plot. This trend is also recognized in the NMDS biplots. 
##Therefore, it is clear that there are differences in the presence/abundance of similar species by running the NMDS analysis with binary labeled as true/false. 

#2 repeat NMDS with Hellinger distances in vegan::decostand
```{r}
##Helinger decostand command
otus_filt_hel. <- decostand(otus_filt, "hellinger")
```
```{r}
##Repeat NMDS 
ord5 <- metaMDS(otus_filt_hel., distance = "euclidean", autotransform = FALSE)
ord5
```
```{r}
##Plot the results
p1_ord5 <- plot_ordination(ps_nosing, ord5, type="samples", color="Treatment", title="NMDS Hellinger Samples") 
p1_ord5
```
```{r}
p2_ord5 <- plot_ordination(ps_nosing, ord5, 
                           
                           type="taxa", 
                           
                           title="NMDS Hellinger Taxa") 
p2_ord5
```

```{r}
p3_ord5 <- plot_ordination(ps_nosing, ord5, 
                           
                           type="biplot", 
                           
                           color="Treatment", 
                           
                           title = "NMDS Hellinger Biplot")

p3_ord5
```

```{r}

p4_ord5 <- plot_ordination(ps_nosing, ord5, 
                           
                           type="split", color="warmed", 
                           
                           label="Treatment", shape = "clipped", 
                           
                           title="Split NMDS Hellinger plots") 

p4_ord5
```

#3 Run PcoA with vst transformed otus + environmental data in vegan::wcmdscale
```{r}
##Load libraries
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("DESeq2", version = "3.10")
library("DESeq2"); packageVersion("DESeq2")
library("ggplot2"); packageVersion("ggplot2")
library("tools"); packageVersion("tools")
```
```{r}
# convert phyloseq to DeSeq object
# after ~, define experimental design using columns in sample data file
# alt - use `~1` as the experimental design if you don't want the actula design to influence your tranformation. #This is a random study so this commmand will be used as the treatments do not really impact the results.
psnosing_ds <- phyloseq_to_deseq2(ps_nosing, ~1)
```

```{r}
# Estimate size factors and dispersion #there was an error:"Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, : every gene contains at least one zero, cannot compute log geometric means
#Use alternate code
gm_mean = function(psnosing_ds, na.rm=TRUE)
  gm_mean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))}
```
```{r}
geoMeans = apply(counts(psnosing_ds), 1, gm_mean)
psnosing_ds = estimateSizeFactors(psnosing_ds, type="ratio", geoMeans = geoMeans) #alt types are "poscounts" and "iterate"
```

```
```{r}
psnosing_ds = estimateDispersions(psnosing_ds, fitType = "local")
```

```{r}
psnosing_ds = DESeq(ps_ds, test="Wald", fitType="parametric") 
```

```{r}
# plot the dispersion estimates
plotDispEsts(psnosing_ds)
```
```{r}
# make a copy of original ps object, run the variance stabilization, and replace otu_table with the transformed table
# then save transformed files as .csv
psnosing_vst <- ps_nosing
vst<-getVarianceStabilizedData(psnosing_ds)
otu_table(psnosing_vst) <- otu_table(vst, taxa_are_rows = TRUE)
psnosing_vst
```

```{r}
# check that taxa names match 
taxa_names(psnosing_vst)
taxa_names(ps_nosing)
```

```{r}
#PCoA Plot
d <- dist(psnosing_vst, "euclidean")
ord <- wcmdscale(d, w = rs, eig = TRUE)
##I could not get this code to work, no matter what I tried and was not sure which code to use. 
## In addition, I had significant trouble running DeSeq on my home computer. I kept getting an error message saying that I did not have "matrixStats" installed. DeSeq would not install and not function. 
```


##4 Run PCA with clr transformed otus + environmental data
```{r}
# For compositional data, use centered log ratio (clr) transformation
# This has been criticized so give some thought to whether or not it is appropriate 
# Load libraries and install packages
install.packages("compositions")
library("compositions"); packageVersion("compositions")
```

```{r}
# Run transformation and save transformed matrix to a new file name. 
psnosing_clr <- otu_table(ps_nosing)
psnosing_clr<- clr(psnosing_clr)
#Change to data frame
psnosing_clr <- as.data.frame(psnosing_clr) 
```

```{r}
# make a copy of original ps object and replace otu_table with the transformed table
psnosing_clr <- ps_nosing
otu_table(psnosing_clr) <- otu_table(psnosing_clr, taxa_are_rows = FALSE)
psnosing_clr
```

```{r}
# check that taxa names match
taxa_names(psnosing_clr)
taxa_names(ps_nosing)
```

```{r}
# Plot ordination
ord_clr <- ordinate(psnosing_clr, method = "PCoA", distance = "euclidean", formula = NULL)
plot(ord_clr)
```

```{r}
#PCoA Plot
p1_ord_clr <- plot_ordination(psnosing_clr, ord_clr, type="samples", color="Treatment", title="PCoA Plot Clr Transformation") 
p1_ord_clr