ggplot2
, gplots
, and VennDiagram
hclust
and pvclust
to put samples into groupsExport
to export a .tsv
or .csv
tablepairs()
or ggpairs()
We can see we have a number of biogeochemcial parameters that we could correlate with our genomic observations: depth (m), temperature, disolved organic carbon (DOC), nitrus oxide (NOx), disolved inorganic phosphorus (DOP), oxygen, and disolved inorganic carbon (DIC).
pairs()
plot to get an overlay of the data.ggpairs()
function of the GGally
package (Warning: can take some time to plot)# set my working directory
setwd("~/Dropbox/stamps2014/downstream")
hot_metadata <- read.table("temp_data/HOT_METADATA.txt", header=T,
row.names=1, sep = "\t")
hot_metadata_scale <- scale(hot_metadata)
require("GGally")
## Loading required package: GGally
## Loading required package: ggplot2
## Loading required package: reshape
require(MASS) # robust linear regression
## Loading required package: MASS
ggpairs(hot_metadata_scale,
upper= list(continuous = "smooth", params = c(method = "rlm")),
lower = list(continuous = "smooth", params = c(method = "rlm")))
heatmap.2()
from the gplots
packagerequire(gplots) # for heatmap.2
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
require(RColorBrewer) # for colours
## Loading required package: RColorBrewer
# require(GGally)
my_cols <- brewer.pal(8, "PuBuGn")
heatmap.2(as.matrix(hot_metadata_scale), col=my_cols, trace= "none", margins=c(5,15))
RColorBrewer
packagelibrary(RColorBrewer)
display.brewer.all()
single-linkage
and average-linkage
clusterings being applied to a large numer of current datasetsORFs <- c(9780, 7281, 14501, 14276, 9617, 13831, 11467) # QC-ed ORFs
cog2 <- read.table("data/cog_level2.txt", header=T, row.names=1, sep="\t")
kegg2 <- read.table("data/kegg_level2.txt", header=T, row.names=1, sep="\t")
seed1 <- read.table("data/seed_level1.txt", row.names=1, header=T, sep="\t")
cog2_scale <- scale(cog2, center = FALSE, scale = ORFs) * 100
kegg2_scale <- scale(kegg2, center = FALSE, scale = ORFs) * 100
seed1_scale <- scale(seed1, center = FALSE, scale = ORFs) * 100
cog2_tree <- hclust(dist(t(cog2_scale), method="manhattan"), method = "average")
kegg2_tree <- hclust(dist(t(kegg2_scale), method="manhattan"), method="average")
seed1_tree <- hclust(dist(t(seed1_scale), method="manhattan"), method="average")
par(mfrow=c(1,3))
plot(cog2_tree, main="COG")
plot(kegg2_tree, main="KEGG")
plot(seed1_tree, main="SEED")
par(mfrow=c(1,1))
pvclust
package to bootstrap these trees to get an idea of how stable this clustering.library("pvclust")
?pvclust
cog2_pvtree <- pvclust(cog2_scale, method.hclust="average",
method.dist="manhattan",
nboot=50)
## Bootstrap (r = 0.48)... Done.
## Bootstrap (r = 0.57)... Done.
## Bootstrap (r = 0.67)... Done.
## Bootstrap (r = 0.76)... Done.
## Bootstrap (r = 0.86)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.19)... Done.
## Bootstrap (r = 1.29)... Done.
## Bootstrap (r = 1.38)... Done.
kegg2_pvtree <- pvclust(kegg2_scale, method.hclust="average",
method.dist="manhattan",
nboot=50)
## Bootstrap (r = 0.49)... Done.
## Bootstrap (r = 0.58)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.79)... Done.
## Bootstrap (r = 0.88)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.09)... Done.
## Bootstrap (r = 1.19)... Done.
## Bootstrap (r = 1.28)... Done.
## Bootstrap (r = 1.4)... Done.
seed1_pvtree <- pvclust(seed1_scale, method.hclust="average",
method.dist="manhattan",
nboot=50)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
par(mfrow=c(1,3))
plot(cog2_pvtree, main="COG")
plot(kegg2_pvtree, main="KEGG")
plot(seed1_pvtree, main="SEED")
par(mfrow=c(1,1))
heatmaps
library(gplots)
library(RColorBrewer)
cog_col <- brewer.pal(8, "PuBuGn")
kegg_col <- brewer.pal(8, "OrRd")
seed_col <- brewer.pal(8, "PuRd")
# annonymous functions to fit dendrograms
my_dist <- function(x) {
dist(x, method="manhattan")
}
my_hclust <- function(x) {
hclust(x, method="average")
}
heatmap.2(cog2_scale, col=cog_col,
trace="none",
distfun=my_dist,
hclustfun=my_hclust,
margins=c(12,15))
heatmap.2(kegg2_scale, col=kegg_col,
trace="none",
distfun=my_dist,
hclustfun=my_hclust,
margins=c(10,15))
heatmap.2(seed1_scale, col=seed_col,
trace="none",
distfun=my_dist,
hclustfun=my_hclust,
margins=c(10,15))
kegg2_meta <- read.table("data/kegg_level2_metabolism.txt", header=T, row.names=1, sep="\t")
kegg2_meta_scale <- scale(kegg2_meta, center = FALSE, scale = ORFs) * 100
heatmap.2(kegg2_meta_scale, col=kegg_col,
trace="none",
distfun=my_dist,
hclustfun=my_hclust,
margins=c(10,15))
These dendrograms can get kind of large, so we’ll break it down clusters, by cutting the tree at a distance of 1.
map
variable that we created with the tree to isolate the groups and replot the heatmap (quite the indexing trick)heatmap.2(kegg3_carb_scale[rev(names(map)[map==2]),], col=kegg_col,
trace="none",
distfun=my_dist,
hclustfun=my_hclust,
Rowv=label_order[label_order %in% names(map)[map==2]],
Colv=carb_heat$colDendrogram,
dendrogram="none",
density="none",
breaks=carb_heat$breaks,
margins=c(5,15))
# library(ade4)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.0-10
# library(gclus)
# library(ape)
# fit NMDS
cog2_scale_nmds <- metaMDS(cog2_scale, distance="manhattan")
## Run 0 stress 0.002283
## Run 1 stress 0.009058
## Run 2 stress 0.003178
## Run 3 stress 0.004435
## Run 4 stress 0.003482
## Run 5 stress 0.002966
## Run 6 stress 0.005038
## Run 7 stress 0.003481
## Run 8 stress 0.004212
## Run 9 stress 0.003092
## Run 10 stress 0.002536
## ... procrustes: rmse 0.01996 max resid 0.07306
## Run 11 stress 0.004278
## Run 12 stress 0.005078
## Run 13 stress 0.003823
## Run 14 stress 0.005987
## Run 15 stress 0.006399
## Run 16 stress 0.002688
## ... procrustes: rmse 0.01418 max resid 0.02353
## Run 17 stress 0.003116
## Run 18 stress 0.002895
## Run 19 stress 0.003559
## Run 20 stress 0.006645
kegg2_scale_nmds <- metaMDS(kegg2_scale, distance="manhattan")
## Wisconsin double standardization
## Run 0 stress 0.03781
## Run 1 stress 0.03835
## Run 2 stress 0.0412
## Run 3 stress 0.04946
## Run 4 stress 0.04026
## Run 5 stress 0.03844
## Run 6 stress 0.04887
## Run 7 stress 0.04579
## Run 8 stress 0.03852
## Run 9 stress 0.04071
## Run 10 stress 0.04278
## Run 11 stress 0.04451
## Run 12 stress 0.04059
## Run 13 stress 0.04428
## Run 14 stress 0.04338
## Run 15 stress 0.03818
## ... procrustes: rmse 0.0182 max resid 0.06687
## Run 16 stress 0.04064
## Run 17 stress 0.04041
## Run 18 stress 0.03775
## ... New best solution
## ... procrustes: rmse 0.006084 max resid 0.02125
## Run 19 stress 0.04088
## Run 20 stress 0.03821
## ... procrustes: rmse 0.02477 max resid 0.09087
seed1_scale_nmds <- metaMDS(seed1_scale, distance="manhattan")
## Run 0 stress 0.002488
## Run 1 stress 0.001856
## ... New best solution
## ... procrustes: rmse 0.07578 max resid 0.1925
## Run 2 stress 0.004627
## Run 3 stress 0.0001204
## ... New best solution
## ... procrustes: rmse 0.0366 max resid 0.09912
## Run 4 stress 0.002023
## Run 5 stress 0.001718
## Run 6 stress 0.0002592
## ... procrustes: rmse 0.004575 max resid 0.01246
## Run 7 stress 0.002856
## Run 8 stress 0.001592
## Run 9 stress 0.001161
## Run 10 stress 0.3896
## Run 11 stress 0.002828
## Run 12 stress 0.003044
## Run 13 stress 0.00595
## Run 14 stress 0.0005387
## ... procrustes: rmse 0.01547 max resid 0.04138
## Run 15 stress 0.0001249
## ... procrustes: rmse 0.002461 max resid 0.007508
## *** Solution reached
## Warning: Stress is (nearly) zero - you may have insufficient data
# resulting plots
par(mfrow=c(1,3))
plot(cog2_scale_nmds, type="t")
plot(kegg2_scale_nmds, type="t")
plot(seed1_scale_nmds, type="t")
par(mfrow=c(1,1))
# redundancy analysis
# transpose to samples on the rows
kegg2_scale.t <- t(kegg2_scale)
# make sure a data frame
kegg2_scale.t.df <- as.data.frame(kegg2_scale.t)
hot_metadata_scale.df <- as.data.frame(hot_metadata_scale)
# run the RDA
kegg2_scale_RDA <- rda(kegg2_scale.t.df ~ ., hot_metadata_scale.df)
plot(kegg2_scale_RDA)