This R script was written to take fungal OTU data gathered through ITS amplicon sequencing and metabolomics data gathered through LC-MS/MS. Both analyses were carried out on subsets from the same tissue. This markdown will guide the user through — # Load libraries
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Filter data to only have the living layer for samples with metabolomics info
fungi_living <- fungi %>%
filter(Decomp_Status == "Living") %>%
filter(Metabolomics == "Y")
#Remove extra columns to aid in creation of distance matrices
fungi_living_OTU <- fungi_living %>%
select("SampleID", contains("OTU"))
Later on, we will need metadata to include in data visualization
fungi_meta <- fungi_living %>%
select(SampleID, Site_Code, HostGenus)
#Remove samples with fewer than 2000 reads
fungi_sums <- fungi_living_OTU %>%
mutate(total = rowSums(fungi_living_OTU[c(2:1741)])) %>%
filter(total > 1300)
#Remove 'total' column
fungi_input <- fungi_sums %>%
mutate(total = NULL)
#Convert to a dataframe that will work with as.matrix
fungi_dataframe <- as.data.frame(fungi_input)
rownames(fungi_dataframe) <- fungi_dataframe$SampleID
fungi_dataframe <- fungi_dataframe[, -1]
fungi_input_meta <- fungi_input %>%
left_join(fungi_meta, fungi_input, by = "SampleID")
#convert to matrix
fungi_mat <- as.matrix(fungi_dataframe)
#calculate distances
fungi.distmat <- vegdist(fungi_mat, method = "bray")
set.seed(2023)
fungi_nmds <- metaMDS(fungi.distmat, distance="bray", k=2, trymax = 100)
## Run 0 stress 0.221533
## Run 1 stress 0.2125934
## ... New best solution
## ... Procrustes: rmse 0.1199497 max resid 0.4216615
## Run 2 stress 0.2181455
## Run 3 stress 0.2510435
## Run 4 stress 0.2446197
## Run 5 stress 0.2181453
## Run 6 stress 0.248204
## Run 7 stress 0.2323909
## Run 8 stress 0.2192584
## Run 9 stress 0.2542132
## Run 10 stress 0.2406459
## Run 11 stress 0.236481
## Run 12 stress 0.2225414
## Run 13 stress 0.2519353
## Run 14 stress 0.2385031
## Run 15 stress 0.2570512
## Run 16 stress 0.2125934
## ... Procrustes: rmse 5.891161e-05 max resid 0.0001910962
## ... Similar to previous best
## Run 17 stress 0.2280101
## Run 18 stress 0.2351187
## Run 19 stress 0.2125934
## ... Procrustes: rmse 5.400257e-05 max resid 0.000169177
## ... Similar to previous best
## Run 20 stress 0.2252341
## *** Best solution repeated 2 times
fungi_nmds$stress
## [1] 0.2125934
stressplot(fungi_nmds) #Shepard plot
scores(fungi_nmds) %>%
as_tibble(rownames = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2)) +
geom_point()
scores(fungi_nmds) %>%
as_tibble(rownames = "SampleID") %>%
inner_join(., fungi_meta, by = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2, color=HostGenus, shape=Site_Code)) +
geom_point()
scores(fungi_nmds) %>%
as_tibble(rownames = "SampleID") %>%
inner_join(., fungi_meta, by = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2, color=HostGenus, shape=Site_Code)) +
geom_point(size=3)+
scale_color_manual(values = c('brown', 'purple', 'blue'))+
theme_bw()
adonis2(vegdist(fungi_mat, method="bray") ~ HostGenus * Site_Code,
data = fungi_input_meta)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = vegdist(fungi_mat, method = "bray") ~ HostGenus * Site_Code, data = fungi_input_meta)
## Df SumOfSqs R2 F Pr(>F)
## HostGenus 2 1.7676 0.16694 2.5668 0.001 ***
## Site_Code 3 1.6402 0.15491 1.5879 0.002 **
## HostGenus:Site_Code 3 1.3269 0.12532 1.2846 0.028 *
## Residual 17 5.8534 0.55283
## Total 25 10.5882 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The end goal of this script is to visualize differences within the datasets and correlate the differences. Here we are looking to correlate metabolites with fungal OTU. Therefore, the next steps are to produce a distance matrix and NMDS for the metabolite data.
metabolite <- read_csv("HILIC_not_gap_filled_abundance_7APR23.csv", show_col_types = FALSE)
metadata <- read_csv("Phyloseq_AK2021_Moss_Metadata-v2.csv", show_col_types = FALSE)
metabolite_w_metadata <- left_join(metadata, metabolite, by = 'SampleID')
#Filter for living samples we have fungal info for
metab_living <- metabolite_w_metadata %>%
filter(Decomp_Status == 'Living') %>%
filter(Metabolomics == 'Y')
metab_meta <- metab_living %>%
select(SampleID, Site_Code, HostGenus)
#Remove extra columns
metab_living_feat <- metab_living %>%
select("SampleID", contains("Feature")) %>%
as.data.frame()
#Create matrix
rownames(metab_living_feat) <- metab_living_feat$SampleID
metab_living_feat <- metab_living_feat[, -1]
metab_mat <- as.matrix(metab_living_feat)
metabolite.distmat <- vegdist(metab_mat, method = "bray")
set.seed(2023)
metab_nmds <- metaMDS(metabolite.distmat, distance = "bray", k=2, trymax = 100)
## Run 0 stress 0.1371134
## Run 1 stress 0.1285196
## ... New best solution
## ... Procrustes: rmse 0.1254962 max resid 0.4975257
## Run 2 stress 0.1500502
## Run 3 stress 0.1298576
## Run 4 stress 0.147465
## Run 5 stress 0.09747497
## ... New best solution
## ... Procrustes: rmse 0.1095758 max resid 0.5231563
## Run 6 stress 0.1500502
## Run 7 stress 0.09745198
## ... New best solution
## ... Procrustes: rmse 0.006277238 max resid 0.02625014
## Run 8 stress 0.1454295
## Run 9 stress 0.1294097
## Run 10 stress 0.1001021
## Run 11 stress 0.136602
## Run 12 stress 0.1294096
## Run 13 stress 0.1248569
## Run 14 stress 0.1068059
## Run 15 stress 0.1320659
## Run 16 stress 0.1372901
## Run 17 stress 0.1068059
## Run 18 stress 0.1294096
## Run 19 stress 0.1591385
## Run 20 stress 0.1519634
## Run 21 stress 0.1295195
## Run 22 stress 0.1311882
## Run 23 stress 0.166473
## Run 24 stress 0.1068059
## Run 25 stress 0.1383764
## Run 26 stress 0.0974766
## ... Procrustes: rmse 0.01017251 max resid 0.03950491
## Run 27 stress 0.1692503
## Run 28 stress 0.1470267
## Run 29 stress 0.1228293
## Run 30 stress 0.09751364
## ... Procrustes: rmse 0.01184083 max resid 0.03884074
## Run 31 stress 0.1002069
## Run 32 stress 0.1632436
## Run 33 stress 0.09753365
## ... Procrustes: rmse 0.01052189 max resid 0.03899888
## Run 34 stress 0.1584984
## Run 35 stress 0.1371134
## Run 36 stress 0.1421129
## Run 37 stress 0.1634359
## Run 38 stress 0.147465
## Run 39 stress 0.1513647
## Run 40 stress 0.09743099
## ... New best solution
## ... Procrustes: rmse 0.002357681 max resid 0.009158309
## ... Similar to previous best
## *** Best solution repeated 1 times
metab_nmds$stress
## [1] 0.09743099
stressplot(metab_nmds) #Shepard plot
scores(metab_nmds) %>%
as_tibble(rownames = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2)) +
geom_point()
#Add metadata
scores(metab_nmds) %>%
as_tibble(rownames = "SampleID") %>%
inner_join(., metab_meta, by = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2, color=HostGenus, shape=Site_Code)) +
geom_point()
scores(metab_nmds) %>%
as_tibble(rownames = "SampleID") %>%
inner_join(., metab_meta, by = "SampleID") %>%
ggplot(aes(x=NMDS1, y=NMDS2, color=HostGenus, shape=Site_Code)) +
geom_point(size = 3)+
scale_color_manual(values = c('brown', 'purple', 'blue'))+
theme_bw()
Use PERMANOVA to quantify variation
adonis2(vegdist(metab_mat, method="bray") ~ HostGenus + Site_Code + HostGenus * Site_Code,
data = metab_meta)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = vegdist(metab_mat, method = "bray") ~ HostGenus + Site_Code + HostGenus * Site_Code, data = metab_meta)
## Df SumOfSqs R2 F Pr(>F)
## HostGenus 2 0.22764 0.45093 12.1090 0.001 ***
## Site_Code 3 0.07409 0.14676 2.6273 0.003 **
## HostGenus:Site_Code 3 0.03391 0.06716 1.2024 0.261
## Residual 18 0.16919 0.33515
## Total 26 0.50483 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Make matrices the same size- we have one Aulacomnium sample that gets removed from microbial data bc too few reads
metab_living_filtered <- metab_living %>%
filter(SampleID != "S2.M1.S6.L1")
metab_living_feat_filtered <- metab_living_filtered %>%
select("SampleID", contains("Feature"))
metab_living_df <- as.data.frame(metab_living_feat_filtered)
#Create matrix
rownames(metab_living_df) <- metab_living_df$SampleID
metab_living_df <- metab_living_df[, -1]
metab_mat_corr <- as.matrix(metab_living_df)
metabolite_filtered.distmat <- vegdist(metab_mat_corr, method = "bray")
Use Mantel test to see the overall correlations.
correlation <- mantel(fungi.distmat, metabolite_filtered.distmat, method = "spearman", permutations = 9999, na.rm = TRUE)
correlation
##
## Mantel statistic based on Spearman's rank correlation rho
##
## Call:
## mantel(xdis = fungi.distmat, ydis = metabolite_filtered.distmat, method = "spearman", permutations = 9999, na.rm = TRUE)
##
## Mantel statistic r: 0.5231
## Significance: 1e-04
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.137 0.174 0.208 0.253
## Permutation: free
## Number of permutations: 9999
I wanted to see if there were significant correlations between specific metabolites and specific OTU.
#Pairwise Correlations
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#Create pairwise matrix
pairwise <- left_join(metab_living_feat_filtered, fungi_input, by ="SampleID") %>%
as.data.frame()
rownames(pairwise) <- pairwise$SampleID
pairwise <- pairwise[, -1]
pairwise_mat <- as.matrix(pairwise)
pairwise_corr <- rcorr(pairwise_mat)
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
flat_pairwise <- flattenCorrMatrix(pairwise_corr$r, pairwise_corr$P)
This performed all pairwise correlations and therefore is a large file. I want to visualize only significant correlations between metabolites and OTU. This creates an input file for the app MetScape which is part of Cytoscape. This app has some particular quirks, so the code represents the exact input file format it requires.
##Show only correlations between metabolites
correlation_metab <- flat_pairwise %>%
filter(str_detect(row, "^Feature"))
##Show only correlations between metabolites and OTU
correlation_metab_OTU <- correlation_metab %>%
filter(str_detect(column, "^OTU"))
##Show only significant correlations
correlation_sig <- correlation_metab_OTU %>%
filter(p <= 0.05) %>%
filter(p != 0)
##Replace <0.0001 with 0.0001 because metscape doesn't like "<"
correlation_sig[correlation_sig == '<.0001'] <- "0.0001"
##Add adjusted p val of 1 so metscape will read
correlation_sig$'adj pval' <- 1
##change column names to match metscape example
colnames(correlation_sig)[1] = 'metab1'
colnames(correlation_sig)[2] = 'metab2'
colnames(correlation_sig)[3] = 'pcor'
colnames(correlation_sig)[4] = 'pval'
write_csv(correlation_sig, 'AK_moss2021_correlation_MetscapeInput_StatsWRProj.csv')
#Conclusion By the end of this, you should have ordinations for microbial and metabolite data, statistics on the driving factors behind the variation, statistics on the correlation between the datasets, and an input file to visualize the networks.