library(vegan)
## Warning: package 'vegan' was built under R version 4.1.2
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.1.2
## Loading required package: lattice
## This is vegan 2.6-4
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggfortify)
wd <- list()
# commonly used paths in my working directory
wd$data <- "/Users/rasmanns/Dropbox/Lavoro/Manuscripts/2024/Sarah_decomposition_phytochemistry/analyses_final/data/formatted/"
wd$output <- "/Users/rasmanns/Dropbox/Lavoro/Manuscripts/2024/Sarah_decomposition_phytochemistry/analyses_final/output/"
sirius <- read.csv(paste0(wd$data, "formatted_peak_annotation.csv"), h=T, sep=",", na.strings = "NA", stringsAsFactors=FALSE)
names(sirius)
## [1] "feature_id_full" "feature_id"
## [3] "feature_mz" "feature_rt"
## [5] "npc_pathway_canopus" "npc_pathway_probability_canopus"
## [7] "npc_superclass_canopus" "npc_superclass_probability_canopus"
## [9] "npc_class_canopus" "npc_class_probability_canopus"
## [11] "name_sirius" "chebiasciiname_sirius"
## [13] "confidencescore_sirius" "inchi_sirius"
## [15] "inchikey2d_sirius" "smiles_sirius"
## [17] "adduct_sirius" "molecularformula_sirius"
## [19] "chebiid_sirius" "feature_id_full_annotated"
Analysis at the superclass level
# here we select the annotated featueres at the superclass level with a probability > 70%
superclass_highprob <- dplyr::filter(sirius, npc_superclass_probability_canopus > 0.7)
levels(factor(superclass_highprob$npc_superclass_canopus)) # we have 57 superclasses
## [1] "Alkylresorcinols" "Amino acid glycosides"
## [3] "Aminosugars and aminoglycosides" "Anthranilic acid alkaloids"
## [5] "Apocarotenoids" "Carotenoids (C40)"
## [7] "Chromanes" "Coumarins"
## [9] "Cyclic polyketides" "Diarylheptanoids"
## [11] "Diterpenoids" "Eicosanoids"
## [13] "Fatty Acids and Conjugates" "Fatty acyl glycosides"
## [15] "Fatty acyls" "Fatty amides"
## [17] "Fatty esters" "Flavonoids"
## [19] "Glycerolipids" "Glycerophospholipids"
## [21] "Guanidine alkaloids" "Histidine alkaloids"
## [23] "β-lactams" "Isoflavonoids"
## [25] "Lignans" "Linear polyketides"
## [27] "Lysine alkaloids" "Macrolides"
## [29] "Meroterpenoids" "Monoterpenoids"
## [31] "Mycosporine derivatives" "Naphthalenes"
## [33] "Nicotinic acid alkaloids" "Nucleosides"
## [35] "Octadecanoids" "Oligopeptides"
## [37] "Ornithine alkaloids" "Peptide alkaloids"
## [39] "Phenanthrenoids" "Phenolic acids (C6-C1)"
## [41] "Phenylpropanoids (C6-C3)" "Polycyclic aromatic polyketides"
## [43] "Polyethers" "Polyols"
## [45] "Pseudoalkaloids" "Pseudoalkaloids (transamidation)"
## [47] "Saccharides" "Serine alkaloids"
## [49] "Sesquiterpenoids" "Small peptides"
## [51] "Sphingolipids" "Steroids"
## [53] "Stilbenoids" "Triterpenoids"
## [55] "Tryptophan alkaloids" "Tyrosine alkaloids"
## [57] "Xanthones"
rownames(superclass_highprob) <- superclass_highprob$feature_id ## assign features ID
superclass <- superclass_highprob %>% select(npc_superclass_canopus) ## keep only the superclasss column with the respective features ID
## chemistry - non-annotated dataset
chem <- read.csv(paste0(wd$data, "formatted_data_scaled_sr.csv"), h=T, sep=",", row.names = 1, na.strings = "NA", stringsAsFactors=FALSE)
names(chem)[1:20]
## [1] "filename" "sample_id" "sample_type" "source_taxon" "site"
## [6] "Sites" "abundance" "organ" "region" "alti"
## [11] "expo" "ntot" "corg" "cn" "X1"
## [16] "X10" "X100" "X1000" "X10000" "X10001"
setdiff(chem$filename, rownames(chem))
## character(0)
####
chem_tab <- chem[,c(15:ncol(chem))] ## select only feaures from the whole dataset
chem_tab2 <- chem_tab
colnames(chem_tab2)<- colnames(chem_tab2) %>% str_replace("X*", "") ## remove the self-generated "X" before nubmers on col names
####
col_names_keep <- rownames(superclass) ## these are the feuateres ID to keep based on the high probability score of the superclass above
chem_tab3 <- chem_tab2 %>% select(any_of(col_names_keep)) ## keep only those features associated with high prob superclass
colnames(chem_tab3) <- superclass$npc_superclass_canopus
chem_tab4 <- as.data.frame(t(chem_tab3))
chem_tab5 <- chem_tab4 %>%
dplyr:: group_by(Superclasses = colnames(chem_tab3))%>%
dplyr:: summarise(across(everything(), list(sum))) %>%
remove_rownames %>%
column_to_rownames(var="Superclasses") %>% t
rownames(chem_tab5)<- gsub('.{0,2}$', '', rownames(chem_tab5)) ## remove the self-generated "X" before nubmers on col names
setdiff(rownames(chem_tab5), rownames(chem))
## character(0)
chem_superclass_tab <- cbind(chem[1:14], chem_tab5, chem[15: ncol(chem)])
Generate a clustering dendrogramm heatmap
library(pheatmap)
names(chem_superclass_tab)[1:80]
## [1] "filename" "sample_id"
## [3] "sample_type" "source_taxon"
## [5] "site" "Sites"
## [7] "abundance" "organ"
## [9] "region" "alti"
## [11] "expo" "ntot"
## [13] "corg" "cn"
## [15] "Alkylresorcinols" "Amino acid glycosides"
## [17] "Aminosugars and aminoglycosides" "Anthranilic acid alkaloids"
## [19] "Apocarotenoids" "Carotenoids (C40)"
## [21] "Chromanes" "Coumarins"
## [23] "Cyclic polyketides" "Diarylheptanoids"
## [25] "Diterpenoids" "Eicosanoids"
## [27] "Fatty Acids and Conjugates" "Fatty acyl glycosides"
## [29] "Fatty acyls" "Fatty amides"
## [31] "Fatty esters" "Flavonoids"
## [33] "Glycerolipids" "Glycerophospholipids"
## [35] "Guanidine alkaloids" "Histidine alkaloids"
## [37] "Isoflavonoids" "Lignans"
## [39] "Linear polyketides" "Lysine alkaloids"
## [41] "Macrolides" "Meroterpenoids"
## [43] "Monoterpenoids" "Mycosporine derivatives"
## [45] "Naphthalenes" "Nicotinic acid alkaloids"
## [47] "Nucleosides" "Octadecanoids"
## [49] "Oligopeptides" "Ornithine alkaloids"
## [51] "Peptide alkaloids" "Phenanthrenoids"
## [53] "Phenolic acids (C6-C1)" "Phenylpropanoids (C6-C3)"
## [55] "Polycyclic aromatic polyketides" "Polyethers"
## [57] "Polyols" "Pseudoalkaloids"
## [59] "Pseudoalkaloids (transamidation)" "Saccharides"
## [61] "Serine alkaloids" "Sesquiterpenoids"
## [63] "Small peptides" "Sphingolipids"
## [65] "Steroids" "Stilbenoids"
## [67] "Triterpenoids" "Tryptophan alkaloids"
## [69] "Tyrosine alkaloids" "Xanthones"
## [71] "β-lactams" "X1"
## [73] "X10" "X100"
## [75] "X1000" "X10000"
## [77] "X10001" "X10002"
## [79] "X10003" "X10004"
chem_superclass_tab$Sites <- as.factor(chem_superclass_tab$Sites)
###for leaves
heatmap_tab <- chem_superclass_tab %>%
dplyr:: filter(organ == "leaves") %>%
dplyr::select(Sites, 15:71) %>%
dplyr:: group_by_at(vars(Sites)) %>%
dplyr:: summarise(across(everything(), list(mean)))%>%
remove_rownames %>%
column_to_rownames(var="Sites") %>% t
rownames(heatmap_tab)<- gsub('.{0,2}$', '', rownames(heatmap_tab)) ## remove the self-generated "X" before nubmers on col names
###
pheatmap(heatmap_tab,
scale = "row",
fontsize= 8,
cutree_rows = 1,
cutree_cols = 8,
main = "Leaves")

###for leaves
heatmap_tab <- chem_superclass_tab %>%
dplyr:: filter(organ == "roots") %>%
dplyr::select(Sites, 15:71) %>%
dplyr:: group_by_at(vars(Sites)) %>%
dplyr:: summarise(across(everything(), list(mean)))%>%
remove_rownames %>%
column_to_rownames(var="Sites") %>% t
rownames(heatmap_tab)<- gsub('.{0,2}$', '', rownames(heatmap_tab)) ## remove the self-generated "X" before nubmers on col names
###
pheatmap(heatmap_tab,
scale = "row",
fontsize= 8,
cutree_rows = 1,
cutree_cols = 8,
main = "Roots")
