library(vegan)
## Warning: package 'vegan' was built under R version 4.1.2
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.1.2
## Loading required package: lattice
## This is vegan 2.6-4
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggfortify)
wd <- list()
# commonly used paths in my working directory
wd$data   <- "/Users/rasmanns/Dropbox/Lavoro/Manuscripts/2024/Sarah_decomposition_phytochemistry/analyses_final/data/formatted/"
wd$output <- "/Users/rasmanns/Dropbox/Lavoro/Manuscripts/2024/Sarah_decomposition_phytochemistry/analyses_final/output/"
sirius <- read.csv(paste0(wd$data, "formatted_peak_annotation.csv"), h=T, sep=",", na.strings = "NA",  stringsAsFactors=FALSE)
names(sirius)
##  [1] "feature_id_full"                    "feature_id"                        
##  [3] "feature_mz"                         "feature_rt"                        
##  [5] "npc_pathway_canopus"                "npc_pathway_probability_canopus"   
##  [7] "npc_superclass_canopus"             "npc_superclass_probability_canopus"
##  [9] "npc_class_canopus"                  "npc_class_probability_canopus"     
## [11] "name_sirius"                        "chebiasciiname_sirius"             
## [13] "confidencescore_sirius"             "inchi_sirius"                      
## [15] "inchikey2d_sirius"                  "smiles_sirius"                     
## [17] "adduct_sirius"                      "molecularformula_sirius"           
## [19] "chebiid_sirius"                     "feature_id_full_annotated"

Analysis at the superclass level

# here we select the annotated featueres at the superclass level with a probability > 70%
superclass_highprob <- dplyr::filter(sirius, npc_superclass_probability_canopus > 0.7) 

levels(factor(superclass_highprob$npc_superclass_canopus)) # we have 57 superclasses 
##  [1] "Alkylresorcinols"                 "Amino acid glycosides"           
##  [3] "Aminosugars and aminoglycosides"  "Anthranilic acid alkaloids"      
##  [5] "Apocarotenoids"                   "Carotenoids (C40)"               
##  [7] "Chromanes"                        "Coumarins"                       
##  [9] "Cyclic polyketides"               "Diarylheptanoids"                
## [11] "Diterpenoids"                     "Eicosanoids"                     
## [13] "Fatty Acids and Conjugates"       "Fatty acyl glycosides"           
## [15] "Fatty acyls"                      "Fatty amides"                    
## [17] "Fatty esters"                     "Flavonoids"                      
## [19] "Glycerolipids"                    "Glycerophospholipids"            
## [21] "Guanidine alkaloids"              "Histidine alkaloids"             
## [23] "β-lactams"                       "Isoflavonoids"                   
## [25] "Lignans"                          "Linear polyketides"              
## [27] "Lysine alkaloids"                 "Macrolides"                      
## [29] "Meroterpenoids"                   "Monoterpenoids"                  
## [31] "Mycosporine derivatives"          "Naphthalenes"                    
## [33] "Nicotinic acid alkaloids"         "Nucleosides"                     
## [35] "Octadecanoids"                    "Oligopeptides"                   
## [37] "Ornithine alkaloids"              "Peptide alkaloids"               
## [39] "Phenanthrenoids"                  "Phenolic acids (C6-C1)"          
## [41] "Phenylpropanoids (C6-C3)"         "Polycyclic aromatic polyketides" 
## [43] "Polyethers"                       "Polyols"                         
## [45] "Pseudoalkaloids"                  "Pseudoalkaloids (transamidation)"
## [47] "Saccharides"                      "Serine alkaloids"                
## [49] "Sesquiterpenoids"                 "Small peptides"                  
## [51] "Sphingolipids"                    "Steroids"                        
## [53] "Stilbenoids"                      "Triterpenoids"                   
## [55] "Tryptophan alkaloids"             "Tyrosine alkaloids"              
## [57] "Xanthones"
rownames(superclass_highprob) <- superclass_highprob$feature_id ## assign features ID

superclass <- superclass_highprob %>% select(npc_superclass_canopus) ## keep only the superclasss column with the respective features ID 
## chemistry - non-annotated dataset
chem <- read.csv(paste0(wd$data, "formatted_data_scaled_sr.csv"), h=T, sep=",", row.names = 1, na.strings = "NA",  stringsAsFactors=FALSE)

names(chem)[1:20]
##  [1] "filename"     "sample_id"    "sample_type"  "source_taxon" "site"        
##  [6] "Sites"        "abundance"    "organ"        "region"       "alti"        
## [11] "expo"         "ntot"         "corg"         "cn"           "X1"          
## [16] "X10"          "X100"         "X1000"        "X10000"       "X10001"
setdiff(chem$filename, rownames(chem))
## character(0)
####
chem_tab <- chem[,c(15:ncol(chem))] ## select only feaures from the whole dataset

chem_tab2 <- chem_tab
colnames(chem_tab2)<- colnames(chem_tab2) %>% str_replace("X*", "") ## remove the self-generated "X" before nubmers on col names

####
col_names_keep <- rownames(superclass) ## these are the feuateres ID to keep based on the high probability score of the superclass above

chem_tab3 <- chem_tab2 %>%  select(any_of(col_names_keep)) ## keep only those features associated with high prob superclass

colnames(chem_tab3) <- superclass$npc_superclass_canopus



chem_tab4 <- as.data.frame(t(chem_tab3))

chem_tab5 <- chem_tab4 %>% 
  dplyr:: group_by(Superclasses = colnames(chem_tab3))%>% 
  dplyr:: summarise(across(everything(), list(sum))) %>% 
          remove_rownames %>% 
          column_to_rownames(var="Superclasses") %>% t

rownames(chem_tab5)<- gsub('.{0,2}$', '', rownames(chem_tab5)) ## remove the self-generated "X" before nubmers on col names
setdiff(rownames(chem_tab5), rownames(chem))
## character(0)
chem_superclass_tab <- cbind(chem[1:14], chem_tab5, chem[15: ncol(chem)])

Generate a clustering dendrogramm heatmap

library(pheatmap)

names(chem_superclass_tab)[1:80]
##  [1] "filename"                         "sample_id"                       
##  [3] "sample_type"                      "source_taxon"                    
##  [5] "site"                             "Sites"                           
##  [7] "abundance"                        "organ"                           
##  [9] "region"                           "alti"                            
## [11] "expo"                             "ntot"                            
## [13] "corg"                             "cn"                              
## [15] "Alkylresorcinols"                 "Amino acid glycosides"           
## [17] "Aminosugars and aminoglycosides"  "Anthranilic acid alkaloids"      
## [19] "Apocarotenoids"                   "Carotenoids (C40)"               
## [21] "Chromanes"                        "Coumarins"                       
## [23] "Cyclic polyketides"               "Diarylheptanoids"                
## [25] "Diterpenoids"                     "Eicosanoids"                     
## [27] "Fatty Acids and Conjugates"       "Fatty acyl glycosides"           
## [29] "Fatty acyls"                      "Fatty amides"                    
## [31] "Fatty esters"                     "Flavonoids"                      
## [33] "Glycerolipids"                    "Glycerophospholipids"            
## [35] "Guanidine alkaloids"              "Histidine alkaloids"             
## [37] "Isoflavonoids"                    "Lignans"                         
## [39] "Linear polyketides"               "Lysine alkaloids"                
## [41] "Macrolides"                       "Meroterpenoids"                  
## [43] "Monoterpenoids"                   "Mycosporine derivatives"         
## [45] "Naphthalenes"                     "Nicotinic acid alkaloids"        
## [47] "Nucleosides"                      "Octadecanoids"                   
## [49] "Oligopeptides"                    "Ornithine alkaloids"             
## [51] "Peptide alkaloids"                "Phenanthrenoids"                 
## [53] "Phenolic acids (C6-C1)"           "Phenylpropanoids (C6-C3)"        
## [55] "Polycyclic aromatic polyketides"  "Polyethers"                      
## [57] "Polyols"                          "Pseudoalkaloids"                 
## [59] "Pseudoalkaloids (transamidation)" "Saccharides"                     
## [61] "Serine alkaloids"                 "Sesquiterpenoids"                
## [63] "Small peptides"                   "Sphingolipids"                   
## [65] "Steroids"                         "Stilbenoids"                     
## [67] "Triterpenoids"                    "Tryptophan alkaloids"            
## [69] "Tyrosine alkaloids"               "Xanthones"                       
## [71] "β-lactams"                       "X1"                              
## [73] "X10"                              "X100"                            
## [75] "X1000"                            "X10000"                          
## [77] "X10001"                           "X10002"                          
## [79] "X10003"                           "X10004"
chem_superclass_tab$Sites <- as.factor(chem_superclass_tab$Sites)
###for leaves
heatmap_tab <- chem_superclass_tab %>% 
  dplyr:: filter(organ == "leaves") %>% 
  dplyr::select(Sites, 15:71) %>%
  dplyr:: group_by_at(vars(Sites)) %>% 
  dplyr:: summarise(across(everything(), list(mean)))%>% 
          remove_rownames %>% 
          column_to_rownames(var="Sites") %>% t

  rownames(heatmap_tab)<- gsub('.{0,2}$', '', rownames(heatmap_tab)) ## remove the self-generated "X" before nubmers on col names
     
###
pheatmap(heatmap_tab,
         scale = "row", 
         fontsize= 8,
         cutree_rows = 1,
         cutree_cols = 8, 
         main = "Leaves")

###for leaves
heatmap_tab <- chem_superclass_tab %>% 
  dplyr:: filter(organ == "roots") %>% 
  dplyr::select(Sites, 15:71) %>%
  dplyr:: group_by_at(vars(Sites)) %>% 
  dplyr:: summarise(across(everything(), list(mean)))%>% 
          remove_rownames %>% 
          column_to_rownames(var="Sites") %>% t

  rownames(heatmap_tab)<- gsub('.{0,2}$', '', rownames(heatmap_tab)) ## remove the self-generated "X" before nubmers on col names
     
###
pheatmap(heatmap_tab,
         scale = "row", 
         fontsize= 8,
         cutree_rows = 1,
         cutree_cols = 8, 
         main = "Roots")