1 1. Vegetation: presence/absence, ordination, and diversity

fn_veg <- file.path(DATA_DIR, params$fn_veg)
need_file(fn_veg)
## [1] "/Users/rasmanns/Library/CloudStorage/Dropbox/Lavoro/Manuscripts/2025/Aurora_Urban_Ecology/data_folder/stats_figures/data//Matrice_1_Présence_non-présence plantes.csv"
veg <- read.csv(fn_veg, header = TRUE, sep = ";", na.strings = "NA", check.names = FALSE)
# Minimal checks
stopifnot(all(c("Parc","Layer") %in% names(veg)))
veg$Parc  <- as.factor(veg$Parc)
veg$Layer <- as.factor(veg$Layer)

# Identify community matrix columns (assumes metadata in first 5 cols as in your script)
comm_cols <- 6:ncol(veg)
stopifnot(max(comm_cols) > 6)
comm <- veg[, comm_cols]

# Ensure binary presence/absence
comm_bin <- as.data.frame(comm > 0) * 1
# Jaccard NMDS
d <- vegdist(comm_bin, method = "jaccard", binary = TRUE)
nmds <- metaMDS(d, k = 2, trymax = 100, autotransform = FALSE, trace = 0)

# Site scores
nmds_points <- as.data.frame(scores(nmds, display = "sites"))
colnames(nmds_points) <- c("NMDS1","NMDS2")
nmds_points$Parc    <- veg$Parc
nmds_points$Habitat <- plyr::revalue(
  veg$Layer,
  c("herb" = "Herb", "shrub" = "Shrub", "tree" = "Tree"),
  warn_missing = FALSE
)
nmds_points$Richness <- vegan::specnumber(comm_bin)

# PERMANOVA (strata if available; fallback to Parc)
strata_col <- if ("Parc2" %in% names(veg)) "Parc2" else "Parc"
perm_formula <- as.formula(paste0("comm_bin ~ Layer"))
perm_out <- adonis2(comm_bin ~ Layer, data = veg, permutations = 199, strata = veg[[strata_col]])

# NMDS plot
nmds_plot <- ggplot(nmds_points, aes(NMDS1, NMDS2)) +
  geom_point(aes(size = Richness, color = Parc), alpha = 0.8) +
  geom_text_repel(aes(label = Parc), color = "black", vjust = 3.5, size = 3.5) +
  stat_ellipse(aes(fill = Habitat), geom = "polygon", alpha = 0.2, color = NA,
               type = "norm", level = 0.75) +
  annotate("text", x = -3, y = max(nmds_points$NMDS2, na.rm = TRUE),
           label = paste0("Stress = ", round(nmds$stress, 3)), hjust = 1.05, vjust = 1.5, size = 4) +
  labs(x = "NMDS1", y = "NMDS2", color = "Parc", fill = "Habitat") +
  theme(legend.position = "none")

ggsave(file.path(OUTPUT_DIR, "veg_nmds.png"), nmds_plot, width = 7, height = 6, dpi = 300)
veg$H <- vegan::diversity(comm_bin)
veg$S <- vegan::specnumber(comm_bin)

# Site-level diversity table
site_div <- tibble(
  Site     = if ("Elev" %in% names(veg)) veg$Elev else seq_len(nrow(veg)),
  Parc     = veg$Parc,
  Shannon  = veg$H,
  Richness = veg$S
)
write.csv(site_div, file.path(OUTPUT_DIR, "site_diversity.csv"), row.names = FALSE)

# Parc-pooled presence/absence and diversity
pooled_by_parc <- rowsum(comm_bin, group = veg$Parc)
pooled_by_parc_bin <- (pooled_by_parc > 0) * 1
write.csv(pooled_by_parc_bin, file.path(OUTPUT_DIR, "pooled_by_parc.csv"))

parc_div <- tibble(
  Parc     = rownames(pooled_by_parc_bin),
  Shannon  = as.numeric(vegan::diversity(pooled_by_parc_bin)),
  Richness = as.numeric(vegan::specnumber(pooled_by_parc_bin))
)
write.csv(parc_div, file.path(OUTPUT_DIR, "parc_diversity.csv"), row.names = FALSE)

# Barplot: richness by Parc
diversity_plot2 <- ggplot(parc_div, aes(y = reorder(Parc, Richness), x = Richness, fill = Parc)) +
  geom_col() +
  labs(x = "Species richness", y = "Parcs") +
  theme(legend.position = "none",
        axis.text = element_text(size = 10),
        axis.title = element_text(size = 14))
ggsave(file.path(OUTPUT_DIR, "veg_richness_by_parc.png"), diversity_plot2, width = 7, height = 8, dpi = 300)

# Parc-level covariates and relationships
veg3 <- veg %>%
  dplyr::select(any_of(c("Parc","Area","H","S"))) %>%
  dplyr::group_by(Parc) %>%
  dplyr::summarise(across(c(Area,H,S), mean, na.rm = TRUE), .groups = "drop") %>%
  remove_rownames() %>%
  column_to_rownames("Parc")
## Warning: There was 1 warning in `dplyr::summarise()`.
## ℹ In argument: `across(c(Area, H, S), mean, na.rm = TRUE)`.
## ℹ In group 1: `Parc = Brouette1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
veg3$Size <- factor(ifelse(veg3$Area >= 20000, "Big", "Small"))

fit_area_S <- lm(Area ~ S, data = veg3)
area_S_plot <- ggplot(veg3, aes(x = Area, y = S, color = Size)) +
  geom_point() +
  geom_smooth(method = lm, se = TRUE) +
  scale_color_brewer(palette = "Set2") +
  labs(y = "Species richness", x = expression(paste("Area (", m^2, ")")))

ggsave(file.path(OUTPUT_DIR, "area_vs_species.png"), area_S_plot, width = 6.5, height = 5.5, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
fig_veg <- ggarrange(
  nmds_plot,
  ggarrange(diversity_plot2, area_S_plot, ncol = 2, labels = c("B)", "C)")),
  ncol = 1, labels = c("A)")
)
## `geom_smooth()` using formula = 'y ~ x'
ggsave(file.path(OUTPUT_DIR, "vegetation_panel.png"), fig_veg, width = 9, height = 12, dpi = 300)
fig_veg

2 2. VOCs (GC–MS/MZmine) processing and ordination

fn_vocs      <- file.path(DATA_DIR, params$fn_vocs)
fn_vocs_meta <- file.path(DATA_DIR, params$fn_vocs_meta)
need_file(fn_vocs); need_file(fn_vocs_meta)
## [1] "/Users/rasmanns/Library/CloudStorage/Dropbox/Lavoro/Manuscripts/2025/Aurora_Urban_Ecology/data_folder/stats_figures/data//fehti_mzmine2_worked.csv"
## [1] "/Users/rasmanns/Library/CloudStorage/Dropbox/Lavoro/Manuscripts/2025/Aurora_Urban_Ecology/data_folder/stats_figures/data//metadata_VOCs.csv"
vocs <- read.csv(fn_vocs,  h=T, sep=";", na.strings = "NA", stringsAsFactors=FALSE)
metadata_vocs <- read.csv(fn_vocs_meta, h=T, sep=";", na.strings = "NA", stringsAsFactors=FALSE, row.names = 1)

# Construct sample x feature matrix
head(vocs)
names(vocs)
##  [1] "row.ID"             "row.m.z"            "row.retention.time"
##  [4] "X02.CDF.Peak.area"  "X01.CDF.Peak.area"  "X11.CDF.Peak.area" 
##  [7] "X22.CDF.Peak.area"  "X23.CDF.Peak.area"  "X63.CDF.Peak.area" 
## [10] "X33.CDF.Peak.area"  "X21.CDF.Peak.area"  "X13.CDF.Peak.area" 
## [13] "X15.CDF.Peak.area"  "X62.CDF.Peak.area"  "X61.CDF.Peak.area" 
## [16] "X12.CDF.Peak.area"  "X73.CDF.Peak.area"  "X31.CDF.Peak.area" 
## [19] "X82.CDF.Peak.area"  "X113.CDF.Peak.area" "X41.CDF.Peak.area" 
## [22] "X103.CDF.Peak.area" "X83.CDF.Peak.area"  "X42.CDF.Peak.area" 
## [25] "X102.CDF.Peak.area" "X43.CDF.Peak.area"  "X81.CDF.Peak.area" 
## [28] "X71.CDF.Peak.area"  "X52.CDF.Peak.area"  "X51.CDF.Peak.area" 
## [31] "X112.CDF.Peak.area" "X93.CDF.Peak.area"  "X123.CDF.Peak.area"
## [34] "X111.CDF.Peak.area" "X92.CDF.Peak.area"  "X101.CDF.Peak.area"
## [37] "X121.CDF.Peak.area" "X122.CDF.Peak.area" "X91.CDF.Peak.area"
vocs2 <- vocs%>%
  select(-c(2,3)) %>% 
  column_to_rownames(var='row.ID') %>% t

# Clean sample names to match metadata
rownames(vocs2)<- rownames(vocs2) %>% str_replace(".CDF.Peak.area*", "")
rownames(vocs2)<- rownames(vocs2) %>% str_replace(".X*", "")
rownames(metadata_vocs)
##  [1] "X02.CDF.Peak.area"  "X15.CDF.Peak.area"  "X11.CDF.Peak.area" 
##  [4] "X12.CDF.Peak.area"  "X13.CDF.Peak.area"  "X21.CDF.Peak.area" 
##  [7] "X22.CDF.Peak.area"  "X23.CDF.Peak.area"  "X31.CDF.Peak.area" 
## [10] "X32.CDF.Peak.area"  "X33.CDF.Peak.area"  "X41.CDF.Peak.area" 
## [13] "X42.CDF.Peak.area"  "X43.CDF.Peak.area"  "X51.CDF.Peak.area" 
## [16] "X52.CDF.Peak.area"  "X53.CDF.Peak.area"  "X61.CDF.Peak.area" 
## [19] "X62.CDF.Peak.area"  "X63.CDF.Peak.area"  "X71.CDF.Peak.area" 
## [22] "X72.CDF.Peak.area"  "X73.CDF.Peak.area"  "X81.CDF.Peak.area" 
## [25] "X82.CDF.Peak.area"  "X83.CDF.Peak.area"  "X91.CDF.Peak.area" 
## [28] "X92.CDF.Peak.area"  "X93.CDF.Peak.area"  "X101.CDF.Peak.area"
## [31] "X102.CDF.Peak.area" "X103.CDF.Peak.area" "X111.CDF.Peak.area"
## [34] "X112.CDF.Peak.area" "X113.CDF.Peak.area" "X121.CDF.Peak.area"
## [37] "X122.CDF.Peak.area" "X123.CDF.Peak.area"
rownames(metadata_vocs)<- rownames(metadata_vocs) %>% str_replace(".CDF.Peak.area*", "")
rownames(metadata_vocs)<- rownames(metadata_vocs) %>% str_replace(".X*", "")


# Merge
vocs3 <- merge(metadata_vocs, vocs2, by = 0)

# Optional row removals kept from original (ensure indices exist)
drop_idx <- intersect(c(1,14), seq_len(nrow(vocs3)))
vocs4 <- if (length(drop_idx)) vocs3[-drop_idx, ] else vocs3

names(vocs4)[2:4] <- c("ID_nb","Parc","Type")
vocs4$Parc <- as.factor(vocs4$Parc)
vocs4$Type <- as.factor(vocs4$Type)
# Internal-standard based scaling (keep structure from original script)
# NOTE: adapt indices if your IS column is elsewhere.
intensity <- vocs4[, 5:44]
is_col    <- 25
scale_fac <- 20
vocscal   <- intensity / vocs4[, is_col] * scale_fac
vocscal   <- vocscal[, -21, drop = FALSE]  # remove IS column from the features

# NMDS on Bray–Curtis
nmds_voc <- metaMDS(vocscal, distance = "bray", trymax = 100, autotransform = FALSE, trace = 0)
scores_voc <- as.data.frame(scores(nmds_voc, display = "sites"))
colnames(scores_voc) <- c("NMDS1","NMDS2")
scores_voc$Parc <- vocs4$Parc
scores_voc$Vegetation_type <- plyr::revalue(
  vocs4$Type, c("Haut" = "Tree", "Milieu" = "Shrub", "Sol" = "Herb"), warn_missing = FALSE
)

nmdsplot_vocs <- ggplot(scores_voc, aes(NMDS1, NMDS2, colour = Parc)) +
  geom_point(size = 3.5, alpha = 0.8) +
  stat_ellipse(aes(fill = Vegetation_type), geom = "polygon", alpha = 0.2, color = NA,
               type = "norm", level = 0.75) +
  annotate("text", x = max(scores_voc$NMDS1, na.rm = TRUE), y = max(scores_voc$NMDS2, na.rm = TRUE),
           label = paste0("Stress = ", round(nmds_voc$stress, 3)), hjust = 1.05, vjust = 1.5, size = 4) +
  labs(color = "Parc", fill = "Habitat") +
  theme(legend.position = "top")

ggsave(file.path(OUTPUT_DIR, "vocs_nmds.png"), nmdsplot_vocs, width = 7, height = 6, dpi = 300)

# PERMANOVA on Type with Parc as strata
perm_voc <- adonis2(vocscal ~ Type, data = vocs4, permutations = 199, strata = vocs4$Parc)

# Derive VOC richness per Parc (count of molecules present in any sample within Parc)
voc_by_parc <- vocscal %>%
  dplyr:: mutate(Parc = vocs4$Parc) %>%
  dplyr:: group_by(Parc) %>%
  dplyr:: summarise(across(where(is.numeric), ~ as.integer(any(. > 0))), .groups = "drop") %>%
  dplyr:: mutate(S_chem = rowSums(across(where(is.numeric))))
write.csv(voc_by_parc, file.path(OUTPUT_DIR, "voc_richness_by_parc.csv"), row.names = FALSE)
# Join VOC chemical richness to vegetation richness per Parc
vocscal2 <- cbind(vocs4[,c(3:4)], vocscal)
vocscal2$Type
##  [1] Sol    Milieu Haut   Sol    Sol    Milieu Haut   Milieu Sol    Milieu
## [11] Haut   Haut   Sol    Milieu Haut   Sol    Haut   Sol    Milieu Haut  
## [21] Sol    Milieu Sol    Milieu Haut   Sol    Haut   Sol    Milieu Haut  
## [31] Sol    Milieu Haut  
## Levels: Haut Milieu Sol
vocscal2$Parc
##  [1] Valency        Valency        Valency        Ouchy          Languedoc     
##  [6] Languedoc      Languedoc      Ouchy          Jeunesse       Jeunesse      
## [11] Jeunesse       Ouchy          Ficelle        Ficelle        Ficelle       
## [16] Milan          Milan          Montbenon      Montbenon      Montbenon     
## [21] Eracom         Eracom         Brouette       Brouette       Brouette      
## [26] Hermitage      Hermitage      Monrepos       Monrepos       Monrepos      
## [31] Derrière-Bourg Derrière-Bourg Derrière-Bourg
## 12 Levels: Brouette Derrière-Bourg Eracom Ficelle Hermitage ... Valency
vocscal2$Parc <- gsub("[0-9]", "", vocscal2$Parc)
vocscal2$Parc <- as.factor(vocscal2$Parc)


veg3_tbl <- vocscal2 %>%   dplyr::select(Parc, 3:33)  %>% 
  dplyr:: group_by(Parc) %>% 
  dplyr:: summarise(across(everything(), ~ sum(., na.rm = TRUE))) %>% 
  column_to_rownames(var='Parc')  

S_chem <- vegan::specnumber(veg3_tbl)

S_veg <-  veg$S

div_table <- cbind(veg3, S_chem, S_veg)

div_table$Parc <- factor(rownames(div_table))
names(div_table)[1] <- c("Area")

vocs_veg_plot <- ggplot(div_table, aes(x = S_veg, y = S_chem)) + 
  geom_point(aes(size = Area, color = Parc )) +
  geom_smooth(method = lm, color = "darkgrey", se = TRUE) +
  ylab("VOCs diversity") +
  xlab("Plant diversity") +
  theme_pubclean(base_size = 14) +
  theme(legend.position = "right")
  
vocs_veg_plot
## `geom_smooth()` using formula = 'y ~ x'

ggsave(file.path(OUTPUT_DIR, "vocs_vs_veg.png"), vocs_veg_plot, width = 7, height = 6, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
# Simple model including city section (Parc prefix) as fixed effect
div_table$Parc2 <- sub("[0-9]$", "", div_table$Parc)
fit_vocveg <- lm(S_veg ~ S_chem + Parc2, data = div_table)
summary(fit_vocveg)
## 
## Call:
## lm(formula = S_veg ~ S_chem + Parc2, data = div_table)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.610 -2.396  0.000  1.949 12.976 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     -5.1429     6.2299  -0.826   0.4176  
## S_chem           0.7071     0.2805   2.521   0.0191 *
## Parc2Eracom     -3.1786     3.8606  -0.823   0.4188  
## Parc2Ficelle    -1.2595     3.9678  -0.317   0.7538  
## Parc2Hermitage  -6.1619     3.9930  -1.543   0.1364  
## Parc2Jeunesse    2.0000     3.8322   0.522   0.6067  
## Parc2Languedoc  -5.5119     3.8606  -1.428   0.1668  
## Parc2Milan       1.0738     3.9678   0.271   0.7891  
## Parc2Monrepos   -3.1619     3.9930  -0.792   0.4365  
## Parc2Montbenon  -1.0000     3.8322  -0.261   0.7965  
## Parc2Ouchy       2.1548     3.8606   0.558   0.5821  
## Parc2Promenade  -0.5929     3.9678  -0.149   0.8825  
## Parc2Valency     0.8381     3.9930   0.210   0.8356  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.693 on 23 degrees of freedom
## Multiple R-squared:  0.4299, Adjusted R-squared:  0.1325 
## F-statistic: 1.445 on 12 and 23 DF,  p-value: 0.2159
anova(fit_vocveg)
write.csv(div_table, file.path(OUTPUT_DIR, "div_table.csv"), row.names = FALSE)

#
fig_vocs <- ggarrange(nmdsplot_vocs + theme(legend.position = "none"),
                      vocs_veg_plot, labels = c("A)", "B)"), ncol = 2, common.legend = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
ggsave(file.path(OUTPUT_DIR, "vocs_panel.png"), fig_vocs, width = 12, height = 6, dpi = 300)
fig_vocs

3 3. HPLC–MS feature table: merge with metadata

fn_hplc_quant <- file.path(DATA_DIR, params$fn_hplc_quant)
fn_hplc_meta  <- file.path(DATA_DIR, params$fn_hplc_meta)
need_file(fn_hplc_quant); need_file(fn_hplc_meta)
## [1] "/Users/rasmanns/Library/CloudStorage/Dropbox/Lavoro/Manuscripts/2025/Aurora_Urban_Ecology/data_folder/stats_figures/data//export_final_quant.csv"
## [1] "/Users/rasmanns/Library/CloudStorage/Dropbox/Lavoro/Manuscripts/2025/Aurora_Urban_Ecology/data_folder/stats_figures/data//metadata_final.csv"
features <- read.csv(fn_hplc_quant, h=T, sep=",", na.strings = "NA", stringsAsFactors=FALSE)

# Transpose samples x features, keep only "Sample" rows
features2 <- as.data.frame(t(features[, 14:(ncol(features) - 1)]))
features_final <- features2 %>%
  rownames_to_column("rowname") %>%
  filter(str_detect(rowname, "Sample")) %>%
  column_to_rownames("rowname")

# Clean rownames (remove leading X and trailing .Peak.area)
features_final_clean <- features_final %>%
  rownames_to_column("rowname") %>%
  mutate(rowname = str_remove(rowname, "^X"),
         rowname = str_remove(rowname, "\\.Peak\\.area$")) %>%
  column_to_rownames("rowname")

# Metadata
metadata <- read.csv(fn_hplc_meta,  h=T, sep=",", na.strings = "NA", stringsAsFactors=FALSE)
metadata_hplc <- metadata %>%
  filter(str_detect(filename, "Sample")) %>%
  column_to_rownames("filename")

# Merge by rownames
hplc_merged <- metadata_hplc %>%
  rownames_to_column("rowname") %>%
  inner_join(features_final_clean %>% rownames_to_column("rowname"), by = "rowname") %>%
  column_to_rownames("rowname")

write.csv(hplc_merged, file = file.path(OUTPUT_DIR, "hplc_merged.csv"))

4 4. Phylogeny and species chemical diversity

# Species vector (de-duplicated)
r <- c(
  "Acer campestre","Acer palmatum","Acer platanoides","Acer pseudoplatanus","Acer saccharum",
  "Achillea millefolium","Aesculus hippocastanum","Anagallis arvensis","Anemone nemorosa",
  "Anthemis tinctoria","Aquilegia vulgaris","Arrhenatherum elatius","Artemisia verlotiorum",
  "Aubrieta deltoidea","Bellis perennis","Betonica officinalis","Betula pendula","Buxus sempervirens",
  "Calystegia sepium","Campanula persicifolia","Campsis grandiflora","Capsella bursa-pastoris",
  "Carex paniculata","Carpinus betulus","Carpinus orientalis","Cedrus atlantica","Cedrus deodara",
  "Centaurea cyanus","Centaurea scabiosa","Centaurea stoebe","Cerastium fontanum",
  "Cercidiphyllum japonicum","Chamaecyparis obtusa","Chimonanthus praecox","Chionanthus retusus",
  "Cichorium intybus","Circaea lutetiana","Cirsium arvense","Convolvulus arvensis","Cornus controversa",
  "Cornus mas","Cornus sanguinea","Cortaderia selloana","Corylus avellana","Corylus cornuta",
  "Cotinus coggygria","Crataegus laevigata","Crepis aurea","Crepis capillaris","Cymbalaria muralis",
  "Dactylis glomerata","Daucus carota","Delphinium elatum","Dryopteris filix-mas","Echium vulgare",
  "Euonymus alatus","Euonymus europaeus","Euonymus latifolius","Euphorbia cyparissias","Fagus sylvatica",
  "Fraxinus excelsior","Galium mollugo","Geranium robertianum","Geranium sanguineum","Geum urbanum",
  "Ginkgo biloba","Glechoma hederacea","Gleditsia triacanthos","Hamamelis virginiana","Hedera helix",
  "Holcus lanatus","Hosta sieboldiana","Hydrangea macrophylla","Hypericum androsaemum","Hypericum palustrum",
  "Hypericum perforatum","Hypochaeris radicata","Iberis sempervirens","Ilex aquifolium","Ilex canariensis",
  "Impatiens parviflora","Juniperus communis","Lactuca virosa","Lathyrus latifolius","Lavandula angustifolia",
  "Leucanthemum vulgare","Leymus arenarius","Ligustrum vulgare","Linaria vulgaris","Liquidambar orientalis",
  "Liriodendron tulipifera","Lolium multiflorum","Lolium perenne","Lonicera japonica","Lonicera xylosteum",
  "Lotus corniculatus","Magnolia × soulangeana","Magnolia kobus","Malva alcea","Malva parviflora",
  "Medicago sativa","Miscanthus sinensis","Nepeta racemosa","Origanum vulgare","Osmanthus heterophyllus",
  "Panicum virgatum","Papaver rhoeas","Parrotia persica","Parthenocissus tricuspidata","Pastinaca sativa",
  "Phleum pratense","Pinus mugo","Pinus nigra","Pinus sylvestris","Plantago lanceolata","Plantago major",
  "Platanus × hispanica","Poa annua","Polygonum aviculare","Polystichum setiferum","Populus nigra",
  "Potentilla reptans","Prunella vulgaris","Prunus × fruticans","Prunus avium","Prunus cerasus",
  "Prunus domestica","Prunus laurocerasus","Prunus lusitanica","Prunus mahaleb","Prunus padus",
  "Prunus serrulata","Prunus yedoensis","Punica granatum","Quercus ilex","Quercus petraea","Quercus robur",
  "Ranunculus acris","Rosa multiflora","Rumex acetosa","Rumex obtusifolius","Salix purpurea","Salvia pratensis",
  "Sambucus canadensis","Scabiosa columbaria","Sequoiadendron giganteum","Silene noctiflora","Sonchus asper",
  "Sonchus oleraceus","Styphnolobium japonicum","Tamarix ramosissima","Taraxacum officinale","Taxus baccata",
  "Taxus brevifolia","Taxus cuspidata","Tilia platyphyllos","Torilis arvensis","Torminalis glaberrima",
  "Tradescantia virginiana","Trifolium arvense","Trifolium pratense","Trifolium repens","Trisetum flavescens",
  "Valeriana officinalis","Verbascum nigrum","Verbascum pulverulentum","Verbena bonariensis","Veronica persica",
  "Viburnum acerifolium","Viburnum henryi","Viburnum lantana","Viburnum opulus","Viburnum trilobum",
  "Vinca major","Vinca minor","Visnaga daucoides"
)
Species <- unique(r)
# Manual corrections retained from source
Species[97]  <- "Magnolia denudata"
Species[124] <- "Prunus spinosa"
Species[117] <- "Platanus orientalis"
Species[158] <- "Sorbus glaberrima"
# TNRS and OpenTree subtree
taxon_search <- tnrs_match_names(names = Species, context_name = "All life")
mu <- tibble(Species = Species,
             ott_name = unique_name(taxon_search),
             ott_id   = taxon_search$ott_id)

ott_in_tree <- ott_id(taxon_search)[is_in_tree(ott_id(taxon_search))]
tr <- tol_induced_subtree(ott_ids = ott_in_tree)
## Progress [--------------------------------] 0/1663 (  0%) ?sProgress [============================] 1663/1663 (100%)  0s                                                            
tr$tip.label <- strip_ott_ids(tr$tip.label, remove_underscores = TRUE)
# Build species x molecule logical matrix from HPLC-merged table (presence if >0)
# Assumes 'Species' column exists in hplc_merged metadata columns.
stopifnot("Species" %in% colnames(hplc_merged))
# Feature columns start after metadata; here we detect numeric-only columns
feature_mat <- hplc_merged %>% select(where(is.numeric))
# Collapse to species presence (any occurrence)
diversity_tbl <- hplc_merged %>%
  dplyr::select(Species, 8:8738)%>%
  dplyr::group_by(Species) %>%
  dplyr::  summarise(across(everything(), ~ as.integer(any(. > 0, na.rm = TRUE))))%>%
  dplyr:: mutate(chem_diversity = rowSums(across(2:ncol(.))))  # counts 1s

write.csv(diversity_tbl, file.path(OUTPUT_DIR, "diversity_tbl.csv"), row.names = FALSE)

# Name normalization for matching to tree tips
normalize_name <- function(x) {
  x %>%
    str_trim() %>%
    str_replace_all("\\s+", " ") %>%
    str_replace_all("×", "x") %>%
    str_replace_all("[._]", " ") %>%
    stringi::stri_trans_general("Latin-ASCII") %>%
    tolower() %>%
    str_replace_all("[^a-z0-9 ]", "") %>%
    str_squish()
}

diversity_tbl <- diversity_tbl %>% mutate(name_norm = normalize_name(Species))
tips      <- tr$tip.label
tips_norm <- normalize_name(tips)

idx <- match(diversity_tbl$name_norm, tips_norm)
diversity_tbl$tip_label <- ifelse(!is.na(idx), tips[idx], NA)

if (anyNA(diversity_tbl$tip_label)) {
  need <- which(is.na(diversity_tbl$tip_label))
  fm <- stringdist::amatch(diversity_tbl$name_norm[need], tips_norm, method = "osa", maxDist = 2)
  diversity_tbl$tip_label[need] <- ifelse(!is.na(fm), tips[fm], NA)
}

unmatched <- diversity_tbl %>% filter(is.na(tip_label)) %>% select(Species)
if (nrow(unmatched) > 0) {
  write.csv(unmatched, file.path(OUTPUT_DIR, "unmatched_species_phylogeny.csv"), row.names = FALSE)
  message("Some species could not be matched to the phylogeny; see unmatched_species_phylogeny.csv")
}
## Some species could not be matched to the phylogeny; see unmatched_species_phylogeny.csv
traits <- diversity_tbl %>% filter(!is.na(tip_label)) %>% select(tip_label, chem_diversity)
# Example correction retained from original
if (length(tr$tip.label) >= 152) tr$tip.label[152] <- "Panicum virgatum"

# Map chemical diversity onto tree
phylo_chem_plot <- ggtree(tr, layout = "circular") %<+% traits +
  geom_tippoint(aes(size = chem_diversity, color = chem_diversity), alpha = 0.6) +
  scale_size_continuous(name = "Chemical diversity", range = c(1, 5)) +
  scale_color_viridis_c(name = "Chemical diversity") +
  theme(legend.position = "right")
## Warning: `aes_()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`
## ℹ The deprecated feature was likely used in the ggtree package.
##   Please report the issue at <https://github.com/YuLab-SMU/ggtree/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic arguments:
## • as.Date = as.Date
## • yscale_mapping = yscale_mapping
## • branch.length = branch.length
## • hang = hang
## ℹ Did you misspell an argument name?
ggsave(file.path(OUTPUT_DIR, "phylogeny_chem_diversity.png"), phylo_chem_plot, width = 8, height = 8, dpi = 300)
## Warning: Removed 78 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).
phylo_chem_plot
## Warning: Removed 78 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).

5 5. Virtual communities: species vs chemical richness

# Species → primary layer assignment from vegetation matrix
meta_cols <- intersect(c("Parc2","Parc1","Parc","Area","Layer"), names(veg))
species_cols <- setdiff(names(veg), meta_cols)

layer_by_species <- purrr::map_chr(
  species_cols,
  \(sp) {
    rows <- veg[[sp]] > 0
    if (!any(rows)) return(NA_character_)
    lyr <- veg$Layer[rows]
    names(sort(table(lyr), decreasing = TRUE))[1]
  }
) |> setNames(species_cols)

target_layers <- c("herb","shrub","tree")
species_pool  <- names(layer_by_species)[layer_by_species %in% target_layers]

# Prepare chemical matrix from diversity_tbl (molecule columns start with any feature column)
mol_cols <- names(diversity_tbl)[!(names(diversity_tbl) %in% c("Species","name_norm","tip_label","chem_diversity"))]
stopifnot(length(mol_cols) > 0)

norm_name <- function(x) {
  x |>
    stringr::str_to_lower() |>
    stringi::stri_trans_general("Latin-ASCII") |>
    gsub("[^a-z]+", " ", x = _) |>
    stringr::str_squish()
}

chem <- diversity_tbl
chem$species_norm <- norm_name(chem$Species)
pool_norm <- norm_name(species_pool)

chem_index <- match(pool_norm, chem$species_norm)
matched_mask <- !is.na(chem_index)
matched_species <- species_pool[matched_mask]

chem_bool <- as.matrix(chem[mol_cols] > 0)

chemical_richness <- function(species_vec) {
  idx <- match(norm_name(species_vec), chem$species_norm)
  idx <- idx[!is.na(idx)]
  if (length(idx) == 0) return(c(chem_rich = NA_real_, n_with_chem = 0))
  present_any <- matrixStats::colAnys(chem_bool[idx, , drop = FALSE])
  c(chem_rich = sum(present_any), n_with_chem = length(idx))
}

# Observed layer proportions (from veg)
prop <- prop.table(table(veg$Layer[target_layers %in% veg$Layer]))
# If some layers missing, fallback to given vector from your script
if (length(prop) != 3) prop <- c(herb = 0.358, shrub = 0.328, tree = 0.313)

sp_by_layer <- split(names(layer_by_species), layer_by_species)[target_layers]

sizes <- 3:80
reps_per_size <- 100

results <- vector("list", length(sizes) * reps_per_size); k <- 1L
for (s in sizes) {
  for (r in seq_len(reps_per_size)) {
    counts <- round(s * prop / sum(prop))
    while (sum(counts) < s) counts[which.max(prop)] <- counts[which.max(prop)] + 1
    while (sum(counts) > s) counts[which.max(counts)] <- counts[which.max(counts)] - 1

    comm <- character(0)
    for (ly in names(counts)) {
      layer_pool <- intersect(sp_by_layer[[ly]], matched_species)
      n_ly <- counts[[ly]]
      if (n_ly > 0 && length(layer_pool) > 0) {
        pick <- if (n_ly <= length(layer_pool)) sample(layer_pool, n_ly, FALSE) else sample(layer_pool, n_ly, TRUE)
        comm <- c(comm, pick)
      }
    }

    vals <- chemical_richness(comm)
    results[[k]] <- tibble(
      species_richness     = s,
      chemical_richness    = vals[["chem_rich"]],
      n_with_chem_profiles = vals[["n_with_chem"]]
    )
    k <- k + 1L
  }
}
sim_df <- bind_rows(results) |> drop_na(chemical_richness)

# Stats
pear <- cor.test(sim_df$species_richness, sim_df$chemical_richness, method = "pearson")
spear <- cor.test(sim_df$species_richness, sim_df$chemical_richness, method = "spearman")
## Warning in cor.test.default(sim_df$species_richness, sim_df$chemical_richness,
## : Cannot compute exact p-value with ties
lm_fit <- lm(chemical_richness ~ species_richness, data = sim_df)
lm_sum <- summary(lm_fit)

write.csv(sim_df, file.path(OUTPUT_DIR, "virtual_communities_results.csv"), row.names = FALSE)

# Plot
comm_chem_div_plot <- ggplot(sim_df, aes(species_richness, chemical_richness)) +
  geom_point(alpha = 0.25, size = 1) +
  geom_smooth(se = TRUE, color = "darkred", linewidth = 1.2, span = 0.6) +
  labs(x = "Plant species richness (community size)", y = "Chemical richness (unique molecules)")

ggsave(file.path(OUTPUT_DIR, "virtual_communities_plot.png"), comm_chem_div_plot, width = 7, height = 6, dpi = 300)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Inset with phylogeny (optional combined figure)
combo <- ggdraw() +
  draw_plot(comm_chem_div_plot, 0, 0, 1, 1) +
  draw_plot(phylo_chem_plot + theme(legend.position = "right"), 0.55, 0.1, 0.4, 0.4)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 78 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).
ggsave(file.path(OUTPUT_DIR, "communities_plus_phylogeny.png"), combo, width = 10, height = 7, dpi = 300)
combo

6 6. Summary panels

panel_veg_vocs <- ggarrange(
  nmds_plot, nmdsplot_vocs, labels = c("A) Vegetation NMDS","B) VOCs NMDS"), ncol = 2
)
ggsave(file.path(OUTPUT_DIR, "panel_veg_vocs.png"), panel_veg_vocs, width = 12, height = 6, dpi = 300)
panel_veg_vocs

7 7. PERMANOVA summaries

cat("Vegetation PERMANOVA (Jaccard PA, factor Layer; strata =", strata_col, ")\\n")
## Vegetation PERMANOVA (Jaccard PA, factor Layer; strata = Parc2 )\n
print(perm_out)
## Permutation test for adonis under reduced model
## Blocks:  strata 
## Permutation: free
## Number of permutations: 199
## 
## adonis2(formula = comm_bin ~ Layer, data = veg, permutations = 199, strata = veg[[strata_col]])
##          Df SumOfSqs      R2      F Pr(>F)   
## Model     2   3.0999 0.20437 4.2383  0.005 **
## Residual 33  12.0681 0.79563                 
## Total    35  15.1680 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\\nVOCs PERMANOVA (Bray–Curtis; factor Type; strata = Parc)\\n")
## \nVOCs PERMANOVA (Bray–Curtis; factor Type; strata = Parc)\n
print(perm_voc)
## Permutation test for adonis under reduced model
## Blocks:  strata 
## Permutation: free
## Number of permutations: 199
## 
## adonis2(formula = vocscal ~ Type, data = vocs4, permutations = 199, strata = vocs4$Parc)
##          Df SumOfSqs      R2      F Pr(>F)   
## Model     2  0.31375 0.11772 2.0015  0.005 **
## Residual 30  2.35138 0.88228                 
## Total    32  2.66513 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8 8. Session info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] stringdist_0.9.15  rotl_3.1.0         ggtree_3.16.3      ape_5.8-1         
##  [5] matrixStats_1.5.0  stringi_1.8.7      cowplot_1.2.0      pheatmap_1.0.13   
##  [9] janitor_2.2.1      ggfortify_0.4.19   lubridate_1.9.4    forcats_1.0.0     
## [13] stringr_1.5.2      dplyr_1.1.4        purrr_1.1.0        readr_2.1.5       
## [17] tidyr_1.3.1        tibble_3.3.0       tidyverse_2.0.0    ggrepel_0.9.6     
## [21] ggpubr_0.6.1       plyr_1.8.9         ggplot2_4.0.0      vegan_2.7-1       
## [25] permute_0.9-8      igraph_2.1.4       RColorBrewer_1.1-3
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.2       S7_0.2.0          
##  [5] fastmap_1.2.0      lazyeval_0.2.2     XML_3.99-0.19      digest_0.6.37     
##  [9] timechange_0.3.0   lifecycle_1.0.4    cluster_2.1.8.1    tidytree_0.4.6    
## [13] magrittr_2.0.4     compiler_4.5.1     progress_1.2.3     rlang_1.1.6       
## [17] sass_0.4.10        tools_4.5.1        yaml_2.3.10        knitr_1.50        
## [21] ggsignif_0.6.4     labeling_0.4.3     prettyunits_1.2.0  curl_7.0.0        
## [25] aplot_0.2.9        abind_1.4-8        withr_3.0.2        grid_4.5.1        
## [29] scales_1.4.0       MASS_7.3-65        cli_3.6.5          crayon_1.5.3      
## [33] rmarkdown_2.29     ragg_1.5.0         treeio_1.32.0      generics_0.1.4    
## [37] rstudioapi_0.17.1  httr_1.4.7         tzdb_0.5.0         cachem_1.1.0      
## [41] splines_4.5.1      parallel_4.5.1     ggplotify_0.1.3    vctrs_0.6.5       
## [45] yulab.utils_0.2.1  Matrix_1.7-3       jsonlite_2.0.0     carData_3.0-5     
## [49] car_3.1-3          patchwork_1.3.2    gridGraphics_0.5-1 hms_1.1.3         
## [53] rstatix_0.7.2      Formula_1.2-5      systemfonts_1.2.3  jquerylib_0.1.4   
## [57] glue_1.8.0         gtable_0.3.6       pillar_1.11.1      rappdirs_0.3.3    
## [61] htmltools_0.5.8.1  R6_2.6.1           textshaping_1.0.3  evaluate_1.0.5    
## [65] lattice_0.22-7     rentrez_1.2.4      backports_1.5.0    broom_1.0.10      
## [69] snakecase_0.11.1   ggfun_0.2.0        bslib_0.9.0        rncl_0.8.7        
## [73] Rcpp_1.1.0         gridExtra_2.3      nlme_3.1-168       mgcv_1.9-3        
## [77] xfun_0.53          fs_1.6.6           pkgconfig_2.0.3

8.1 Notes for repository upload

  • Keep the data/ directory structure consistent with params.
  • The output/ directory contains:
    • Tables: site_diversity.csv, pooled_by_parc.csv, parc_diversity.csv, voc_richness_by_parc.csv, div_table.csv, hplc_merged.csv, diversity_tbl.csv, virtual_communities_results.csv, and optionally unmatched_species_phylogeny.csv.
    • Figures: veg_nmds.png, veg_richness_by_parc.png, area_vs_species.png, vegetation_panel.png, vocs_nmds.png, vocs_vs_veg.png, vocs_panel.png, phylogeny_chem_diversity.png, virtual_communities_plot.png, communities_plus_phylogeny.png, panel_veg_vocs.png.
  • For exact reproducibility, consider archiving an renv.lock file created with {renv} after running renv::init() in the project root.
  • This document avoids hard-coded absolute paths and provides checks where column names were ambiguous (e.g., Parc2 vs Parc).