library(readxl)
library(dplyr)
library(tidyr)
library(purrr)
library(taxize)Data retrieval
Tropicos
Specimen records were downloaded from TROPICOS by querying each country in the location field. For each accepted name, associated synonyms were retrieved using the tp_synonyms() function from the taxize R package.
Code
Load libraries:
Load source data and retrieve synonyms (French Guiana):
## ----------------------------
## French Guiana
## ----------------------------
FRG <- read.csv("../Data/TROPICOS/Specimen Search Results 20260127143117_French_Guiana.csv",
sep = ",", fileEncoding = "Latin1")
# Get unique identifiers for each name
ids <- FRG$Scientific.Name.Id
pp <- !duplicated(FRG$Scientific.Name.Id)
tropicos_id <- FRG[pp, c("Scientific.Name.Id")]
#Create empty list
#info_names_acc <- list()
#ll_acc <- list()
# Loop commented out to avoid running
# for (i in 1:length(tropicos_id)){
# info_names_acc[[i]] <- tp_synonyms(tropicos_id[[i]],
# key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# }
#saveRDS(info_names_acc, file="FRG_acc_names.RData")Load source data and retrieve synonyms (Guyana):
## ----------------------------
## Guyana
## ----------------------------
GUY <- read.csv("../Data/TROPICOS/Specimen Search Results 20260127143117_Guyana.csv",
sep = ",", fileEncoding = "Latin1")
# Get unique identifiers for each name
GUY_pp <- !duplicated(GUY$Scientific.Name.Id)
tropicos_id_GUY <- GUY[GUY_pp, c("Scientific.Name.Id")]
#info_names_acc_GUY <- list()
#ll_acc_GUY <- list()
# Loop commented out
# for (i in 1:length(tropicos_id_GUY)){
# info_names_acc_GUY[[i]] <- tp_synonyms(tropicos_id_GUY[[i]],
# key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# }
#saveRDS(info_names_acc_GUY, file="GUY_acc_names.RData")Load source data and retrieve synonyms (Suriname):
## ----------------------------
## Suriname
## ----------------------------
SUR <- read.csv("../Data/TROPICOS/Specimen Search Results 20260127143117_Suriname.csv",
sep = ",", fileEncoding = "Latin1")
SUR_pp <- !duplicated(SUR$Scientific.Name.Id)
tropicos_id_SUR <- SUR[SUR_pp, c("Scientific.Name.Id")]
#info_names_acc_SUR <- list()
#ll_acc_SUR <- list()
# Loop commented out
# for (i in 1:length(tropicos_id_SUR)){
# info_names_acc_SUR[[i]] <- tp_synonyms(tropicos_id_SUR[[i]],
# key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# }
#saveRDS(info_names_acc_SUR, file="SUR_acc_names.RData")Read saved files:
FRG_tropicos <- readRDS("../Data/TROPICOS/FRG_acc_names.RData")
GUY_tropicos <- readRDS("../Data/TROPICOS/GUY_acc_names.RData")
SUR_tropicos <- readRDS("../Data/TROPICOS/SUR_acc_names.RData")Now, we need to unnest each file:
## FRENCH GUIANA ##
FRG_acc_syn <- tibble(FRG = FRG_tropicos) %>%
mutate(acc_id = row_number()) %>% # preserve association
# widen top-level list
unnest_wider(FRG) %>% # creates: accepted, synonyms
# --- FIX TYPE INCONSISTENCIES ---
mutate(
accepted = map(
accepted,
~ mutate(.x, nameid = as.character(nameid))
),
synonyms = map(
synonyms,
~ mutate(.x, nameid = as.character(nameid))
)
) %>%
# unnest accepted
unnest_longer(accepted) %>%
unnest_wider(accepted, names_sep = "_acc") %>%
# unnest synonyms
unnest_longer(synonyms) %>%
unnest_wider(synonyms, names_sep = "_syn")
## GUYANA ##
GUY_acc_syn <- tibble(GUY = GUY_tropicos) %>%
mutate(acc_id = row_number()) %>% # preserve association
# widen top-level list
unnest_wider(GUY) %>% # creates: accepted, synonyms
# --- FIX TYPE INCONSISTENCIES ---
mutate(
accepted = map(
accepted,
~ mutate(.x, nameid = as.character(nameid))
),
synonyms = map(
synonyms,
~ mutate(.x, nameid = as.character(nameid))
)
) %>%
# unnest accepted
unnest_longer(accepted) %>%
unnest_wider(accepted, names_sep = "_acc") %>%
# unnest synonyms
unnest_longer(synonyms) %>%
unnest_wider(synonyms, names_sep = "_syn")
## SURINAME ##
SUR_acc_syn <- tibble(SUR = SUR_tropicos) %>%
mutate(acc_id = row_number()) %>% # preserve association
# widen top-level list
unnest_wider(SUR) %>% # creates: accepted, synonyms
# --- FIX TYPE INCONSISTENCIES ---
mutate(
accepted = map(
accepted,
~ mutate(.x, nameid = as.character(nameid))
),
synonyms = map(
synonyms,
~ mutate(.x, nameid = as.character(nameid))
)
) %>%
# unnest accepted
unnest_longer(accepted) %>%
unnest_wider(accepted, names_sep = "_acc") %>%
# unnest synonyms
unnest_longer(synonyms) %>%
unnest_wider(synonyms, names_sep = "_syn")The next step is to add 1:lenght(id_tropicos) in order to join the unnested data (above) with the data with information about synonyms. This is necessary because the list of synonyms retrieved from TROPICOS does not preserve the scientific name ID of names without synonyms.
FRG_id <- data.frame(
acc_id = seq_along(tropicos_id),
tp_id = tropicos_id)
GUY_id <- data.frame(
acc_id = seq_along(tropicos_id_GUY),
tp_id = tropicos_id_GUY)
SUR_id <- data.frame(
acc_id = seq_along(tropicos_id_SUR),
tp_id = tropicos_id_SUR)
# Add IDs for names without synonyms
FRG_acc_syn2<-FRG_id %>%
left_join(FRG_acc_syn, by = "acc_id")
GUY_acc_syn2<-GUY_id %>%
left_join(GUY_acc_syn, by = "acc_id")
SUR_acc_syn2<-SUR_id %>%
left_join(SUR_acc_syn, by = "acc_id")Next, we retrieved detailed taxonomic information from TROPICOS for each scientific name using its unique identifier (ID). For example: Scientific.Name.Id = 2103228; Scientific.Name = Philodendron guttiferum Kunth; Philodendron cf. guttiferum Kunth; Philodendron guttiferum s. lat. Kunth.
# FRENCH GUIANA
# ids<-FRG$Scientific.Name.Id
# pp<-!duplicated(FRG$Scientific.Name.Id)
# tropicos_id<-FRG[pp,c("Scientific.Name.Id")]
#
# summary_tp_FRG<-list()
#
# for (i in 1:length(tropicos_id)){
# summary_tp_FRG[[i]] <- tp_summary(tropicos_id[[i]],key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# #ll_acc[[i]]<-tidy(info_names_acc[[i]])
# #print(mass)
# }
#
# saveRDS(summary_tp_FRG, file="../Data/TROPICOS/summary_tp_FRG.RData")
#
# # GUYANA
# GUY_pp<-!duplicated(GUY$Scientific.Name.Id)
# tropicos_id_GUY<-GUY[GUY_pp,c("Scientific.Name.Id")]
# summary_tp_GUY<-list()
#
# for (i in 1:length(tropicos_id_GUY)){
# summary_tp_GUY[[i]] <- tp_summary(tropicos_id_GUY[[i]],key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# #ll_acc[[i]]<-tidy(info_names_acc[[i]])
# #print(mass)
# }
#
# saveRDS(summary_tp_GUY, file="../Data/TROPICOS/summary_tp_GUY.RData")
#
# # SURINAME
#
# SUR_pp<-!duplicated(SUR$Scientific.Name.Id)
# tropicos_id_SUR<-SUR[SUR_pp,c("Scientific.Name.Id")]
# summary_tp_SUR<-list()
#
# for (i in 1:length(tropicos_id_SUR)){
# summary_tp_SUR[[i]] <- tp_summary(tropicos_id_SUR[[i]],key = "c8e5f3b3-fc7e-4331-9b60-5af590cd0356")
# #ll_acc[[i]]<-tidy(info_names_acc[[i]])
# #print(mass)
# }
#
#
# saveRDS(summary_tp_SUR, file="../Data/TROPICOS/summary_tp_SUR.RData")Read results:
summary_tp_FRG<-readRDS("../Data/TROPICOS/summary_tp_FRG.RData")
summary_tp_GUY<-readRDS("../Data/TROPICOS/summary_tp_GUY.RData")
summary_tp_SUR<-readRDS("../Data/TROPICOS/summary_tp_SUR.RData")Convert to tibble and bind all files:
df_tp_FRG <- map_dfr(
summary_tp_FRG,
~ as_tibble(.x),
.id = "list_id") %>%
mutate(country = "FRG") %>%
relocate(country, .before = everything())
df_tp_GUY <- map_dfr(
summary_tp_GUY,
~ as_tibble(.x),
.id = "list_id") %>%
mutate(country = "GUY") %>%
relocate(country, .before = everything())
df_tp_SUR <- map_dfr(
summary_tp_SUR,
~ as_tibble(.x),
.id = "list_id")%>%
mutate(country = "SUR") %>%
relocate(country, .before = everything())
# Bind all files
df_tp_all <- bind_rows(df_tp_FRG, df_tp_GUY, df_tp_SUR)Select unique accepted names:
acc_names_tp<-df_tp_all %>%
distinct(country, nameid, scientificname, family,
scientificnamewithauthors, rank,
namepublishedcitation)
acc_names_tp2 <- acc_names_tp %>%
mutate(value = country) %>%
pivot_wider(
names_from = country,
values_from = value)Exclude gymnosperms, ferns, mosses:
# 1. Retrieve a list of families included in the APG
apg_families<-apgFamilies() # https://www.mobot.org/MOBOT/research/APweb/
#write.csv(apg_families, "apg_families.csv")
# Select order of Gymnosperm to filter out from the wcvp
# A list of orders of gymnosperm that are included in the
# APG list was selected from https://www.conifers.org/zz/gymnosperms.php
gymno<-apg_families %>%
filter(accepted == TRUE &
(order == "Cycadales" |
order == "Zamiaceae" |
order == "Ginkgoales" |
order == "Gnetales" |
order == "Gnetaceae" |
order == "Ephedraceae" |
order == "Pinales" |
order == "Araucariales" |
order == "Podocarpaceae" |
order == "Cupressales" |
order == "Cupressaceae" |
order == "Taxaceae"))
# Select order of Ferns to filter out from the wcvp
# A list of fern order was extracted from: A classification for extant ferns. TAXON 55 (3) August 2006: 705–731
ferns<- apg_families %>%
filter(accepted == TRUE &
(order == "Polypodiales" |
order == "Cyatheales" |
order == "Salviniales" |
order == "Schizaeales" |
order == "Gleicheniales" |
order == "Hymenophyllales" |
order == "Lycopodiales" |
order == "Osmundale" |
order == "Marattiales" |
order == "Equisetales" |
order == "Psilotales" |
order == "Ophioglossales"|
order == "Polypodiales-eupolypod I"))
# bind gymno and ferns
gymno_ferns<-rbind(gymno, ferns)
acc_names_tp3<-anti_join(acc_names_tp2, gymno_ferns,
by = "family")
# Check if there is one unique ID per name
check<-acc_names_tp3%>%
group_by(scientificnamewithauthors) %>%
summarize(n_ids = n_distinct(nameid))
# No! Dieffenbachia seguine (Jacq.) Schott is associated with two publications but Wiener Z. Kunst 3: 803 is the correct publication
# we will exclute Dieffenbachia seguine (Jacq.) Schott, published in Melet. Bot. 1: 20 in 1932. nameid: 2102603
acc_names_tp4<-acc_names_tp3 %>%
filter(nameid !=2102603)
check<-acc_names_tp4%>%
group_by(scientificnamewithauthors) %>%
summarize(n_ids = n_distinct(nameid)) # OK! One ID for each
# Save file
write.csv(acc_names_tp4, "../Data/TROPICOS/Guianas_acc_names_tropicos.csv")Now, that we have all accepted names (with information from TROPICOS), we will join the synonyms.
### Add synonyms to accepted names
Guianas_acc_names_tropicos<-read.csv("../Data/TROPICOS/Guianas_acc_names_tropicos.csv")
# Join files each syn separetely because the row number repeats rbinding
# them is not possible as the id based on row number are not unique
### FRENCH GUIANA ###
FRG_acc_syn_tp<-FRG_acc_syn2 %>%
left_join(Guianas_acc_names_tropicos,
by=c("tp_id" = "nameid")) %>%
select(-accepted_accnameid)%>%
relocate(tp_id, scientificname,
family, scientificnamewithauthors,
rank, namepublishedcitation) %>%
select(-accepted_accscientificname,
-acc_id,
-accepted_accscientificnamewithauthors,
-accepted_accfamily, -X)
### GUYANA ###
GUY_acc_syn_tp<-GUY_acc_syn2 %>%
left_join(Guianas_acc_names_tropicos,
by=c("tp_id" = "nameid")) %>%
select(-accepted_accnameid)%>%
relocate(tp_id, scientificname,
family, scientificnamewithauthors,
rank, namepublishedcitation) %>%
select(-accepted_accscientificname,
-acc_id,
-accepted_accscientificnamewithauthors,
-accepted_accfamily, -X)
### SURINAME ###
SUR_acc_syn_tp<-SUR_acc_syn2 %>%
left_join(Guianas_acc_names_tropicos,
by=c("tp_id" = "nameid")) %>%
select(-accepted_accnameid)%>%
relocate(tp_id, scientificname,
family, scientificnamewithauthors,
rank, namepublishedcitation) %>%
select(-accepted_accscientificname,
-acc_id,
-accepted_accscientificnamewithauthors,
-accepted_accfamily, -X)We will bind all files and remove duplicate rows to obtain our final file.
### Now bind files (acc and syn) from each country and remove entire
# duplicated rows.
df_all<-bind_rows(FRG_acc_syn_tp,
GUY_acc_syn_tp,
SUR_acc_syn_tp) %>%
distinct()
spp_Gui<-df_all %>%
filter(rank == "species")
length(unique(spp_Gui$scientificnamewithauthors)) # 7371 unique scientific names, taxoomic rank = species[1] 7371