library(kewr)
library(dplyr)
library(tidyr)
library(purrr)
library(taxize)
library(tidyr)
library(stringr)Data retrieval
Data retrieval from Plants of the World (POWO)
To obtain the names of flowering plants (angiosperms) occurring in Guyana, Suriname, and French Guiana, we downloaded the World Checklist of Vascular Plants (WCVP) from the Royal Botanic Gardens, Kew.
From this dataset, we excluded all non-angiosperm families and retained only names whose distribution field included Guyana (GUY), Suriname (SUR), or French Guiana (FRG).
The WCVP dataset was downloaded from http://sftp.kew.org/pub/data-repositories/WCVP/. Based on this dataset, we performed a series of queries, as outlined below.
Code
Load libraries:
Load source data:
wcvp_names<-read.csv("../Data/wcvp/wcvp_names.csv", sep="|")
wcvp_distribution<-read.csv("../Data/wcvp/wcvp_distribution.csv", sep="|")Select names that occur in Guyana, Suriname, or French Guiana:
# standardize data types
wcvp_names$plant_name_id<-as.integer(wcvp_names$plant_name_id)
wcvp_distribution$plant_name_id<-as.integer(wcvp_distribution$plant_name_id)
##Join the two files
df<- wcvp_distribution %>%
left_join(wcvp_names, by = "plant_name_id")
# Select names that occur in Guyana, Suriname, or French Guiana
df_Guianas<-df %>%
filter(area_code_l3 == "GUY" |
area_code_l3 == "FRG" |
area_code_l3 == "SUR")Select only Angiosperms:
# 1. Retrieve a list of families included in the APG
apg_families<-apgFamilies() # https://www.mobot.org/MOBOT/research/APweb/
write.csv(apg_families, "apg_families.csv")
# Select order of Gymnosperm to filter out from the wcvp
# A list of orders of gymnosperm that are included in the
# APG list was selected from https://www.conifers.org/zz/gymnosperms.php
gymno<-apg_families %>%
filter(accepted == TRUE &
(order == "Cycadales" |
order == "Zamiaceae" |
order == "Ginkgoales" |
order == "Gnetales" |
order == "Gnetaceae" |
order == "Ephedraceae" |
order == "Pinales" |
order == "Araucariales" |
order == "Podocarpaceae" |
order == "Cupressales" |
order == "Cupressaceae" |
order == "Taxaceae"))
# Select order of Ferns to filter out from the wcvp
# A list of fern order was extracted from: A classification for extant ferns. TAXON 55 (3) August 2006: 705–731
ferns<- apg_families %>%
filter(accepted == TRUE &
(order == "Polypodiales" |
order == "Cyatheales" |
order == "Salviniales" |
order == "Schizaeales" |
order == "Gleicheniales" |
order == "Hymenophyllales" |
order == "Lycopodiales" |
order == "Osmundale" |
order == "Marattiales" |
order == "Equisetales" |
order == "Psilotales" |
order == "Ophioglossales"|
order == "Polypodiales-eupolypod I"))
# bind gymno and ferns
gymno_ferns<-rbind(gymno, ferns)
df_Guianas2<-anti_join(df_Guianas, gymno_ferns, by = c("family" = "family"))
write.csv(df_Guianas2, "../Data/POWO/df_Guianas2.csv")
totals<-df_Guianas2 %>%
group_by(family) %>%
summarise(N_genus = n_distinct(genus))
head(totals)# A tibble: 6 × 2
family N_genus
<chr> <int>
1 Acanthaceae 11
2 Achariaceae 3
3 Aizoaceae 1
4 Alismataceae 2
5 Alstroemeriaceae 1
6 Amaranthaceae 7
So far, the list contains 8,092 taxon names, including genus, species, variety, subspecies, and form. For each name, we retrieve the corresponding IPNI unique identifier, taxonomic status, publication details, and synonymy using the lookup_powo() function from the kewr package.
# select unique IDs
pp<-!duplicated(df_Guianas2$powo_id)
powo_id<-df_Guianas2[pp,c("powo_id")]
# Create empty lists to be popuated with the data that will be retrieved from POWO
info_names_acc<-list()
ll_acc<-list()
# Loop to retrieve information for each name
# Running this loop can take a while
# for (i in 1:length(powo_id)){
# info_names_acc[[i]] <- lookup_powo(powo_id[[i]])
# ll_acc[[i]]<-tidy(info_names_acc[[i]])
# }
# save the list
#saveRDS(ll_acc, file="Guianas_powo_acc_names.RData")
ll_acc<-readRDS("../Data/POWO/Guianas_powo_acc_names.RData")The function lookup_powo retrieves a large nested list. The next step is to unnest this list to keep only relevant information (synonyms and location).
- Unnest distribution
# convert to tibble
ll_acc2<-tibble(V_ = ll_acc) %>%
unnest_wider(V_, names_sep = "")
# add a row id
ll_acc2 <- ll_acc2 %>%
mutate(acc_id = row_number())
## unnest synonyms
df_acc_and_synonyms <- ll_acc2 %>%
mutate(acc_id = row_number()) %>%
tidyr::unnest_longer(V_synonyms, values_to = "T_synonym") %>%
tidyr::unnest(T_synonym) %>%
select(acc_id, accepted_name = V_name, everything()) %>% #
select(-V_basionym, -V_basionymOf, -V_classification, -V_childNameUsages, -V_locations) %>%
rename(
fqId_synonym = fqId,
name_synoynm = name,
author_synoynm = author,
rank_synonym = rank,
taxonomicSatus_synonym = taxonomicStatus) %>%
distinct(V_fqId, fqId_synonym, .keep_all = TRUE)TO CHECK: The columns basionym and V_basionymOf contains information on basionyms. We chould check if we need this info, or if all we need is already included in the column synonyms, that includes homotypic and heterotyic synonyms.
Unnest location
df_locations <- ll_acc2 %>% mutate( locations = map(V_locations, ~ unlist(.x)) ) %>% unnest_longer(locations, values_to = "locations_all") %>% select(V_fqId, V_name, V_authors, V_rank, V_reference, locations_all) %>% distinct(V_fqId, V_name, V_authors, V_rank, V_reference, locations_all) ## Select names that occur in Guyana, Suriname, or French Guiana df_locations_Gui<-df_locations %>% filter(str_starts(locations_all, "GUY")| str_starts(locations_all, "SUR")| str_starts(locations_all, "FRG")) length(unique(df_locations_Gui$V_name))[1] 4039length(unique(df_Guianas2$plant_name_id))[1] 11018# make one row per name, and locations in individual columns df_wide <- df_locations_Gui %>% arrange(V_fqId, locations_all) %>% group_by(V_fqId) %>% mutate(loc_n = row_number()) %>% ungroup() %>% pivot_wider( names_from = loc_n, values_from = locations_all, names_prefix = "location_" ) %>% select(V_fqId, location_1, location_2, location_3, location_4, location_5, location_6) df_all<-df_acc_and_synonyms %>% left_join(df_wide, by="V_fqId") df_acc_and_synonyms# A tibble: 22,451 × 37 acc_id accepted_name V_modified V_bibliographicCitation V_genus <int> <chr> <chr> <chr> <chr> 1 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 2 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 3 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 4 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 5 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 6 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 7 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 8 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 9 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… 10 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp… # ℹ 22,441 more rows # ℹ 32 more variables: V_taxonomicStatus <chr>, V_kingdom <chr>, # V_phylum <chr>, V_clazz <chr>, V_subclass <chr>, V_order <chr>, # V_family <chr>, V_nomenclaturalCode <chr>, V_source <chr>, # V_namePublishedInYear <int>, V_taxonRemarks <chr>, # V_nomenclaturalStatus <chr>, V_lifeform <chr>, V_climate <chr>, # V_hybrid <lgl>, V_plantae <lgl>, V_fungi <lgl>, V_synonym <lgl>, …write.csv(df_acc_and_synonyms, "../Data/POWO/df_acc_and_synonyms.csv")Now that we have unnested synonyms and location, we will join the files and export it as .csv
df_all<-df_acc_and_synonyms %>%
left_join(df_wide, by="V_fqId")
df_acc_and_synonyms# A tibble: 22,451 × 37
acc_id accepted_name V_modified V_bibliographicCitation V_genus
<int> <chr> <chr> <chr> <chr>
1 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
2 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
3 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
4 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
5 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
6 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
7 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
8 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
9 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
10 1 Acalypha 2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
# ℹ 22,441 more rows
# ℹ 32 more variables: V_taxonomicStatus <chr>, V_kingdom <chr>,
# V_phylum <chr>, V_clazz <chr>, V_subclass <chr>, V_order <chr>,
# V_family <chr>, V_nomenclaturalCode <chr>, V_source <chr>,
# V_namePublishedInYear <int>, V_taxonRemarks <chr>,
# V_nomenclaturalStatus <chr>, V_lifeform <chr>, V_climate <chr>,
# V_hybrid <lgl>, V_plantae <lgl>, V_fungi <lgl>, V_synonym <lgl>, …
write.csv(df_acc_and_synonyms, "../Data/POWO/df_acc_and_synonyms.csv")