Data retrieval

Author

Juliana Stropp

Data retrieval from Plants of the World (POWO)

To obtain the names of flowering plants (angiosperms) occurring in Guyana, Suriname, and French Guiana, we downloaded the World Checklist of Vascular Plants (WCVP) from the Royal Botanic Gardens, Kew.

From this dataset, we excluded all non-angiosperm families and retained only names whose distribution field included Guyana (GUY), Suriname (SUR), or French Guiana (FRG).

The WCVP dataset was downloaded from http://sftp.kew.org/pub/data-repositories/WCVP/. Based on this dataset, we performed a series of queries, as outlined below.

Code

Load libraries:

library(kewr)
library(dplyr)
library(tidyr)
library(purrr)
library(taxize)
library(tidyr)
library(stringr)

Load source data:

wcvp_names<-read.csv("../Data/wcvp/wcvp_names.csv", sep="|")
wcvp_distribution<-read.csv("../Data/wcvp/wcvp_distribution.csv", sep="|")

Select names that occur in Guyana, Suriname, or French Guiana:

# standardize data types
wcvp_names$plant_name_id<-as.integer(wcvp_names$plant_name_id)
wcvp_distribution$plant_name_id<-as.integer(wcvp_distribution$plant_name_id)

##Join the two files
df<- wcvp_distribution %>%
  left_join(wcvp_names, by = "plant_name_id")

# Select names that occur in Guyana, Suriname, or French Guiana
df_Guianas<-df %>% 
   filter(area_code_l3 == "GUY" |
          area_code_l3 == "FRG" |
          area_code_l3 ==  "SUR")

Select only Angiosperms:

# 1. Retrieve a list of families included in the APG
apg_families<-apgFamilies() # https://www.mobot.org/MOBOT/research/APweb/
write.csv(apg_families, "apg_families.csv")

# Select order of Gymnosperm to filter out from the wcvp
# A list of orders of gymnosperm that are included in the 
# APG list was selected from https://www.conifers.org/zz/gymnosperms.php

gymno<-apg_families %>% 
    filter(accepted == TRUE &
    (order == "Cycadales" |
           order == "Zamiaceae" |
           order == "Ginkgoales" |
           order == "Gnetales" |
           order == "Gnetaceae" |
           order == "Ephedraceae" |
           order == "Pinales" |
           order == "Araucariales" |
           order == "Podocarpaceae" |
           order == "Cupressales" |
           order == "Cupressaceae" |
           order == "Taxaceae"))

# Select order of Ferns to filter out from the wcvp
# A list of fern order was extracted from: A classification for extant ferns. TAXON 55 (3) August 2006: 705–731

ferns<- apg_families %>%
  filter(accepted == TRUE &
         (order == "Polypodiales" |
         order == "Cyatheales" |
         order == "Salviniales" |
         order == "Schizaeales" |
         order == "Gleicheniales" |
         order == "Hymenophyllales" |
         order == "Lycopodiales" |
         order == "Osmundale" |
         order == "Marattiales" |
         order == "Equisetales" |
         order == "Psilotales" |
         order == "Ophioglossales"|
         order == "Polypodiales-eupolypod I"))

# bind gymno and ferns
gymno_ferns<-rbind(gymno, ferns)

df_Guianas2<-anti_join(df_Guianas, gymno_ferns, by = c("family" = "family"))

write.csv(df_Guianas2, "../Data/POWO/df_Guianas2.csv")

totals<-df_Guianas2 %>% 
        group_by(family) %>% 
        summarise(N_genus = n_distinct(genus))

head(totals)
# A tibble: 6 × 2
  family           N_genus
  <chr>              <int>
1 Acanthaceae           11
2 Achariaceae            3
3 Aizoaceae              1
4 Alismataceae           2
5 Alstroemeriaceae       1
6 Amaranthaceae          7

So far, the list contains 8,092 taxon names, including genus, species, variety, subspecies, and form. For each name, we retrieve the corresponding IPNI unique identifier, taxonomic status, publication details, and synonymy using the lookup_powo() function from the kewr package.

# select unique IDs
pp<-!duplicated(df_Guianas2$powo_id) 
powo_id<-df_Guianas2[pp,c("powo_id")]

# Create empty lists to be popuated with the data that will be retrieved from POWO
info_names_acc<-list()
ll_acc<-list()

# Loop to retrieve information for each name
# Running this loop can take a while
# for (i in 1:length(powo_id)){
#   info_names_acc[[i]] <- lookup_powo(powo_id[[i]])
#   ll_acc[[i]]<-tidy(info_names_acc[[i]])
# }

# save the list
#saveRDS(ll_acc, file="Guianas_powo_acc_names.RData")

ll_acc<-readRDS("../Data/POWO/Guianas_powo_acc_names.RData")

The function lookup_powo retrieves a large nested list. The next step is to unnest this list to keep only relevant information (synonyms and location).

  1. Unnest distribution
# convert to tibble
ll_acc2<-tibble(V_ = ll_acc) %>% 
  unnest_wider(V_, names_sep = "")

# add a row id
ll_acc2 <- ll_acc2 %>%
  mutate(acc_id = row_number())

## unnest synonyms
df_acc_and_synonyms <- ll_acc2 %>%
  mutate(acc_id = row_number()) %>%
  tidyr::unnest_longer(V_synonyms, values_to = "T_synonym") %>%
  tidyr::unnest(T_synonym) %>%
  select(acc_id, accepted_name = V_name, everything()) %>% # 
  select(-V_basionym, -V_basionymOf, -V_classification, -V_childNameUsages, -V_locations) %>% 
  rename(
    fqId_synonym = fqId,
    name_synoynm = name,
    author_synoynm = author,
    rank_synonym = rank,
    taxonomicSatus_synonym = taxonomicStatus) %>%
    distinct(V_fqId, fqId_synonym, .keep_all = TRUE)

TO CHECK: The columns basionym and V_basionymOf contains information on basionyms. We chould check if we need this info, or if all we need is already included in the column synonyms, that includes homotypic and heterotyic synonyms.

  1. Unnest location

    df_locations <- ll_acc2 %>%
      mutate(
        locations = map(V_locations, ~ unlist(.x))
      ) %>%
      unnest_longer(locations, values_to = "locations_all") %>% 
      select(V_fqId, V_name, V_authors, V_rank, V_reference,   locations_all) %>% 
      distinct(V_fqId, V_name, V_authors, V_rank, V_reference, locations_all)
    
    ## Select names that occur in Guyana, Suriname, or French Guiana
    df_locations_Gui<-df_locations %>%
      filter(str_starts(locations_all, "GUY")|
            str_starts(locations_all, "SUR")|
            str_starts(locations_all, "FRG"))
    
    length(unique(df_locations_Gui$V_name))
    [1] 4039
    length(unique(df_Guianas2$plant_name_id))
    [1] 11018
    # make one row per name, and locations in individual columns
    df_wide <- df_locations_Gui %>%
      arrange(V_fqId, locations_all) %>%
      group_by(V_fqId) %>%
      mutate(loc_n = row_number()) %>%
      ungroup() %>%
      pivot_wider(
        names_from = loc_n,
        values_from = locations_all,
        names_prefix = "location_"
      ) %>% 
      select(V_fqId, location_1, location_2, location_3, location_4, location_5, location_6)
    
    
    df_all<-df_acc_and_synonyms %>% 
            left_join(df_wide, by="V_fqId")
    
    df_acc_and_synonyms
    # A tibble: 22,451 × 37
       acc_id accepted_name V_modified               V_bibliographicCitation V_genus
        <int> <chr>         <chr>                    <chr>                   <chr>  
     1      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     2      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     3      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     4      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     5      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     6      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     7      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     8      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
     9      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
    10      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
    # ℹ 22,441 more rows
    # ℹ 32 more variables: V_taxonomicStatus <chr>, V_kingdom <chr>,
    #   V_phylum <chr>, V_clazz <chr>, V_subclass <chr>, V_order <chr>,
    #   V_family <chr>, V_nomenclaturalCode <chr>, V_source <chr>,
    #   V_namePublishedInYear <int>, V_taxonRemarks <chr>,
    #   V_nomenclaturalStatus <chr>, V_lifeform <chr>, V_climate <chr>,
    #   V_hybrid <lgl>, V_plantae <lgl>, V_fungi <lgl>, V_synonym <lgl>, …
    write.csv(df_acc_and_synonyms, "../Data/POWO/df_acc_and_synonyms.csv")

    Now that we have unnested synonyms and location, we will join the files and export it as .csv

df_all<-df_acc_and_synonyms %>% 
        left_join(df_wide, by="V_fqId")

df_acc_and_synonyms
# A tibble: 22,451 × 37
   acc_id accepted_name V_modified               V_bibliographicCitation V_genus
    <int> <chr>         <chr>                    <chr>                   <chr>  
 1      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 2      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 3      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 4      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 5      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 6      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 7      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 8      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
 9      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
10      1 Acalypha      2020-07-08T17:16:00.000Z IPNI 2026. Published o… Acalyp…
# ℹ 22,441 more rows
# ℹ 32 more variables: V_taxonomicStatus <chr>, V_kingdom <chr>,
#   V_phylum <chr>, V_clazz <chr>, V_subclass <chr>, V_order <chr>,
#   V_family <chr>, V_nomenclaturalCode <chr>, V_source <chr>,
#   V_namePublishedInYear <int>, V_taxonRemarks <chr>,
#   V_nomenclaturalStatus <chr>, V_lifeform <chr>, V_climate <chr>,
#   V_hybrid <lgl>, V_plantae <lgl>, V_fungi <lgl>, V_synonym <lgl>, …
write.csv(df_acc_and_synonyms, "../Data/POWO/df_acc_and_synonyms.csv")