This script transforms long-format data tables used in rocky intertidal surveys of the Marine Biodiversity Observation Network Pole to Pole of the Americas (MBON Pole to Pole) into Darwin Core Archive (DwC-A) files for publishing data in the Ocean Biodiversity Information System (OBIS) following instructions from this manual. It also generates an integrated file ready for data analysis.

The original R Markdown version of this document is available here.

To test this script, you will need to:

  • Install R software and RStudio.

  • Create three folders in your working directory: Analysis, Data, IPT.

  • Set your working directory to the location of these three folders (e.g. setwd(“~/your directory”).

  • Save the The DataSheet_longformat_TEST file in the “Data” folder. This is the data table that you will substitute with your own data.

Now, just copy the code chunks below and paste them into your R console.

1 Basic Setup

You need a few packages to run this code. Check that the “Data”, IPT" and “Analysis” folders are created under your working directory. The DataSheet_longformat_TEST file should be available in the “Data” folder.

library(lutz)
library(countrycode)
library(readxl)
library(reshape2)
library(lubridate)
library(dplyr)
library(ggplot2)
library(kableExtra)
options(dplyr.summarise.inform = FALSE)

# Create a "Data", IPT" and "Analysis" folder under your selected directory
baseDataDir = "Data"
baseIPT = "IPT"
baseAnalysis = "Analysis"

2 Load your data table

Set fileNameto the name of the longformat data table from your survey

# Select your data table
fileName = "DataSheet_longformat_TEST_v2.xlsx"

3 Read file sheets

## Extract information about your sampling site from the SiteInfo tab of your data table file

DF.sites = read_xlsx(file.path(baseDataDir, fileName),sheet = "SiteInfo")
DF.sites = DF.sites[!is.na(DF.sites$COUNTRY),]

3.1 convert time to UTC

## get number of seconds from midnight
secsSTART = (1 -abs(as.numeric(julian(DF.sites$TIME_START)) - (as.integer(julian(DF.sites$TIME_START))))) * (60*60*24)
secsEND = (1 - abs(as.numeric(julian(DF.sites$TIME_END)) - (as.integer(julian(DF.sites$TIME_END))))) * (60*60*24)
dateChar = paste(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY, sep="-")

## get timezone and timezone offset
timeZone = tz_lookup_coords(mean(DF.sites$LATITUDE, na.rm=T), mean(DF.sites$LONGITUDE, na.rm=T), method="accurate")
dateOffset = tz_offset(dateChar, timeZone)$utc_offset_h

## create data and time UTC
DF.sites$eventDate = as.POSIXct(dateChar, tz="UTC")
DF.sites$TIME_START = DF.sites$eventDate + seconds(secsSTART) + hours(dateOffset)
DF.sites$TIME_END = DF.sites$eventDate + seconds(secsEND) + hours(dateOffset)
DF.sites$eventTime = paste(format(DF.sites$TIME_START, "%H:%M:%SZ"), format(DF.sites$TIME_END, "%H:%M:%SZ"), sep="/")

print(timeZone)
## [1] "America/Argentina/Catamarca"
kable(DF.sites[1:5, c(3:5, 13:14)]) %>% kable_styling("striped")
COUNTRY LOCALITY SITE TIME_START TIME_END
ARGENTINA PUERTO MADRYN PUNTA ESTE 2018-10-25 07:45:00 2018-10-25 10:45:00
ARGENTINA PUERTO MADRYN PUNTA ESTE 2018-10-25 07:45:00 2018-10-25 10:45:00
ARGENTINA PUERTO MADRYN PUNTA ESTE 2018-10-25 07:45:00 2018-10-25 10:45:00
ARGENTINA PUERTO MADRYN PUNTA CUEVAS 2018-11-05 07:45:00 2018-11-05 10:45:00
ARGENTINA PUERTO MADRYN PUNTA CUEVAS 2018-11-05 07:45:00 2018-11-05 10:45:00

3.2 Extract other fields

# Country code
DF.sites$datasetName = paste0("MBON-P2P-biodiversity-",unique(DF.sites$countryCode))
DF.sites$countryCodeISO = countrycode(DF.sites$COUNTRY, "country.name","iso3c")

# Sampling protocol
DF.sites$samplingProtocol = "MBON-P2P_bestpractices-rockyshores"

# Sampling size value
DF.sites$samplingSizeValue = 0.25

# Sampling unit
DF.sites$samplingSizeUnit = "square meter"

print(DF.sites$countryCodeISO[1])
## [1] "ARG"

3.3 Extrat data, taxa list and codes

## data
DF.data = read_xlsx(file.path(baseDataDir, fileName),sheet = "DATA")
DF.data = DF.data[!is.na(DF.data$LOCALITY),]

## spp list
DF.spp = read_xlsx(file.path(baseDataDir, fileName),sheet = "sppList")

## codes
DF.countryCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Countries")
DF.localityCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Locality")
DF.siteCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Sites")
DF.habitatCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Habitat")

kable(DF.data[1:5, 1:7]) %>% kable_styling("striped")
LOCALITY SITE STRATA SAMPLE AREA_quadrat scientificName Variable
PUERTO MADRYN PUNTA CUEVAS HIGH S02 FULL QUADRAT substrate_BAREROCK COVER
PUERTO MADRYN PUNTA CUEVAS MID S01 FULL QUADRAT substrate_BAREROCK COVER
PUERTO MADRYN PUNTA CUEVAS MID S02 FULL QUADRAT substrate_BAREROCK COVER
PUERTO MADRYN PUNTA CUEVAS MID S03 FULL QUADRAT substrate_BAREROCK COVER
PUERTO MADRYN PUNTA CUEVAS MID S04 FULL QUADRAT substrate_BAREROCK COVER
kable(DF.spp[1:5,]) %>% kable_styling("striped")
scientificName AphiaID Authority Rank Catami
Aaptos 132064 Gray, 1867 Genus NA
Abietinaria 117225 Kirchenpauer, 1884 Genus NA
Abietinaria abietina 117870 (Linnaeus, 1758) Species NA
Abra alba 141433 (W. Wood, 1802) Species NA
Acanthais brevidentata 395268 (W. Wood, 1828) Species NA

4 Generate IDs

## add codes: SITES
DF.sites = left_join(DF.sites, DF.countryCodes, by = "COUNTRY")
DF.sites = left_join(DF.sites, DF.localityCodes, by = "LOCALITY")
DF.sites = left_join(DF.sites, DF.siteCodes, by = "SITE")
DF.sites = left_join(DF.sites, DF.habitatCodes, by = "HABITAT")

DF.sites$PARENT_UNIT_ID = paste(DF.sites$countryCode, DF.sites$localityCode, DF.sites$siteCode, DF.sites$habitatCode, 
                                paste0(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY), sep="_")
DF.sites$UNIT_ID = paste(DF.sites$PARENT_UNIT_ID, DF.sites$STRATA, sep="_")

print(DF.sites$UNIT_ID[1:6])
## [1] "ARG_PMADRY_PESTE_RS_20181025_HIGH" "ARG_PMADRY_PESTE_RS_20181025_MID" 
## [3] "ARG_PMADRY_PESTE_RS_20181025_LOW"  "ARG_PMADRY_PCUEVA_RS_2018115_HIGH"
## [5] "ARG_PMADRY_PCUEVA_RS_2018115_MID"  "ARG_PMADRY_PCUEVA_RS_2018115_LOW"

5 Assign codes to DATA

## Add Aphia ID and taxa rank
DF.data = left_join(DF.data, DF.spp[,c("scientificName", "AphiaID", "Rank")])
DF.data = left_join(DF.data, DF.sites[,c("UNIT_ID", "LOCALITY", "SITE", "STRATA")])
DF.data = DF.data %>% group_by(LOCALITY, SITE, STRATA, SAMPLE) %>% 
  mutate(sampleOrganismID = 1:n(), scientificName, AphiaID, Rank, Variable, Value)
DF.data$occurrenceID = paste(DF.data$UNIT_ID, DF.data$SAMPLE, sprintf("%03d", DF.data$sampleOrganismID), sep="_")

print(DF.data$occurrenceID[1:20])
##  [1] "ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_001"
##  [2] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S01_001" 
##  [3] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S02_001" 
##  [4] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S03_001" 
##  [5] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S04_001" 
##  [6] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S05_001" 
##  [7] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S06_001" 
##  [8] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S07_001" 
##  [9] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S08_001" 
## [10] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S09_001" 
## [11] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S10_001" 
## [12] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S01_001"
## [13] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S02_001"
## [14] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S03_001"
## [15] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S04_001"
## [16] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S05_001"
## [17] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S06_001"
## [18] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S07_001"
## [19] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S08_001"
## [20] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S09_001"

5.1 Convert abundance values

## to count per square meter 
DF.data$Value[DF.data$Variable=="ABUNDANCE"] = DF.data$Value[DF.data$Variable=="ABUNDANCE"] * 4

print(DF.data$Value[DF.data$Variable=="ABUNDANCE"][1:10])
##  [1] 12 16  4  4  4  4  4  4  8  4

5.2 Assign other IPT fields

DF.data$basisOfRecord = "HumanObservation"
DF.data$occurrenceStatus = "present"
DF.data$scientificNameID = paste0("lsid:marinespecies.org:taxname:", DF.data$AphiaID)

## fields for the eMoF
DF.data$measurementTypeID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/",  ##Coverage (in assayed sample) of biological entity 
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/") ## number per square meter

DF.data$measurementUnit = ifelse(DF.data$Variable=="COVER", "percent", "count")
DF.data$measurementUnitID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPCT/",    ## percentage
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/")   ## number per square meter

DF.data = DF.data %>% arrange(occurrenceID, scientificName)

kable(DF.data[1:5, ]) %>% kable_styling("striped")
LOCALITY SITE STRATA SAMPLE AREA_quadrat scientificName Variable Value Remarks QC AphiaID Rank UNIT_ID sampleOrganismID occurrenceID basisOfRecord occurrenceStatus scientificNameID measurementTypeID measurementUnit measurementUnitID
PUERTO MADRYN PUNTA CUEVAS HIGH S01 FULL QUADRAT Brachidontes rodriguezii COVER 1 NA NA 367978 Species ARG_PMADRY_PCUEVA_RS_2018115_HIGH 1 ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001 HumanObservation present lsid:marinespecies.org:taxname:367978 http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN PUNTA CUEVAS HIGH S01 FULL QUADRAT Ulva prolifera COVER 13 NA NA 234476 Species ARG_PMADRY_PCUEVA_RS_2018115_HIGH 2 ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002 HumanObservation present lsid:marinespecies.org:taxname:234476 http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN PUNTA CUEVAS HIGH S02 FULL QUADRAT substrate_BAREROCK COVER 22 NA NA -99999 NA ARG_PMADRY_PCUEVA_RS_2018115_HIGH 1 ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_001 HumanObservation present lsid:marinespecies.org:taxname:-99999 http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN PUNTA CUEVAS HIGH S02 FULL QUADRAT Balanus glandula COVER 2 NA NA 394848 Species ARG_PMADRY_PCUEVA_RS_2018115_HIGH 2 ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002 HumanObservation present lsid:marinespecies.org:taxname:394848 http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN PUNTA CUEVAS HIGH S02 FULL QUADRAT Brachidontes rodriguezii COVER 2 NA NA 367978 Species ARG_PMADRY_PCUEVA_RS_2018115_HIGH 3 ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_003 HumanObservation present lsid:marinespecies.org:taxname:367978 http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/

5.3 Remove substrate type records

## EventCore file
IPT.event = DF.sites %>% 
  select(datasetName,
         parentEventID=PARENT_UNIT_ID,
         eventID = UNIT_ID,
         samplingProtocol,
         samplingSizeValue,
         samplingSizeUnit,
         eventDate,
         eventTime,
         year = YEAR,
         month = MONTH,
         day = DAY,
         habitat = HABITAT,
         eventRemarks = REMARKS,
         country = COUNTRY,
         countryCode = countryCodeISO,
         locality = LOCALITY,
         decimalLatitude = LATITUDE.x,
         decimalLongitude = LONGITUDE.x,
         coordinateUncertaintyInMeters = GPS_ERROR,
         geodeticDatum = DATUM,
         strata=STRATA)

## Remove substrate type records
DF.data.noSubstrate = DF.data %>% 
  filter(! grepl("substrate", scientificName, fixed = T))

## OccurrenceCore file
IPT.occurrence = DF.data.noSubstrate %>% ungroup() %>% 
  select(eventID = UNIT_ID,
         basisOfRecord,
         occurrenceID,
         scientificNameID,
         scientificName, 
         taxonRank = Rank) 

## Event Measurement or Fact (eMOF) file
IPT.mof = data.frame(eventID = DF.data.noSubstrate$UNIT_ID, 
                     occurrenceID = DF.data.noSubstrate$occurrenceID,
                     measurementType = tolower(DF.data.noSubstrate$Variable), 
                     measurmenetTypeID = DF.data.noSubstrate$measurementTypeID,
                     measurementValue = DF.data.noSubstrate$Value,
                     measurementUnit = DF.data.noSubstrate$measurementUnit,
                     measurementUnitID = DF.data.noSubstrate$measurementUnitID
                     )
print("EventCore")
## [1] "EventCore"
kable(IPT.event[1:3, ]) %>% kable_styling("striped")
datasetName parentEventID eventID samplingProtocol samplingSizeValue samplingSizeUnit eventDate eventTime year month day habitat eventRemarks country countryCode locality decimalLatitude decimalLongitude coordinateUncertaintyInMeters geodeticDatum strata
MBON-P2P-biodiversity- ARG_PMADRY_PESTE_RS_20181025 ARG_PMADRY_PESTE_RS_20181025_HIGH MBON-P2P_bestpractices-rockyshores 0.25 square meter 2018-10-25 07:45:00Z/10:45:00Z 2018 10 25 ROCKYSHORE NA ARGENTINA ARG PUERTO MADRYN -42.78511 -64.95264 5 WGS84 HIGH
MBON-P2P-biodiversity- ARG_PMADRY_PESTE_RS_20181025 ARG_PMADRY_PESTE_RS_20181025_MID MBON-P2P_bestpractices-rockyshores 0.25 square meter 2018-10-25 07:45:00Z/10:45:00Z 2018 10 25 ROCKYSHORE NA ARGENTINA ARG PUERTO MADRYN -42.78511 -64.95264 5 WGS84 MID
MBON-P2P-biodiversity- ARG_PMADRY_PESTE_RS_20181025 ARG_PMADRY_PESTE_RS_20181025_LOW MBON-P2P_bestpractices-rockyshores 0.25 square meter 2018-10-25 07:45:00Z/10:45:00Z 2018 10 25 ROCKYSHORE NA ARGENTINA ARG PUERTO MADRYN -42.78511 -64.95264 5 WGS84 LOW
print("OccurrenceCore")
## [1] "OccurrenceCore"
kable(IPT.occurrence[1:3, ]) %>% kable_styling("striped")
eventID basisOfRecord occurrenceID scientificNameID scientificName taxonRank
ARG_PMADRY_PCUEVA_RS_2018115_HIGH HumanObservation ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001 lsid:marinespecies.org:taxname:367978 Brachidontes rodriguezii Species
ARG_PMADRY_PCUEVA_RS_2018115_HIGH HumanObservation ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002 lsid:marinespecies.org:taxname:234476 Ulva prolifera Species
ARG_PMADRY_PCUEVA_RS_2018115_HIGH HumanObservation ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002 lsid:marinespecies.org:taxname:394848 Balanus glandula Species
print("MoF")
## [1] "MoF"
kable(IPT.mof[1:3, ]) %>% kable_styling("striped")
eventID occurrenceID measurementType measurmenetTypeID measurementValue measurementUnit measurementUnitID
ARG_PMADRY_PCUEVA_RS_2018115_HIGH ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001 cover http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ 1 percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
ARG_PMADRY_PCUEVA_RS_2018115_HIGH ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002 cover http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ 13 percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
ARG_PMADRY_PCUEVA_RS_2018115_HIGH ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002 cover http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/ 2 percent http://vocab.nerc.ac.uk/collection/P06/current/UPCT/

5.4 Generate Data Anaylisis files

## reformat to wide
DF.dataWide = dcast(occurrenceID+LOCALITY+SITE+STRATA+SAMPLE+scientificName+AphiaID+Rank~Variable, value.var = "Value", data=DF.data, sum)

5.5 Save files

rootFileName = paste(unique(DF.sites$countryCodeISO), paste0(unique(DF.sites$localityCode, collapse="-")), 
                     unique(DF.sites$HABITAT), gsub("-","", min(DF.sites$eventDate)), sep="_")

## IPT files
readr::write_csv(IPT.event, path = file.path(baseIPT,paste0(rootFileName, "_IPT-event.csv")))
readr::write_csv(IPT.occurrence, path = file.path(baseIPT,paste0(rootFileName, "_IPT-occurrence.csv")))
readr::write_csv(IPT.mof, path = file.path(baseIPT,paste0(rootFileName, "_IPT-mof.csv")))

## Analysis file
readr::write_csv(DF.dataWide, path = file.path(baseAnalysis,paste0(rootFileName, "_analysis.csv")))
readr::write_csv(DF.sites, path = file.path(baseAnalysis,paste0(rootFileName, "_site.csv")))

You are done! You can now upload your data (DwC-A files) to OBIS following this How-To manual.