This script transforms long-format data tables used in rocky intertidal surveys of the Marine Biodiversity Observation Network Pole to Pole of the Americas (MBON Pole to Pole) into Darwin Core Archive (DwC-A) files for publishing data in the Ocean Biodiversity Information System (OBIS) following instructions from this manual. It also generates an integrated file ready for data analysis.

The original R Markdown version of this document is available here.

To test this script, you will need to:

Install R software and RStudio.
Create three folders in your working directory: Analysis, Data, IPT.
Set your working directory to the location of these three folders (e.g. setwd(“~/your directory”).
Save the The DataSheet_longformat_TEST file in the “Data” folder. This is the data table that you will substitute with your own data.

Now, just copy the code chunks below and paste them into your R console.

1 Basic Setup

You need a few packages to run this code. Check that the “Data”, IPT" and “Analysis” folders are created under your working directory. The DataSheet_longformat_TEST file should be available in the “Data” folder.

library(lutz)
library(countrycode)
library(readxl)
library(reshape2)
library(lubridate)
library(dplyr)
library(ggplot2)
library(kableExtra)
options(dplyr.summarise.inform = FALSE)

# Create a "Data", IPT" and "Analysis" folder under your selected directory
baseDataDir = "Data"
baseIPT = "IPT"
baseAnalysis = "Analysis"

2 Load your data table

Set fileNameto the name of the longformat data table from your survey

# Select your data table
fileName = "DataSheet_longformat_TEST_v2.xlsx"

3 Read file sheets

## Extract information about your sampling site from the SiteInfo tab of your data table file

DF.sites = read_xlsx(file.path(baseDataDir, fileName),sheet = "SiteInfo")
DF.sites = DF.sites[!is.na(DF.sites$COUNTRY),]

3.1 convert time to UTC

## get number of seconds from midnight
secsSTART = (1 -abs(as.numeric(julian(DF.sites$TIME_START)) - (as.integer(julian(DF.sites$TIME_START))))) * (60*60*24)
secsEND = (1 - abs(as.numeric(julian(DF.sites$TIME_END)) - (as.integer(julian(DF.sites$TIME_END))))) * (60*60*24)
dateChar = paste(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY, sep="-")

## get timezone and timezone offset
timeZone = tz_lookup_coords(mean(DF.sites$LATITUDE, na.rm=T), mean(DF.sites$LONGITUDE, na.rm=T), method="accurate")
dateOffset = tz_offset(dateChar, timeZone)$utc_offset_h

## create data and time UTC
DF.sites$eventDate = as.POSIXct(dateChar, tz="UTC")
DF.sites$TIME_START = DF.sites$eventDate + seconds(secsSTART) + hours(dateOffset)
DF.sites$TIME_END = DF.sites$eventDate + seconds(secsEND) + hours(dateOffset)
DF.sites$eventTime = paste(format(DF.sites$TIME_START, "%H:%M:%SZ"), format(DF.sites$TIME_END, "%H:%M:%SZ"), sep="/")

print(timeZone)

## [1] "America/Argentina/Catamarca"

kable(DF.sites[1:5, c(3:5, 13:14)]) %>% kable_styling("striped")

COUNTRY	LOCALITY	SITE	TIME_START	TIME_END
ARGENTINA	PUERTO MADRYN	PUNTA ESTE	2018-10-25 07:45:00	2018-10-25 10:45:00
ARGENTINA	PUERTO MADRYN	PUNTA ESTE	2018-10-25 07:45:00	2018-10-25 10:45:00
ARGENTINA	PUERTO MADRYN	PUNTA ESTE	2018-10-25 07:45:00	2018-10-25 10:45:00
ARGENTINA	PUERTO MADRYN	PUNTA CUEVAS	2018-11-05 07:45:00	2018-11-05 10:45:00
ARGENTINA	PUERTO MADRYN	PUNTA CUEVAS	2018-11-05 07:45:00	2018-11-05 10:45:00

3.2 Extract other fields

# Country code
DF.sites$datasetName = paste0("MBON-P2P-biodiversity-",unique(DF.sites$countryCode))
DF.sites$countryCodeISO = countrycode(DF.sites$COUNTRY, "country.name","iso3c")

# Sampling protocol
DF.sites$samplingProtocol = "MBON-P2P_bestpractices-rockyshores"

# Sampling size value
DF.sites$samplingSizeValue = 0.25

# Sampling unit
DF.sites$samplingSizeUnit = "square meter"

print(DF.sites$countryCodeISO[1])

## [1] "ARG"

3.3 Extrat data, taxa list and codes

## data
DF.data = read_xlsx(file.path(baseDataDir, fileName),sheet = "DATA")
DF.data = DF.data[!is.na(DF.data$LOCALITY),]

## spp list
DF.spp = read_xlsx(file.path(baseDataDir, fileName),sheet = "sppList")

## codes
DF.countryCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Countries")
DF.localityCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Locality")
DF.siteCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Sites")
DF.habitatCodes = read_xlsx(file.path(baseDataDir, fileName),sheet = "Habitat")

kable(DF.data[1:5, 1:7]) %>% kable_styling("striped")

LOCALITY	SITE	STRATA	SAMPLE	AREA_quadrat	scientificName	Variable
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S02	FULL QUADRAT	substrate_BAREROCK	COVER
PUERTO MADRYN	PUNTA CUEVAS	MID	S01	FULL QUADRAT	substrate_BAREROCK	COVER
PUERTO MADRYN	PUNTA CUEVAS	MID	S02	FULL QUADRAT	substrate_BAREROCK	COVER
PUERTO MADRYN	PUNTA CUEVAS	MID	S03	FULL QUADRAT	substrate_BAREROCK	COVER
PUERTO MADRYN	PUNTA CUEVAS	MID	S04	FULL QUADRAT	substrate_BAREROCK	COVER

kable(DF.spp[1:5,]) %>% kable_styling("striped")

scientificName	AphiaID	Authority	Rank	Catami
Aaptos	132064	Gray, 1867	Genus	NA
Abietinaria	117225	Kirchenpauer, 1884	Genus	NA
Abietinaria abietina	117870	(Linnaeus, 1758)	Species	NA
Abra alba	141433	(W. Wood, 1802)	Species	NA
Acanthais brevidentata	395268	(W. Wood, 1828)	Species	NA

4 Generate IDs

## add codes: SITES
DF.sites = left_join(DF.sites, DF.countryCodes, by = "COUNTRY")
DF.sites = left_join(DF.sites, DF.localityCodes, by = "LOCALITY")
DF.sites = left_join(DF.sites, DF.siteCodes, by = "SITE")
DF.sites = left_join(DF.sites, DF.habitatCodes, by = "HABITAT")

DF.sites$PARENT_UNIT_ID = paste(DF.sites$countryCode, DF.sites$localityCode, DF.sites$siteCode, DF.sites$habitatCode, 
                                paste0(DF.sites$YEAR, DF.sites$MONTH, DF.sites$DAY), sep="_")
DF.sites$UNIT_ID = paste(DF.sites$PARENT_UNIT_ID, DF.sites$STRATA, sep="_")

print(DF.sites$UNIT_ID[1:6])

## [1] "ARG_PMADRY_PESTE_RS_20181025_HIGH" "ARG_PMADRY_PESTE_RS_20181025_MID" 
## [3] "ARG_PMADRY_PESTE_RS_20181025_LOW"  "ARG_PMADRY_PCUEVA_RS_2018115_HIGH"
## [5] "ARG_PMADRY_PCUEVA_RS_2018115_MID"  "ARG_PMADRY_PCUEVA_RS_2018115_LOW"

5 Assign codes to DATA

## Add Aphia ID and taxa rank
DF.data = left_join(DF.data, DF.spp[,c("scientificName", "AphiaID", "Rank")])
DF.data = left_join(DF.data, DF.sites[,c("UNIT_ID", "LOCALITY", "SITE", "STRATA")])
DF.data = DF.data %>% group_by(LOCALITY, SITE, STRATA, SAMPLE) %>% 
  mutate(sampleOrganismID = 1:n(), scientificName, AphiaID, Rank, Variable, Value)
DF.data$occurrenceID = paste(DF.data$UNIT_ID, DF.data$SAMPLE, sprintf("%03d", DF.data$sampleOrganismID), sep="_")

print(DF.data$occurrenceID[1:20])

##  [1] "ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_001"
##  [2] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S01_001" 
##  [3] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S02_001" 
##  [4] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S03_001" 
##  [5] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S04_001" 
##  [6] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S05_001" 
##  [7] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S06_001" 
##  [8] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S07_001" 
##  [9] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S08_001" 
## [10] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S09_001" 
## [11] "ARG_PMADRY_PCUEVA_RS_2018115_MID_S10_001" 
## [12] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S01_001"
## [13] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S02_001"
## [14] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S03_001"
## [15] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S04_001"
## [16] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S05_001"
## [17] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S06_001"
## [18] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S07_001"
## [19] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S08_001"
## [20] "ARG_PMADRY_PESTE_RS_20181025_HIGH_S09_001"

5.1 Convert abundance values

## to count per square meter 
DF.data$Value[DF.data$Variable=="ABUNDANCE"] = DF.data$Value[DF.data$Variable=="ABUNDANCE"] * 4

print(DF.data$Value[DF.data$Variable=="ABUNDANCE"][1:10])

##  [1] 12 16  4  4  4  4  4  4  8  4

5.2 Assign other IPT fields

DF.data$basisOfRecord = "HumanObservation"
DF.data$occurrenceStatus = "present"
DF.data$scientificNameID = paste0("lsid:marinespecies.org:taxname:", DF.data$AphiaID)

## fields for the eMoF
DF.data$measurementTypeID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/",  ##Coverage (in assayed sample) of biological entity 
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/") ## number per square meter

DF.data$measurementUnit = ifelse(DF.data$Variable=="COVER", "percent", "count")
DF.data$measurementUnitID = ifelse(DF.data$Variable=="COVER",
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPCT/",    ## percentage
                                   "http://vocab.nerc.ac.uk/collection/P06/current/UPMS/")   ## number per square meter

DF.data = DF.data %>% arrange(occurrenceID, scientificName)

kable(DF.data[1:5, ]) %>% kable_styling("striped")

LOCALITY	SITE	STRATA	SAMPLE	AREA_quadrat	scientificName	Variable	Value	Remarks	QC	AphiaID	Rank	UNIT_ID	sampleOrganismID	occurrenceID	basisOfRecord	occurrenceStatus	scientificNameID	measurementTypeID	measurementUnit	measurementUnitID
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S01	FULL QUADRAT	Brachidontes rodriguezii	COVER	1	NA	NA	367978	Species	ARG_PMADRY_PCUEVA_RS_2018115_HIGH	1	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001	HumanObservation	present	lsid:marinespecies.org:taxname:367978	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S01	FULL QUADRAT	Ulva prolifera	COVER	13	NA	NA	234476	Species	ARG_PMADRY_PCUEVA_RS_2018115_HIGH	2	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002	HumanObservation	present	lsid:marinespecies.org:taxname:234476	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S02	FULL QUADRAT	substrate_BAREROCK	COVER	22	NA	NA	-99999	NA	ARG_PMADRY_PCUEVA_RS_2018115_HIGH	1	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_001	HumanObservation	present	lsid:marinespecies.org:taxname:-99999	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S02	FULL QUADRAT	Balanus glandula	COVER	2	NA	NA	394848	Species	ARG_PMADRY_PCUEVA_RS_2018115_HIGH	2	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002	HumanObservation	present	lsid:marinespecies.org:taxname:394848	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
PUERTO MADRYN	PUNTA CUEVAS	HIGH	S02	FULL QUADRAT	Brachidontes rodriguezii	COVER	2	NA	NA	367978	Species	ARG_PMADRY_PCUEVA_RS_2018115_HIGH	3	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_003	HumanObservation	present	lsid:marinespecies.org:taxname:367978	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/

5.3 Remove substrate type records

## EventCore file
IPT.event = DF.sites %>% 
  select(datasetName,
         parentEventID=PARENT_UNIT_ID,
         eventID = UNIT_ID,
         samplingProtocol,
         samplingSizeValue,
         samplingSizeUnit,
         eventDate,
         eventTime,
         year = YEAR,
         month = MONTH,
         day = DAY,
         habitat = HABITAT,
         eventRemarks = REMARKS,
         country = COUNTRY,
         countryCode = countryCodeISO,
         locality = LOCALITY,
         decimalLatitude = LATITUDE.x,
         decimalLongitude = LONGITUDE.x,
         coordinateUncertaintyInMeters = GPS_ERROR,
         geodeticDatum = DATUM,
         strata=STRATA)

## Remove substrate type records
DF.data.noSubstrate = DF.data %>% 
  filter(! grepl("substrate", scientificName, fixed = T))

## OccurrenceCore file
IPT.occurrence = DF.data.noSubstrate %>% ungroup() %>% 
  select(eventID = UNIT_ID,
         basisOfRecord,
         occurrenceID,
         scientificNameID,
         scientificName, 
         taxonRank = Rank) 

## Event Measurement or Fact (eMOF) file
IPT.mof = data.frame(eventID = DF.data.noSubstrate$UNIT_ID, 
                     occurrenceID = DF.data.noSubstrate$occurrenceID,
                     measurementType = tolower(DF.data.noSubstrate$Variable), 
                     measurmenetTypeID = DF.data.noSubstrate$measurementTypeID,
                     measurementValue = DF.data.noSubstrate$Value,
                     measurementUnit = DF.data.noSubstrate$measurementUnit,
                     measurementUnitID = DF.data.noSubstrate$measurementUnitID
                     )
print("EventCore")

## [1] "EventCore"

kable(IPT.event[1:3, ]) %>% kable_styling("striped")

datasetName	parentEventID	eventID	samplingProtocol	samplingSizeValue	samplingSizeUnit	eventDate	eventTime	year	month	day	habitat	eventRemarks	country	countryCode	locality	decimalLatitude	decimalLongitude	coordinateUncertaintyInMeters	geodeticDatum	strata
MBON-P2P-biodiversity-	ARG_PMADRY_PESTE_RS_20181025	ARG_PMADRY_PESTE_RS_20181025_HIGH	MBON-P2P_bestpractices-rockyshores	0.25	square meter	2018-10-25	07:45:00Z/10:45:00Z	2018	10	25	ROCKYSHORE	NA	ARGENTINA	ARG	PUERTO MADRYN	-42.78511	-64.95264	5	WGS84	HIGH
MBON-P2P-biodiversity-	ARG_PMADRY_PESTE_RS_20181025	ARG_PMADRY_PESTE_RS_20181025_MID	MBON-P2P_bestpractices-rockyshores	0.25	square meter	2018-10-25	07:45:00Z/10:45:00Z	2018	10	25	ROCKYSHORE	NA	ARGENTINA	ARG	PUERTO MADRYN	-42.78511	-64.95264	5	WGS84	MID
MBON-P2P-biodiversity-	ARG_PMADRY_PESTE_RS_20181025	ARG_PMADRY_PESTE_RS_20181025_LOW	MBON-P2P_bestpractices-rockyshores	0.25	square meter	2018-10-25	07:45:00Z/10:45:00Z	2018	10	25	ROCKYSHORE	NA	ARGENTINA	ARG	PUERTO MADRYN	-42.78511	-64.95264	5	WGS84	LOW

print("OccurrenceCore")

## [1] "OccurrenceCore"

kable(IPT.occurrence[1:3, ]) %>% kable_styling("striped")

eventID	basisOfRecord	occurrenceID	scientificNameID	scientificName	taxonRank
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	HumanObservation	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001	lsid:marinespecies.org:taxname:367978	Brachidontes rodriguezii	Species
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	HumanObservation	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002	lsid:marinespecies.org:taxname:234476	Ulva prolifera	Species
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	HumanObservation	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002	lsid:marinespecies.org:taxname:394848	Balanus glandula	Species

print("MoF")

## [1] "MoF"

kable(IPT.mof[1:3, ]) %>% kable_styling("striped")

eventID	occurrenceID	measurementType	measurmenetTypeID	measurementValue	measurementUnit	measurementUnitID
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_001	cover	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	1	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S01_002	cover	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	13	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/
ARG_PMADRY_PCUEVA_RS_2018115_HIGH	ARG_PMADRY_PCUEVA_RS_2018115_HIGH_S02_002	cover	http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL10/	2	percent	http://vocab.nerc.ac.uk/collection/P06/current/UPCT/

5.4 Generate Data Anaylisis files

## reformat to wide
DF.dataWide = dcast(occurrenceID+LOCALITY+SITE+STRATA+SAMPLE+scientificName+AphiaID+Rank~Variable, value.var = "Value", data=DF.data, sum)

5.5 Save files

rootFileName = paste(unique(DF.sites$countryCodeISO), paste0(unique(DF.sites$localityCode, collapse="-")), 
                     unique(DF.sites$HABITAT), gsub("-","", min(DF.sites$eventDate)), sep="_")

## IPT files
readr::write_csv(IPT.event, path = file.path(baseIPT,paste0(rootFileName, "_IPT-event.csv")))
readr::write_csv(IPT.occurrence, path = file.path(baseIPT,paste0(rootFileName, "_IPT-occurrence.csv")))
readr::write_csv(IPT.mof, path = file.path(baseIPT,paste0(rootFileName, "_IPT-mof.csv")))

## Analysis file
readr::write_csv(DF.dataWide, path = file.path(baseAnalysis,paste0(rootFileName, "_analysis.csv")))
readr::write_csv(DF.sites, path = file.path(baseAnalysis,paste0(rootFileName, "_site.csv")))

You are done! You can now upload your data (DwC-A files) to OBIS following this How-To manual.

DwC-A Writer

Eduardo Klein and Enrique Montes (emontesh@usf.edu)

10/7/2020