Code

R

library(EML)
library(ncdf4)
library(emld)
library(lubridate)
library(tibble)
library(dplyr)
library(tidyr)
library(reshape2)
emld::eml_version("eml-2.2.0")

## [1] "eml-2.2.0"

set.seed(42)

#reticulate::install_miniconda()
reticulate::use_condaenv("r-reticulate")

Python

# Python standard library
import datetime
import tempfile
import xml.etree.ElementTree as ET

# External dependencies; all available on PyPI
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import netCDF4
import pandas as pd
import dicttoxml

rng = np.random.default_rng(42)

A simple, example forecast of population growth of two interacting species

========================================================

Generating the forecast

To illustrate the application of the forecast standard, we’ll consider the classic Lotka-Volterra population growth model. To keep this first example simple, we’ll only consider two species and two uncertainties - an additive process error, which converts this to a stochastic Lotka-Volterra, and an observation error. We’ll propagate uncertainty by running multiple ensembles. To illustrate the ability of the output format to accommodate spatial dimensions, we’ll run the model at multiple depths in a water column (e.g. lake or ocean) but those depths are not interacting – we know this isn’t realistic, but we want to keep things simple. Overall, this gives us the following dimensions

Code

R

reference_datetime = as.Date("2001-03-04")  ## start date of our forecast
n_time <- 30
datetimes = seq(from=reference_datetime,by="1 day",length=n_time)

depths <- c(1, 3, 5)
n_depths <- length(depths)

n_ensembles <- 10
ensembles <- seq_len(n_ensembles)

species <- c("species_1","species_2")
n_species <- length(species)

obs_dim <- 2 ## 1 = latent state
             ## 2 = latent state + observation error

Python

reference_datetime = datetime.date(2001, 3, 4)
n_time = 30
datetimes = [reference_datetime + datetime.timedelta(days = x) for x in range(n_time)]

depths = [1, 3, 5]
n_depths = len(depths)

n_ensembles = 10
ensembles = list(range(n_ensembles))

species = ["species_1", "species_2"]
n_species = len(species)

obs_dim = 2 ## 1 = latent state, 2 = latent_state + observation error

Next we’re going to assume fixed parameters and initial conditions, set up storage for our forecast output (which we see has 5 dimensions), and then forward simulate the populations

Code

R

## parameters for species_1 and species_2
r <- c(1, 2.75)
K <-  c(10, 20)
alpha <-  c(0.2, 0.3)

## process error
process_sd <- c(0.02,0.01)

## observation error
obs_sd <- c(0.5,0.75)

## initial condition uncertainties
n0_mu <-  c(0.5,0.25,1,0.5,2,1)
n0_sd <- n0_mu*c(0.5,0.1)
## initial condition: gamma parameters
n0_r  <- n0_mu/(n0_sd^2)
n0_a  <- n0_mu*n0_r

## forecast output storage
n <- array(NA,dim = c(n_time, n_depths, n_ensembles, obs_dim,2)) ## last dim is species

## initial conditions
for(i in 1:n_ensembles){
  n[1,,i,1,] <- matrix(rgamma(n_depths*2,n0_a,n0_r),nrow = n_depths,ncol = 2,byrow = TRUE)
}
n[1,,,2,] = n[1,,,1,] # assume that there's no additional observation error on the initial condition uncertainty

## data assimilation flag
data_assimilation <- rep(as.integer(0), n_time)  ## this code indicates a 'free-run' that didn't assimilate new data

## forward simulation
for(t in 2:n_time){
  for(depth in 1:n_depths){
    for(ens in 1:n_ensembles){
      ## predict latent state
      n_curr <- n[t-1,depth,ens,1,] ## current state
      n[t, depth, ens,1,1] <-  n_curr[1] + 
          r[1]*n_curr[1]*(1-((n_curr[1] + 
          alpha[1]*n_curr[2])/K[1])) + rnorm(1, 0, process_sd[1])
      n[t, depth, ens,1,2] <-  n_curr[2] + 
          r[2]*n_curr[2]*(1-((n_curr[2] + 
          alpha[2]*n[1])/K[2])) + rnorm(1, 0, process_sd[2])
      ## predict observed values
      n[t,depth,ens,2,] = rnorm(2,n[t,depth,ens,1,],obs_sd)
    }
  }
}

Python

r = [1, 2.75]
K = [10, 20]
alpha = [0.2, 0.3]

process_sd = [0.02, 0.01]
obs_sd = [0.5, 0.75]

n0 = 0.5
n0_mu = [[0.5, 0.25],
         [1, 0.5],
         [2, 1]]
n0_sd = np.multiply(n0_mu,[[0.5,0.1],[0.5,0.1],[0.5,0.1]])
n0_s  = np.divide(np.multiply(n0_sd,n0_sd),n0_mu)
n0_a  = np.divide(n0_mu,n0_s)

n = np.zeros((n_time, n_depths, n_ensembles, obs_dim, n_species))
n[0,:,:,:,:] = n0

for i in range(0,n_ensembles-1):
  n[0,:,i,0,:] = rng.gamma(n0_a,n0_s)

n[0,:,:,1,:] = n[0,:,:,0,:] 

data_assimilation = np.zeros((n_time), dtype=int)

for t in range(1, n_time):
    for depth in range(n_depths):
        for ens in range(n_ensembles):
            # predict latent state
            n_curr = n[t-1,depth,ens,0,:]
            n[t, depth, ens, 0, 0] = n_curr[0] + \
                r[0] * n_curr[0] * \
                (1 - ((n_curr[0] + alpha[0]*n_curr[1]) / K[0])) + \
                np.random.randn(1) * process_sd[0]
            n[t, depth, ens, 0, 1] = n_curr[1] + \
                r[1] * n_curr[1] * \
                (1 - ((n_curr[1] + alpha[1]*n_curr[0]) / K[1])) + \
                np.random.randn(1) * process_sd[1]
            # predict observed values
            n[t,depth,ens,1,:] = n[t,depth,ens,0,:] + np.random.randn(2) * obs_sd

Here’s a simple visualization of our ensemble forecast at one depth

Code

R

plot(datetimes,n[,1,1,1,1],ylim=range(n),ylab="Population Density",type='l')
for(s in seq_along(species)){ ##species
  for(e in ensembles){
    lines(datetimes,n[,1,e,1,s],col=s)                                  ## latent
    points(datetimes+runif(n_time,-0.12,0.12),n[,1,e,2,s],col=s,cex=0.35)  ## pseudo-obs w/ jitter
  }
}
legend("bottomright",legend=species,col = seq_along(species),lty = 1)

Python

# Plot won't show due to pkgdown bug
# matplotlib.use('Agg')
plt.rc('xtick', labelsize=8) 
plt.plot(datetimes, n[:,0,0,0,0], "-")

colors = ["r", "b"]
for s in range(n_species):
    for e in range(n_ensembles):
        plt.plot(datetimes, n[:,0,e,0,s], "-", color=colors[s])
        plt.plot(datetimes, n[:,0,e,1,s], "+", color=colors[s])
leg_lines = [plt.Line2D([0], [0], color=colors[x]) for x in range(n_species)]
leg_labs = [f"Species {x}" for x in range(n_species)]
plt.legend(leg_lines, leg_labs)
plt.xlabel("Time")
plt.ylabel("Population Density")
plt.show()

Saving to a standardized output format

Standard Option 1: netCDF

Forecast identifiers

Because netCDF is a self-documenting format we’re going to start by defining some forecast identifiers (which we’ll also later use in the metadata)

project_id identifies forecasts coming from a single project and is used to link forecasts different model versions. Examples might include a project Github repository or a team name.

model_id is updated each time the forecast code is modified (e.g. model structure, model calibration, forecast workflow). Examples might be a DOI, version number, or Github hash. Results from a single model_id are considered comparable.

iteration_id represents a unique ID for each forecast run. Examples might be a start time or database ID.

For example, if you have a forecast code base on GitHub and launch a forecast from that code that runs daily for 365 days, then there will be one project_id and 365 iteration_ids. A paper analyzing the forecasts would cite the project_id.

Code

R

project_id <- "LogisticDemo"
model_id   <- "v0.5"
iteration_id <- "20010304T060000"   # ISO datetime should make a valid Forecast_id

Python

project_id = "LogisticDemo"
model_id = "v0.5"
iteration_id = "20010304T060000"

netcdf Dimensions

Once we have our ID’s defined, we’re going to use ncdim_def to define the dimensions that our output variables will have

Code

R

## Set dimensions
timedim <- ncdim_def("datetime",   ## dimension name
                     units = paste('days since',reference_datetime),
                               ## size of timestep, with units and start date
                     vals = as.numeric(datetimes - reference_datetime),
                               ## sequence of values that defines the dimension.
                               ## netCDF expresses time dimensions relative to a
                               ## specified start date, in this case reference_datetime
                     longname = 'datetime')
                               ## descriptive name

depthdim <- ncdim_def("depth", 
                      units = "meters",
                      vals = depths, 
                      longname = 'Depth from surface') 

parmdim <- ncdim_def("parameter",             
                    units = "",
                    vals = ensembles,       
                    longname = 'ensemble member') 

obsdim <- ncdim_def("obs_flag",
                    units = "",
                    vals = 1:obs_dim,
                    longname = "observation error flag. 1 = latent state, 2 = w/ obs error")

## quick check that units are valid
udunits2::ud.is.parseable(timedim$units)

## [1] TRUE

udunits2::ud.is.parseable(depthdim$units)

## [1] TRUE

udunits2::ud.is.parseable(parmdim$units)

## [1] TRUE

udunits2::ud.is.parseable(obsdim$units)

## [1] TRUE

Python

Note that unlike R, the Python interface expects us to create the output NetCDF file first before populating it with output.


ncfile = tempfile.NamedTemporaryFile(suffix=".nc")
ncout = netCDF4.Dataset(ncfile.name, mode='w')

timedim = ncout.createDimension("datetime", n_time)
timevar = ncout.createVariable("datetime", "f4", "datetime")
timevar.units = f"days since {reference_datetime}"
timevar[:] = [(t - reference_datetime).days for t in datetimes]

depthdim = ncout.createDimension("depth", n_depths)
depthvar = ncout.createVariable("depth", "f4", "depth")
depthvar.units = "meters"
depthvar[:] = depths

parmdim = ncout.createDimension("parameter", n_ensembles)
parmvar = ncout.createVariable("parameter", "i4", "parameter")
parmvar.units = ""
parmvar[:] = ensembles

obsdim = ncout.createDimension("obs_flag", obs_dim)
obsvar = ncout.createVariable("obs_flag", "i4", "obs_flag")
obsvar.units = ""
obsvar[:] = list(range(obs_dim))

Variable names

Once we’ve defined our dimensions, we can then define the metadata about our variables using ncvar_def

Code

R

fillvalue <- 1e32  ## missing data value

#Define variables
def_list <- list()
def_list[[1]] <- ncvar_def(name =  "species_1",
                           units = "number of individuals",
                           dim = list(timedim, depthdim, parmdim, obsdim),
                           missval = fillvalue,
                           longname = '<scientific name of species 1>',
                           prec="single")
def_list[[2]] <- ncvar_def(name =  "species_2",
                           units = "number of individuals",
                           dim = list(timedim, depthdim, parmdim, obsdim),
                           missval = fillvalue,
                           longname = '<scientific name of species 2>',
                           prec="single")
def_list[[3]] <- ncvar_def(name =  "data_assimilation",
                           units = "integer",
                           dim = list(timedim),
                           missval = fillvalue,
                           longname = 'EFI standard data assimilation code. 0 = no data',
                           prec="single")

Python

spp1nc = ncout.createVariable("species_1", "f4", ("datetime", "depth", "parameter", "obs_flag"))
spp1nc.units = "number of individuals"
spp1nc.longname = "<scientific name of species 1>"

spp2nc = ncout.createVariable("species_2", "f4", ("datetime", "depth", "parameter", "obs_flag"))
spp2nc.units = "number of individuals"
spp2nc.longname = "<scientific name of species 2>"

danc = ncout.createVariable("data_assimilation", "i4", ("datetime"))
danc.units = "integer"
danc.longname = "EFI standard data assimilation code. 0 = no data"

Creating the file

Finally, we’ll now create our netCDF file and add our outputs and global metadata to the file

Code

R

## file name
ncfname <- "logistic-forecast-ensemble-multi-variable-space-long.nc"

## open your netCDF file
ncout <- nc_create(ncfname,def_list,force_v4=T)

## fill in our output data
ncvar_put(ncout,def_list[[1]] , n[, , , ,1 ])        ## species 1
ncvar_put(ncout,def_list[[2]] , n[, , , ,2 ])        ## species 2
ncvar_put(ncout,def_list[[3]] , data_assimilation) ## data_assimilation flag

## Global attributes (metadata)
ncatt_put(ncout,0,"project_id", as.character(project_id), 
          prec =  "text")
ncatt_put(ncout,0,"model_id",as.character(model_id), 
          prec =  "text")
ncatt_put(ncout,0,"iteration_id",as.character(iteration_id), 
          prec =  "text")

nc_close(ncout)   ## make sure to close the file

Python

Recall: We already created the file above.

# Write data to file
spp1nc[:,:,:,:] = n[:,:,:,:,0]
spp2nc[:,:,:,:] = n[:,:,:,:,1]
danc[:] = data_assimilation

# Set global attributes (metadata)
ncout.project_id = project_id
ncout.model_id = model_id
ncout.iteration_id = iteration_id

ncout.close()

Standard Option 2: ensemble CSV

Convert to a flat file format (CSV) with one column for each variable and all ensemble members saved

Code

R

## wrangle data from an array into a long format
df_combined <- reshape2::melt(n,varnames=c("datetime","depth","parameter","obs_flag","species")) %>%
  pivot_wider(id_cols = 1:4,names_from = "species",names_prefix = "species_",values_from = "value") %>% 
  pivot_longer(cols = starts_with("species"),names_to = "variable",values_to = "prediction") %>%
  mutate(datetime = datetimes[datetime]) %>%
  mutate(depth = depths[depth]) %>%
  right_join(tibble(datetime=datetimes,data_assimilation = data_assimilation))

## Joining, by = "datetime"

head(df_combined,n=10)

## # A tibble: 10 × 7
##    datetime   depth parameter obs_flag variable  prediction data_assimilation
##    <date>     <dbl>     <int>    <int> <chr>          <dbl>             <int>
##  1 2001-03-04     1         1        1 species_1      0.817                 0
##  2 2001-03-04     1         1        1 species_2      0.235                 0
##  3 2001-03-05     1         1        1 species_1      1.60                  0
##  4 2001-03-05     1         1        1 species_2      0.855                 0
##  5 2001-03-06     1         1        1 species_1      2.94                  0
##  6 2001-03-06     1         1        1 species_2      3.06                  0
##  7 2001-03-07     1         1        1 species_1      4.84                  0
##  8 2001-03-07     1         1        1 species_2     10.1                   0
##  9 2001-03-08     1         1        1 species_1      6.34                  0
## 10 2001-03-08     1         1        1 species_2     23.5                   0

df_combined %>% filter(parameter < 3,obs_flag == 1,datetime == as.Date("2001-03-05")) %>% arrange(datetime) %>% select(-data_assimilation)

## # A tibble: 12 × 6
##    datetime   depth parameter obs_flag variable  prediction
##    <date>     <dbl>     <int>    <int> <chr>          <dbl>
##  1 2001-03-05     1         1        1 species_1      1.60 
##  2 2001-03-05     1         1        1 species_2      0.855
##  3 2001-03-05     3         1        1 species_1      1.69 
##  4 2001-03-05     3         1        1 species_2      1.62 
##  5 2001-03-05     5         1        1 species_1      2.98 
##  6 2001-03-05     5         1        1 species_2      3.33 
##  7 2001-03-05     1         2        1 species_1      1.98 
##  8 2001-03-05     1         2        1 species_2      0.891
##  9 2001-03-05     3         2        1 species_1      0.875
## 10 2001-03-05     3         2        1 species_2      1.59 
## 11 2001-03-05     5         2        1 species_1      5.33 
## 12 2001-03-05     5         2        1 species_2      3.91

write.csv(df_combined, 
          file = "logistic-forecast-ensemble-multi-variable-multi-depth.csv")

Python

Ensemble format

n_names = ["datetime", "depth", "ensemble", "obs_flag", "species"]
n_index = pd.MultiIndex.from_product([range(s) for s in n.shape], names=n_names)
df = pd.DataFrame({"pop": n.flatten()}, index = n_index)["pop"]

## would like to find a more elegant solution than long -> wide -> long
df = df.unstack("species").reset_index()
df.rename(columns={0:"species 1",1:"species 2"},inplace=True)
df = pd.melt(df,id_vars=['datetime','depth','ensemble','obs_flag'],value_vars=["species 1","species 2"])
df.columns = ['datetime', 'depth', 'ensemble', 'obs_flag', 'variable', 'prediction']
df["datetime"] = [datetimes[x] for x in df["datetime"]]
df["depth"] = [depths[x] for x in df["depth"]]


meta_df = pd.DataFrame({"datetime": datetimes, "data_assimilation": data_assimilation})

df_combined = df.merge(meta_df, on="datetime", how="right")
df_combined.head()

##      datetime  depth  ensemble  ...   variable prediction  data_assimilation
## 0  2001-03-04      1         0  ...  species 1   0.535207                  0
## 1  2001-03-04      1         0  ...  species 1   0.535207                  0
## 2  2001-03-04      1         1  ...  species 1   0.579589                  0
## 3  2001-03-04      1         1  ...  species 1   0.579589                  0
## 4  2001-03-04      1         2  ...  species 1   0.597916                  0
## 
## [5 rows x 7 columns]

out_csv = tempfile.NamedTemporaryFile(suffix=".csv")
df_combined.to_csv(out_csv.name)

Standard Option 3: distributional CSV

Convert to a flat file format (CSV) with forecast distributions saved

Code

R

## calculate summary statistics across ensemble members and output variables
dfs <- df_combined %>% add_column(family="normal") %>%
                   group_by(datetime, depth,family,obs_flag,variable,data_assimilation) %>%
                   summarize(mu = mean(prediction),
                             sigma   = sd(prediction)) %>%
                   pivot_longer(7:8,names_to="parameter",values_to = "predictions") %>%
                   relocate(parameter,.after = family) %>%
                   relocate(predictions,.after=variable)

## `summarise()` has grouped output by 'datetime', 'depth', 'family', 'obs_flag',
## 'variable'. You can override using the `.groups` argument.

head(dfs,n=10)

## # A tibble: 10 × 8
## # Groups:   datetime, depth, family, obs_flag, variable [5]
##    datetime   depth family parameter obs_flag variable  predictions data_assim…¹
##    <date>     <dbl> <chr>  <chr>        <int> <chr>           <dbl>        <int>
##  1 2001-03-04     1 normal mu               1 species_1      0.756             0
##  2 2001-03-04     1 normal sigma            1 species_1      0.174             0
##  3 2001-03-04     1 normal mu               1 species_2      0.250             0
##  4 2001-03-04     1 normal sigma            1 species_2      0.0129            0
##  5 2001-03-04     1 normal mu               2 species_1      0.756             0
##  6 2001-03-04     1 normal sigma            2 species_1      0.174             0
##  7 2001-03-04     1 normal mu               2 species_2      0.250             0
##  8 2001-03-04     1 normal sigma            2 species_2      0.0129            0
##  9 2001-03-04     3 normal mu               1 species_1      0.982             0
## 10 2001-03-04     3 normal sigma            1 species_1      0.347             0
## # … with abbreviated variable name ¹data_assimilation

write.csv(dfs, file = "logistic-forecast-summary-multi-variable-multi-depth.csv")

Python

#keep_cols = [col for col in df_combined.columns if not "Species" in col]
#df_long = df_combined.melt(id_vars=keep_cols, var_name="species")

dfs = df_combined\
    .groupby(["datetime", "depth", "obs_flag", "variable",  "data_assimilation"])\
    .agg(
        mu = ("prediction", np.mean), sigma = ("prediction", np.std)
    )\
    .reset_index()\
    .melt(
        id_vars = ["datetime", "depth", "obs_flag",
                    "variable","data_assimilation"],
        value_vars = ["mu", "sigma"],
        var_name = "parameter",
        value_name = "prediction"
    )

dfs['family'] = 'normal'
dfs = dfs[['datetime','depth','family','parameter','obs_flag','variable','prediction']]

#        Conf_interv_025 = ("value", lambda x: np.quantile(x, 0.025)),
#        Conf_interv_975 = ("value", lambda x: np.quantile(x, 0.975)),


dfs.head()

##      datetime  depth  family parameter  obs_flag   variable  prediction
## 0  2001-03-04      1  normal        mu         0  species 1    0.531146
## 1  2001-03-04      1  normal        mu         0  species 2    0.272927
## 2  2001-03-04      1  normal        mu         1  species 1    0.531146
## 3  2001-03-04      1  normal        mu         1  species 2    0.272927
## 4  2001-03-04      3  normal        mu         0  species 1    0.895926

out_summary = tempfile.NamedTemporaryFile(suffix=".csv")
dfs.to_csv(out_summary.name)

Standardized Metadata

Forecast output entity: dataTable

Let’s begin by documenting the metadata of the forecast output data table itself, which we’ll do using EML’s dataTable entity. In this example we’ll assume we’re working with format 2 (ensemble CSV), though most of the metadata would be identical for all three formats.

To start, the attributes table stores the basic metadata about the variable names and units.

Code

R

## define variable names, units, etc
## in practice, this might be kept in a spreadsheet
attributes <- tibble::tribble(
 ~attributeName,     ~attributeDefinition,                          ~unit,                  ~formatString, ~numberType, ~definition,
 "datetime",          "[dimension]{datetime}",                      "year",                 "YYYY-MM-DD",   NA,          NA,
 "depth",             "[dimension]{depth in reservior}",            "meter",                 NA,           "real",       NA,
 "ensemble",          "[dimension]{index of ensemble member}",      "dimensionless",         NA,           "integer",    NA,
 "obs_flag",          "[dimension]{observation error}",             "dimensionless",         NA,           "integer",    NA,
 "species_1",         "[variable]{Pop. density of species 1}",      "numberPerMeterSquared", NA,           "real",       NA,
 "species_2",         "[variable]{Pop. density of species 2}",      "numberPerMeterSquared", NA,           "real",       NA,
 "data_assimilation", "[flag]{whether time step assimilated data}", "dimensionless",         NA,           "integer",    NA
) 
## note: EML uses a different unit standard than UDUNITS. For now use EML. EFI needs to provide a custom unitList.
attributes

## # A tibble: 7 × 6
##   attributeName     attributeDefinition            unit  forma…¹ numbe…² defin…³
##   <chr>             <chr>                          <chr> <chr>   <chr>   <lgl>  
## 1 datetime          [dimension]{datetime}          year  YYYY-M… <NA>    NA     
## 2 depth             [dimension]{depth in reservio… meter <NA>    real    NA     
## 3 ensemble          [dimension]{index of ensemble… dime… <NA>    integer NA     
## 4 obs_flag          [dimension]{observation error} dime… <NA>    integer NA     
## 5 species_1         [variable]{Pop. density of sp… numb… <NA>    real    NA     
## 6 species_2         [variable]{Pop. density of sp… numb… <NA>    real    NA     
## 7 data_assimilation [flag]{whether time step assi… dime… <NA>    integer NA     
## # … with abbreviated variable names ¹formatString, ²numberType, ³definition

attrList <- set_attributes(attributes, 
                           col_classes = c("Date", "numeric", "numeric","numeric", 
                                           "numeric","numeric", "numeric"))

Python

No easy function for this, so have to do it (tediously) by hand:

# NOTE: List of dicts 
attrList = [
    {
        "attributeName": "datetime",
        "attributeDefinition": "[dimension]{datetime}",
        "storageType": "date",
        "measurementScale": {
            "dateTime": {
                "formatString": "YYYY-MM-DD",
                "dateTimeDomain": []
            }
        }
    },
    {
        "attributeName": "depth",
        "attributeDefinition": "[dimension]{depth in reservior}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "meter"},
                "numericDomain": {"numberType": "real"}
            }
        }
    },
    {
        "attributeName": "ensemble",
        "attributeDefinition": "[dimension]{index of ensemble member}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "dimensionless"},
                "numericDomain": {"numberType": "integer"}
            }
        }
    },
    {
        "attributeName": "obs_flag",
        "attributeDefinition": "[dimension]{observation error}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "dimensionless"},
                "numericDomain": {"numberType": "integer"}
            }
        }
    },
    {
        "attributeName": "species_1",
        "attributeDefinition": "[variable]{Pop. density of species 1}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "numberPerMeterSquared"},
                "numericDomain": {"numberType": "real"}
            }
        }
    },
    {
        "attributeName": "species_2",
        "attributeDefinition": "[variable]{Pop. density of species 2}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "numberPerMeterSquared"},
                "numericDomain": {"numberType": "real"}
            }
        }
    },
    {
        "attributeName": "data_assimilation",
        "attributeDefinition": "[flag]{whether time step assimilated data}",
        "storageType": "float",
        "measurementScale": {
            "ratio": {
                "unit": {"standardUnit": "dimensionless"},
                "numericDomain": {"numberType": "integer"}
            }
        }
    }
]

Note that the EFI standard extends the EML attibuteDefinition to include a [variable_type] and {variable_definition}. The options for [variable_type] include: * dimension * variable = output variable * diagnostic = variable output purely for diagnostic purposes * observation = data that is or could be compared to an output_variable * obs_error * flag * initial_condition * driver * parameter * random_effect * process_error

We then link the attribute information with the info about a particular file

Code

R

## sets metadata about the file itself (name, file type, size, MD5, etc)
physical <- set_physical("logistic-forecast-ensemble-multi-variable-multi-depth.csv",
                         recordDelimiter='\n')

## Automatically calculated file size using file.size("logistic-forecast-ensemble-multi-variable-multi-depth.csv")

## Automatically calculated authentication size using digest::digest("logistic-forecast-ensemble-multi-variable-multi-depth.csv", algo = "md5", file = TRUE)

## set metadata for the file as a whole
dataTable <- eml$dataTable(
                 entityName = "forecast",  ## this is a standard name to allow us to distinguish this entity from 
                 entityDescription = "Forecast of population size using a depth specific model",
                 physical = physical,
                 attributeList = attrList)

Python

physical = {
    "objectName": "logistic-forecast-ensemble-multi-variable-multi-depth.csv",
    "size": {"unit": "bytes", "size": "110780"},
    "authentication": {
        "method": "MD5",
        "authentication": "4dbe687fa1f5fc0ff789096076eebd78"
    },
    "dataFormat": {
        "textFormat": {
            "recordDelimiter": "\n",
            "attributeOrientation": "column",
            "simpleDelimited": {"fieldDelimiter": ","}
        }
    }
}

dataTable = {
    "entityName": "forecast",  ## this is a standard name to allow us to distinguish this entity from 
    "entityDescription": "Forecast of population size using a depth specific model",
    "physical": physical,
    "attributeList": attrList
}

Here entityName = "forecast" is a standard name within the EFI standard to allow us to distinguish this entity from metadata about parameters, drivers, etc.

There’s a lot more optional terminology that could be exploited here – for instance, the specification lets us define different missing value codes (and explanations) for each column, and allows us to indicate precision, minimum and maximum.

Note that physical type can document almost any formats as well, including NetCDF etc. A NetCDF file would still document the variables measured in much the same way regardless of the underlying representation.

Provenance

Now that we’ve documented the actual data.frame itself, we can add additional metadata to the record describing our forecast, which is essential for citing, discovering, and interpreting the result. We start with some authorship information.

Code

R

me <- list(individualName = list(givenName = "Quinn", 
                                 surName = "Thomas"),
           electronicMailAddress = "rqthomas@vt.edu",
           id = "https://orcid.org/0000-0003-1282-7825")

Python

me = {
    "individualName": {"givenName": "Quinn", "surName": "Thomas"},
    "electronicMailAddress": "rqthomas@vt.edu",
    "id": "https://orcid.org/0000-0003-1282-7825"
}

Coverage

Set Taxonomic, Temporal, and Geographic Coverage. (Look, apparently we’re modeling population densities of Gloeotrichia echinulata and Anabaena circinalis cyanobacterium in Lake Sunapee, NH, USA)

Code

R

taxa <- tibble::tribble(
  ~Genus,      ~Species,
"Gloeotrichia", "echinulata",
"Anabaena",     "circinalis")
coverage <- 
  set_coverage(begin = first(datetimes), 
               end = last(datetimes),
               sci_names = taxa,
               geographicDescription = "Lake Sunapee, NH, USA ",
               west = -72.15, east = -72.05, 
               north = 43.48, south = 43.36)

Python

No equivalent utility, so have to (tediously) do this by hand.

coverage = {
    "geographicCoverage": {
        "geographicDescription": "Lake Sunapee, NH, USA",
        "boundingCoordinates": {
            "westBoundingCoordinate": -72.15,
            "eastBoundingCoordinate": -72.05,
            "northBoundingCoordinate": 43.48,
            "southBoundingCoordinate": 43.36
        }
    },
    "temporalCoverage": {"rangeOfDates": {
        "beginDate": {"calendarDate": str(datetimes[0])},
        "endDate": {"calendarDate": str(datetimes[-1])}
    }},
    "taxonomicCoverage": {
        "taxonomicClassification": {
            "taxonRankName": "Genus",
            "taxonRankValue": "Gloeotrichia",
            "taxonomicClassification": {
                "taxonRankName": "Species",
                "taxonRankValue": "echinulata"
            }
        },
        "taxonomicClassification": {
            "taxonRankName": "Genus",
            "taxonRankValue": "Anabaena",
            "taxonomicClassification": {
                "taxonRankName": "Species",
                "taxonRankValue": "circinalis"
            }
        }
    }
}

Keywords

Set key words. We will need to develop a EFI controlled vocabulary

Code

R

keywordSet <- list(
    list(
        keywordThesaurus = "EFI controlled vocabulary",
        keyword = list("forecast",
                    "population",
                    "timeseries")
    ))

Python

# NOTE: List of dicts!
keywordSet = [{
    "keywordThesaurus" : "EFI controlled vocabulary",
    "keyword" : ["forecast", "population", "timeseries"]
}]

Abstract

Our dataset needs an abstract describing what this is all about.

Code

R

abstract_text <- system.file("extdata", "abstract.md", package="EFIstandards", mustWork = TRUE)

Python

with open("../inst/extdata/abstract.md", "r") as f:
    abstract_text = f.read()

Dataset

Next, we’ll combine these various bits to document the output dataset as a whole

Code

R

dataset = eml$dataset(
               title = "A very simple Lotka-Volterra forecast",
               creator = me,
               contact = list(references="https://orcid.org/0000-0003-1282-7825"),
               pubDate = reference_datetime,
               intellectualRights = "http://www.lternet.edu/data/netpolicy.html.",
               abstract =  "An illustration of how we might use EML metadata to describe an ecological forecast",
               dataTable = dataTable,
               keywordSet = keywordSet,
               coverage = coverage
               )

Python

dataset = {
    "title" : "A very simple Lotka-Volterra forecast",
    "creator" : me,
    "contact" : {"references": "https://orcid.org/0000-0003-1282-7825"},
    "pubDate" : reference_datetime,
    "intellectualRights" : "http://www.lternet.edu/data/netpolicy.html.",
    "abstract" :  "An illustration of how we might use EML metadata to describe an ecological forecast",
    "dataTable" : dataTable,
    "keywordSet" : keywordSet,
    "coverage" : coverage
}

Forecast-specific additionalMetadata

The EFI standard is using EML’s additionalMetadata capacity. Section 2.1 of the EFI standard describes both the basic elements (2.1.1) and the info about model structure and uncertainty (2.1.2).

Code

R

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    forecast = list(
## Basic elements
      timestep = "1 day", ## should be udunits parsable; already in coverage -> temporalCoverage?
      horizon = "30 days",
      reference_datetime = reference_datetime,
      iteration_id = iteration_id,
      project_id = project_id,
      metadata_standard_version = "0.5",
      model_description = list(
        model_id = model_id,
        name = "discrete Lotka–Volterra model",
        type = "process-based",
        repository = "https://github.com/eco4cast/EFIstandards/blob/master/vignettes/logistic-metadata-example.Rmd"
      ),
## MODEL STRUCTURE & UNCERTAINTY CLASSES
      initial_conditions = list(
        # Possible values: absent, present, data_driven, propagates, assimilates
        present = TRUE,
        data_driven = TRUE,
        # Number of parameters / dimensionality
        complexity = 2,  ## [species 1, species 2] per depth
        propagation = list(
          type = "ensemble",
          size = 10
        )
      ),
      drivers = list(
        present = FALSE
      ),
      parameters = list(
        present = TRUE,
        data_driven = TRUE,
        complexity = 6  ## [r, K, alpha] x 2 spp
      ),
      random_effects = list(
        present = FALSE
      ),
      process_error = list(
        present = TRUE,
        data_driven = TRUE,
        status = "propagates",
        propagation = list(
          type = "ensemble", # ensemble vs analytic
          size = 10          # required if ensemble
        ),
        complexity = 2,   
        covariance = FALSE
      ),
      obs_error = list(
        present = TRUE,
        data_driven = TRUE,
        complexity = 2,   
        covariance = FALSE
      )
    ) # forecast
  ) # metadata
) # eml$additionalMetadata

Python

additionalMetadata = {
  "metadata" : { 
    "forecast" : { 
      ## Basic elements
      "timestep" : "1 day", ## should be udunits parsable; already in coverage -> temporalCoverage?
      "horizon" : "30 days",
      "reference_datetime" : reference_datetime,
      "iteration_id" : iteration_id,
      "project_id" : project_id,
      "metadata_standard_version" : "0.5",
      "model_description" : {
        "model_id" : model_id,
        "name" : "discrete Lotka–Volterra model",
        "type" : "process-based",
        "repository" : "https://github.com/eco4cast/EFIstandards/blob/master/vignettes/logistic-metadata-example.Rmd"
        },
        ## MODEL STRUCTURE & UNCERTAINTY CLASSES
      "initial_conditions" : {
        "present" : True,
        "date_driven" : True,
        # Number of parameters / dimensionality
        "complexity" : 2,  ## [species 1, species 2] per depth
        "propagation" : {
          "type" : "ensemble",
          "size" : 10
          }
        },
        "drivers" : {"present" : False},
        "parameters" : {
          "present" : True,
          "date_driven" : True,
          "complexity" : 6  ## [r, K, alpha] x 2 spp
        },
        "random_effects" : {"present" : False},
        "process_error" : {
          "present" : True,
          "date_driven" : True,
          "propagation" : {
            "type" : "ensemble", # ensemble vs analytic
            "size" : 10          # required if ensemble
            },
            "complexity" : 2,   
            "covariance" : False
        },
        "obs_error" : {
            "present" : True,
            "date_driven" : True,
            "complexity" : 2,
            "covariance" : False
        }
      } # forecast
    } # metadata
}

Within the additionalMetadata, we anticipate the model structure info will be the most challenging to document. All six model structural elements / uncertainty classes need to be documented, even if just to record that a specific element is absent in the model. The six classes are:

Class	Description
initial_conditions	Uncertainty in the initialization of state variables (Y)
drivers	Uncertainty in model drivers, covariates, and exogenous scenarios (X)
parameters	Uncertainty in model parameters (\(\theta\))
random_effects	Unexplained variability and heterogeneity in model parameters (\(\alpha\))
obs_error	Uncertainty in the observations of the output variables (g)
process_error	Dynamic uncertainty in the process model (\(\epsilon\))

Each class has a very similar structure and the only required element in each is <present> which is a boolean indicating whether this concept is included in the model (TRUE) or not (FALSE).

If present = TRUE then the <data_driven> tag is also required, and is a boolean indicating whether the quantity being discussed is based on data (TRUE) or not (FALSE). Examples of non-data-driven inputs include scenario-based drivers, spin-up initial conditions, and hand-tuned or theoretical parameters.

Next, if present = TRUE the <complexity> tag is recommended, but not required, and should list the size/dimension of each status/uncertainty class at a single location (per dimensions defined above). For example, the logistic model has two initial conditions (n[1], n[2]), six parameters (r, K, and alpha for each species), and both process and observation error on the two state variables.

The <covariance> tag states whether the errors/uncertainties within an uncertainty class are being treated as independent or not.

Finally, the <propagation> and <assimilation> tags are used to indicate, respectively, which uncertainties are propagated into the forecast and which are updated iteratively. Both include a set of recommended subtags used to provide additional information about the methods used (see EFI standard doc). If subtags are not used, one should just include a empty tag (e.g. <propagation />) to indicate that this concept is present.

Assembling and validating the metadata

All we need now is to combine the dataset metadata with the forecast additionalMetadata

Code

R

my_eml <- eml$eml(dataset = dataset,
           additionalMetadata = additionalMetadata,
           packageId = iteration_id , 
           system = "datetime"  ## system used to generate packageId
           )

Python

No nice utilities, so we have to create the XML by hand. This is significantly simplified by the dicttoxml package, though we still have to do a bit of extra work.

dataset_xml = ET.fromstring(dicttoxml.dicttoxml(
    dataset,
    custom_root="dataset",
    attr_type=False
))

metadata_xml = ET.fromstring(dicttoxml.dicttoxml(
    additionalMetadata,
    custom_root="additionalMetadata",
    attr_type=False
))

my_eml = ET.Element("eml:eml", {
    "xmlns:eml": "https://eml.ecoinformatics.org/eml-2.2.0",
    "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
    "xmlns:stmml": "http://www.xml-cml.org/schema/stmml-1.2",
    "system": "datetime",
    "packageId": iteration_id,
    "xsi:schemaLocation": "https://eml.ecoinformatics.org/eml-2.2.0 https://eml.ecoinformatics.org/eml-2.2.0/eml.xsd"
})

my_eml.append(dataset_xml)
my_eml.append(metadata_xml)

Once we have finished building our EML metadata, we can confirm it is valid.
This will catch any missing elements. (Recall that what is ‘required’ depends on what you include – for example, you don’t have to document a dataTable at all, but if you do, you have to document the “physical” file format it is in
(e.g. csv) and the attributes and units it uses!)

Code

R

## check base EML
emld::eml_validate(my_eml)

## [1] TRUE
## attr(,"errors")
## character(0)

## check that the EML is also a valid EFI forecast
EFIstandards::forecast_validator(my_eml)

## • Checking Validity of EML file...
## ✔ EML is valid
## ✔ additionalMetadata found
## ✔ metadata_standard_version found
## ✔ timestep parsable
## ✔ horizon parsable
## ✔ reference_datetime found
## ✔ iteration_id found
## ✔ project_id found
## ✔ model_description found
## ✔ model_id found
## ✔ name found
## ✔ type found
## ✔ repository found
## ✔ initial_conditions found
## ✔ present found
## ✔ data_driven found
## ✔ complexity valid
## ✔ type found
## ✔ initial_conditions propagation type valid: ensemble
## ✔ size valid
## ✔ parameters found
## ✔ present found
## ✔ data_driven found
## ✔ complexity valid
## ✔ drivers found
## ✔ present found
## ✔ random_effects found
## ✔ present found
## ✔ process_error found
## ✔ present found
## ✔ data_driven found
## ✔ complexity valid
## ✔ type found
## ✔ process_error propagation type valid: ensemble
## ✔ size valid
## ✔ obs_error found
## ✔ present found
## ✔ data_driven found
## ✔ complexity valid

## [1] TRUE
## attr(,"errors")
## character(0)

Python

There is no Python EML validation utility. Therefore, we will have to use the R utility here. First, we write to disk.

with open("forecast-eml-python.xml", "w") as f:
    f.write(ET.tostring(my_eml, encoding="unicode"))

## 6209

Next, we have two options: One is to use the Python standard library to spawn a separate R process that runs the validator.

import subprocess
result = subprocess.run([
    "Rscript",
    "-e", 'emld::eml_validate("forecast-eml-python.xml")',
    "-e", 'EFIstandards::forecast_validator("forecast-eml-python.xml")'
], capture_output=True)

Alternatively, we can use the rpy2 module to run R directly from within Python (analogous to R’s reticulate).

import rpy2
from rpy2.robjects.packages import importr
emld = importr("emld")
EFIstandards = importr("EFIstandards")

emld.eml_validate("forecast-eml-python.xml")
EFIstandards.forecast_validator("forecast-eml-python.xml")

We are now ready to write out a valid EML document:

Code

R

write_eml(my_eml, "forecast-eml.xml")

Python

We already did this above to validate the document. However, for completeness:

with open("forecast-eml-python.xml", "w") as f:
    f.write(ET.tostring(my_eml, encoding="unicode"))

## 6209

At this point, we could easily upload this metadata along with the data itself to a repository (e.g. DataONE) manually or via an API.

We can also generate a JSON-LD version of EML:

Code

R

emld::as_json(as_emld("forecast-eml.xml"), file = "forecast-eml.json")

Python

(Under construction)

Example metadata for a Lotka-Volterra population growth forecast

Code

R

Python

Generating the forecast

Code

R

Python

Code

R

Python

Code

R

Python

Saving to a standardized output format

Standard Option 1: netCDF

Forecast identifiers

Code

R

Python

netcdf Dimensions

Code

R

Python

Variable names

Code

R

Python

Creating the file

Code

R

Python

Standard Option 2: ensemble CSV

Code

R

Python

Standard Option 3: distributional CSV

Code

R

Python

Standardized Metadata

Forecast output entity: dataTable

Code

R

Python

Code

R

Python

Provenance

Code

R

Python

Coverage

Code

R

Python

Keywords

Code

R

Python

Abstract

Code

R

Python

Dataset

Code

R

Python

Forecast-specific additionalMetadata

Code

R

Python

Assembling and validating the metadata

Code

R

Python

Code

R

Python

Code