Making the Iris Dataset FAIR-Compliant

The Problem

The famous Iris dataset exists as a simple CSV everywhere. It’s not FAIR:

  • Findable? No persistent identifiers
  • Accessible? No metadata about provenance
  • Interoperable? No ontology links
  • Reusable? No license, no data dictionary

The Solution

Build a proof-of-concept using Airtable (free) + R to transform it into FAIR-compliant JSON-LD.

Architecture

Three linked tables in Airtable:

  1. Observations - the 150 iris measurements
  2. Taxa - species metadata with NCBI Taxonomy IDs
  3. Property_Mappings - ontology URIs for each measurement (Plant Ontology)

R script pulls via API and generates JSON-LD with: - Persistent identifiers for each record - Schema.org vocabulary - Ontology mappings (PO, NCBI) - Full provenance and licensing

Implementation

library( airtabler )
library( dplyr )

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library( tidyr )
library(jsonlite)
library(purrr)

Attaching package: 'purrr'
The following object is masked from 'package:jsonlite':

    flatten
source( "./utils.R" )
df_at <- 
  airtable(
    base = Sys.getenv("AIRTABLE_BASE") , 
    tables = c("Observations", "Property_Mappings (ontology)", "Taxa (species metadata)")
  )

df_obs <- 
  df_at$Observations$select_all() %>% 
  unnest(c(Species,`Scientific_Name (from Species)`) )
  

df_ontol <- 
  df_at$`Property_Mappings (ontology)`$select_all() 


df_taxa <- df_at$`Taxa (species metadata)`$select_all()

Convert all observations to JSON-LD

# Config
BASE_DOI <- "https://airtable.com/invite/l?inviteId=invB41lvSB2IoCwtV&inviteToken=912532bce994100be88dfd49cd69d56d0223915e66d6f5b8b431501350901fac&utm_medium=email&utm_source=product_team&utm_content=transactional-alerts"
LICENSE_URL <- "https://creativecommons.org/licenses/by/4.0/"




cat("Converting", nrow(df_obs), "observations to JSON-LD format...\n")
Converting 150 observations to JSON-LD format...
jsonld_records <- df_obs %>%
  rowwise() %>%
  do(record = convert_observation_to_jsonld(., df_taxa, df_ontol)) %>%
  pull(record)

cat("Conversion complete!\n\n")
Conversion complete!

Create dataset-level metedata

dataset_metadata <- list(
  `@context` = "http://schema.org",
  `@type` = "Dataset",
  `@id` = BASE_DOI,
  name = "FAIR Iris Dataset",
  description = paste(
    "Morphological measurements of 150 Iris flowers from three species",
    "(Iris setosa, Iris versicolor, and Iris virginica).",
    "This dataset demonstrates FAIR data principles with linked ontologies",
    "and comprehensive metadata."
  ),
  creator = list(
    list(
      `@type` = "Person",
      name = "Edgar Anderson",
      affiliation = "Washington University"
    ),
    list(
      `@type` = "Person", 
      name = "Ronald A. Fisher",
      affiliation = "University College London"
    )
  ),
  datePublished = "1936",
  dateModified = Sys.Date(),
  license = LICENSE_URL,
  keywords = c("Iris", "morphology", "botany", "classification", "machine learning"),
  version = "1.0.0",
  distribution = list(
    list(
      `@type` = "DataDownload",
      encodingFormat = "application/ld+json",
      contentUrl = paste0(BASE_DOI, "/iris_fair_dataset.json")
    )
  ),
  measurementTechnique = "Manual measurement using calipers",
  spatialCoverage = "Gaspé Peninsula, Quebec, Canada",
  temporalCoverage = "1935",
  numberOfRecords = nrow(df_obs),
  variableMeasured = df_ontol$Property_Name
)

Print

# Print first record nicely formatted
cat(prettify(toJSON(jsonld_records[[1]], auto_unbox = TRUE)))
{
    "@context": "http://schema.org",
    "@type": "Dataset",
    "@id": "https://airtable.com/invite/l?inviteId=invB41lvSB2IoCwtV&inviteToken=912532bce994100be88dfd49cd69d56d0223915e66d6f5b8b431501350901fac&utm_medium=email&utm_source=product_team&utm_content=transactional-alerts/record/000111",
    "identifier": 111,
    "name": "Iris specimen 111",
    "description": "Morphological measurements of Iris specimen 111",
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "measurement": [
        {
            "@type": "PropertyValue",
            "propertyID": null,
            "name": "sepal length",
            "description": null,
            "value": 6.5,
            "unitCode": null,
            "unitText": "centimeters"
        },
        {
            "@type": "PropertyValue",
            "propertyID": null,
            "name": "sepal width",
            "description": null,
            "value": 3.2,
            "unitCode": null,
            "unitText": "centimeters"
        },
        {
            "@type": "PropertyValue",
            "propertyID": null,
            "name": "petal length",
            "description": null,
            "value": 5.1,
            "unitCode": null,
            "unitText": "centimeters"
        },
        {
            "@type": "PropertyValue",
            "propertyID": null,
            "name": "petal width",
            "description": null,
            "value": 2,
            "unitCode": null,
            "unitText": "centimeters"
        }
    ],
    "dateCreated": "2026-02-27T08:11:18.000Z",
    "about": {
        "@type": "Taxon",
        "@id": "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=4449",
        "scientificName": "virginica",
        "commonName": "Virginia iris",
        "taxonRank": "species",
        "kingdom": "Plantae",
        "family": "Iridaceae",
        "genus": "Iris"
    },
    "provenance": {
        "@type": "PropertyValue",
        "name": "data provenance",
        "collector": {
            "@type": "Person",
            "name": "Edgar Anderson"
        },
        "collectionDate": "1935-10-03",
        "location": "Gaspé Peninsula, Quebec, Canada"
    }
}
# Print just the measurement section
cat(prettify(toJSON(jsonld_records[[1]]$measurement, auto_unbox = TRUE)))
[
    {
        "@type": "PropertyValue",
        "propertyID": null,
        "name": "sepal length",
        "description": null,
        "value": 6.5,
        "unitCode": null,
        "unitText": "centimeters"
    },
    {
        "@type": "PropertyValue",
        "propertyID": null,
        "name": "sepal width",
        "description": null,
        "value": 3.2,
        "unitCode": null,
        "unitText": "centimeters"
    },
    {
        "@type": "PropertyValue",
        "propertyID": null,
        "name": "petal length",
        "description": null,
        "value": 5.1,
        "unitCode": null,
        "unitText": "centimeters"
    },
    {
        "@type": "PropertyValue",
        "propertyID": null,
        "name": "petal width",
        "description": null,
        "value": 2,
        "unitCode": null,
        "unitText": "centimeters"
    }
]
# Print dataset metadata
cat(prettify(toJSON(dataset_metadata, auto_unbox = TRUE)))
{
    "@context": "http://schema.org",
    "@type": "Dataset",
    "@id": "https://airtable.com/invite/l?inviteId=invB41lvSB2IoCwtV&inviteToken=912532bce994100be88dfd49cd69d56d0223915e66d6f5b8b431501350901fac&utm_medium=email&utm_source=product_team&utm_content=transactional-alerts",
    "name": "FAIR Iris Dataset",
    "description": "Morphological measurements of 150 Iris flowers from three species (Iris setosa, Iris versicolor, and Iris virginica). This dataset demonstrates FAIR data principles with linked ontologies and comprehensive metadata.",
    "creator": [
        {
            "@type": "Person",
            "name": "Edgar Anderson",
            "affiliation": "Washington University"
        },
        {
            "@type": "Person",
            "name": "Ronald A. Fisher",
            "affiliation": "University College London"
        }
    ],
    "datePublished": "1936",
    "dateModified": "2026-02-27",
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "keywords": [
        "Iris",
        "morphology",
        "botany",
        "classification",
        "machine learning"
    ],
    "version": "1.0.0",
    "distribution": [
        {
            "@type": "DataDownload",
            "encodingFormat": "application/ld+json",
            "contentUrl": "https://airtable.com/invite/l?inviteId=invB41lvSB2IoCwtV&inviteToken=912532bce994100be88dfd49cd69d56d0223915e66d6f5b8b431501350901fac&utm_medium=email&utm_source=product_team&utm_content=transactional-alerts/iris_fair_dataset.json"
        }
    ],
    "measurementTechnique": "Manual measurement using calipers",
    "spatialCoverage": "Gaspé Peninsula, Quebec, Canada",
    "temporalCoverage": "1935",
    "numberOfRecords": 150,
    "variableMeasured": [
        "sepal_length ",
        "sepal_width  ",
        "petal_length ",
        "petal_width  "
    ]
}

Extras: export JSON-LD files

# Export individual records
cat("Exporting JSON-LD files...\n")
Exporting JSON-LD files...
# Create output directory
dir.create("fair_iris_output", showWarnings = FALSE)
dir.create("fair_iris_output/records", showWarnings = FALSE)

# Export each record individually
for (i in seq_along(jsonld_records)) {
  record_id <- sprintf("%06d", df_obs$Record_ID[i])
  filename <- paste0("fair_iris_output/records/iris_record_", record_id, ".json")
  
  write(
    toJSON(jsonld_records[[i]], pretty = TRUE, auto_unbox = TRUE, null = "null"),
    file = filename
  )
}

cat("  ✓ Exported", length(jsonld_records), "individual record files\n")
  ✓ Exported 150 individual record files
# Export complete dataset (all records in one file)
complete_dataset <- list(
  `@context` = "http://schema.org",
  `@graph` = jsonld_records
)

write(
  toJSON(complete_dataset, pretty = TRUE, auto_unbox = TRUE, null = "null"),
  file = "fair_iris_output/iris_complete_dataset.json"
)

cat("  ✓ Exported complete dataset file\n")
  ✓ Exported complete dataset file
# Export dataset metadata
write(
  toJSON(dataset_metadata, pretty = TRUE, auto_unbox = TRUE, null = "null"),
  file = "fair_iris_output/iris_dataset_metadata.json"
)

cat("  ✓ Exported dataset metadata file\n")
  ✓ Exported dataset metadata file