library(tidyjson)
library(magrittr)
library(jsonlite)
library(dplyr)

JSON Parsing

Each classification is an array. Depending on the workflow and how it’s changed, classification arrays may vary in structure within a single project. Also, empty arrays seem to be problematic. Depending on the type of project, you probably want to split the data into workflows and even limit workflow version prior to flattening.


Load example data

sas <- read.csv("../data/questions-SAS-1000.csv", stringsAsFactors = F)
kitteh <- read.csv("../data/kitteh-zoo-classifications.csv", stringsAsFactors = F)
wilde <- read.csv("../data/points-wildebeest.csv", stringsAsFactors = F)
chicago <- read.csv("../data/chicago-wildlife-watch-classifications.csv", stringsAsFactors = F)

Simple Yes or No Questions

sas$annotations[1] %>% prettify
[
    {
        "task": "init",
        "task_label": "Are there whales or dolphins in this photo?",
        "value": "No"
    }
]
 

Simple Point Marking

wilde$annotations[2] %>% prettify()
[
    {
        "task": "T1",
        "task_label": "Mark every wildebeest with the **RED CIRCLE**. Remove unwanted marks with the **X** in the corner of the circle.",
        "value": [
            {
                "x": 3913.710798235543,
                "y": 2106.318257956449,
                "tool": 0,
                "frame": 0,
                "details": [

                ],
                "tool_label": "Wildebeest Marker"
            },
            {
                "x": 2803.6400990996435,
                "y": 2469.2312010502965,
                "tool": 0,
                "frame": 0,
                "details": [

                ],
                "tool_label": "Wildebeest Marker"
            },
            {
                "x": 2867.6826394344066,
                "y": 2163.2457784417584,
                "tool": 0,
                "frame": 0,
                "details": [

                ],
                "tool_label": "Wildebeest Marker"
            },
            {
                "x": 1586.8318327391382,
                "y": 1956.8835166825118,
                "tool": 0,
                "frame": 0,
                "details": [

                ],
                "tool_label": "Wildebeest Marker"
            },
            {
                "x": 1437.3992386246903,
                "y": 1778.9850151659198,
                "tool": 0,
                "frame": 0,
                "details": [

                ],
                "tool_label": "Wildebeest Marker"
            }
        ]
    }
]
 

Combination Question and Marking: Note that the fomat of the value array varies by task

kitteh$annotations[1] %>% prettify
[
    {
        "task": "init",
        "task_label": "How many cats are there in this image?",
        "value": "1"
    },
    {
        "task": "T1",
        "task_label": "Mark each cat's face and tail. Draw an ellipse around each cat's face (not including the ears) and mark the tail tip with a point.",
        "value": [

        ]
    }
]
 

Flattening the Files

It’s much easier to parse/flatten the JSON when everything is in a standard format, so you probably want to split out your raw file based on the Workflow and even Task IDs. You also want to limit to only the workflow version(s) with actual data. This is because previous versions, especially those with empty data, may have different structures for the classification data, which is annoying and problematic.

Note: you may need to dig into your raw data a bit to identify which workflow and version you need. Some projects have many workflows and versions, others not so many.

fun_check_workflow <- function(data){
 data %>% group_by(workflow_id, workflow_version) %>% 
          summarise(date = max(created_at), count = n()) %>% 
          print    
}

For example: This is the Snapshots at Sea classifications by workflow

sas %>% fun_check_workflow()

vs. that of Wildebeest Marking Project

wilde %>% fun_check_workflow()

Vs. Chicago Wildlife Watch

chicago %>% fun_check_workflow()

Basic Flattening

With jsonlite, you can basically flatten all of the json data into a series of nested lists. This works really well for simple data, like questions, but marking tasks and more complex workflows get a bit complicated.

library(jsonlite)
#Basic Flattening Function
basic_flattening <- function(jdata) {
     out <- list() #create list to hold everything
     
     for (i in 1:dim(jdata)[1]) { #loop through each row of the dataset at a time
          classification_id  <- jdata$classification_id[i] 
          subject_id <- jdata$subject_ids[i] 
          split_anno <- fromJSON(txt = jdata$annotations[i], simplifyDataFrame = T) 
          out[[i]] <- cbind(classification_id, subject_id, split_anno)
     }
     
     do.call(what = rbind, args = out)   
}

Single questions flatten alright

flat_sas <- sas %>% basic_flattening() 
str(flat_sas)
'data.frame':   999 obs. of  5 variables:
 $ classification_id: int  2896902 2896910 2896915 2896917 2896923 2896934 2896940 2896945 2896950 2896958 ...
 $ subject_id       : int  920838 920143 920327 920336 920414 919869 920709 920171 920992 920875 ...
 $ task             : chr  "init" "init" "init" "init" ...
 $ task_label       : chr  "Are there whales or dolphins in this photo?" "Are there whales or dolphins in this photo?" "Are there whales or dolphins in this photo?" "Are there whales or dolphins in this photo?" ...
 $ value            : chr  "No" "Yes" "Yes" "No" ...

But more complex questions produce embedded lists inside the “value” column.

flat_wilde <- wilde[1:10,] %>% basic_flattening() 
str(flat_wilde, max.level = 2)
'data.frame':   10 obs. of  5 variables:
 $ classification_id: int  2001677 2001683 2001684 2002042 2004527 3866661 3866666 3867128 4440549 4440554
 $ subject_id       : int  483320 483321 483320 483321 483320 1042992 1042854 1042907 1042916 1042956
 $ task             : chr  "T1" "T1" "T1" "T1" ...
 $ task_label       : chr  "Mark every wildebeest with the *Red circle*. Removed unwanted marks with " "Mark every wildebeest with the **RED CIRCLE**. Remove unwanted marks with the **X** in the corner of the circle." "Mark every wildebeest with the **RED CIRCLE**. Remove unwanted marks with the **X** in the corner of the circle." "Mark every wildebeest with the **RED CIRCLE**. Remove unwanted marks with the **X** in the corner of the circle." ...
 $ value            :List of 10
  ..$ :'data.frame':    30 obs. of  6 variables:
  ..$ :'data.frame':    5 obs. of  6 variables:
  ..$ : list()
  ..$ :'data.frame':    22 obs. of  6 variables:
  ..$ : list()
  ..$ :'data.frame':    6 obs. of  6 variables:
  ..$ :'data.frame':    3 obs. of  6 variables:
  ..$ :'data.frame':    10 obs. of  6 variables:
  ..$ :'data.frame':    2 obs. of  6 variables:
  ..$ : list()
flat_kitteh <- kitteh %>% basic_flattening() 
str(flat_kitteh, max.level = 3)
'data.frame':   6 obs. of  5 variables:
 $ classification_id: int  2962830 2962830 2962838 2962838 9451458 9451458
 $ subject_id       : int  458021 458021 458022 458022 458041 458041
 $ task             : chr  "init" "T1" "init" "T1" ...
 $ task_label       : chr  "How many cats are there in this image?" "Mark each cat's face and tail. Draw an ellipse around each cat's face (not including the ears) and mark the tail tip with a poi"| __truncated__ "How many cats are there in this image?" "Mark each cat's face and tail. Draw an ellipse around each cat's face (not including the ears) and mark the tail tip with a poi"| __truncated__ ...
 $ value            :List of 6
  ..$ : chr "1"
  ..$ : chr 
  ..$ : chr "1"
  ..$ : chr 
  ..$ : chr "1"
  ..$ :'data.frame':    1 obs. of  9 variables:
  .. ..$ x         : num 326
  .. ..$ y         : int 137
  .. ..$ rx        : num 227
  .. ..$ ry        : num 114
  .. ..$ tool      : int 0
  .. ..$ angle     : num -39.6
  .. ..$ frame     : int 0
  .. ..$ details   :List of 1
  .. ..$ tool_label: chr "Catface"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeWpzb24pCmxpYnJhcnkobWFncml0dHIpCmxpYnJhcnkoanNvbmxpdGUpCmxpYnJhcnkoZHBseXIpCmBgYAoKCiMgSlNPTiBQYXJzaW5nCgpFYWNoIGNsYXNzaWZpY2F0aW9uIGlzIGFuIGFycmF5LiBEZXBlbmRpbmcgb24gdGhlIHdvcmtmbG93IGFuZCBob3cgaXQncyBjaGFuZ2VkLCBjbGFzc2lmaWNhdGlvbiBhcnJheXMgbWF5IHZhcnkgaW4gc3RydWN0dXJlIHdpdGhpbiBhIHNpbmdsZSBwcm9qZWN0LiBBbHNvLCBlbXB0eSBhcnJheXMgc2VlbSB0byBiZSBwcm9ibGVtYXRpYy4gRGVwZW5kaW5nIG9uIHRoZSB0eXBlIG9mIHByb2plY3QsIHlvdSBwcm9iYWJseSB3YW50IHRvIHNwbGl0IHRoZSBkYXRhIGludG8gd29ya2Zsb3dzIGFuZCBldmVuIGxpbWl0IHdvcmtmbG93IHZlcnNpb24gcHJpb3IgdG8gZmxhdHRlbmluZy4gCgotLS0KCiMjIyMgTG9hZCBleGFtcGxlIGRhdGEKYGBge3IgbG9hZCBleGFtcGxlIGRhdGF9CnNhcyA8LSByZWFkLmNzdigiLi4vZGF0YS9xdWVzdGlvbnMtU0FTLTEwMDAuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEYpCmtpdHRlaCA8LSByZWFkLmNzdigiLi4vZGF0YS9raXR0ZWgtem9vLWNsYXNzaWZpY2F0aW9ucy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRikKd2lsZGUgPC0gcmVhZC5jc3YoIi4uL2RhdGEvcG9pbnRzLXdpbGRlYmVlc3QuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEYpCmNoaWNhZ28gPC0gcmVhZC5jc3YoIi4uL2RhdGEvY2hpY2Fnby13aWxkbGlmZS13YXRjaC1jbGFzc2lmaWNhdGlvbnMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEYpCgpgYGAKCiMjIyMgU2ltcGxlIFllcyBvciBObyBRdWVzdGlvbnMKCmBgYHtyIGRpc3BsYXkgZXhhbXBsZSBhbm5vdGF0aW9uIGZvcm1hdHN9CnNhcyRhbm5vdGF0aW9uc1sxXSAlPiUgcHJldHRpZnkKYGBgCgojIyMjIFNpbXBsZSBQb2ludCBNYXJraW5nCmBgYHtyfQp3aWxkZSRhbm5vdGF0aW9uc1syXSAlPiUgcHJldHRpZnkoKQpgYGAKIyMjIyBDb21iaW5hdGlvbiBRdWVzdGlvbiBhbmQgTWFya2luZzogTm90ZSB0aGF0IHRoZSBmb21hdCBvZiB0aGUgdmFsdWUgYXJyYXkgdmFyaWVzIGJ5IHRhc2sKYGBge3J9CmtpdHRlaCRhbm5vdGF0aW9uc1sxXSAlPiUgcHJldHRpZnkKCmBgYAoKIyBGbGF0dGVuaW5nIHRoZSBGaWxlcwoKSXQncyBtdWNoIGVhc2llciB0byBwYXJzZS9mbGF0dGVuIHRoZSBKU09OIHdoZW4gZXZlcnl0aGluZyBpcyBpbiBhIHN0YW5kYXJkIGZvcm1hdCwgc28geW91IHByb2JhYmx5IHdhbnQgdG8gc3BsaXQgb3V0IHlvdXIgcmF3IGZpbGUgYmFzZWQgb24gdGhlIFdvcmtmbG93IGFuZCBldmVuIFRhc2sgSURzLiBZb3UgYWxzbyB3YW50IHRvIGxpbWl0IHRvIG9ubHkgdGhlIHdvcmtmbG93IHZlcnNpb24ocykgd2l0aCBhY3R1YWwgZGF0YS4gVGhpcyBpcyBiZWNhdXNlIHByZXZpb3VzIHZlcnNpb25zLCBlc3BlY2lhbGx5IHRob3NlIHdpdGggZW1wdHkgZGF0YSwgbWF5IGhhdmUgZGlmZmVyZW50IHN0cnVjdHVyZXMgZm9yIHRoZSBjbGFzc2lmaWNhdGlvbiBkYXRhLCB3aGljaCBpcyBhbm5veWluZyBhbmQgcHJvYmxlbWF0aWMuCgpOb3RlOiB5b3UgbWF5IG5lZWQgdG8gZGlnIGludG8geW91ciByYXcgZGF0YSBhIGJpdCB0byBpZGVudGlmeSB3aGljaCB3b3JrZmxvdyBhbmQgdmVyc2lvbiB5b3UgbmVlZC4gU29tZSBwcm9qZWN0cyBoYXZlIG1hbnkgd29ya2Zsb3dzIGFuZCB2ZXJzaW9ucywgb3RoZXJzIG5vdCBzbyBtYW55LgoKYGBge3Igd29ya2Zsb3dfZnVuX2RlZmluaXRpb259CmZ1bl9jaGVja193b3JrZmxvdyA8LSBmdW5jdGlvbihkYXRhKXsKIGRhdGEgJT4lIGdyb3VwX2J5KHdvcmtmbG93X2lkLCB3b3JrZmxvd192ZXJzaW9uKSAlPiUgCiAgICAgICAgICBzdW1tYXJpc2UoZGF0ZSA9IG1heChjcmVhdGVkX2F0KSwgY291bnQgPSBuKCkpICU+JSAKICAgICAgICAgIHByaW50ICAgIAp9CmBgYApGb3IgZXhhbXBsZTogVGhpcyBpcyB0aGUgU25hcHNob3RzIGF0IFNlYSBjbGFzc2lmaWNhdGlvbnMgYnkgd29ya2Zsb3cKCmBgYHtyfQpzYXMgJT4lIGZ1bl9jaGVja193b3JrZmxvdygpCmBgYAoKdnMuIHRoYXQgb2YgV2lsZGViZWVzdCBNYXJraW5nIFByb2plY3QKYGBge3J9CndpbGRlICU+JSBmdW5fY2hlY2tfd29ya2Zsb3coKQpgYGAKVnMuIENoaWNhZ28gV2lsZGxpZmUgV2F0Y2gKYGBge3J9CmNoaWNhZ28gJT4lIGZ1bl9jaGVja193b3JrZmxvdygpCmBgYAoKIyMgQmFzaWMgRmxhdHRlbmluZwoKV2l0aCBqc29ubGl0ZSwgeW91IGNhbiBiYXNpY2FsbHkgZmxhdHRlbiBhbGwgb2YgdGhlIGpzb24gZGF0YSBpbnRvIGEgc2VyaWVzIG9mIG5lc3RlZCBsaXN0cy4gVGhpcyB3b3JrcyByZWFsbHkgd2VsbCBmb3Igc2ltcGxlIGRhdGEsIGxpa2UgcXVlc3Rpb25zLCBidXQgbWFya2luZyB0YXNrcyBhbmQgbW9yZSBjb21wbGV4IHdvcmtmbG93cyBnZXQgYSBiaXQgY29tcGxpY2F0ZWQuCgpgYGB7ciBmbGF0dGVuaW5nIH0KbGlicmFyeShqc29ubGl0ZSkKCiNCYXNpYyBGbGF0dGVuaW5nIEZ1bmN0aW9uCmJhc2ljX2ZsYXR0ZW5pbmcgPC0gZnVuY3Rpb24oamRhdGEpIHsKICAgICBvdXQgPC0gbGlzdCgpICNjcmVhdGUgbGlzdCB0byBob2xkIGV2ZXJ5dGhpbmcKICAgICAKICAgICBmb3IgKGkgaW4gMTpkaW0oamRhdGEpWzFdKSB7ICNsb29wIHRocm91Z2ggZWFjaCByb3cgb2YgdGhlIGRhdGFzZXQgYXQgYSB0aW1lCiAgICAgICAgICBjbGFzc2lmaWNhdGlvbl9pZCAgPC0gamRhdGEkY2xhc3NpZmljYXRpb25faWRbaV0gCiAgICAgICAgICBzdWJqZWN0X2lkIDwtIGpkYXRhJHN1YmplY3RfaWRzW2ldIAogICAgICAgICAgc3BsaXRfYW5ubyA8LSBmcm9tSlNPTih0eHQgPSBqZGF0YSRhbm5vdGF0aW9uc1tpXSwgc2ltcGxpZnlEYXRhRnJhbWUgPSBUKSAKICAgICAgICAgIG91dFtbaV1dIDwtIGNiaW5kKGNsYXNzaWZpY2F0aW9uX2lkLCBzdWJqZWN0X2lkLCBzcGxpdF9hbm5vKQogICAgIH0KICAgICAKICAgICBkby5jYWxsKHdoYXQgPSByYmluZCwgYXJncyA9IG91dCkgICAKfQoKYGBgCgpTaW5nbGUgcXVlc3Rpb25zIGZsYXR0ZW4gYWxyaWdodApgYGB7ciBmbGF0dGVuIHNhc30KZmxhdF9zYXMgPC0gc2FzICU+JSBiYXNpY19mbGF0dGVuaW5nKCkgCnN0cihmbGF0X3NhcykKYGBgCgpCdXQgbW9yZSBjb21wbGV4IHF1ZXN0aW9ucyBwcm9kdWNlIGVtYmVkZGVkIGxpc3RzIGluc2lkZSB0aGUgInZhbHVlIiBjb2x1bW4uCgoKYGBge3J9CmZsYXRfd2lsZGUgPC0gd2lsZGVbMToxMCxdICU+JSBiYXNpY19mbGF0dGVuaW5nKCkgCnN0cihmbGF0X3dpbGRlLCBtYXgubGV2ZWwgPSAyKQpgYGAKCmBgYHtyfQpmbGF0X2tpdHRlaCA8LSBraXR0ZWggJT4lIGJhc2ljX2ZsYXR0ZW5pbmcoKSAKc3RyKGZsYXRfa2l0dGVoLCBtYXgubGV2ZWwgPSAzKQpgYGA=