This code flattens the Chicago Wildlife Watch data.
library(tidyjson)
library(magrittr)
library(jsonlite)
library(dplyr)
library(stringr)
library(tidyr)
chicago_unfiltered <- read.csv("../data/chicago-wildlife-watch-classifications.csv", stringsAsFactors = F)
First, we need to limit the classification data to the final workflow version and, if necessary, split by task. T0 is clearly the only task we really care about in this dataset (though note the changed format of current site).
# check which workflow version we want:
chicago_unfiltered %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n_distinct(workflow_version))
quick_check <- chicago_unfiltered %>%
select(., subject_ids, classification_id, workflow_version, annotations) %>%
as.tbl_json(json.column = "annotations") %>%
gather_array(column.name = "task_index") %>% # really important for joining later
spread_values(task = jstring("task"), task_label = jstring("task_label"), value = jstring("value")) %>%
gather_keys %>%
append_values_string()
quick_check %>% data.frame %>% group_by(., workflow_version, key, task) %>% summarise(., classification_count = n()) %>% print
So filter to the appropriate workflow and get going! Let’s take a quick peek at the data.
chicago <- chicago_unfiltered %>% filter(., workflow_version == 397.41)
chicago$annotations[1] %>% prettify()
[
{
"task": "T0",
"value": [
{
"choice": "RCCN",
"answers": {
"HWMN": "1"
},
"filters": {
}
}
]
}
]
# preliminary flat
basic_flat_with_values <- chicago %>%
select(., subject_ids, classification_id, workflow_version, annotations) %>%
as.tbl_json(json.column = "annotations") %>%
gather_array(column.name = "task_index") %>% # really important for joining later
spread_values(task = jstring("task"), task_label = jstring("task_label"), value = jstring("value"))
basic_flat_with_values %>% data.frame %>% head
chicago_summary <- basic_flat_with_values %>%
gather_keys %>%
append_values_string()
chicago_summary %>% data.frame %>% head # this will have all the classification IDs; if Value is empty, then the field will be null. This will have multiple rows per classification if there are multiple tasks completed
chicago_summary %>% data.frame %>% group_by(., workflow_version, key, task) %>% summarise(., n())
# quick check the filtered original data
chicago %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n_distinct(workflow_version))
Now dive into the first nested object, the species choice. Note that if you have different task types that you haven’t filtered out, or if you have null objects, this might break or else drop rows.
# grab choices; append embedded array values just for tracking
# Note that this will break if any of the tasks are simple questions. You would need to split by task before here.
chicago_choices <- basic_flat_with_values %>%
enter_object("value") %>% json_lengths(column.name = "total_species") %>%
gather_array(column.name = "species_index") %>% #each classification is an array. so you need to gather up multiple arrays.
spread_values(choice = jstring("choice"), answers = jstring("answers")) #append the answers as characters just in case
# if there are multiple species ID'd, there will be multiple rows and array.index will be >1
chicago_choices %>% data.frame %>% head
chicago_choices %>% group_by(., classification_id) %>% summarise(., count = n(), max(species_index)) %>% arrange(., -count)
Now dive into the second nested object, which is the sub questions. Since these actually aren’t arrays, it’s okay if they’re empty! This still keeps the rows.
# grab answers - for some reason, this keeps rows even if there are no answers!
# Note that this last bit is the part that would need to be customized per team, I think
chicago_answers <- chicago_choices %>%
enter_object("answers") %>%
spread_values(how_many = jstring("HWMN"), wow = jstring("CLCKWWFTHSSNWSMPHT"), off_leash = jstring("CLCKSFDGSFFLSH"))
chicago_answers %>% data.frame %>% head
#chicago_answers %>% group_by(classification_id) %>% summarise(., n())
Put everything back together, which is important if you’ve dropped rows because of empty arrays and things.
# in theory, you want to tie all of these back together just in case there are missing values
add_choices <- left_join(basic_flat_with_values, chicago_choices)
Joining, by = c("subject_ids", "classification_id", "workflow_version", "task_index", "task", "task_label", "value")
tot <- left_join(add_choices, chicago_answers)
Joining, by = c("subject_ids", "classification_id", "workflow_version", "task_index", "task", "task_label", "value", "total_species", "species_index", "choice", "answers")
flat_data <- tot %>% select(., -task_index, -task_label, -value, -answers)
flat_data %>% data.frame %>% head
Here’s your file out!
write.csv(flat_data, file = "../data/chicago-flattened.csv")
LS0tCnRpdGxlOiAiSlNPTi1QYXJzaW5nIFN1cnZleSBUYXNrczogQ2hpY2FnbyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBjb2RlIGZsYXR0ZW5zIHRoZSBDaGljYWdvIFdpbGRsaWZlIFdhdGNoIGRhdGEuIAoKYGBge3J9CmxpYnJhcnkodGlkeWpzb24pCmxpYnJhcnkobWFncml0dHIpCmxpYnJhcnkoanNvbmxpdGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoc3RyaW5ncikKbGlicmFyeSh0aWR5cikKCmNoaWNhZ29fdW5maWx0ZXJlZCA8LSByZWFkLmNzdigiLi4vZGF0YS9jaGljYWdvLXdpbGRsaWZlLXdhdGNoLWNsYXNzaWZpY2F0aW9ucy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRikKYGBgCgpGaXJzdCwgd2UgbmVlZCB0byBsaW1pdCB0aGUgY2xhc3NpZmljYXRpb24gZGF0YSB0byB0aGUgZmluYWwgd29ya2Zsb3cgdmVyc2lvbiBhbmQsIGlmIG5lY2Vzc2FyeSwgc3BsaXQgYnkgdGFzay4gVDAgaXMgY2xlYXJseSB0aGUgb25seSB0YXNrIHdlIHJlYWxseSBjYXJlIGFib3V0IGluIHRoaXMgZGF0YXNldCAodGhvdWdoIG5vdGUgdGhlIGNoYW5nZWQgZm9ybWF0IG9mIGN1cnJlbnQgc2l0ZSkuIAoKYGBge3J9CiMgY2hlY2sgd2hpY2ggd29ya2Zsb3cgdmVyc2lvbiB3ZSB3YW50OgpjaGljYWdvX3VuZmlsdGVyZWQgJT4lIHN1bW1hcmlzZSguLCBuX2Rpc3RpbmN0KHN1YmplY3RfaWRzKSwgbl9kaXN0aW5jdChjbGFzc2lmaWNhdGlvbl9pZCksIG5fZGlzdGluY3Qod29ya2Zsb3dfdmVyc2lvbikpCgpxdWlja19jaGVjayA8LSBjaGljYWdvX3VuZmlsdGVyZWQgJT4lIAogICAgIHNlbGVjdCguLCBzdWJqZWN0X2lkcywgY2xhc3NpZmljYXRpb25faWQsIHdvcmtmbG93X3ZlcnNpb24sIGFubm90YXRpb25zKSAlPiUKICAgICBhcy50YmxfanNvbihqc29uLmNvbHVtbiA9ICJhbm5vdGF0aW9ucyIpICU+JQogICAgIGdhdGhlcl9hcnJheShjb2x1bW4ubmFtZSA9ICJ0YXNrX2luZGV4IikgJT4lICMgcmVhbGx5IGltcG9ydGFudCBmb3Igam9pbmluZyBsYXRlcgogICAgIHNwcmVhZF92YWx1ZXModGFzayA9IGpzdHJpbmcoInRhc2siKSwgdGFza19sYWJlbCA9IGpzdHJpbmcoInRhc2tfbGFiZWwiKSwgdmFsdWUgPSBqc3RyaW5nKCJ2YWx1ZSIpKSAgJT4lIAogICAgIGdhdGhlcl9rZXlzICU+JQogICAgIGFwcGVuZF92YWx1ZXNfc3RyaW5nKCkKCnF1aWNrX2NoZWNrICU+JSBkYXRhLmZyYW1lICU+JSBncm91cF9ieSguLCB3b3JrZmxvd192ZXJzaW9uLCBrZXksIHRhc2spICU+JSBzdW1tYXJpc2UoLiwgY2xhc3NpZmljYXRpb25fY291bnQgPSBuKCkpICU+JSBwcmludAoKYGBgCgpTbyBmaWx0ZXIgdG8gdGhlIGFwcHJvcHJpYXRlIHdvcmtmbG93IGFuZCBnZXQgZ29pbmchIExldCdzIHRha2UgYSBxdWljayBwZWVrIGF0IHRoZSBkYXRhLgoKYGBge3J9CmNoaWNhZ28gPC0gY2hpY2Fnb191bmZpbHRlcmVkICU+JSBmaWx0ZXIoLiwgd29ya2Zsb3dfdmVyc2lvbiA9PSAzOTcuNDEpCmNoaWNhZ28kYW5ub3RhdGlvbnNbMV0gJT4lIHByZXR0aWZ5KCkKYGBgCgoKYGBge3J9CiMgcHJlbGltaW5hcnkgZmxhdApiYXNpY19mbGF0X3dpdGhfdmFsdWVzIDwtIGNoaWNhZ28gJT4lIAogICAgIHNlbGVjdCguLCBzdWJqZWN0X2lkcywgY2xhc3NpZmljYXRpb25faWQsIHdvcmtmbG93X3ZlcnNpb24sIGFubm90YXRpb25zKSAlPiUKICAgICBhcy50YmxfanNvbihqc29uLmNvbHVtbiA9ICJhbm5vdGF0aW9ucyIpICU+JQogICAgIGdhdGhlcl9hcnJheShjb2x1bW4ubmFtZSA9ICJ0YXNrX2luZGV4IikgJT4lICMgcmVhbGx5IGltcG9ydGFudCBmb3Igam9pbmluZyBsYXRlcgogICAgIHNwcmVhZF92YWx1ZXModGFzayA9IGpzdHJpbmcoInRhc2siKSwgdGFza19sYWJlbCA9IGpzdHJpbmcoInRhc2tfbGFiZWwiKSwgdmFsdWUgPSBqc3RyaW5nKCJ2YWx1ZSIpKSAKCmJhc2ljX2ZsYXRfd2l0aF92YWx1ZXMgJT4lIGRhdGEuZnJhbWUgJT4lIGhlYWQKCmNoaWNhZ29fc3VtbWFyeSA8LSAgYmFzaWNfZmxhdF93aXRoX3ZhbHVlcyAlPiUgCiAgICAgZ2F0aGVyX2tleXMgJT4lCiAgICAgYXBwZW5kX3ZhbHVlc19zdHJpbmcoKQoKY2hpY2Fnb19zdW1tYXJ5ICU+JSBkYXRhLmZyYW1lICU+JSBoZWFkICMgdGhpcyB3aWxsIGhhdmUgYWxsIHRoZSBjbGFzc2lmaWNhdGlvbiBJRHM7IGlmIFZhbHVlIGlzIGVtcHR5LCB0aGVuIHRoZSBmaWVsZCB3aWxsIGJlIG51bGwuIFRoaXMgd2lsbCBoYXZlIG11bHRpcGxlIHJvd3MgcGVyIGNsYXNzaWZpY2F0aW9uIGlmIHRoZXJlIGFyZSBtdWx0aXBsZSB0YXNrcyBjb21wbGV0ZWQKCmNoaWNhZ29fc3VtbWFyeSAlPiUgZGF0YS5mcmFtZSAlPiUgZ3JvdXBfYnkoLiwgd29ya2Zsb3dfdmVyc2lvbiwga2V5LCB0YXNrKSAlPiUgc3VtbWFyaXNlKC4sIG4oKSkKCiMgcXVpY2sgY2hlY2sgdGhlIGZpbHRlcmVkIG9yaWdpbmFsIGRhdGEKY2hpY2FnbyAlPiUgc3VtbWFyaXNlKC4sIG5fZGlzdGluY3Qoc3ViamVjdF9pZHMpLCBuX2Rpc3RpbmN0KGNsYXNzaWZpY2F0aW9uX2lkKSwgbl9kaXN0aW5jdCh3b3JrZmxvd192ZXJzaW9uKSkKYGBgCgpOb3cgZGl2ZSBpbnRvIHRoZSBmaXJzdCBuZXN0ZWQgb2JqZWN0LCB0aGUgc3BlY2llcyBjaG9pY2UuIE5vdGUgdGhhdCBpZiB5b3UgaGF2ZSBkaWZmZXJlbnQgdGFzayB0eXBlcyB0aGF0IHlvdSBoYXZlbid0IGZpbHRlcmVkIG91dCwgb3IgaWYgeW91IGhhdmUgbnVsbCBvYmplY3RzLCB0aGlzIG1pZ2h0IGJyZWFrIG9yIGVsc2UgZHJvcCByb3dzLgoKYGBge3J9CiMgZ3JhYiBjaG9pY2VzOyBhcHBlbmQgZW1iZWRkZWQgYXJyYXkgdmFsdWVzIGp1c3QgZm9yIHRyYWNraW5nCiMgTm90ZSB0aGF0IHRoaXMgd2lsbCBicmVhayBpZiBhbnkgb2YgdGhlIHRhc2tzIGFyZSBzaW1wbGUgcXVlc3Rpb25zLiBZb3Ugd291bGQgbmVlZCB0byBzcGxpdCBieSB0YXNrIGJlZm9yZSBoZXJlLgpjaGljYWdvX2Nob2ljZXMgPC0gYmFzaWNfZmxhdF93aXRoX3ZhbHVlcyAlPiUKICAgICBlbnRlcl9vYmplY3QoInZhbHVlIikgJT4lIGpzb25fbGVuZ3Rocyhjb2x1bW4ubmFtZSA9ICJ0b3RhbF9zcGVjaWVzIikgJT4lIAogICAgIGdhdGhlcl9hcnJheShjb2x1bW4ubmFtZSA9ICJzcGVjaWVzX2luZGV4IikgJT4lICNlYWNoIGNsYXNzaWZpY2F0aW9uIGlzIGFuIGFycmF5LiBzbyB5b3UgbmVlZCB0byBnYXRoZXIgdXAgbXVsdGlwbGUgYXJyYXlzLgogICAgIHNwcmVhZF92YWx1ZXMoY2hvaWNlID0ganN0cmluZygiY2hvaWNlIiksIGFuc3dlcnMgPSBqc3RyaW5nKCJhbnN3ZXJzIikpICNhcHBlbmQgdGhlIGFuc3dlcnMgYXMgY2hhcmFjdGVycyBqdXN0IGluIGNhc2UKCiMgaWYgdGhlcmUgYXJlIG11bHRpcGxlIHNwZWNpZXMgSUQnZCwgdGhlcmUgd2lsbCBiZSBtdWx0aXBsZSByb3dzIGFuZCBhcnJheS5pbmRleCB3aWxsIGJlID4xCmNoaWNhZ29fY2hvaWNlcyAlPiUgZGF0YS5mcmFtZSAlPiUgaGVhZCAKY2hpY2Fnb19jaG9pY2VzICU+JSBncm91cF9ieSguLCBjbGFzc2lmaWNhdGlvbl9pZCkgJT4lIHN1bW1hcmlzZSguLCBjb3VudCA9IG4oKSwgbWF4KHNwZWNpZXNfaW5kZXgpKSAlPiUgYXJyYW5nZSguLCAtY291bnQpCmBgYAoKTm93IGRpdmUgaW50byB0aGUgc2Vjb25kIG5lc3RlZCBvYmplY3QsIHdoaWNoIGlzIHRoZSBzdWIgcXVlc3Rpb25zLiBTaW5jZSB0aGVzZSBhY3R1YWxseSBhcmVuJ3QgYXJyYXlzLCBpdCdzIG9rYXkgaWYgdGhleSdyZSBlbXB0eSEgVGhpcyBzdGlsbCBrZWVwcyB0aGUgcm93cy4KYGBge3J9CiMgZ3JhYiBhbnN3ZXJzIC0gZm9yIHNvbWUgcmVhc29uLCB0aGlzIGtlZXBzIHJvd3MgZXZlbiBpZiB0aGVyZSBhcmUgbm8gYW5zd2VycyEgCiMgTm90ZSB0aGF0IHRoaXMgbGFzdCBiaXQgaXMgdGhlIHBhcnQgdGhhdCB3b3VsZCBuZWVkIHRvIGJlIGN1c3RvbWl6ZWQgcGVyIHRlYW0sIEkgdGhpbmsKY2hpY2Fnb19hbnN3ZXJzIDwtIGNoaWNhZ29fY2hvaWNlcyAlPiUgCiAgICAgZW50ZXJfb2JqZWN0KCJhbnN3ZXJzIikgJT4lIAogICAgIHNwcmVhZF92YWx1ZXMoaG93X21hbnkgPSBqc3RyaW5nKCJIV01OIiksIHdvdyA9IGpzdHJpbmcoIkNMQ0tXV0ZUSFNTTldTTVBIVCIpLCBvZmZfbGVhc2ggPSBqc3RyaW5nKCJDTENLU0ZER1NGRkxTSCIpKQoKY2hpY2Fnb19hbnN3ZXJzICU+JSBkYXRhLmZyYW1lICU+JSBoZWFkICAgICAgCiNjaGljYWdvX2Fuc3dlcnMgJT4lIGdyb3VwX2J5KGNsYXNzaWZpY2F0aW9uX2lkKSAlPiUgc3VtbWFyaXNlKC4sIG4oKSkgICAgIApgYGAKClB1dCBldmVyeXRoaW5nIGJhY2sgdG9nZXRoZXIsIHdoaWNoIGlzIGltcG9ydGFudCBpZiB5b3UndmUgZHJvcHBlZCByb3dzIGJlY2F1c2Ugb2YgZW1wdHkgYXJyYXlzIGFuZCB0aGluZ3MuCmBgYHtyfQojIGluIHRoZW9yeSwgeW91IHdhbnQgdG8gdGllIGFsbCBvZiB0aGVzZSBiYWNrIHRvZ2V0aGVyIGp1c3QgaW4gY2FzZSB0aGVyZSBhcmUgbWlzc2luZyB2YWx1ZXMKYWRkX2Nob2ljZXMgPC0gbGVmdF9qb2luKGJhc2ljX2ZsYXRfd2l0aF92YWx1ZXMsIGNoaWNhZ29fY2hvaWNlcykKdG90IDwtIGxlZnRfam9pbihhZGRfY2hvaWNlcywgY2hpY2Fnb19hbnN3ZXJzKQpmbGF0X2RhdGEgPC0gdG90ICU+JSBzZWxlY3QoLiwgLXRhc2tfaW5kZXgsIC10YXNrX2xhYmVsLCAtdmFsdWUsIC1hbnN3ZXJzKQoKZmxhdF9kYXRhICU+JSBkYXRhLmZyYW1lICU+JSBoZWFkCmBgYAoKSGVyZSdzIHlvdXIgZmlsZSBvdXQhCmBgYHtyfQp3cml0ZS5jc3YoZmxhdF9kYXRhLCBmaWxlID0gIi4uL2RhdGEvY2hpY2Fnby1mbGF0dGVuZWQuY3N2IikKYGBgCgo=