knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(jsonlite)
library(tidyjson)
##
## Attaching package: 'tidyjson'
## The following object is masked from 'package:jsonlite':
##
## read_json
library(stringr)
library(ggplot2)
Pull in data: Note that fread doesn’t properly parse the quotes around the JSON blobs, so you need to use read.csv even though it’s 5x slower.
#classifications <- fread("../Data/snapshots-at-sea-classifications.csv", select = c(1:8, 12, 14))
classifications <- read.csv("../Data/snapshots-at-sea-classifications.csv", stringsAsFactors = F)
Explore data to identify which workflows, versions, and date ranges to keep data from
classifications %<>% mutate(., created_at = ymd_hms(created_at)) #set created_at to a date field
summary(classifications)
## classification_id user_name user_id
## Min. : 2896902 Length:1085163 Min. : 1
## 1st Qu.:11778124 Class :character 1st Qu.: 321256
## Median :19997101 Mode :character Median :1173173
## Mean :18538431 Mean : 927135
## 3rd Qu.:21721566 3rd Qu.:1466965
## Max. :55217257 Max. :1653861
## NA's :146990
## user_ip workflow_id workflow_name workflow_version
## Length:1085163 Min. :427.0 Length:1085163 Min. :2.100
## Class :character 1st Qu.:427.0 Class :character 1st Qu.:4.100
## Mode :character Median :433.0 Mode :character Median :4.160
## Mean :470.9 Mean :4.541
## 3rd Qu.:505.0 3rd Qu.:6.200
## Max. :566.0 Max. :6.200
##
## created_at gold_standard expert
## Min. :2015-09-14 02:56:45 Mode:logical Mode:logical
## 1st Qu.:2016-04-27 23:27:54 NA's:1085163 NA's:1085163
## Median :2016-10-31 17:06:52
## Mean :2016-08-22 15:37:52
## 3rd Qu.:2016-11-19 18:44:51
## Max. :2017-05-18 17:27:18
##
## metadata annotations subject_data
## Length:1085163 Length:1085163 Length:1085163
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## subject_ids
## Min. : 919796
## 1st Qu.:1313143
## Median :4166347
## Mean :3261927
## 3rd Qu.:4179639
## Max. :6621275
##
classification_summary <- classifications %>%
group_by(., workflow_id, workflow_version) %>%
summarise(., count = n(), text = first(workflow_name), first = min(created_at), last = max(created_at)) %>%
arrange(., workflow_version)
print(classification_summary)
## Source: local data frame [30 x 6]
## Groups: workflow_id [6]
##
## workflow_id workflow_version count
## <int> <dbl> <int>
## 1 433 2.10 5
## 2 433 2.40 1
## 3 433 2.80 76
## 4 433 3.10 3
## 5 504 3.10 52
## 6 505 3.10 10
## 7 506 3.10 42
## 8 566 3.10 49
## 9 427 3.12 95
## 10 505 3.13 123
## # ... with 20 more rows, and 3 more variables: text <chr>, first <dttm>,
## # last <dttm>
# filter to relevant workflow versions & workflow ID
workflow_id_set <- 505
workflow_version_set <- 4.16
current_data <- classifications %>%
filter(., workflow_id == workflow_id_set, workflow_version == workflow_version_set) %>%
arrange(., created_at) %>%
mutate(., counter = row_number())
#which classifications do you keep? look at the timing of them...
ggplot(data = current_data, aes(x = created_at, y = counter)) + geom_line(aes())
Limit date range
#filter to data after 2017!
current_data %<>% filter(., created_at > ymd("2017-01-01"))
#rm(classifications) #drop the original file from memory because it's too massive
summary(current_data)
## classification_id user_name user_id
## Min. :52710973 Length:29489 Min. : 6
## 1st Qu.:53465400 Class :character 1st Qu.:1631715
## Median :53839954 Mode :character Median :1647976
## Mean :53854493 Mean :1467033
## 3rd Qu.:54179106 3rd Qu.:1650900
## Max. :55217257 Max. :1653861
## NA's :7909
## user_ip workflow_id workflow_name workflow_version
## Length:29489 Min. :505 Length:29489 Min. :4.16
## Class :character 1st Qu.:505 Class :character 1st Qu.:4.16
## Mode :character Median :505 Mode :character Median :4.16
## Mean :505 Mean :4.16
## 3rd Qu.:505 3rd Qu.:4.16
## Max. :505 Max. :4.16
##
## created_at gold_standard expert
## Min. :2017-05-05 21:12:55 Mode:logical Mode:logical
## 1st Qu.:2017-05-10 19:16:04 NA's:29489 NA's:29489
## Median :2017-05-12 17:29:39
## Mean :2017-05-13 04:24:58
## 3rd Qu.:2017-05-15 12:05:49
## Max. :2017-05-18 17:27:18
##
## metadata annotations subject_data
## Length:29489 Length:29489 Length:29489
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## subject_ids counter
## Min. :6579670 Min. : 76530
## 1st Qu.:6607848 1st Qu.: 83902
## Median :6610303 Median : 91274
## Mean :6608928 Mean : 91274
## 3rd Qu.:6616718 3rd Qu.: 98646
## Max. :6621275 Max. :106018
##
It appears that [] signal the start and end of a JSON array, and both rjson and jsonlite have issues interpreting a vector of JSON blobs. You can force them to read through the column one row at a time, grabbing classification IDs from other columns so you can link back later. Or you can just use tidyjson, which does this for you. Wish I’d known this 6 hours ago.
Note that tidyjson struggles with navigating the structure of subject_data because the subject ID is given as the key, and not as a key:value pair.
# note that jsonlite and rjson only work if you loop through each record, because they are separate arrays. similarly, all JSON parsers have issues parsing the subject data because the subject ID is stored as the key itself, not a value in an ID:IDNumber key:value pairing.
filtered_dat <- current_data
flattened <- filtered_dat %>%
select(., -subject_data, -metadata) %>%
as.tbl_json(json.column = "annotations") %>%
gather_array %>%
spread_values(
task = jstring("task"),
task_label = jstring("task_label"),
value = jstring("value")
)
head(flattened)
## classification_id user_name user_id user_ip workflow_id
## 1 52710973 tedcheese 5254 95b009a03859f2e5e0b7 505
## 2 52710983 tedcheese 5254 95b009a03859f2e5e0b7 505
## 3 52710989 tedcheese 5254 95b009a03859f2e5e0b7 505
## 4 52711001 tedcheese 5254 95b009a03859f2e5e0b7 505
## 5 52711029 tedcheese 5254 95b009a03859f2e5e0b7 505
## 6 52711043 tedcheese 5254 95b009a03859f2e5e0b7 505
## workflow_name workflow_version
## 1 3. Is there a fluke (tail) in this photo? 4.16
## 2 3. Is there a fluke (tail) in this photo? 4.16
## 3 3. Is there a fluke (tail) in this photo? 4.16
## 4 3. Is there a fluke (tail) in this photo? 4.16
## 5 3. Is there a fluke (tail) in this photo? 4.16
## 6 3. Is there a fluke (tail) in this photo? 4.16
## created_at gold_standard expert subject_ids counter array.index
## 1 2017-05-05 21:12:55 NA NA 6619470 76530 1
## 2 2017-05-05 21:12:58 NA NA 6609359 76531 1
## 3 2017-05-05 21:13:01 NA NA 6616926 76532 1
## 4 2017-05-05 21:13:04 NA NA 6619715 76533 1
## 5 2017-05-05 21:13:15 NA NA 6613536 76534 1
## 6 2017-05-05 21:13:18 NA NA 6612132 76535 1
## task
## 1 init
## 2 init
## 3 init
## 4 init
## 5 init
## 6 init
## task_label
## 1 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## 2 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## 3 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## 4 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## 5 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## 6 4. Is there a fluke (tail) with its underside visible in this photo? (see help for examples)
## value
## 1 Yes
## 2 No
## 3 No
## 4 No
## 5 Yes
## 6 No
dim(flattened)
## [1] 29489 16
aggregated <- flattened %>%
mutate(., answer = ifelse(value == "No", 0, 1)) %>%
group_by(., subject_ids, workflow_id, workflow_name, workflow_version) %>%
summarise(., total_votes = n(), total_yes = sum(answer), prop_yes = sum(answer)/n()) %>%
mutate(., keep = as.factor(ifelse(prop_yes > 0.45, "T", "F")))
glimpse(aggregated)
## Observations: 5,643
## Variables: 8
## $ subject_ids <int> 6579670, 6579671, 6579672, 6579673, 6579674, ...
## $ workflow_id <int> 505, 505, 505, 505, 505, 505, 505, 505, 505, ...
## $ workflow_name <chr> "3. Is there a fluke (tail) in this photo?", ...
## $ workflow_version <dbl> 4.16, 4.16, 4.16, 4.16, 4.16, 4.16, 4.16, 4.1...
## $ total_votes <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ total_yes <dbl> 0, 1, 0, 1, 1, 0, 2, 2, 3, 4, 0, 0, 1, 5, 3, ...
## $ prop_yes <dbl> 0.0, 0.2, 0.0, 0.2, 0.2, 0.0, 0.4, 0.4, 0.6, ...
## $ keep <fctr> F, F, F, F, F, F, F, F, T, T, F, F, F, T, T,...
Keep set for next workflow
subjects_Q4 <- aggregated %>% filter(., keep == "T")
summary(subjects_Q4)
## subject_ids workflow_id workflow_name workflow_version
## Min. :6579678 Min. :505 Length:1408 Min. :4.16
## 1st Qu.:6608126 1st Qu.:505 Class :character 1st Qu.:4.16
## Median :6612302 Median :505 Mode :character Median :4.16
## Mean :6609659 Mean :505 Mean :4.16
## 3rd Qu.:6617639 3rd Qu.:505 3rd Qu.:4.16
## Max. :6621275 Max. :505 Max. :4.16
## total_votes total_yes prop_yes keep
## Min. : 4.000 Min. : 2.000 Min. :0.5000 F: 0
## 1st Qu.: 5.000 1st Qu.: 3.000 1st Qu.:0.6000 T:1408
## Median : 5.000 Median : 4.000 Median :0.8000
## Mean : 5.167 Mean : 4.244 Mean :0.8237
## 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.:1.0000
## Max. :62.000 Max. :52.000 Max. :1.0000
write.csv(x = subjects_Q4, file = "../Data/subjects_for_Q4.csv")