knitr::opts_chunk$set(echo = TRUE)

library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(jsonlite)
library(tidyjson)
## 
## Attaching package: 'tidyjson'
## The following object is masked from 'package:jsonlite':
## 
##     read_json
library(stringr)
library(ggplot2)

Pull in data: Note that fread doesn’t properly parse the quotes around the JSON blobs, so you need to use read.csv even though it’s 5x slower.

#classifications <- fread("../Data/snapshots-at-sea-classifications.csv", select = c(1:8, 12, 14))
classifications <- read.csv("../Data/snapshots-at-sea-classifications.csv", stringsAsFactors = F)

Explore data to identify which workflows, versions, and date ranges to keep data from

classifications %<>% mutate(., created_at = ymd_hms(created_at)) #set created_at to a date field
summary(classifications)
##  classification_id   user_name            user_id       
##  Min.   : 2896902   Length:1085163     Min.   :      1  
##  1st Qu.:11778124   Class :character   1st Qu.: 321256  
##  Median :19997101   Mode  :character   Median :1173173  
##  Mean   :18538431                      Mean   : 927135  
##  3rd Qu.:21721566                      3rd Qu.:1466965  
##  Max.   :55217257                      Max.   :1653861  
##                                        NA's   :146990   
##    user_ip           workflow_id    workflow_name      workflow_version
##  Length:1085163     Min.   :427.0   Length:1085163     Min.   :2.100   
##  Class :character   1st Qu.:427.0   Class :character   1st Qu.:4.100   
##  Mode  :character   Median :433.0   Mode  :character   Median :4.160   
##                     Mean   :470.9                      Mean   :4.541   
##                     3rd Qu.:505.0                      3rd Qu.:6.200   
##                     Max.   :566.0                      Max.   :6.200   
##                                                                        
##    created_at                  gold_standard   expert       
##  Min.   :2015-09-14 02:56:45   Mode:logical   Mode:logical  
##  1st Qu.:2016-04-27 23:27:54   NA's:1085163   NA's:1085163  
##  Median :2016-10-31 17:06:52                                
##  Mean   :2016-08-22 15:37:52                                
##  3rd Qu.:2016-11-19 18:44:51                                
##  Max.   :2017-05-18 17:27:18                                
##                                                             
##    metadata         annotations        subject_data      
##  Length:1085163     Length:1085163     Length:1085163    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   subject_ids     
##  Min.   : 919796  
##  1st Qu.:1313143  
##  Median :4166347  
##  Mean   :3261927  
##  3rd Qu.:4179639  
##  Max.   :6621275  
## 
classification_summary <- classifications %>% 
     group_by(., workflow_id, workflow_version) %>% 
     summarise(., count = n(), text = first(workflow_name), first = min(created_at), last = max(created_at)) %>% 
     arrange(., workflow_version)

print(classification_summary)
## Source: local data frame [30 x 6]
## Groups: workflow_id [6]
## 
##    workflow_id workflow_version count
##          <int>            <dbl> <int>
## 1          433             2.10     5
## 2          433             2.40     1
## 3          433             2.80    76
## 4          433             3.10     3
## 5          504             3.10    52
## 6          505             3.10    10
## 7          506             3.10    42
## 8          566             3.10    49
## 9          427             3.12    95
## 10         505             3.13   123
## # ... with 20 more rows, and 3 more variables: text <chr>, first <dttm>,
## #   last <dttm>
# filter to relevant workflow versions & workflow ID
workflow_id_set <- 505
workflow_version_set <- 4.16

current_data <- classifications %>% 
     filter(., workflow_id == workflow_id_set, workflow_version == workflow_version_set) %>%
     arrange(., created_at) %>%
     mutate(., counter = row_number())

#which classifications do you keep? look at the timing of them...
ggplot(data = current_data, aes(x = created_at, y = counter)) + geom_line(aes())

Limit date range

#filter to data after 2017!
current_data %<>% filter(., created_at > ymd("2017-01-01"))

#rm(classifications) #drop the original file from memory because it's too massive
summary(current_data)
##  classification_id   user_name            user_id       
##  Min.   :52710973   Length:29489       Min.   :      6  
##  1st Qu.:53465400   Class :character   1st Qu.:1631715  
##  Median :53839954   Mode  :character   Median :1647976  
##  Mean   :53854493                      Mean   :1467033  
##  3rd Qu.:54179106                      3rd Qu.:1650900  
##  Max.   :55217257                      Max.   :1653861  
##                                        NA's   :7909     
##    user_ip           workflow_id  workflow_name      workflow_version
##  Length:29489       Min.   :505   Length:29489       Min.   :4.16    
##  Class :character   1st Qu.:505   Class :character   1st Qu.:4.16    
##  Mode  :character   Median :505   Mode  :character   Median :4.16    
##                     Mean   :505                      Mean   :4.16    
##                     3rd Qu.:505                      3rd Qu.:4.16    
##                     Max.   :505                      Max.   :4.16    
##                                                                      
##    created_at                  gold_standard   expert       
##  Min.   :2017-05-05 21:12:55   Mode:logical   Mode:logical  
##  1st Qu.:2017-05-10 19:16:04   NA's:29489     NA's:29489    
##  Median :2017-05-12 17:29:39                                
##  Mean   :2017-05-13 04:24:58                                
##  3rd Qu.:2017-05-15 12:05:49                                
##  Max.   :2017-05-18 17:27:18                                
##                                                             
##    metadata         annotations        subject_data      
##  Length:29489       Length:29489       Length:29489      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   subject_ids         counter      
##  Min.   :6579670   Min.   : 76530  
##  1st Qu.:6607848   1st Qu.: 83902  
##  Median :6610303   Median : 91274  
##  Mean   :6608928   Mean   : 91274  
##  3rd Qu.:6616718   3rd Qu.: 98646  
##  Max.   :6621275   Max.   :106018  
## 

Parsing/Flattening JSON

Some notes on JSON.

  1. It appears that [] signal the start and end of a JSON array, and both rjson and jsonlite have issues interpreting a vector of JSON blobs. You can force them to read through the column one row at a time, grabbing classification IDs from other columns so you can link back later. Or you can just use tidyjson, which does this for you. Wish I’d known this 6 hours ago.

  2. Note that tidyjson struggles with navigating the structure of subject_data because the subject ID is given as the key, and not as a key:value pair.

# note that jsonlite and rjson only work if you loop through each record, because they are separate arrays. similarly, all JSON parsers have issues parsing the subject data because the subject ID is stored as the key itself, not a value in an ID:IDNumber key:value pairing.

filtered_dat <- current_data
flattened <- filtered_dat %>% 
     select(., -subject_data, -metadata) %>%
     as.tbl_json(json.column = "annotations") %>%
     gather_array %>%
     spread_values(
               task = jstring("task"),
               task_label = jstring("task_label"),
               value = jstring("value")
     )

head(flattened)
##   classification_id user_name user_id              user_ip workflow_id
## 1          52710973 tedcheese    5254 95b009a03859f2e5e0b7         505
## 2          52710983 tedcheese    5254 95b009a03859f2e5e0b7         505
## 3          52710989 tedcheese    5254 95b009a03859f2e5e0b7         505
## 4          52711001 tedcheese    5254 95b009a03859f2e5e0b7         505
## 5          52711029 tedcheese    5254 95b009a03859f2e5e0b7         505
## 6          52711043 tedcheese    5254 95b009a03859f2e5e0b7         505
##                               workflow_name workflow_version
## 1 3. Is there a fluke (tail) in this photo?             4.16
## 2 3. Is there a fluke (tail) in this photo?             4.16
## 3 3. Is there a fluke (tail) in this photo?             4.16
## 4 3. Is there a fluke (tail) in this photo?             4.16
## 5 3. Is there a fluke (tail) in this photo?             4.16
## 6 3. Is there a fluke (tail) in this photo?             4.16
##            created_at gold_standard expert subject_ids counter array.index
## 1 2017-05-05 21:12:55            NA     NA     6619470   76530           1
## 2 2017-05-05 21:12:58            NA     NA     6609359   76531           1
## 3 2017-05-05 21:13:01            NA     NA     6616926   76532           1
## 4 2017-05-05 21:13:04            NA     NA     6619715   76533           1
## 5 2017-05-05 21:13:15            NA     NA     6613536   76534           1
## 6 2017-05-05 21:13:18            NA     NA     6612132   76535           1
##   task
## 1 init
## 2 init
## 3 init
## 4 init
## 5 init
## 6 init
##                                                                                      task_label
## 1 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
## 2 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
## 3 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
## 4 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
## 5 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
## 6 4. Is there a fluke (tail) with its underside visible in this photo?  (see help for examples)
##   value
## 1   Yes
## 2    No
## 3    No
## 4    No
## 5   Yes
## 6    No
dim(flattened)
## [1] 29489    16

Aggregation

aggregated <- flattened %>% 
     mutate(., answer = ifelse(value == "No", 0, 1)) %>%
     group_by(., subject_ids, workflow_id, workflow_name, workflow_version) %>%
     summarise(., total_votes = n(), total_yes = sum(answer), prop_yes = sum(answer)/n()) %>%
     mutate(., keep = as.factor(ifelse(prop_yes > 0.45, "T", "F")))
     
glimpse(aggregated)
## Observations: 5,643
## Variables: 8
## $ subject_ids      <int> 6579670, 6579671, 6579672, 6579673, 6579674, ...
## $ workflow_id      <int> 505, 505, 505, 505, 505, 505, 505, 505, 505, ...
## $ workflow_name    <chr> "3. Is there a fluke (tail) in this photo?", ...
## $ workflow_version <dbl> 4.16, 4.16, 4.16, 4.16, 4.16, 4.16, 4.16, 4.1...
## $ total_votes      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ total_yes        <dbl> 0, 1, 0, 1, 1, 0, 2, 2, 3, 4, 0, 0, 1, 5, 3, ...
## $ prop_yes         <dbl> 0.0, 0.2, 0.0, 0.2, 0.2, 0.0, 0.4, 0.4, 0.6, ...
## $ keep             <fctr> F, F, F, F, F, F, F, F, T, T, F, F, F, T, T,...

Keep set for next workflow

subjects_Q4 <- aggregated %>% filter(., keep == "T")
summary(subjects_Q4)
##   subject_ids       workflow_id  workflow_name      workflow_version
##  Min.   :6579678   Min.   :505   Length:1408        Min.   :4.16    
##  1st Qu.:6608126   1st Qu.:505   Class :character   1st Qu.:4.16    
##  Median :6612302   Median :505   Mode  :character   Median :4.16    
##  Mean   :6609659   Mean   :505                      Mean   :4.16    
##  3rd Qu.:6617639   3rd Qu.:505                      3rd Qu.:4.16    
##  Max.   :6621275   Max.   :505                      Max.   :4.16    
##   total_votes       total_yes         prop_yes      keep    
##  Min.   : 4.000   Min.   : 2.000   Min.   :0.5000   F:   0  
##  1st Qu.: 5.000   1st Qu.: 3.000   1st Qu.:0.6000   T:1408  
##  Median : 5.000   Median : 4.000   Median :0.8000           
##  Mean   : 5.167   Mean   : 4.244   Mean   :0.8237           
##  3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.:1.0000           
##  Max.   :62.000   Max.   :52.000   Max.   :1.0000
write.csv(x = subjects_Q4, file = "../Data/subjects_for_Q4.csv")