# Scientific data paper: https://www.nature.com/articles/sdata201526

# Link to Dryad: https://datadryad.org/stash/dataset/doi:10.5061/dryad.5pt92
# You can download the data from there as doi_10.5061_dryad.5pt92__v1.zip

# Once extracted, use fread from the data.table package to quickly load 1Gb csv file (310Mb compressed)
all_image_data = read_csv("raw_data_for_dryad.csv.zip")
## Parsed with column specification:
## cols(
##   CaptureEventID = col_character(),
##   ClassificationID = col_character(),
##   UserID = col_character(),
##   Species = col_character(),
##   Count = col_double(),
##   Standing = col_double(),
##   Resting = col_double(),
##   Moving = col_double(),
##   Eating = col_double(),
##   Interacting = col_double(),
##   Babies = col_double()
## )
nrow(all_image_data)
## [1] 10530564
head(all_image_data)
## # A tibble: 6 x 11
##   CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
##   <chr>          <chr>            <chr>  <chr>   <dbl>    <dbl>   <dbl>  <dbl>
## 1 ASG0000001     50c79e2c9177d00… not-l… human       1        1       0      0
## 2 ASG0000001     50c7c0f9dea72b4… cb2e6… human       1        1       0      0
## 3 ASG0000001     50c80a57b58fdd4… 5e3c1… human       1        1       0      0
## 4 ASG0000001     50c81492b58fdd1… not-l… human       1        0       0      0
## 5 ASG0000001     50c8a96c56175a5… 0c165… human       1        0       0      0
## 6 ASG0000001     50cce4699a35a62… not-l… human       1        1       0      0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>
# Let's have a look at the first 'capture event'
subset(all_image_data, CaptureEventID == "ASG0000001")
## # A tibble: 18 x 11
##    CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
##    <chr>          <chr>            <chr>  <chr>   <dbl>    <dbl>   <dbl>  <dbl>
##  1 ASG0000001     50c79e2c9177d00… not-l… human       1        1       0      0
##  2 ASG0000001     50c7c0f9dea72b4… cb2e6… human       1        1       0      0
##  3 ASG0000001     50c80a57b58fdd4… 5e3c1… human       1        1       0      0
##  4 ASG0000001     50c81492b58fdd1… not-l… human       1        0       0      0
##  5 ASG0000001     50c8a96c56175a5… 0c165… human       1        0       0      0
##  6 ASG0000001     50cce4699a35a62… not-l… human       1        1       0      0
##  7 ASG0000001     50cd013157d38e1… 09448… human       1        0       0      0
##  8 ASG0000001     50cd2fc09a35a63… not-l… blank       0        0       0      0
##  9 ASG0000001     50cdc52357d38e3… 73043… human       1        1       0      0
## 10 ASG0000001     50cdd122b58fdd0… not-l… human       1        0       0      0
## 11 ASG0000001     50cdddda57d38e3… 3191c… human       1        0       0      0
## 12 ASG0000001     50d72d34e399563… not-l… human       1        0       0      0
## 13 ASG0000001     50d9f256e399563… 232e3… human       1        1       0      0
## 14 ASG0000001     50db0117e399563… ca2e3… human       1        0       0      1
## 15 ASG0000001     50db43d1e399563… 84272… human       1        0       1      0
## 16 ASG0000001     50dd65d2e399563… fcb4b… human       1        0       0      0
## 17 ASG0000001     50e3264de399563… edd94… blank       0        0       0      0
## 18 ASG0000001     50e58c5ee399563… ec563… human       1        0       0      0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>

How many images have total consensus?

# Calculate the number of classifications of each species for each capture event and add a column 'perc' for what percentage of all classifications they represent
classifications = all_image_data %>% group_by(CaptureEventID) %>% count(Species) %>% mutate(perc = 100*(n / sum(n)))

# Histogram of the percentage of the most common answer
classifications %>% 
  group_by(CaptureEventID) %>% 
  summarise(max_perc = max(perc)) %>% 
    ggplot(aes(x = max_perc)) + 
    geom_histogram(bins = 50) +
    theme_bw()
## `summarise()` ungrouping output (override with `.groups` argument)

Let’s have a look at the relationship between consensus and number of users making classificaitons

head(classifications, 100000) %>% 
  group_by(CaptureEventID) %>% 
  summarise(n = n, max_perc = max(perc)) %>% 
    ggplot(aes(x = max_perc, y = n)) + 
    geom_point(alpha = 0.05) +
    theme_bw()
## `summarise()` regrouping output by 'CaptureEventID' (override with `.groups` argument)

Lets have a look at the consensus results

consensus_results = subset(classifications, perc == 100)

range(consensus_results$n)
## [1]  5 88
ggplot(consensus_results, aes(x = n)) + 
    geom_histogram(bins = 25) +
    theme_bw()

ggplot(subset(consensus_results, n < 25), aes(x = n)) + 
    geom_histogram(bins = 25) +
    theme_bw()

Let’s have a look at the frequency of identified species in the data

consensus_results %>% group_by(Species) %>% count(Species) %>% arrange(n) %>% knitr::kable()
Species n
honeyBadger 1
aardwolf 2
hyenaStriped 2
wildcat 2
rodents 4
bushbuck 5
civet 6
caracal 9
waterbuck 9
rhinoceros 10
batEaredFox 12
reptiles 15
mongoose 18
leopard 26
serval 30
vervetMonkey 30
jackal 42
reedbuck 42
hare 62
topi 66
koriBustard 74
dikDik 77
secretaryBird 131
porcupine 135
aardvark 168
eland 184
lionMale 317
cheetah 319
otherBird 326
gazelleGrants 356
ostrich 375
impala 440
baboon 482
hartebeest 922
lionFemale 1157
hippopotamus 1345
hyenaSpotted 1799
warthog 2002
guineaFowl 2159
buffalo 2284
human 2956
giraffe 4676
elephant 5560
gazelleThomsons 14518
wildebeest 28597
zebra 32456
blank 808559

There’s about 900k images here, what are they all of…?

consensus_results %>% 
  group_by(Species) %>% 
  count(Species) %>% 
  arrange(n) %>% 
    ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
      geom_bar(stat = "identity") +  # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
      xlab("Species") + 
      ylab("Frequency") + 
      theme_bw() + # Simple theme
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels

Ah, about 800k blanks, let’s remove them

consensus_no_blanks = subset(consensus_results, Species != "blank")

nrow(consensus_no_blanks)
## [1] 104208
subset(consensus_no_blanks, Species != "blank") %>% 
  group_by(Species) %>% 
  count(Species) %>% 
  arrange(n) %>% 
    ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
      geom_bar(stat = "identity") +  # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
      xlab("Species") + 
      ylab("Frequency") + 
      theme_bw() + # Simple theme
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels

We can also download some images of interest, given a capture ID

# Download the metadata file (from http://lila.science/datasets/snapshot-serengeti)
# "All metadata (.csv)", about 100Mb zipped
# https://lilablobssc.blob.core.windows.net/snapshotserengeti-v-2-0/SnapshotSerengeti_S1-11_v2_1.csv.zip

# What's in the metadata file
files = unzip("SnapshotSerengeti_S1-11_v2_1.csv.zip", list=TRUE)
files
##                                     Name    Length                Date
## 1 SnapshotSerengeti_v2_1_annotations.csv 271085510 2020-08-02 10:20:00
## 2      SnapshotSerengeti_v2_1_images.csv 467873967 2020-08-02 10:19:00
# Read the image csv
image_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_images.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   capture_id = col_character(),
##   image_rank_in_capture = col_double(),
##   image_path_rel = col_character()
## )
# Read the annotations csv
annotations_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_annotations.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   capture_id = col_character(),
##   season = col_character(),
##   site = col_character(),
##   capture_date_local = col_date(format = ""),
##   capture_time_local = col_time(format = ""),
##   subject_id = col_character(),
##   question__species = col_character(),
##   question__count_max = col_character(),
##   question__count_median = col_character(),
##   question__horns_visible = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 20615 parsing failures.
##   row                 col               expected actual         file
##  9836 question__count_min no trailing characters    -50 <connection>
## 12148 question__count_min no trailing characters    -50 <connection>
## 12404 question__count_min no trailing characters    -50 <connection>
## 12406 question__count_min no trailing characters    -50 <connection>
## 12915 question__count_min no trailing characters    -50 <connection>
## ..... ................... ...................... ...... ............
## See problems(...) for more details.
# Let's see how many annotations are of leopards
leopard_sequences = subset(consensus_no_blanks, Species == "leopard")

nrow(leopard_sequences)
## [1] 26
head(leopard_sequences)
## # A tibble: 6 x 4
## # Groups:   CaptureEventID [6]
##   CaptureEventID Species     n  perc
##   <chr>          <chr>   <int> <dbl>
## 1 ASG0001329     leopard    14   100
## 2 ASG0002hur     leopard    19   100
## 3 ASG0004oiu     leopard    10   100
## 4 ASG000566q     leopard    13   100
## 5 ASG0005m6t     leopard    12   100
## 6 ASG0005tch     leopard    10   100

We can also download some images of interest, given a capture ID

# Find a particular sequence (capture ID)
example_capture = subset(annotations_csv, subject_id == "ASG0001329")

# Find images from capture ID
images = subset(image_csv, capture_id == example_capture$capture_id)

images
## # A tibble: 3 x 4
##       X1 capture_id       image_rank_in_captu… image_path_rel                   
##    <dbl> <chr>                           <dbl> <chr>                            
## 1 167177 SER_S1#H06#4#444                    1 S1/H06/H06_R4/S1_H06_R4_PICT1328…
## 2 167178 SER_S1#H06#4#444                    2 S1/H06/H06_R4/S1_H06_R4_PICT1329…
## 3 167179 SER_S1#H06#4#444                    3 S1/H06/H06_R4/S1_H06_R4_PICT1330…
# **Don't actually need all this, base_url is simple enough**
# The SAS URL for Snapshot is from http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt
##Snapshot Serengeti,https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D

#sas_url = 'https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D'
#
#base_url = strsplit(sas_url, '\\?')[[1]][1]
#sas_token = strsplit(sas_url, '\\?')[[1]][2]

base_url = "https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped"

# Let's try and download these images...

# Where to put them
outputdir = ("/Users/freeman.r/Documents/data/snapshot_serengheti_data/images")
# Make sure it exists
if (!dir.exists(outputdir)) dir.create(outputdir)

downloaded_files = list()
# For each image in that capture sequence ('images' is a dataframe where each row is some image information)
for (i in 1:nrow(images)) {
  
  # Get the name of the image ('images' is a dataframe)
  name = images[i, ]$image_path_rel
  
  # Work out the output folder/filename (where to put it)
  outputfile = paste0(outputdir, "/", name)
  
  # If the file hasn't been downloaded already
  if (!file.exists(outputfile)) {
    # Make the directory for the file in case it doesn't exist
    dir.create(dirname(outputfile), recursive = T)
    
    # Print out what we're doing
    print(sprintf("Downloading image %s", images[i, ]$image_path_rel))
    
    # URL to be downloaded
    url = paste0(base_url, '/', name)
    
    # Download the image to the folder/file
    download.file(url, outputfile)
    
  }
  downloaded_files[[i]] = outputfile
  
}
knitr::include_graphics(unlist(downloaded_files))
A caption

A caption

Actually easier to load the provided ‘gold standard’ data

library(data.table)

gold_standard = fread("gold_standard_data.csv")

# How many rows
nrow(gold_standard)
## [1] 4432
# Have a look at data
head(gold_standard)
##    CaptureEventID NumSpecies   Species Count
## 1:     ASG000b3xp          1  reedbuck     1
## 2:     ASG000b45v          1     zebra     1
## 3:     ASG000b48a          1  reedbuck     1
## 4:     ASG000b4c2          1  elephant     1
## 5:     ASG000b4cg          1 otherBird     1
## 6:     ASG000b4k2          1  elephant     1

How many detections of each species

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [5] tidyr_1.1.2       tibble_3.0.4      ggplot2_3.3.2     tidyverse_1.3.0  
##  [9] readr_1.3.1       data.table_1.12.8
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0 xfun_0.18        haven_2.2.0      colorspace_1.4-1
##  [5] vctrs_0.3.4      generics_0.0.2   htmltools_0.5.0  yaml_2.2.1      
##  [9] utf8_1.1.4       rlang_0.4.8      pillar_1.4.6     glue_1.4.2      
## [13] withr_2.3.0      DBI_1.1.0        dbplyr_1.4.2     modelr_0.1.6    
## [17] readxl_1.3.1     jpeg_0.1-8.1     lifecycle_0.2.0  munsell_0.5.0   
## [21] gtable_0.3.0     cellranger_1.1.0 rvest_0.3.5      evaluate_0.14   
## [25] labeling_0.3     knitr_1.30       fansi_0.4.1      highr_0.8       
## [29] broom_0.7.0      Rcpp_1.0.5       scales_1.1.1     backports_1.1.10
## [33] jsonlite_1.7.1   farver_2.0.3     fs_1.4.1         hms_0.5.3       
## [37] digest_0.6.27    stringi_1.5.3    grid_3.6.2       cli_2.1.0       
## [41] tools_3.6.2      magrittr_1.5     crayon_1.3.4     pkgconfig_2.0.3 
## [45] ellipsis_0.3.1   xml2_1.2.2       reprex_0.3.0     lubridate_1.7.9 
## [49] assertthat_0.2.1 rmarkdown_2.3    httr_1.4.1       rstudioapi_0.11 
## [53] R6_2.4.1         compiler_3.6.2