# Scientific data paper: https://www.nature.com/articles/sdata201526

# Link to Dryad: https://datadryad.org/stash/dataset/doi:10.5061/dryad.5pt92
# You can download the data from there as doi_10.5061_dryad.5pt92__v1.zip

# Once extracted, use fread from the data.table package to quickly load 1Gb csv file (310Mb compressed)
all_image_data = read_csv("raw_data_for_dryad.csv.zip")

## Parsed with column specification:
## cols(
##   CaptureEventID = col_character(),
##   ClassificationID = col_character(),
##   UserID = col_character(),
##   Species = col_character(),
##   Count = col_double(),
##   Standing = col_double(),
##   Resting = col_double(),
##   Moving = col_double(),
##   Eating = col_double(),
##   Interacting = col_double(),
##   Babies = col_double()
## )

nrow(all_image_data)

## [1] 10530564

head(all_image_data)

## # A tibble: 6 x 11
##   CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
##   <chr>          <chr>            <chr>  <chr>   <dbl>    <dbl>   <dbl>  <dbl>
## 1 ASG0000001     50c79e2c9177d00… not-l… human       1        1       0      0
## 2 ASG0000001     50c7c0f9dea72b4… cb2e6… human       1        1       0      0
## 3 ASG0000001     50c80a57b58fdd4… 5e3c1… human       1        1       0      0
## 4 ASG0000001     50c81492b58fdd1… not-l… human       1        0       0      0
## 5 ASG0000001     50c8a96c56175a5… 0c165… human       1        0       0      0
## 6 ASG0000001     50cce4699a35a62… not-l… human       1        1       0      0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>

# Let's have a look at the first 'capture event'
subset(all_image_data, CaptureEventID == "ASG0000001")

## # A tibble: 18 x 11
##    CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
##    <chr>          <chr>            <chr>  <chr>   <dbl>    <dbl>   <dbl>  <dbl>
##  1 ASG0000001     50c79e2c9177d00… not-l… human       1        1       0      0
##  2 ASG0000001     50c7c0f9dea72b4… cb2e6… human       1        1       0      0
##  3 ASG0000001     50c80a57b58fdd4… 5e3c1… human       1        1       0      0
##  4 ASG0000001     50c81492b58fdd1… not-l… human       1        0       0      0
##  5 ASG0000001     50c8a96c56175a5… 0c165… human       1        0       0      0
##  6 ASG0000001     50cce4699a35a62… not-l… human       1        1       0      0
##  7 ASG0000001     50cd013157d38e1… 09448… human       1        0       0      0
##  8 ASG0000001     50cd2fc09a35a63… not-l… blank       0        0       0      0
##  9 ASG0000001     50cdc52357d38e3… 73043… human       1        1       0      0
## 10 ASG0000001     50cdd122b58fdd0… not-l… human       1        0       0      0
## 11 ASG0000001     50cdddda57d38e3… 3191c… human       1        0       0      0
## 12 ASG0000001     50d72d34e399563… not-l… human       1        0       0      0
## 13 ASG0000001     50d9f256e399563… 232e3… human       1        1       0      0
## 14 ASG0000001     50db0117e399563… ca2e3… human       1        0       0      1
## 15 ASG0000001     50db43d1e399563… 84272… human       1        0       1      0
## 16 ASG0000001     50dd65d2e399563… fcb4b… human       1        0       0      0
## 17 ASG0000001     50e3264de399563… edd94… blank       0        0       0      0
## 18 ASG0000001     50e58c5ee399563… ec563… human       1        0       0      0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>

How many images have total consensus?

# Calculate the number of classifications of each species for each capture event and add a column 'perc' for what percentage of all classifications they represent
classifications = all_image_data %>% group_by(CaptureEventID) %>% count(Species) %>% mutate(perc = 100*(n / sum(n)))

# Histogram of the percentage of the most common answer
classifications %>% 
  group_by(CaptureEventID) %>% 
  summarise(max_perc = max(perc)) %>% 
    ggplot(aes(x = max_perc)) + 
    geom_histogram(bins = 50) +
    theme_bw()

## `summarise()` ungrouping output (override with `.groups` argument)

Let’s have a look at the relationship between consensus and number of users making classificaitons

head(classifications, 100000) %>% 
  group_by(CaptureEventID) %>% 
  summarise(n = n, max_perc = max(perc)) %>% 
    ggplot(aes(x = max_perc, y = n)) + 
    geom_point(alpha = 0.05) +
    theme_bw()

## `summarise()` regrouping output by 'CaptureEventID' (override with `.groups` argument)

Lets have a look at the consensus results

consensus_results = subset(classifications, perc == 100)

range(consensus_results$n)

## [1]  5 88

ggplot(consensus_results, aes(x = n)) + 
    geom_histogram(bins = 25) +
    theme_bw()

ggplot(subset(consensus_results, n < 25), aes(x = n)) + 
    geom_histogram(bins = 25) +
    theme_bw()

Let’s have a look at the frequency of identified species in the data

consensus_results %>% group_by(Species) %>% count(Species) %>% arrange(n) %>% knitr::kable()

Species	n
honeyBadger	1
aardwolf	2
hyenaStriped	2
wildcat	2
rodents	4
bushbuck	5
civet	6
caracal	9
waterbuck	9
rhinoceros	10
batEaredFox	12
reptiles	15
mongoose	18
leopard	26
serval	30
vervetMonkey	30
jackal	42
reedbuck	42
hare	62
topi	66
koriBustard	74
dikDik	77
secretaryBird	131
porcupine	135
aardvark	168
eland	184
lionMale	317
cheetah	319
otherBird	326
gazelleGrants	356
ostrich	375
impala	440
baboon	482
hartebeest	922
lionFemale	1157
hippopotamus	1345
hyenaSpotted	1799
warthog	2002
guineaFowl	2159
buffalo	2284
human	2956
giraffe	4676
elephant	5560
gazelleThomsons	14518
wildebeest	28597
zebra	32456
blank	808559

There’s about 900k images here, what are they all of…?

consensus_results %>% 
  group_by(Species) %>% 
  count(Species) %>% 
  arrange(n) %>% 
    ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
      geom_bar(stat = "identity") +  # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
      xlab("Species") + 
      ylab("Frequency") + 
      theme_bw() + # Simple theme
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels

Ah, about 800k blanks, let’s remove them

consensus_no_blanks = subset(consensus_results, Species != "blank")

nrow(consensus_no_blanks)

## [1] 104208

subset(consensus_no_blanks, Species != "blank") %>% 
  group_by(Species) %>% 
  count(Species) %>% 
  arrange(n) %>% 
    ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
      geom_bar(stat = "identity") +  # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
      xlab("Species") + 
      ylab("Frequency") + 
      theme_bw() + # Simple theme
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels

We can also download some images of interest, given a capture ID

# Download the metadata file (from http://lila.science/datasets/snapshot-serengeti)
# "All metadata (.csv)", about 100Mb zipped
# https://lilablobssc.blob.core.windows.net/snapshotserengeti-v-2-0/SnapshotSerengeti_S1-11_v2_1.csv.zip

# What's in the metadata file
files = unzip("SnapshotSerengeti_S1-11_v2_1.csv.zip", list=TRUE)
files

##                                     Name    Length                Date
## 1 SnapshotSerengeti_v2_1_annotations.csv 271085510 2020-08-02 10:20:00
## 2      SnapshotSerengeti_v2_1_images.csv 467873967 2020-08-02 10:19:00

# Read the image csv
image_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_images.csv"))

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   capture_id = col_character(),
##   image_rank_in_capture = col_double(),
##   image_path_rel = col_character()
## )

# Read the annotations csv
annotations_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_annotations.csv"))

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   capture_id = col_character(),
##   season = col_character(),
##   site = col_character(),
##   capture_date_local = col_date(format = ""),
##   capture_time_local = col_time(format = ""),
##   subject_id = col_character(),
##   question__species = col_character(),
##   question__count_max = col_character(),
##   question__count_median = col_character(),
##   question__horns_visible = col_logical()
## )

## See spec(...) for full column specifications.

## Warning: 20615 parsing failures.
##   row                 col               expected actual         file
##  9836 question__count_min no trailing characters    -50 <connection>
## 12148 question__count_min no trailing characters    -50 <connection>
## 12404 question__count_min no trailing characters    -50 <connection>
## 12406 question__count_min no trailing characters    -50 <connection>
## 12915 question__count_min no trailing characters    -50 <connection>
## ..... ................... ...................... ...... ............
## See problems(...) for more details.

# Let's see how many annotations are of leopards
leopard_sequences = subset(consensus_no_blanks, Species == "leopard")

nrow(leopard_sequences)

## [1] 26

head(leopard_sequences)

## # A tibble: 6 x 4
## # Groups:   CaptureEventID [6]
##   CaptureEventID Species     n  perc
##   <chr>          <chr>   <int> <dbl>
## 1 ASG0001329     leopard    14   100
## 2 ASG0002hur     leopard    19   100
## 3 ASG0004oiu     leopard    10   100
## 4 ASG000566q     leopard    13   100
## 5 ASG0005m6t     leopard    12   100
## 6 ASG0005tch     leopard    10   100

We can also download some images of interest, given a capture ID

# Find a particular sequence (capture ID)
example_capture = subset(annotations_csv, subject_id == "ASG0001329")

# Find images from capture ID
images = subset(image_csv, capture_id == example_capture$capture_id)

images

## # A tibble: 3 x 4
##       X1 capture_id       image_rank_in_captu… image_path_rel                   
##    <dbl> <chr>                           <dbl> <chr>                            
## 1 167177 SER_S1#H06#4#444                    1 S1/H06/H06_R4/S1_H06_R4_PICT1328…
## 2 167178 SER_S1#H06#4#444                    2 S1/H06/H06_R4/S1_H06_R4_PICT1329…
## 3 167179 SER_S1#H06#4#444                    3 S1/H06/H06_R4/S1_H06_R4_PICT1330…

# **Don't actually need all this, base_url is simple enough**
# The SAS URL for Snapshot is from http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt
##Snapshot Serengeti,https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D

#sas_url = 'https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D'
#
#base_url = strsplit(sas_url, '\\?')[[1]][1]
#sas_token = strsplit(sas_url, '\\?')[[1]][2]

base_url = "https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped"

# Let's try and download these images...

# Where to put them
outputdir = ("/Users/freeman.r/Documents/data/snapshot_serengheti_data/images")
# Make sure it exists
if (!dir.exists(outputdir)) dir.create(outputdir)

downloaded_files = list()
# For each image in that capture sequence ('images' is a dataframe where each row is some image information)
for (i in 1:nrow(images)) {
  
  # Get the name of the image ('images' is a dataframe)
  name = images[i, ]$image_path_rel
  
  # Work out the output folder/filename (where to put it)
  outputfile = paste0(outputdir, "/", name)
  
  # If the file hasn't been downloaded already
  if (!file.exists(outputfile)) {
    # Make the directory for the file in case it doesn't exist
    dir.create(dirname(outputfile), recursive = T)
    
    # Print out what we're doing
    print(sprintf("Downloading image %s", images[i, ]$image_path_rel))
    
    # URL to be downloaded
    url = paste0(base_url, '/', name)
    
    # Download the image to the folder/file
    download.file(url, outputfile)
    
  }
  downloaded_files[[i]] = outputfile
  
}
knitr::include_graphics(unlist(downloaded_files))

$A caption$

A caption

Actually easier to load the provided ‘gold standard’ data

library(data.table)

gold_standard = fread("gold_standard_data.csv")

# How many rows
nrow(gold_standard)

## [1] 4432

# Have a look at data
head(gold_standard)

##    CaptureEventID NumSpecies   Species Count
## 1:     ASG000b3xp          1  reedbuck     1
## 2:     ASG000b45v          1     zebra     1
## 3:     ASG000b48a          1  reedbuck     1
## 4:     ASG000b4c2          1  elephant     1
## 5:     ASG000b4cg          1 otherBird     1
## 6:     ASG000b4k2          1  elephant     1

How many detections of each species

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [5] tidyr_1.1.2       tibble_3.0.4      ggplot2_3.3.2     tidyverse_1.3.0  
##  [9] readr_1.3.1       data.table_1.12.8
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0 xfun_0.18        haven_2.2.0      colorspace_1.4-1
##  [5] vctrs_0.3.4      generics_0.0.2   htmltools_0.5.0  yaml_2.2.1      
##  [9] utf8_1.1.4       rlang_0.4.8      pillar_1.4.6     glue_1.4.2      
## [13] withr_2.3.0      DBI_1.1.0        dbplyr_1.4.2     modelr_0.1.6    
## [17] readxl_1.3.1     jpeg_0.1-8.1     lifecycle_0.2.0  munsell_0.5.0   
## [21] gtable_0.3.0     cellranger_1.1.0 rvest_0.3.5      evaluate_0.14   
## [25] labeling_0.3     knitr_1.30       fansi_0.4.1      highr_0.8       
## [29] broom_0.7.0      Rcpp_1.0.5       scales_1.1.1     backports_1.1.10
## [33] jsonlite_1.7.1   farver_2.0.3     fs_1.4.1         hms_0.5.3       
## [37] digest_0.6.27    stringi_1.5.3    grid_3.6.2       cli_2.1.0       
## [41] tools_3.6.2      magrittr_1.5     crayon_1.3.4     pkgconfig_2.0.3 
## [45] ellipsis_0.3.1   xml2_1.2.2       reprex_0.3.0     lubridate_1.7.9 
## [49] assertthat_0.2.1 rmarkdown_2.3    httr_1.4.1       rstudioapi_0.11 
## [53] R6_2.4.1         compiler_3.6.2

Snapshot_data

Indicators and Assessments Unit (RF)

25/11/2020

How many images have total consensus?

Let’s have a look at the relationship between consensus and number of users making classificaitons

Lets have a look at the consensus results

Let’s have a look at the frequency of identified species in the data

There’s about 900k images here, what are they all of…?

Ah, about 800k blanks, let’s remove them

We can also download some images of interest, given a capture ID

We can also download some images of interest, given a capture ID

Actually easier to load the provided ‘gold standard’ data

How many detections of each species