# Scientific data paper: https://www.nature.com/articles/sdata201526
# Link to Dryad: https://datadryad.org/stash/dataset/doi:10.5061/dryad.5pt92
# You can download the data from there as doi_10.5061_dryad.5pt92__v1.zip
# Once extracted, use fread from the data.table package to quickly load 1Gb csv file (310Mb compressed)
all_image_data = read_csv("raw_data_for_dryad.csv.zip")
## Parsed with column specification:
## cols(
## CaptureEventID = col_character(),
## ClassificationID = col_character(),
## UserID = col_character(),
## Species = col_character(),
## Count = col_double(),
## Standing = col_double(),
## Resting = col_double(),
## Moving = col_double(),
## Eating = col_double(),
## Interacting = col_double(),
## Babies = col_double()
## )
nrow(all_image_data)
## [1] 10530564
head(all_image_data)
## # A tibble: 6 x 11
## CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ASG0000001 50c79e2c9177d00… not-l… human 1 1 0 0
## 2 ASG0000001 50c7c0f9dea72b4… cb2e6… human 1 1 0 0
## 3 ASG0000001 50c80a57b58fdd4… 5e3c1… human 1 1 0 0
## 4 ASG0000001 50c81492b58fdd1… not-l… human 1 0 0 0
## 5 ASG0000001 50c8a96c56175a5… 0c165… human 1 0 0 0
## 6 ASG0000001 50cce4699a35a62… not-l… human 1 1 0 0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>
# Let's have a look at the first 'capture event'
subset(all_image_data, CaptureEventID == "ASG0000001")
## # A tibble: 18 x 11
## CaptureEventID ClassificationID UserID Species Count Standing Resting Moving
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ASG0000001 50c79e2c9177d00… not-l… human 1 1 0 0
## 2 ASG0000001 50c7c0f9dea72b4… cb2e6… human 1 1 0 0
## 3 ASG0000001 50c80a57b58fdd4… 5e3c1… human 1 1 0 0
## 4 ASG0000001 50c81492b58fdd1… not-l… human 1 0 0 0
## 5 ASG0000001 50c8a96c56175a5… 0c165… human 1 0 0 0
## 6 ASG0000001 50cce4699a35a62… not-l… human 1 1 0 0
## 7 ASG0000001 50cd013157d38e1… 09448… human 1 0 0 0
## 8 ASG0000001 50cd2fc09a35a63… not-l… blank 0 0 0 0
## 9 ASG0000001 50cdc52357d38e3… 73043… human 1 1 0 0
## 10 ASG0000001 50cdd122b58fdd0… not-l… human 1 0 0 0
## 11 ASG0000001 50cdddda57d38e3… 3191c… human 1 0 0 0
## 12 ASG0000001 50d72d34e399563… not-l… human 1 0 0 0
## 13 ASG0000001 50d9f256e399563… 232e3… human 1 1 0 0
## 14 ASG0000001 50db0117e399563… ca2e3… human 1 0 0 1
## 15 ASG0000001 50db43d1e399563… 84272… human 1 0 1 0
## 16 ASG0000001 50dd65d2e399563… fcb4b… human 1 0 0 0
## 17 ASG0000001 50e3264de399563… edd94… blank 0 0 0 0
## 18 ASG0000001 50e58c5ee399563… ec563… human 1 0 0 0
## # … with 3 more variables: Eating <dbl>, Interacting <dbl>, Babies <dbl>
# Calculate the number of classifications of each species for each capture event and add a column 'perc' for what percentage of all classifications they represent
classifications = all_image_data %>% group_by(CaptureEventID) %>% count(Species) %>% mutate(perc = 100*(n / sum(n)))
# Histogram of the percentage of the most common answer
classifications %>%
group_by(CaptureEventID) %>%
summarise(max_perc = max(perc)) %>%
ggplot(aes(x = max_perc)) +
geom_histogram(bins = 50) +
theme_bw()
## `summarise()` ungrouping output (override with `.groups` argument)
head(classifications, 100000) %>%
group_by(CaptureEventID) %>%
summarise(n = n, max_perc = max(perc)) %>%
ggplot(aes(x = max_perc, y = n)) +
geom_point(alpha = 0.05) +
theme_bw()
## `summarise()` regrouping output by 'CaptureEventID' (override with `.groups` argument)
consensus_results = subset(classifications, perc == 100)
range(consensus_results$n)
## [1] 5 88
ggplot(consensus_results, aes(x = n)) +
geom_histogram(bins = 25) +
theme_bw()
ggplot(subset(consensus_results, n < 25), aes(x = n)) +
geom_histogram(bins = 25) +
theme_bw()
consensus_results %>% group_by(Species) %>% count(Species) %>% arrange(n) %>% knitr::kable()
| Species | n |
|---|---|
| honeyBadger | 1 |
| aardwolf | 2 |
| hyenaStriped | 2 |
| wildcat | 2 |
| rodents | 4 |
| bushbuck | 5 |
| civet | 6 |
| caracal | 9 |
| waterbuck | 9 |
| rhinoceros | 10 |
| batEaredFox | 12 |
| reptiles | 15 |
| mongoose | 18 |
| leopard | 26 |
| serval | 30 |
| vervetMonkey | 30 |
| jackal | 42 |
| reedbuck | 42 |
| hare | 62 |
| topi | 66 |
| koriBustard | 74 |
| dikDik | 77 |
| secretaryBird | 131 |
| porcupine | 135 |
| aardvark | 168 |
| eland | 184 |
| lionMale | 317 |
| cheetah | 319 |
| otherBird | 326 |
| gazelleGrants | 356 |
| ostrich | 375 |
| impala | 440 |
| baboon | 482 |
| hartebeest | 922 |
| lionFemale | 1157 |
| hippopotamus | 1345 |
| hyenaSpotted | 1799 |
| warthog | 2002 |
| guineaFowl | 2159 |
| buffalo | 2284 |
| human | 2956 |
| giraffe | 4676 |
| elephant | 5560 |
| gazelleThomsons | 14518 |
| wildebeest | 28597 |
| zebra | 32456 |
| blank | 808559 |
consensus_results %>%
group_by(Species) %>%
count(Species) %>%
arrange(n) %>%
ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
geom_bar(stat = "identity") + # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
xlab("Species") +
ylab("Frequency") +
theme_bw() + # Simple theme
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels
consensus_no_blanks = subset(consensus_results, Species != "blank")
nrow(consensus_no_blanks)
## [1] 104208
subset(consensus_no_blanks, Species != "blank") %>%
group_by(Species) %>%
count(Species) %>%
arrange(n) %>%
ggplot(aes(x = reorder(Species, -n), y = n)) + # Order by value
geom_bar(stat = "identity") + # Stat 'identity' as we've alrdady worked out the total/percentage we want to plot
xlab("Species") +
ylab("Frequency") +
theme_bw() + # Simple theme
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Rotate x-axis labels
# Download the metadata file (from http://lila.science/datasets/snapshot-serengeti)
# "All metadata (.csv)", about 100Mb zipped
# https://lilablobssc.blob.core.windows.net/snapshotserengeti-v-2-0/SnapshotSerengeti_S1-11_v2_1.csv.zip
# What's in the metadata file
files = unzip("SnapshotSerengeti_S1-11_v2_1.csv.zip", list=TRUE)
files
## Name Length Date
## 1 SnapshotSerengeti_v2_1_annotations.csv 271085510 2020-08-02 10:20:00
## 2 SnapshotSerengeti_v2_1_images.csv 467873967 2020-08-02 10:19:00
# Read the image csv
image_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_images.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## capture_id = col_character(),
## image_rank_in_capture = col_double(),
## image_path_rel = col_character()
## )
# Read the annotations csv
annotations_csv = read_csv(unz("SnapshotSerengeti_S1-11_v2_1.csv.zip", "SnapshotSerengeti_v2_1_annotations.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## capture_id = col_character(),
## season = col_character(),
## site = col_character(),
## capture_date_local = col_date(format = ""),
## capture_time_local = col_time(format = ""),
## subject_id = col_character(),
## question__species = col_character(),
## question__count_max = col_character(),
## question__count_median = col_character(),
## question__horns_visible = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 20615 parsing failures.
## row col expected actual file
## 9836 question__count_min no trailing characters -50 <connection>
## 12148 question__count_min no trailing characters -50 <connection>
## 12404 question__count_min no trailing characters -50 <connection>
## 12406 question__count_min no trailing characters -50 <connection>
## 12915 question__count_min no trailing characters -50 <connection>
## ..... ................... ...................... ...... ............
## See problems(...) for more details.
# Let's see how many annotations are of leopards
leopard_sequences = subset(consensus_no_blanks, Species == "leopard")
nrow(leopard_sequences)
## [1] 26
head(leopard_sequences)
## # A tibble: 6 x 4
## # Groups: CaptureEventID [6]
## CaptureEventID Species n perc
## <chr> <chr> <int> <dbl>
## 1 ASG0001329 leopard 14 100
## 2 ASG0002hur leopard 19 100
## 3 ASG0004oiu leopard 10 100
## 4 ASG000566q leopard 13 100
## 5 ASG0005m6t leopard 12 100
## 6 ASG0005tch leopard 10 100
# Find a particular sequence (capture ID)
example_capture = subset(annotations_csv, subject_id == "ASG0001329")
# Find images from capture ID
images = subset(image_csv, capture_id == example_capture$capture_id)
images
## # A tibble: 3 x 4
## X1 capture_id image_rank_in_captu… image_path_rel
## <dbl> <chr> <dbl> <chr>
## 1 167177 SER_S1#H06#4#444 1 S1/H06/H06_R4/S1_H06_R4_PICT1328…
## 2 167178 SER_S1#H06#4#444 2 S1/H06/H06_R4/S1_H06_R4_PICT1329…
## 3 167179 SER_S1#H06#4#444 3 S1/H06/H06_R4/S1_H06_R4_PICT1330…
# **Don't actually need all this, base_url is simple enough**
# The SAS URL for Snapshot is from http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt
##Snapshot Serengeti,https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D
#sas_url = 'https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped?st=2020-01-01T00%3A00%3A00Z&se=2034-01-01T00%3A00%3A00Z&sp=rl&sv=2019-07-07&sr=c&sig=/DGPd%2B9WGFt6HgkemDFpo2n0M1htEXvTq9WoHlaH7L4%3D'
#
#base_url = strsplit(sas_url, '\\?')[[1]][1]
#sas_token = strsplit(sas_url, '\\?')[[1]][2]
base_url = "https://lilablobssc.blob.core.windows.net/snapshotserengeti-unzipped"
# Let's try and download these images...
# Where to put them
outputdir = ("/Users/freeman.r/Documents/data/snapshot_serengheti_data/images")
# Make sure it exists
if (!dir.exists(outputdir)) dir.create(outputdir)
downloaded_files = list()
# For each image in that capture sequence ('images' is a dataframe where each row is some image information)
for (i in 1:nrow(images)) {
# Get the name of the image ('images' is a dataframe)
name = images[i, ]$image_path_rel
# Work out the output folder/filename (where to put it)
outputfile = paste0(outputdir, "/", name)
# If the file hasn't been downloaded already
if (!file.exists(outputfile)) {
# Make the directory for the file in case it doesn't exist
dir.create(dirname(outputfile), recursive = T)
# Print out what we're doing
print(sprintf("Downloading image %s", images[i, ]$image_path_rel))
# URL to be downloaded
url = paste0(base_url, '/', name)
# Download the image to the folder/file
download.file(url, outputfile)
}
downloaded_files[[i]] = outputfile
}
knitr::include_graphics(unlist(downloaded_files))
A caption
library(data.table)
gold_standard = fread("gold_standard_data.csv")
# How many rows
nrow(gold_standard)
## [1] 4432
# Have a look at data
head(gold_standard)
## CaptureEventID NumSpecies Species Count
## 1: ASG000b3xp 1 reedbuck 1
## 2: ASG000b45v 1 zebra 1
## 3: ASG000b48a 1 reedbuck 1
## 4: ASG000b4c2 1 elephant 1
## 5: ASG000b4cg 1 otherBird 1
## 6: ASG000b4k2 1 elephant 1
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [5] tidyr_1.1.2 tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
## [9] readr_1.3.1 data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.18 haven_2.2.0 colorspace_1.4-1
## [5] vctrs_0.3.4 generics_0.0.2 htmltools_0.5.0 yaml_2.2.1
## [9] utf8_1.1.4 rlang_0.4.8 pillar_1.4.6 glue_1.4.2
## [13] withr_2.3.0 DBI_1.1.0 dbplyr_1.4.2 modelr_0.1.6
## [17] readxl_1.3.1 jpeg_0.1-8.1 lifecycle_0.2.0 munsell_0.5.0
## [21] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.5 evaluate_0.14
## [25] labeling_0.3 knitr_1.30 fansi_0.4.1 highr_0.8
## [29] broom_0.7.0 Rcpp_1.0.5 scales_1.1.1 backports_1.1.10
## [33] jsonlite_1.7.1 farver_2.0.3 fs_1.4.1 hms_0.5.3
## [37] digest_0.6.27 stringi_1.5.3 grid_3.6.2 cli_2.1.0
## [41] tools_3.6.2 magrittr_1.5 crayon_1.3.4 pkgconfig_2.0.3
## [45] ellipsis_0.3.1 xml2_1.2.2 reprex_0.3.0 lubridate_1.7.9
## [49] assertthat_0.2.1 rmarkdown_2.3 httr_1.4.1 rstudioapi_0.11
## [53] R6_2.4.1 compiler_3.6.2