---
title: Checking annotation status
subtitle: Bird song evolution
author: <a href="https://marce10.github.io/">Marcelo Araya-Salas, PhD</a>
date: "`r Sys.Date()`"
toc: true
toc-depth: 2
toc-location: left
number-sections: true
highlight-style: pygments
format:
html:
df-print: kable
code-fold: true
code-tools: true
code-copy: true
embed-resources: true
editor_options:
chunk_output_type: console
---
<!-- this code add line numbers to code blocks -->
<!-- only works when code folding is not used in yaml (code_folding: show) -->
```{r set root directory}
#| echo: FALSE
# set working directory
knitr:: opts_knit$ set (root.dir = ".." )
```
```{r add link to github repo}
#| echo: FALSE
#| results: asis
# print link to github repo if any
if (file.exists ("./.git/config" )){
config <- readLines ("./.git/config" )
url <- grep ("url" , config, value = TRUE )
url <- gsub (" \\ turl = |.git$" , "" , url)
cat (" \n Source code, data and annotation protocol found at [" , url, "](" , url, ")" , sep = "" )
}
```
```{r setup style}
#| echo: FALSE
#| message: FALSE
#| warning: FALSE
# options to customize chunk outputs
knitr:: opts_chunk$ set (
tidy.opts = list (width.cutoff = 65 ),
tidy = TRUE ,
message = FALSE ,
warning = FALSE
)
```
<!-- skyblue box -->
< div class = "alert alert-info" >
# Purpose {.unnumbered .unlisted}
- Double-check annotations
</ div >
# Load packages {.unnumbered .unlisted}
```{r load packages}
#| eval: TRUE
# knitr is require for creating html/pdf/word reports
# formatR is used for soft-wrapping code
# install/ load packages
sketchy:: load_packages (packages = c ("knitr" , "formatR" , "rprojroot" , "viridis" , "googlesheets4" , "warbleR" , "maRce10/Rraven" , "googledrive" , "maRce10/ohun" ))
path_sound_files <- "/home/m/OneDrive/bird_song_evolution/bird_song_recordings/consolidated_sound_files/"
# annotation_path <- "/home/m/Insync/marceloa27@gmail.com/Google Drive/bird_song_evolution/raven_selections/"
annotation_path <- "./data/processed/annotations/"
# check annotations
# warbleR_options(wav.path = "run/user/1000/gvfs/smb-share:server=cinnas.local,share=neurobiología/marcelo_araya/bird_song_evolution/audios/consolidated_sound_files")
ann_spec_path <- "/home/m/OneDrive/bird_song_evolution/annotated_spectrograms/"
```
# Consolidate sound files
```{r}
#| eval: FALSE
# cns <- consolidate(path = "/home/m/OneDrive/bird_song_evolution/bird_song_recordings/", dest.path = "/media/m/Expansion/Projects/Ongoing/bird_song_evolution/consolidated_sound_files/", parallel = 10)
```
# Check sound files
```{r}
#| eval: false
csf <- check_sound_files (path = path_sound_files, check.header = TRUE , parallel = 20 )
csf2 <- csf[grep ("corrupted" , csf$ result),]
fix_wavs (path = path_sound_files, files = csf2$ sound.files)
csf3 <- check_sound_files (path = file.path (path_sound_files, "converted_sound_files" ), check.header = TRUE , parallel = 20 )
```
```{r TO RUN MANUALLY}
#| eval: false
### CHANGES IN ANNOTATION FILES #########
prev_snapshot <- readRDS ("./data/processed/prev_raven_annotations_snapshot.RDS" )
new_snapshot <- fileSnapshot (annotation_path, md5sum= TRUE , recursive = TRUE )
changes <- changedFiles (before = prev_snapshot, after = new_snapshot)
# only save in something changed
if ((length (changes$ added > 0 ) | length (changes$ changed > 0 ) | length (changes$ deleted > 0 )) & ! knitr:: is_html_output (excludes = "markdown" )){
saveRDS (new_snapshot, "./data/processed/prev_raven_annotations_snapshot.RDS" )
saveRDS (changes, "./data/processed/changes_in_annotation_files.RDS" )
}
### CHECK ANNOTATIONS #################
# if not readin from google docs
googlesheets4:: gs4_deauth ()
# googlesheets4::gs4_auth()
rec_data <- read_sheet ('https://docs.google.com/spreadsheets/d/16ukhyf37hm13f1FXB2JQ-tCfHoxXy2qj_TX2xWvTVu8/edit#gid=148139271' )
rec_data$ assigned_to[rec_data$ assigned_to == "NA" ] <- NA
# remove data with no family<`
rec_data <- rec_data[rec_data$ family != "NA" , ]
# re run manually
# prev_status <- readRDS("./data/processed/annotation_status_results.RDS")
# names(prev_status) <- paste0("prev_", names(prev_status))
#
# saveRDS(prev_status, "./data/processed/previous_annotation_status_results.RDS")
anns <- imp_raven (path = annotation_path, all.data = TRUE , warbler.format = TRUE , name.from.file = TRUE , ext.case = "lower" , pb = TRUE , recursive = TRUE )
# grep(".Table.1 (1)", anns$sound.files, fixed = TRUE, value = T)
anns$ family <- dirname (anns$ selec.file)
anns$ selec.file <- basename (anns$ selec.file)
anns$ species <- gsub ("_" , " " , sapply (strsplit (anns$ selec.file, "-" ), "[" , 1 ))
# get those with more than 1 annotation file
dup_anns <- table (anns$ selec.file[! duplicated (anns$ ` Begin Path ` )])
dup_anns <- dup_anns[dup_anns > 1 ]
dup_anns <- anns[anns$ selec.file %in% names (dup_anns), c ("sound.files" , "selec.file" , "Begin Path" )]
dup_anns <- dup_anns[! duplicated (dup_anns),]
ann_files <- unique (c (anns$ selec.file, unlist (.Options$ Rraven)))
ann_files <- data.frame (file = ann_files)
ann_files$ species <- gsub ("_" , " " , sapply (strsplit (ann_files$ file, "-" ), "[" , 1 ))
ann_files$ family <- sapply (ann_files$ species, function (x) anns$ family[anns$ species == x][1 ])
undup_anns <- anns[! duplicated (paste (anns$ sound.files, anns$ selec.file)), ]
rownames (undup_anns) <- 1 : nrow (undup_anns)
# get those in which the names inside the text file and in the txt file name don't match
file_name_no_match <- na.omit (undup_anns$ selec.file[! sapply (seq_len (nrow (undup_anns)), function (x) grepl (gsub (".wav" , "" , undup_anns$ sound.files[x]), undup_anns$ selec.file[x]))])
# get those in which the name doesnt match the expected
weird_files <- ann_files[! grepl ("Table.1.selections.txt" , ann_files$ file) | ann_files$ file %in% c (unlist (.Options$ Rraven), file_name_no_match), ]
weird_files$ problem <- if (nrow (weird_files) > 0 ) "file name" else vector ()
weird_files$ problem[weird_files$ file %in% unlist (.Options$ Rraven)] <- "Empty file"
weird_files$ problem[weird_files$ file %in% file_name_no_match] <- "Species name in txt file name and Begin column dont match"
weird_files$ problem[weird_files$ file %in% unlist (.Options$ Rraven)] <- "Empty file"
# keep only those with "good" names
ann_files <- ann_files[! ann_files$ file %in% weird_files$ file, ]
# counts per family
fam_count <- aggregate (species ~ family, data = rec_data, length)
names (fam_count)[2 ] <- "total"
fam_count$ annotated <- sapply (fam_count$ family, function (x)
sum (ann_files$ family == x))
fam_count$ prop.annotated <- round (fam_count$ annotated / fam_count$ total, 2 )
fam_count$ assinged.to <- sapply (fam_count$ family, function (x)
paste (unique (rec_data$ assigned_to[rec_data$ family == x]), collapse = "/" ))
fam_count$ assinged.to[fam_count$ assinged.to == "NA" ] <- "not assigned"
weird_files$ assinged.to <- sapply (weird_files$ family, function (x)
paste (unique (rec_data$ assigned_to[rec_data$ family == x]), collapse = "/" ))
# check_sound_files()
anns <- anns[, c ("sound.files" , "selec" , "start" , "end" , "bottom.freq" , "top.freq" , "selec.file" , "family" , "species" , "element" , "song" )]
anns <- anns[anns$ selec.file %in% ann_files$ file, ]
#
#
# anns$family <- sapply(anns$species, function(x) rec_data$family[rec_data$species == x][1])
anns$ assinged.to <- sapply (anns$ f, function (x)
paste (unique (rec_data$ assigned_to[rec_data$ family == x]), collapse = "/" ))
all (ann_files$ species %in% rec_data$ species)
cs <- check_sels (anns, parallel = 20 , pb= TRUE , fix.selec = TRUE , path = path_sound_files)
file_info <- info_sound_files (parallel = 20 , path = path_sound_files, skip.error = TRUE )
file_info$ species <- gsub ("_" , " " , sapply (strsplit (file_info$ sound.files, "-" ), "[" , 1 ))
file_info$ family <- sapply (file_info$ species, function (x) rec_data$ family[rec_data$ species == x][1 ])
as.data.frame (table (cs$ check.res[cs$ check.res != "OK" ]))
unique (cs$ sound.files[cs$ check.res == "sound file not found" ])
# exp_raven(cs, path = "./data/processed", sound.file.path = "/media/m/Seagate Portable Drive/bird_song_recordings/consolidated_files",file.name = "combined_annotations_22-03-2023")
saveRDS (list (cs = cs, anns = anns, rec_data = rec_data, ann_files = ann_files, weird_files = weird_files, fam_count = fam_count, file_info = file_info, dup_anns = dup_anns), "./data/processed/annotation_status_results.RDS" )
### FIND PROBLEMATIC ANNOTATIONS ###########
# re run manually
prev_prob <- read.csv ("./data/processed/problematic_annotations.csv" )
write.csv (prev_prob, "./data/processed/prev_problematic_annotations.csv" , row.names = FALSE )
problematic_sels <- cs[cs$ check.res != "OK" , ]
# weird duration or frequency range
long_anns <- cs[cs$ end - cs$ start > 10 & ! cs$ family %in% c ("Tinamidae" , "Accipitridae" , "Falconidae" ), ]
if (nrow (long_anns) > 0 )
long_anns$ check.res <- "longer than 10 s"
broad_anns <- cs[cs$ top.freq - cs$ bottom.freq > 10 , ]
broad_anns$ check.res <- "frequency range wider than 10 kHz"
song_na <- cs[is.na (cs$ song), ]
song_na$ check.res <- "NAs in 'song'"
elem_na <- cs[is.na (cs$ element), ]
elem_na$ check.res <- "NAs in 'element'"
problematic_sels <- rbind (problematic_sels, long_anns, broad_anns, song_na, elem_na)
problematic_sels <- problematic_sels[, c ("sound.files" ,"family" , "assinged.to" , "selec" , "check.res" )]
problematic_sels$ spectrograms <- ""
names (problematic_sels) <- c ("sound.file" , "family" , "assinged.to" , "selection" , "problem" , "spectrograms" )
problematic_sels <- problematic_sels[order (problematic_sels$ family, problematic_sels$ sound.file, problematic_sels$ selection), ]
sf <- gsub (" \\ .wav" , "" , problematic_sels$ sound.file)
sf <- gsub ("_" , "%5F" , sf)
sf <- gsub ("-" , "%2D" , sf)
problematic_sels <- problematic_sels[order (problematic_sels$ sound.file, problematic_sels$ selection), ]
# find overlapping songs
song_anns <- song_analysis (X = anns, parallel = 14 )
song_anns <- check_sels (song_anns, parallel = 14 , fix.selec = TRUE , path = path_sound_files)
# song_anns <- song_anns[song_anns$check.res == "OK",]
song_anns <- song_anns[song_anns$ end - song_anns$ start > 0 , ]
ovlp_songs <- overlapping_sels (song_anns[song_anns$ check.res == "OK" & ! is.na (song_anns$ song), ], parallel = 14 )
ovlp_songs <- ovlp_songs[! is.na (ovlp_songs$ ovlp.sels), ]
ovlp_songs$ family <- sapply (ovlp_songs$ sound.files, function (x) anns$ family[anns$ sound.files == x][1 ])
ovlp_songs$ problem <- "overlapping songs"
ovlp_songs$ assinged.to <- sapply (ovlp_songs$ sound.files, function (x) anns$ assinged.to[anns$ sound.files == x][1 ])
ovlp_song_probs <- aggregate (song ~ sound.files + family + assinged.to + problem, data = ovlp_songs, unique)
ovlp_song_probs$ family <- sapply (ovlp_song_probs$ sound.file, function (x) anns$ family[anns$ sound.files == x][1 ])
ovlp_song_probs$ selections <- sapply (seq_len (nrow (ovlp_song_probs)), function (x) paste (na.omit (anns$ selec[anns$ sound.files == ovlp_songs$ sound.files[x] & anns$ song == ovlp_songs$ song[x]]), collapse = ", " ))
agg_prob_sels <- aggregate (selection ~ sound.file + family + assinged.to + problem, data = problematic_sels, unique)
agg_prob_sels$ sel.count <- sapply (agg_prob_sels$ sound.file, function (x) sum (problematic_sels$ sound.file == x))
ovlp_song_probs$ sel.count <- sapply (ovlp_song_probs$ sound.file, function (x) sum (ovlp_songs$ sound.files == x))
names (agg_prob_sels)[5 ] <- "selections"
ovlp_song_probs$ song <- NULL
names (ovlp_song_probs)[1 ] <- "sound.file"
agg_prob_sels <- rbind (agg_prob_sels, ovlp_song_probs)
agg_prob_sels <- data.frame (lapply (agg_prob_sels, as.character), stringsAsFactors= FALSE )
agg_prob_sels$ fixed <- ""
# Copy problematic to folder in google drive
#### MAKE SURE SPECTROGRAMS EXISTS FOR ALL ANNOTATED FILES (if not run spectrogram creator chunk below)
# search the jpeg image files that match the species name and copy those files to a new folder
# out <- warbleR:::pblapply_wrblr_int(unique(agg_prob_sels$sound.file), function(x){
# jpegs <- list.files(path = ann_spec_path, pattern = gsub(".wav", "", x), recursive = TRUE, full.names = TRUE)
#
# file.copy(from = jpegs, file.path("/home/m/Insync/marceloa27@gmail.com/Google Drive/bird_song_evolution/annotated_spectrograms_problematic/", basename(jpegs)), overwrite = TRUE)
#
# }
# )
#
# problm_image_drive <- googledrive::drive_ls(path = "bird_song_evolution/annotated_spectrograms_problematic/")
#
# problm_image_drive
# image file page 1
# agg_prob_sels$jpeg_p1 <- gsub(".wav", "--p1.jpeg",agg_prob_sels$sound.file)
# google drive link
# agg_prob_sels$gd_id <- sapply(agg_prob_sels$sound.file, function(x){
#
# as.vector(problm_image_drive$id[grep(gsub(".wav", "--p", x), problm_image_drive$name)])[1]
# })
# agg_prob_sels$gd_link <- ifelse(!is.na(agg_prob_sels$gd_id), paste0("https://drive.google.com/file/d/", agg_prob_sels$gd_id, "/view?usp=drive_link"), NA)
sptrgm_files <- list.files (path = "~/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/pooled/" , pattern = ".jpeg$" , full.names = FALSE )
# agg_prob_sels$spectrograms <- sapply(agg_prob_sels$sound.file, function(x){
#
# img <- grep(gsub(".wav$", "", x), sptrgm_files, value = TRUE)[1]
#
#
# out <- if (length(img) > 0) paste0("file:///home/m/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/pooled/", x) else NA
# return(out)
# })
agg_prob_sels$ files <-
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?q=" ,gsub ("_" , "%5F" , sapply (strsplit (agg_prob_sels$ sound.file, "-" ), "[" , 1 )), "&view=7&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments&searchScope=folder" ), new_tab = TRUE )
write.csv (agg_prob_sels, "./data/processed/problematic_annotations.csv" , row.names = FALSE )
```
# Changes in annnotation files
```{r}
#| eval: true
changes <- readRDS ("./data/processed/changes_in_annotation_files.RDS" )
```
- `r length(changes$unchanged)` files unchanged
- `r length(changes$added)` files added
- `r length(changes$deleted)` files deleted
- `r length(changes$changed)` files updated
## New files by family
```{r}
#| eval: true
added_df <- data.frame (family = dirname (changes$ added), file = basename (changes$ added))
fam_count_added <- as.data.frame (table (added_df$ family))
names (fam_count_added) <- c ("Family" , "Added files" )
fam_count_added_kbl <- kableExtra:: kbl (
fam_count_added,
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits =
)
fam_count_added_kbl <-
kableExtra:: kable_styling (
fam_count_added_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
fam_count_added_kbl
```
# Descriptive stats
```{r}
#| eval: true
attach (readRDS ("./data/processed/annotation_status_results.RDS" ))
```
- `r length(unique(anns$sound.files))` species from `r length(unique(ann_files$family))` families already annotated (`r paste0(round(nrow(ann_files) / nrow(rec_data), 2) * 100, "%")` of all available especies; `r round(sum(file_info$duration[file_info$sound.files %in% anns$sound.files]) / 3600, 2)` recording hours; `r nrow(anns)` annotations)
- `r sum(fam_count$prop.annotated >= 0.9)` families with 90% of all available especies annotated (`r round(sum(fam_count$prop.annotated >= 0.9) / nrow(fam_count), 2) * 100` % of all families)
- `r sum(!file_info$sound.files %in% anns$sound.files)` species have not been annotated (`r round(sum(!file_info$sound.files %in% anns$sound.files) / nrow(rec_data) *100, 2)` %; `r round(sum(file_info$duration[!file_info$sound.files %in% anns$sound.files]) / 3600, 0)` recording hours; `r round(92/ (351 + 92) * 100, 0)` % of the total recording hours)
-
# Species per family
::: {.panel-tabset}
## Done
All recordings in these families have been annotated (`r sum(fam_count$prop.annotated == 1)` families, `r sum(fam_count$total[fam_count$prop.annotated == 1])` recordings, `r round((sum(file_info$duration[file_info$sound.files %in% anns$sound.files[anns$family %in% fam_count$family[fam_count$prop.annotated == 1]]])) / 3600, 2)` recording hours)
```{r}
#| eval: true
sub_fam_count <- fam_count[fam_count$ prop.annotated == 1 ,]
prop_analzyed <- sub_fam_count$ prop.annotated
sub_fam_count$ prop.annotated <-
ifelse (
sub_fam_count$ prop.annotated > 0.9 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "green" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
ifelse (
sub_fam_count$ prop.annotated > 0.5 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "black" ,
background = "yellow" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
)
)
)
sub_fam_count$ assinged.to <-
ifelse (
sub_fam_count$ assinged.to == "not assigned" ,
kableExtra:: cell_spec (
sub_fam_count$ assinged.to,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (sub_fam_count$ assinged.to, "html" , new_tab = TRUE )
)
sub_fam_count$ recordings <-
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fbird%5Fsong%5Frecordings%2F" , sub_fam_count$ family), new_tab = TRUE )
sub_fam_count$ spectrograms <- ifelse (prop_analzyed > 0 ,
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fannotated%5Fspectrograms%2F" , sub_fam_count$ family), new_tab = TRUE ), "" )
sub_fam_count_kbl <- kableExtra:: kbl (
sub_fam_count,
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits =
)
#
# sub_fam_count_kbl <-
# kableExtra::row_spec(
# kable_input = sub_fam_count_kbl,
# row = which(prop_analzyed == 1),
# background = grDevices::adjustcolor("#6DCD59FF", alpha.f = 0.3)
# )
sub_fam_count_kbl <-
kableExtra:: kable_styling (
sub_fam_count_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
sub_fam_count_kbl
```
## Almost done
90%-99% of recordings in these families have been annotated (`r sum(fam_count$prop.annotated >= 0.9 & fam_count$prop.annotated < 1)` families, `r length(unique(anns$sound.files[anns$family %in% fam_count$family[fam_count$prop.annotated >= 0.9 & fam_count$prop.annotated < 1]]))` recordings, `r round((sum(file_info$duration[file_info$sound.files %in% anns$sound.files[anns$family %in% fam_count$family[fam_count$prop.annotated >= 0.9 & fam_count$prop.annotated < 1]]])) / 3600, 2)` recording hours)
```{r}
#| eval: true
sub_fam_count <- fam_count[fam_count$ prop.annotated >= 0.9 & fam_count$ prop.annotated < 1 ,]
prop_analzyed <- sub_fam_count$ prop.annotated
sub_fam_count$ prop.annotated <-
ifelse (
sub_fam_count$ prop.annotated > 0.9 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "green" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
ifelse (
sub_fam_count$ prop.annotated > 0.5 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "black" ,
background = "yellow" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
)
)
)
sub_fam_count$ assinged.to <-
ifelse (
sub_fam_count$ assinged.to == "not assigned" ,
kableExtra:: cell_spec (
sub_fam_count$ assinged.to,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (sub_fam_count$ assinged.to, "html" , new_tab = TRUE )
)
sub_fam_count$ recordings <-
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fbird%5Fsong%5Frecordings%2F" , sub_fam_count$ family), new_tab = TRUE )
sub_fam_count$ spectrograms <- ifelse (prop_analzyed > 0 ,
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fannotated%5Fspectrograms%2F" , sub_fam_count$ family), new_tab = TRUE ), "" )
sub_fam_count_kbl <- kableExtra:: kbl (
sub_fam_count,
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits =
)
sub_fam_count_kbl <-
kableExtra:: row_spec (
kable_input = sub_fam_count_kbl,
row = which (prop_analzyed == 1 ),
background = grDevices:: adjustcolor ("#6DCD59FF" , alpha.f = 0.3 )
)
sub_fam_count_kbl <-
kableExtra:: kable_styling (
sub_fam_count_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
sub_fam_count_kbl
```
## Halfway
1%-90% of recordings in these families have been annotated (`r sum(fam_count$prop.annotated >= 0.0001 & fam_count$prop.annotated < 0.9)` families, `r length(unique(anns$sound.files[anns$family %in% fam_count$family[fam_count$prop.annotated >= 0.0001 & fam_count$prop.annotated < 0.9]]))` recordings, `r round((sum(file_info$duration[file_info$sound.files %in% anns$sound.files[anns$family %in% fam_count$family[fam_count$prop.annotated >= 0.0001 & fam_count$prop.annotated < 0.9]]])) / 3600, 2)` recording hours)
```{r}
#| eval: true
sub_fam_count <- fam_count[fam_count$ prop.annotated >= 0.0001 & fam_count$ prop.annotated < 0.90 ,]
prop_analzyed <- sub_fam_count$ prop.annotated
sub_fam_count$ prop.annotated <-
ifelse (
sub_fam_count$ prop.annotated > 0.9 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "green" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
ifelse (
sub_fam_count$ prop.annotated > 0.5 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "black" ,
background = "yellow" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
)
)
)
sub_fam_count$ assinged.to <-
ifelse (
sub_fam_count$ assinged.to == "not assigned" ,
kableExtra:: cell_spec (
sub_fam_count$ assinged.to,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (sub_fam_count$ assinged.to, "html" , new_tab = TRUE )
)
sub_fam_count$ recordings <-
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fbird%5Fsong%5Frecordings%2F" , sub_fam_count$ family), new_tab = TRUE )
sub_fam_count$ spectrograms <- ifelse (prop_analzyed > 0 ,
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fannotated%5Fspectrograms%2F" , sub_fam_count$ family), new_tab = TRUE ), "" )
sub_fam_count_kbl <- kableExtra:: kbl (
sub_fam_count,
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits =
)
sub_fam_count_kbl <-
kableExtra:: row_spec (
kable_input = sub_fam_count_kbl,
row = which (prop_analzyed == 1 ),
background = grDevices:: adjustcolor ("#6DCD59FF" , alpha.f = 0.3 )
)
sub_fam_count_kbl <-
kableExtra:: kable_styling (
sub_fam_count_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
sub_fam_count_kbl
```
## Missing
0 recordings in these families have been annotated (`r sum(fam_count$prop.annotated == 0)` families)
```{r}
#| eval: true
sub_fam_count <- fam_count[fam_count$ prop.annotated == 0 ,]
prop_analzyed <- sub_fam_count$ prop.annotated
sub_fam_count$ prop.annotated <-
ifelse (
sub_fam_count$ prop.annotated > 0.9 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "green" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
ifelse (
sub_fam_count$ prop.annotated > 0.5 ,
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "black" ,
background = "yellow" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (
sub_fam_count$ prop.annotated,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
)
)
)
sub_fam_count$ assinged.to <-
ifelse (
sub_fam_count$ assinged.to == "not assigned" ,
kableExtra:: cell_spec (
sub_fam_count$ assinged.to,
"html" ,
color = "white" ,
background = "red" ,
bold = TRUE ,
font_size = 12 ,
new_tab = TRUE
),
kableExtra:: cell_spec (sub_fam_count$ assinged.to, "html" , new_tab = TRUE )
)
sub_fam_count$ recordings <-
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fbird%5Fsong%5Frecordings%2F" , sub_fam_count$ family), new_tab = TRUE )
sub_fam_count$ spectrograms <- ifelse (prop_analzyed > 0 ,
kableExtra:: cell_spec ("link" , "html" , link = paste0 ("https://6f33fa7f78ea46e2aaca-my.sharepoint.com/personal/marcelo_araya_ucr_ac_cr/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2Fmarcelo%5Faraya%5Fucr%5Fac%5Fcr%2FDocuments%2Fbird%5Fsong%5Fevolution%2Fannotated%5Fspectrograms%2F" , sub_fam_count$ family), new_tab = TRUE ), "" )
sub_fam_count_kbl <- kableExtra:: kbl (
sub_fam_count,
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits =
)
sub_fam_count_kbl <-
kableExtra:: row_spec (
kable_input = sub_fam_count_kbl,
row = which (prop_analzyed == 1 ),
background = grDevices:: adjustcolor ("#6DCD59FF" , alpha.f = 0.3 )
)
sub_fam_count_kbl <-
kableExtra:: kable_styling (
sub_fam_count_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
sub_fam_count_kbl
```
:::
# Double-checking annotations
- `r sum(weird_files$problem == "file name" | weird_files$problem == "Species name in txt file name and Begin column dont match")` weirdly named file(s) and `r sum(weird_files$problem == "Empty file")` empty file(s):
```{r}
#| eval: true
weird_files
```
- Duplicated annotation files
```{r}
#| eval: true
#|
kableExtra:: kable_styling (
kable (dup_anns),
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
```
```{r}
#| eval: true
agg_prob_sels <- read.csv ("./data/processed/problematic_annotations.csv" )
agg_prob_sels <- agg_prob_sels[grep ("not found|read" , agg_prob_sels$ problem, invert = TRUE ), ]
agg_prob_sels$ fixed <- ifelse (is.na (agg_prob_sels$ fixed), "" , agg_prob_sels$ fixed)
# agg_prob_sels$files <- ifelse(!is.na(agg_prob_sels$files),
# kableExtra::cell_spec("link", "html", link = agg_prob_sels$files, new_tab = TRUE), "")
```
The following table show selections ('selections' column) within sound files ('sound.file' column) that are problematic (described in 'problem' column). Please check if and fix accordingly. If a Raven selection table is fixed, please upload it to google drive, but make sure the old copy is removed first. After fixing annotations report that into [ this data sheet ](https://docs.google.com/spreadsheets/d/1guDbKEVPB-scVQidTOVoPEn1YbXASBM7_cs6CpEYWUM/edit?usp=sharing) .
- `r length(unique(agg_prob_sels$sound.file))` files
```{r}
#| eval: true
agg_prob_sels <- agg_prob_sels[order (agg_prob_sels$ family, agg_prob_sels$ sound.file, agg_prob_sels$ problem), ]
agg_prob_sels <- agg_prob_sels[agg_prob_sels$ problem != "frequency range wider than 10 kHz" ,]
problematic_sels_kbl <- kableExtra:: kbl (
agg_prob_sels[, c ("sound.file" , "family" , "problem" , "selections" , "files" )],
row.names = FALSE ,
escape = FALSE ,
format = "html" ,
digits = 3
)
problematic_sels_kbl <-
kableExtra:: kable_styling (
problematic_sels_kbl,
bootstrap_options = c ("striped" , "hover" , "condensed" , "responsive" ),
full_width = FALSE ,
font_size = 12
)
problematic_sels_kbl
```
# Created annotated spectrograms
```{r}
#| eval: false
warbleR_options (wav.path = path_sound_files)
source ("~/Dropbox/R_package_testing/warbleR/R/full_spectrograms.R" )
# create folders for each family
for (i in unique (rec_data$ family))
if (! file.exists (file.path (ann_spec_path, i)))
dir.create (file.path (ann_spec_path, i))
new_and_changed_files <- gsub (".Table.1.selections.txt" , ".wav" , basename (c (changes$ added, changes$ changed)))
new_and_changed_files <- unique (c (new_and_changed_files, prev_prob$ sound.file))
# out <- warbleR:::pblapply_wrblr_int(agg_prob_sels$sound.file, pbar = TRUE, cl = 10, function(x)
out <- warbleR::: pblapply_wrblr_int (unique (cs$ sound.files), pbar = TRUE , cl = 1 , function (x)
{
sub_anns <- cs[cs$ sound.files == x, ]
frq_range <- range (c (sub_anns$ bottom.freq, sub_anns$ top.freq))
frq_range[1 ] <- frq_range[1 ] - 1
if (frq_range[1 ] < 0 ) frq_range[1 ] <- 0
frq_range[2 ] <- frq_range[2 ] + ((frq_range[2 ] - frq_range[1 ]) / 3 )
if (frq_range[2 ] < 6 ) frq_range[2 ] <- 6
if (frq_range[2 ] > 22.05 ) frq_range[2 ] <- 22.05
sub_anns$ selec <- paste (sub_anns$ selec, sub_anns$ element, sep = "-" )
# print(file.path(sub_anns$family[1], x))
# print(x)
# unlink(list.files(path = file.path(ann_spec_path, sub_anns$family[1]), pattern = gsub(".wav", "", x)))
a <- NA
# if(!any(file.exists(file.path(ann_spec_path, sub_anns$family[1], paste0(gsub(".wav", "", x), paste0("--p",1:40,".jpeg"))))))
a <- try (
full_spectrograms (
X = sub_anns,
flim = frq_range,
sxrow = 4 ,
rows = 10 ,
ovlp = 25 ,
collevels = seq (- 100 , 0 , 5 ),
parallel = 1 ,
overwrite = TRUE ,
dest.path = file.path (ann_spec_path, sub_anns$ family[1 ]),
# dest.path = "/home/m/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/problematic/",
song = "song" ,
fast.spec = TRUE ,
horizontal = TRUE ,
pb = F,
only.annotated = TRUE ,
path = path_sound_files
)
, silent = TRUE )
# if (is(a,"try-error"))
# print(file.path(sub_anns$family[1], x))
}
)
sum (! file.exists (file.path (ann_spec_path, cs$ family[! duplicated (cs$ sound.files)], gsub (".wav" , "--p" ,cs$ sound.files[! duplicated (cs$ sound.files)]))))
# consolidate
cns_imgs <- consolidate (path = "/home/m/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/by_family/" , dest.path = "/home/m/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/pooled/" , save.csv = FALSE , file.ext = ".jpeg$" )
# check if all annotations have images
ann_spec_files <- list.files (path = "/home/m/Dropbox/Projects/bird_song_evolution/data/processed/annotated_spectrograms/pooled/" , pattern = ".jpeg$" )
# extract species name, removing any thing link p1.jpeg or p2.jpeg and so on
spec_spp <- unique (gsub ("--p[0-9]+ \\ .jpeg" , "" , ann_spec_files))
missing <- setdiff (unique (cs$ sound.files), paste0 (spec_spp, ".wav" ))
```
# Add new recordings
```{r}
#| eval: FALSE
new_data_no_cuts <- readxl:: read_excel ("./data/raw/Marcelo Files - no Production cut.xlsx" )
new_data_cuts <- readxl:: read_excel ("./data/raw/Marcelo Files - Production cuts.xlsx" )
new_data_cuts <- new_data_cuts[! new_data_cuts$ ` Scientific Name ` %in% rec_data$ species,]
nrow (new_data_cuts)
new_data_no_cuts <- new_data_no_cuts[! new_data_no_cuts$ ` Parent Species ` %in% rec_data$ species,]
nrow (new_data_no_cuts)
head (new_data_no_cuts)
new_data_no_cuts$ source <- "not publication cut"
new_data_cuts$ source <- "publication cut"
clm <- read.csv ("./data/raw/NEW_Clements-Checklist-v2022-October-2022.csv" )
clm.sp <- clm[clm$ category == "species" , ]
clm.sp$ genus <- sapply (clm.sp$ scientific.name, function (x) strsplit (x, " " )[[1 ]][1 ])
new_data_cuts$ genus <- sapply (new_data_cuts$ ` Scientific Name ` , function (x) strsplit (x, " " )[[1 ]][1 ])
new_data_cuts$ family <- sapply (seq_len (nrow (new_data_cuts)), function (x){ fam <- clm.sp$ family[clm.sp$ genus == new_data_cuts$ genus[x]][1 ]
fam <- if (length (fam) == 0 ) NA else strsplit (fam, " \\ (" )[[1 ]][1 ]
return (fam)
})
new_data_no_cuts$ genus <- sapply (new_data_no_cuts$ ` Parent Species ` , function (x) strsplit (x, " " )[[1 ]][1 ])
new_data_no_cuts$ family <- sapply (seq_len (nrow (new_data_no_cuts)), function (x){ fam <- clm.sp$ family[clm.sp$ genus == new_data_no_cuts$ genus[x]][1 ]
fam <- if (length (fam) == 0 ) NA else strsplit (fam, " \\ (" )[[1 ]][1 ]
return (fam)
})
new_data_cuts$ Orginal.Scientific.Name <- new_data_cuts$ Scientific.Name <- new_data_cuts$ ` Scientific Name `
new_data_no_cuts$ Orginal.Scientific.Name <- new_data_no_cuts$ ` Parent Species `
new_data_no_cuts$ ` Scientific Name ` <- new_data_cuts$ ` Scientific Name ` <- NULL
new_data_no_cuts$ Scientific.Name <- new_data_no_cuts$ ` Parent Species `
new_data_no_cuts$ file_url <- paste0 ("https://macaulaylibrary.org/asset/" , new_data_no_cuts$ ` ML Catalog Number ` )
new_data_cuts$ file_url <- paste0 ("https://macaulaylibrary.org/asset/" , new_data_cuts$ ` ML Catalog Number ` )
new_data_no_cuts$ species_ebird_url <- paste0 ("https://ebird.org/species/" , new_data_no_cuts$ ` eBird Species Code ` )
new_data_cuts$ species_ebird_url <- paste0 ("https://ebird.org/species/" , new_data_cuts$ SpeciesCode)
new_data_cuts$ ` Common Name ` <- new_data_cuts$ ` English Name `
new_data_cuts$ Behaviors <- NA
names (new_data_no_cuts)
names (new_data_cuts)
new_data_cuts$ Recordist <- paste (new_data_cuts$ FirstName, new_data_cuts$ LastName)
new_data_cuts$ Year <- new_data_cuts$ Month <- new_data_cuts$ Day <- new_data_cuts$ Locality <- new_data_cuts$ ` Loc ID ` <- new_data_cuts$ ` Country State County ` <- new_data_cuts$ ` Taxon Category ` <- NA
common_colums <- intersect (names (new_data_no_cuts), names (new_data_cuts))
second_batch <- rbind (new_data_no_cuts[, common_colums], new_data_cuts[, common_colums])
second_batch$ sound.files <- paste0 (gsub (" " , "_" , second_batch$ Scientific.Name), "-ML" , second_batch$ ` ML Catalog Number ` , ".wav" )
# ohun::feature_acoustic_data(path = "~/Downloads/combined/")
# warbleR::info_sound_files(path = "~/Downloads/combined/")
# this code is not organized (is a mess!)
fls <- list.files (path = "~/Downloads/combined/" )
table (substr (fls, nchar (fls) - 3 , nchar (fls)))
second_batch <- second_batch[second_batch$ ` ML Catalog Number ` %in% gsub (".m4a|.mp3|.wav" , "" , fls),]
nrow (second_batch)
sub_fls <- fls[gsub (".m4a|.mp3|.wav" , "" , fls) %in% second_batch$ ` ML Catalog Number ` ]
table (substr (sub_fls, nchar (sub_fls) - 3 , nchar (sub_fls)))
sum (! second_batch$ ` ML Catalog Number ` %in% gsub (".m4a|.mp3|.wav" , "" , fls))
mp3_2_wav (samp.rate = 44.1 , bit.depth = 16 , path = "~/Downloads/combined/" , overwrite = TRUE , dest.path = "~/Downloads/combined/" )
# delete mp3s
# setwd("~/Downloads/combined/")
for (i in grep ("m4a$" , fls, value = TRUE ))
{
cll <- paste0 ("ffmpeg -i " , i, " " , gsub ("m4a$" , "wav" , i))
system (cll)
}
write.csv (second_batch, "./data/raw/sound_files_and_extended_metadata_second_batch.csv" , row.names = FALSE )
second_batch <- second_batch[, c ("family" , "Scientific.Name" , "species_ebird_url" , "ML Catalog Number" , "sound.files" , "Common Name" , "Orginal.Scientific.Name" , "Behaviors" , "file_url" )]
write.csv (second_batch, "./data/raw/sound_files_metadata_second_batch.csv" , row.names = FALSE )
fix_wavs (samp.rate = 44.1 , bit.depth = 16 , path = "~/Downloads/combined/" )
fr <- file.rename (
from = file.path ("~/Downloads/combined/converted_sound_files" , paste0 (second_batch$ ` ML Catalog Number ` , ".wav" )),
to = file.path ("~/Downloads/combined/converted_sound_files" , second_batch$ sound.files))
nrow (second_batch)
all (fr)
for (i in na.omit (unique (second_batch$ family))){
print (i)
if (! dir.exists (file.path ("/run/user/1000/gvfs/smb-share:server=cinnas.local,share=neurobiología/marcelo_araya/bird_song_evolution/by_family" , i)))
dir.create (file.path ("/run/user/1000/gvfs/smb-share:server=cinnas.local,share=neurobiología/marcelo_araya/bird_song_evolution/by_family" , i))
frm <- file.path ("~/Downloads/combined/converted_sound_files" , na.omit (second_batch$ sound.files[second_batch$ family == i]))
tu <- file.path ("/run/user/1000/gvfs/smb-share:server=cinnas.local,share=neurobiología/marcelo_araya/bird_song_evolution/by_family" , i, na.omit (second_batch$ sound.files[second_batch$ family == i]))
fr <- file.copy (frm, to = tu)
}
```
<!-- light green box -->
< div class = "alert alert-success" >
# Takeaways {.unnumbered .unlisted}
- Doing good progress
</ div >
<!-- '---' adds a gray vertical line -->
---
<!-- add packages used, system details and versions -->
# Session information {.unnumbered .unlisted}
```{r session info}
#| echo: FALSE
sessionInfo ()
```