Optimize detection for 55 kHz single cut with more signals

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

table(sels_55$sound.files)

sels_55 <- split_sels[split_sels$sound.files == "T0000003-7.wav", ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

oed <- optimize_energy_detector(reference = sels_55, threshold = seq(0.01, 0.02, 0.005), min.duration = c(0.01, 0.001), ssmooth = c(5, 10), hold.time = c(0.02, 0.05, seq(0.08, 0.1, 0.01)), path = .Options$warbleR$path, thinning = c(0.5), parallel = 18, bp = c(35, 90), max.duration = c(0.15, 0.2, 0.22))

saveRDS(oed, "./data/processed/optimization_55kHz.RDS")
optim_ad <- readRDS("./data/processed/optimization_55kHz.RDS")

# print dynamic table
oa_DT <- datatable(optim_ad, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad, is.numeric), 3)

Optimize detection for 55 kHz 5 cuts with more signals

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

tab <- table(sels_55$sound.files)
sels_55 <- sels_55[sels_55$sound.files %in% names(tab)[tab > 1000], ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

# best tuning parameters 
oed5 <- optimize_energy_detector(reference = sels_55, files =  unique(sels_55$sound.files), threshold = 0.01, min.duration = c(0.005 ,0.001, 0.0005), ssmooth = c(5, 10), hold.time = c(0.005, 0.01, 0.02), path = .Options$warbleR$path, thinning = 0.5, parallel = 1, bp = c(35, 90), max.duration = c(0.15, 0.175, 0.2), by.sound.file = TRUE)

saveRDS(oed5, "./data/processed/optimization_55kHz_5_cuts.RDS")
optim_ad <- readRDS("./data/processed/optimization_55kHz_5_cuts.RDS")

optim_ad <- summarize_diagnostic(optim_ad, time.diagnostics = TRUE)

oa_DT <- datatable(optim_ad, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad, is.numeric), 3)

Optimal detection diagnostic 55 kHz 5 cuts

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

tab <- table(sels_55$sound.files)
sels_55 <- sels_55[sels_55$sound.files %in% names(tab)[tab > 1000], ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

length(unique(sels_55$sound.files))

# best tuning parameters 
ed5 <- energy_detector(files =  unique(sels_55$sound.files), threshold = 0.01, min.duration = 0.001, ssmooth = 5, hold.time = 0.005, path = .Options$warbleR$path, thinning = 0.5, parallel = 1, bp = c(35, 90), max.duration = 0.15)

# fed4
saveRDS(ed5, "./data/processed/optimal_detection_55kHz_5_cuts.RDS")
ed5 <- readRDS("./data/processed/optimal_detection_55kHz_5_cuts.RDS")

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

tab <- table(sels_55$sound.files)
sels_55 <- sels_55[sels_55$sound.files %in% names(tab)[tab > 1000], ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

led5 <- label_detection(reference = sels_55, detection = ed5, parallel = 10, pb = FALSE)

led5 <- filter_detection(led5, parallel = 10, pb = FALSE)

optim_ad_bs <- diagnose_detection(reference = sels_55, detection = led5, by.sound.file = TRUE)

optim_ad <- diagnose_detection(reference = sels_55, detection = led5, by.sound.file = FALSE)

saveRDS(list(call_5 = attributes(ed5)$call, filtered_detect_5 = led5, optim_ad_bs_5 = optim_ad_bs, optim_ad_5 = optim_ad), "./data/processed/filtered_detection_55Khz_5_cuts.RDS")

By sound file

attach(readRDS("./data/processed/filtered_detection_55Khz_5_cuts.RDS"))

call_5
## energy_detector(files = unique(sels_55$sound.files), path = .Options$warbleR$path, 
##     thinning = 0.5, bp = c(35, 90), ssmooth = 5, threshold = 0.01, 
##     hold.time = 0.005, min.duration = 0.001, max.duration = 0.15, 
##     parallel = 1)
oa_DT <- datatable(optim_ad_bs_5, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad_bs_5, is.numeric), 3)

Summarized

oa_DT <- datatable(optim_ad_5, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad_5, is.numeric), 3)

Detection over all files

Optimize detection

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

# best tuning parameters 
oed_all <- optimize_energy_detector(reference = sels_55, files =  unique(sels_55$sound.files), threshold = 0.01, min.duration = c(0.005 ,0.001, 0.0005), ssmooth = c(5, 10), hold.time = c(0.005, 0.01), path = .Options$warbleR$path, thinning = 0.5, parallel = 1, bp = c(35, 90), max.duration = c(0.15, 0.175), by.sound.file = TRUE)

saveRDS(oed_all, "./data/processed/optimization_55kHz_all_cuts.RDS")
split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

# best tuning parameters 
ed_all <-  energy_detector(files = unique(split_sels$sound.files), threshold = 0.01, min.duration = 0.001, ssmooth = 5, hold.time = 0.005, path = .Options$warbleR$path, thinning = 0.5, parallel = 1, bp = c(35, 90), max.duration = 0.15)

saveRDS(ed_all, "./data/processed/optimal_detection_55kHz_all_cuts.RDS")
ed_all <- readRDS("./data/processed/optimal_detection_55kHz_all_cuts.RDS")

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)

label_ed_all <- label_detection(reference = sels_55, detection = ed_all, parallel = 10, pb = FALSE)
filter_ed_all <- filter_detection(label_ed_all, parallel = 10, pb = FALSE)

optim_ad_bs_all <- diagnose_detection(reference = sels_55, detection = filter_ed_all, by.sound.file = TRUE, pb = FALSE)

optim_ad_all <- diagnose_detection(reference = sels_55, detection = ed_all, pb = FALSE)

saveRDS(list(call = attributes(ed5)$call, filter_ed_all = filter_ed_all, optim_ad_bs_all = optim_ad_bs_all, optim_ad_all = optim_ad_all), "./data/processed/filtered_detection_55Khz_all_cuts.RDS")

By sound file

attach(readRDS("./data/processed/filtered_detection_55Khz_all_cuts.RDS"))

call
## energy_detector(files = unique(sels_55$sound.files), path = .Options$warbleR$path, 
##     thinning = 0.5, bp = c(35, 90), ssmooth = 5, threshold = 0.01, 
##     hold.time = 0.005, min.duration = 0.001, max.duration = 0.15, 
##     parallel = 1)
oa_DT <- datatable(optim_ad_bs_all, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad_bs_all, is.numeric), 3)

Summarized

oa_DT <- datatable(optim_ad_all, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(optim_ad_all, is.numeric), 3)

Random forest classification

# measure spectrographic parameters
spectral_parameters <- spectro_analysis(filter_ed_all, bp = c(35, 85), fast = TRUE, ovlp = 70, parallel = 5)

# mfccs <-  mfcc_stats(X = filter_ed_all, bp = c(30, 90), ovlp = 70, parallel = 10)

# na_rows <- unique(unlist(sapply(mfccs, function(x) which(is.na(x)))))
# 
# filter_ed_all <- filter_ed_all[-na_rows, ]
# spectral_parameters <- spectral_parameters[-na_rows, ]
# mfccs <- mfccs[-na_rows, ]

spectral_parameters$class <- filter_ed_all$detection.class

# spectral_parameters <- data.frame(spectral_parameters, mfccs[, !names(spectral_parameters) %in% c("sound.files", "selec")])

spectral_parameters$class[spectral_parameters$class != "false.positive"] <- "true.positive"

# make it a factor for ranger to work 
spectral_parameters$class <- as.factor(spectral_parameters$class)
  
fp_index <- which(spectral_parameters$class == "false.positive")
tp_index <- which(spectral_parameters$class == "true.positive")

set.seed(10)
sub_fp_index <- sample(fp_index, length(tp_index))
sub_data <- spectral_parameters[c(sub_fp_index, tp_index), ]
sub_filter_ed_all <- filter_ed_all[c(sub_fp_index, tp_index), ]

# run RF model spectral and cepstral parameters
  rfm_sub <-
    ranger(
      class ~ .,
      data = sub_data[, !names(sub_data) %in% c("sound.files", "selec")],
      num.trees = 10000,
      importance = "impurity",
      seed = 10
    )
  

# table(lab_detec$detection.class)
sub_filter_ed_all$pred.class <- rfm_sub$predictions  
  

  # run RF model spectral and cepstral parameters
  rfm <-
    ranger(
      class ~ .,
      data = spectral_parameters[, !names(spectral_parameters) %in% c("sound.files", "selec")],
      num.trees = 10000,
      importance = "impurity",
      seed = 10
    )

  # table(lab_detec$detection.class)
filter_ed_all$pred.class <- rfm$predictions  
  

saveRDS(list(rfm = rfm, rfm_sub = rfm_sub, sub_filter_ed_all = sub_filter_ed_all, spectral_parameters = spectral_parameters, filter_ed_all_rf = filter_ed_all_rf), "./data/processed/data_and_model_random_forest_55kHz_cuts.RDS")  
attach(readRDS("./data/processed/data_and_model_random_forest_55kHz_cuts.RDS")  )

rfm
## Ranger result
## 
## Call:
##  ranger(class ~ ., data = spectral_parameters[, !names(spectral_parameters) %in%      c("sound.files", "selec")], num.trees = 10000, importance = "impurity",      seed = 10) 
## 
## Type:                             Classification 
## Number of trees:                  10000 
## Sample size:                      119057 
## Number of independent variables:  26 
## Mtry:                             5 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             1.36 %

Final results

Diagnostic after random forest classification:

positive_detec <- filter_ed_all_rf[filter_ed_all_rf$pred.class == "true.positive", ]

temp_detec <- positive_detec
temp_detec$detection.class <- "true.positive"

split_sels <- read.csv("./data/processed/split_manual_annotations_5min.csv")

sels_55 <- split_sels[split_sels$peak.freq > 30, ]

# merged overlapping selections
sels_55 <- merged_overlaps(sels_55)
## 5673 selections overlapped
diag <- diagnose_detection(reference = sels_55, detection = temp_detec, pb = FALSE)

# print dynamic table
oa_DT <- datatable(diag, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(diag, is.numeric), 3)

Black line = 1:1 gray line = model slope

obs_count <- tapply(sels_55$sound.files, sels_55$sound.files, length)
pred_count <- tapply(positive_detec$sound.files, positive_detec$sound.files, length)
sound_files <- unique(split_sels$sound.files)


# add those missing in predicted
names_pred_0 <- setdiff(c(names(obs_count),sound_files), names(pred_count))
pred_0 <- rep(0, length(names_pred_0))
names(pred_0) <- names_pred_0
pred_count <- c(pred_count, pred_0)

# add those missing in observed
names_obs_0 <- setdiff(c(names(pred_count), sound_files), names(obs_count))
obs_0 <- rep(0, length(names_obs_0))
names(obs_0) <- names_obs_0
obs_count <- c(obs_count, obs_0)

# order by name
pred_count <- pred_count[order(names(pred_count))]
obs_count <- obs_count[order(names(obs_count))]

# put both in a single data frame
df <- data.frame(sound.files = names(obs_count), observed = obs_count, predicted = pred_count)

# plot
ggplot(df, aes(x = observed, y = predicted)) +
  geom_point(color = viridis(10, alpha = 0.4)[2], size = 3) +
  geom_abline(slope = 1, intercept = 0) +
  annotate("text", x = 150, y = 550, label = paste("r =", round(cor(obs_count, pred_count), 3)), size = 8) + 
  geom_smooth(method = "lm", se = FALSE, col = "gray") +
  theme_classic(base_size = 20)
## `geom_smooth()` using formula 'y ~ x'

# print best fit lm model
(lm(pred_count ~ obs_count))
## 
## Call:
## lm(formula = pred_count ~ obs_count)
## 
## Coefficients:
## (Intercept)    obs_count  
##     -3.5627       0.8822
# split
split_obs <- split_sound_files(path = .Options$warbleR$path, X = sels_55, only.sels = TRUE, sgmt.dur = 60, pb = FALSE, parallel = 10)

split_pred <- split_sound_files(path = .Options$warbleR$path, X = as.data.frame(positive_detec), only.sels = TRUE, sgmt.dur = 60, pb = FALSE, parallel = 10)

obs_count <- tapply(split_obs$sound.files, split_obs$sound.files, length)
pred_count <- tapply(split_pred$sound.files, split_pred$sound.files, length)
sound_files <- unique(split_sels$sound.files)


# add those missing in predicted
names_pred_0 <- setdiff(c(names(obs_count),sound_files), names(pred_count))
pred_0 <- rep(0, length(names_pred_0))
names(pred_0) <- names_pred_0
pred_count <- c(pred_count, pred_0)

# add those missing in observed
names_obs_0 <- setdiff(c(names(pred_count), sound_files), names(obs_count))
obs_0 <- rep(0, length(names_obs_0))
names(obs_0) <- names_obs_0
obs_count <- c(obs_count, obs_0)

# order by name
pred_count <- pred_count[order(names(pred_count))]
obs_count <- obs_count[order(names(obs_count))]

saveRDS(list(pred_count = pred_count, obs_count = obs_count), "observed_and_predicted_1_min_clips.RDS")
attach(readRDS("./data/processed/observed_and_predicted_1_min_clips.RDS"))

# put both in a single data frame
df <- data.frame(sound.files = names(obs_count), observed = obs_count, predicted = pred_count)

df$sum <- df$observed + df$predicted

# plot
ggplot(df[df$sum > 0, ], aes(x = observed, y = predicted)) +
  geom_point(color = viridis(10, alpha = 0.4)[2], size = 3) +
  geom_abline(slope = 1, intercept = 0) +
  annotate("text", x = 20, y = 50, label = paste("r =", round(cor(obs_count, pred_count), 3)), size = 8) + 
  geom_smooth(method = "lm", se = FALSE, col = "gray") +
  theme_classic(base_size = 20)


# print best fit lm model
(lm(pred_count ~ obs_count))

Session information

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.3.5      ohun_0.1.0         DT_0.18            kableExtra_1.3.4  
##  [5] ranger_0.13.1      readxl_1.3.1       svMisc_1.1.4       rfigshare_0.3.7   
##  [9] viridis_0.6.1      viridisLite_0.4.0  Rraven_1.0.13      pbapply_1.4-3     
## [13] bioacoustics_0.2.5 warbleR_1.1.27     NatureSounds_1.0.4 knitr_1.33        
## [17] seewave_2.1.8      tuneR_1.3.3.1      devtools_2.4.2     usethis_2.0.1     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152      bitops_1.0-7      fs_1.5.0          webshot_0.5.2    
##  [5] httr_1.4.2        rprojroot_2.0.2   tools_4.1.0       bslib_0.2.5.1    
##  [9] utf8_1.2.1        R6_2.5.0          mgcv_1.8-36       DBI_1.1.1        
## [13] colorspace_2.0-2  withr_2.4.2       tidyselect_1.1.1  gridExtra_2.3    
## [17] prettyunits_1.1.1 processx_3.5.2    moments_0.14      compiler_4.1.0   
## [21] rvest_1.0.1       cli_3.0.1         xml2_1.3.2        desc_1.3.0       
## [25] labeling_0.4.2    sass_0.4.0        scales_1.1.1      callr_3.7.0      
## [29] proxy_0.4-26      dtw_1.22-3        systemfonts_1.0.2 stringr_1.4.0    
## [33] digest_0.6.27     svglite_2.0.0     rmarkdown_2.10    pkgconfig_2.0.3  
## [37] htmltools_0.5.2   sessioninfo_1.1.1 highr_0.9         fastmap_1.1.0    
## [41] htmlwidgets_1.5.3 rlang_0.4.11      rstudioapi_0.13   farver_2.1.0     
## [45] jquerylib_0.1.4   generics_0.1.0    jsonlite_1.7.2    crosstalk_1.1.1  
## [49] dplyr_1.0.7       RCurl_1.98-1.4    magrittr_2.0.1    Matrix_1.3-4     
## [53] Rcpp_1.0.7        munsell_0.5.0     fansi_0.5.0       lifecycle_1.0.0  
## [57] stringi_1.7.4     yaml_2.2.1        MASS_7.3-54       RJSONIO_1.3-1.4  
## [61] pkgbuild_1.2.0    plyr_1.8.6        grid_4.1.0        promises_1.2.0.1 
## [65] crayon_1.4.1      lattice_0.20-44   splines_4.1.0     ps_1.6.0         
## [69] pillar_1.6.1      rjson_0.2.20      fftw_1.0-6        pkgload_1.2.1    
## [73] XML_3.99-0.6      glue_1.4.2        evaluate_0.14     remotes_2.4.0    
## [77] vctrs_0.3.8       httpuv_1.6.1      testthat_3.0.4    cellranger_1.1.0 
## [81] gtable_0.3.0      purrr_0.3.4       assertthat_0.2.1  cachem_1.0.5     
## [85] xfun_0.25         later_1.2.0       signal_0.7-7      tibble_3.1.2     
## [89] memoise_2.0.0     ellipsis_0.3.2