Data analysis

Oilbird vocalizations

Author

Marcelo Araya-Salas

Published

January 9, 2024

Source code and data found at https://github.com/maRce10/oilbird_vocal_repertoire

Load packages

Code

# load function from sketchy
source("https://raw.githubusercontent.com/maRce10/sketchy/main/R/load_packages.R")

# install/ load packages
sketchy::load_packages(packages = c(github = "maRce10/Rraven", github = "maRce10/ohun",
    github = "maRce10/warbleR", "viridis", "caret", "randomForest",
    "umap", "mclust"))

1 Reading annotations

Code

clps <- "./DiegoMejia_Grabaciones/5_min_clips/"
spctrs <- "./DiegoMejia_Grabaciones/spectros/"

pth2 <- "./DiegoMejia_Grabaciones/Oilbird_Anotations Diego Mejía"

anns <- imp_raven(pth2, recursive = TRUE, warbler.format = TRUE, name.from.file = TRUE,
    ext.case = "lower", all.data = TRUE)


empty_recs <- .Options$Rraven$`empty selection table files`

cs <- check_sels(anns, path = clps)

anns <- anns[cs$check.res == "OK", ]

feature_reference(anns, units = c("s", "kHz"))

               min  mean    max
sel.duration  0.02  3.65 300.00
gap.duration -0.20  2.40 174.55
annotations   1.00 46.21 178.00
bottom.freq   0.00  0.26   3.49
top.freq      0.43  9.80  11.03

Code

anns2 <- anns[anns$Adulto == "y" & anns$Sobrelapamiento == "n", ]

write.csv(anns2, "./data/processed/adult_annotations.csv", row.names = FALSE)

2 Measure acoustic features

Code

anns <- read.csv("./data/processed/adult_annotations.csv")

anns$type <- NA
dirs <- list.dirs(path = "./DiegoMejia_Grabaciones/Espectros Categorizados")[-1]

for (x in dirs) anns$type[filter_sels(anns, path = x, index = TRUE)] <- basename(x)

table(anns$type)

check_sels(anns, path = clps)


acoust_feat <- spectro_analysis(anns, parallel = 20, path = clps)
mfcs <- mfcc_stats(anns, parallel = 20, path = clps)

acoust_feat <- cbind(acoust_feat, mfcs[, -c(1, 2)])

acoust_feat$type <- anns$type

write.csv(acoust_feat, "./data/processed/acoustic_features.csv", row.names = FALSE)

3 Random Forest classification

Code

acoust_feat <- read.csv("./data/processed/acoustic_features.csv")

acoust_feat <- acoust_feat[complete.cases(acoust_feat), ]

# acoust_feat <- acoust_feat[!is.na(acoust_feat$type), ]
acoust_feat$type <- make.names(acoust_feat$type)
acoust_feat$type <- factor(acoust_feat$type)

# Create data subsets
partition <- createDataPartition(
    y = acoust_feat$type,
    times = 1,
    p = 0.75,
    list = FALSE
)

trainset <- acoust_feat[partition, -c(1, 2)]
testset <- acoust_feat[-partition, -c(1, 2)]

trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 100,
        savePredictions = TRUE,
        classProbs = TRUE,
        returnResamp = "all",
        sampling = "down"
    )

pred_model <-
    train(
        type ~ .,
        data = trainset,
        method = "rf",
        trControl = trcontrol,
        metric = "Accuracy",
        preProcess = "scale", 
        proximity = TRUE
    )

# save confusion matrix
conf_mat <-
    confusionMatrix(predict(pred_model, testset), testset$type)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$type ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

all_rf_model <- randomForest(
  type ~ .,
  data = acoust_feat,  # Your entire dataset
  proximity = TRUE,  # Include proximity matrix
  ntree = best_rf_model$ntree,  # Number of trees
  mtry = best_rf_model$mtry,    # Number of variables tried for splitting at each node
  nodesize = best_rf_model$nodesize,  # Minimum size of terminal nodes
  maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
)


saveRDS(
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset,
        all_rf_model = all_rf_model,
        data = acoust_feat
    ),
    "./data/processed/random_forest_model_call_types.RDS"
)

Code

rf_model_results <- readRDS("./data/processed/random_forest_model_call_types.RDS")

# print confusion matrix results
rf_model_results$conf_mat_bb$overall

      Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
   3.52239e-01    3.03568e-01    3.16043e-01    3.89743e-01    1.70149e-01 
AccuracyPValue  McnemarPValue 
   6.70249e-30            NaN

Code

confusion_df <- rf_model_results$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

4 Manual labels

4.1 UMAP visualization

Code

umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15,
    n_components = 2)

# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[,
    2], type = rf_model_results$data$type)

umap_df$pred.type <- predict(rf_model_results$pred_model_bb, rf_model_results$data)
# predict(object = rf_model_results$all_rf_model,
# rf_model_results$data)

mod_umap <- Mclust(umap_df[, 1:2])

summary(mod_umap)

---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 9
components: 

 log-likelihood    n df      BIC      ICL
         -10309 2703 53 -21036.8 -21782.5

Clustering table:
  1   2   3   4   5   6   7   8   9 
277 569  24 162 268 199 419 651 134

Code

grouping_umap <- as.factor(mod_umap$classification)
umap_df$group <- mod_umap$classification

umap_df$sound.files <- rf_model_results$data$sound.files
umap_df$selec <- rf_model_results$data$selec

write.csv(umap_df, "./data/processed/umap_on_rf_proximity_call_types.csv",
    row.names = FALSE)

Code

umap_df <- read.csv("./data/processed/umap_on_rf_proximity_call_types.csv")

# Create a scatterplot
gg_umap_loc <- ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = type,
    fill = type, shape = type)) + geom_point(size = 4) + ylim(c(-7,
    7)) + scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
    scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) + theme_classic(base_size = 20) +
    labs(x = "UMAP1", y = "UMAP2", color = "Call type", fill = "Call type",
        shape = "Call type") + scale_shape_manual(values = rep(c(1:2,
    16:21), 3)[1:16])

gg_umap_loc

5 Unsupervised labels

5.1 UMAP visualization

Code

umap_df <- read.csv("./data/processed/umap_on_rf_proximity_call_types.csv")

# Create a scatterplot
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = as.factor(group),
    fill = as.factor(group), shape = as.factor(group))) + geom_point(size = 4) +
    ylim(c(-7, 7)) + scale_color_viridis_d(alpha = 0.3, begin = 0.1,
    end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
    theme_classic(base_size = 20) + labs(x = "UMAP1", y = "UMAP2",
    color = "Call type", fill = "Call type", shape = "Call type") +
    scale_shape_manual(values = rep(c(1:2, 16:22), 3)[1:9])

6 Match between manual and unsupervised classifications

All calls

Code

# confusion matrix
conf_all <- aggregate(UMAP2 ~ type + group, umap_df, length)

names(conf_all)[ncol(conf_all)] <- "Freq"

conf_all$total <- sapply(conf_all$group, function(x) sum(umap_df$group ==
    x))

conf_all$proportion <- conf_all$Freq/conf_all$total

conf_all <- conf_all[order(conf_all$proportion), ]

conf_all$group <- factor(conf_all$group)

conf_all$type <- factor(conf_all$type)

ggplot(conf_all, aes(y = group, x = type, fill = proportion)) + geom_tile() +
    theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8)) + labs(y = "Unsupervised categories",
    x = "Manual categories")

All calls excluding proportions lower than 5%

Code

# confusion matrix
conf_df <- aggregate(UMAP2 ~ type + group, umap_df, length)

names(conf_df)[ncol(conf_df)] <- "Freq"

conf_df$total <- sapply(conf_df$group, function(x) sum(umap_df$group ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total

conf_df <- conf_df[order(conf_df$proportion), ]

conf_df$group <- factor(conf_df$group)

conf_df$type <- factor(conf_df$type)

conf_df <- conf_df[conf_df$proportion > 0.05, ]

ggplot(conf_df, aes(y = group, x = type, fill = proportion)) + geom_tile() +
    theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8)) + labs(y = "Unsupervised categories",
    x = "Manual categories")

Only those that were correctly classified based on manual categories

Code

# confusion matrix
conf_df <- aggregate(UMAP2 ~ type + group, umap_df[umap_df$type ==
    umap_df$pred.type, ], length)

names(conf_df)[ncol(conf_df)] <- "Freq"

conf_df$total <- sapply(conf_df$group, function(x) sum(umap_df$group[umap_df$type ==
    umap_df$pred.type] == x))

conf_df$proportion <- conf_df$Freq/conf_df$total

conf_df <- conf_df[order(conf_df$proportion), ]

conf_df$group <- factor(conf_df$group)

conf_df$type <- factor(conf_df$type)

ggplot(conf_df, aes(y = group, x = type, fill = proportion)) + geom_tile() +
    theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8)) + labs(y = "Unsupervised categories",
    x = "Manual categories")

Only those that were correctly classified based on manual categories lower than 5%

Code

# confusion matrix
conf_df <- aggregate(UMAP2 ~ type + group, umap_df[umap_df$type ==
    umap_df$pred.type, ], length)

names(conf_df)[ncol(conf_df)] <- "Freq"

conf_df$total <- sapply(conf_df$group, function(x) sum(umap_df$group[umap_df$type ==
    umap_df$pred.type] == x))

conf_df$proportion <- conf_df$Freq/conf_df$total

conf_df <- conf_df[order(conf_df$proportion), ]

conf_df$group <- factor(conf_df$group)

conf_df$type <- factor(conf_df$type)

conf_df <- conf_df[conf_df$proportion > 0.05, ]

ggplot(conf_df, aes(y = group, x = type, fill = proportion)) + geom_tile() +
    theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8)) + labs(y = "Unsupervised categories",
    x = "Manual categories")

6.1 Chi-square on contingency table

Code

tbl <- xtabs(Freq ~ group + type, conf_all)

chisq.test(tbl)


    Pearson's Chi-squared test

data:  tbl
X-squared = 4356, df = 120, p-value <2e-16

7 Print spectrograms

Code

anns <- read.csv("./data/processed/adult_annotations.csv")

spectrograms(anns, flim = c(0, 11), collevels = seq(-120, 0, 5), pal = viridis::viridis,
    path = clps, dest.path = spctrs, propwidth = TRUE)

head(umap_df)

for (i in unique(umap_df$group)) {

    dir.create(paste0("./data/processed/spectros_umap_groups/", i))
    spectrograms(anns[paste(anns$sound.files, anns$selec) %in% paste(umap_df$sound.files[umap_df$group ==
        i], umap_df$selec[umap_df$group == i]), ], flim = c(0, 11),
        collevels = seq(-120, 0, 5), pal = viridis::viridis, path = clps,
        dest.path = paste0("./data/processed/spectros_umap_groups/",
            i), propwidth = TRUE)


}

Takeaways

Session information

R version 4.3.1 (2023-06-16)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3 
LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
 [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
 [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       

time zone: America/Costa_Rica
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] mclust_6.0.0         umap_0.2.10.0        randomForest_4.7-1.1
 [4] caret_6.0-94         lattice_0.22-5       ggplot2_3.4.4       
 [7] viridis_0.6.4        viridisLite_0.4.2    warbleR_1.1.29      
[10] NatureSounds_1.0.4   knitr_1.45           seewave_2.2.3       
[13] tuneR_1.4.6          ohun_1.0.2           Rraven_1.0.14       

loaded via a namespace (and not attached):
  [1] DBI_1.2.0            bitops_1.0-7         pROC_1.18.5         
  [4] pbapply_1.7-2        gridExtra_2.3        formatR_1.14        
  [7] remotes_2.4.2.1      testthat_3.2.1       rlang_1.1.2         
 [10] magrittr_2.0.3       e1071_1.7-14         compiler_4.3.1      
 [13] png_0.1-8            vctrs_0.6.5          reshape2_1.4.4      
 [16] stringr_1.5.1        pkgconfig_2.0.3      crayon_1.5.2        
 [19] fastmap_1.1.1        backports_1.4.1      labeling_0.4.3      
 [22] utf8_1.2.4           rmarkdown_2.25       prodlim_2023.08.28  
 [25] purrr_1.0.2          xfun_0.41            jsonlite_1.8.8      
 [28] recipes_1.0.8        parallel_4.3.1       R6_2.5.1            
 [31] RColorBrewer_1.1-3   stringi_1.8.3        reticulate_1.34.0   
 [34] parallelly_1.36.0    rpart_4.1.21         brio_1.1.4          
 [37] lubridate_1.9.3      Rcpp_1.0.11          iterators_1.0.14    
 [40] future.apply_1.11.0  Matrix_1.6-2         splines_4.3.1       
 [43] nnet_7.3-19          igraph_1.6.0         timechange_0.2.0    
 [46] tidyselect_1.2.0     rstudioapi_0.15.0    yaml_2.3.8          
 [49] timeDate_4022.108    codetools_0.2-19     dtw_1.23-1          
 [52] listenv_0.9.0        tibble_3.2.1         plyr_1.8.9          
 [55] withr_2.5.2          askpass_1.2.0        evaluate_0.23       
 [58] signal_1.8-0         future_1.33.0        survival_3.5-7      
 [61] sf_1.0-15            sketchy_1.0.2        units_0.8-5         
 [64] proxy_0.4-27         pillar_1.9.0         packrat_0.9.2       
 [67] KernSmooth_2.23-22   stats4_4.3.1         checkmate_2.3.1     
 [70] foreach_1.5.2        generics_0.1.3       RCurl_1.98-1.13     
 [73] munsell_0.5.0        scales_1.3.0         globals_0.16.2      
 [76] class_7.3-22         glue_1.6.2           tools_4.3.1         
 [79] xaringanExtra_0.7.0  data.table_1.14.8    RSpectra_0.16-1     
 [82] ModelMetrics_1.2.2.2 gower_1.0.1          grid_4.3.1          
 [85] ipred_0.9-14         colorspace_2.1-0     nlme_3.1-163        
 [88] cli_3.6.2            fansi_1.0.6          lava_1.7.3          
 [91] dplyr_1.1.3          gtable_0.3.4         fftw_1.0-7          
 [94] digest_0.6.33        classInt_0.4-10      farver_2.1.1        
 [97] rjson_0.2.21         htmlwidgets_1.6.2    htmltools_0.5.7     
[100] lifecycle_1.0.4      hardhat_1.3.0        openssl_2.1.1       
[103] MASS_7.3-60