next steps

call rate solo vs group flight gap duration solo vs group flight

Load packages

## add 'developer' to packages to be installed from github
x <- c("data.table", "lubridate", "devtools", "maRce10/warbleR", "readxl", "ranger", "caret", "e1071", "pbapply", "viridis", "ggplot2", "DT", "kableExtra", "rlang", "Sim.DiffProc", "soundgen"#, "markovchain", "igraph", "TraMineR", "spgs"
       )

aa <- lapply(x, function(y) {
  
  # get pakage name
  pkg <- strsplit(y, "/")[[1]]
  pkg <- pkg[length(pkg)]
  
  # check if installed, if not then install 
  if (!pkg %in% installed.packages()[,"Package"])  {

      if (grepl("/", y))  devtools::install_github(y, force = TRUE) else
    install.packages(y) 
    }

  # load package
  a <- try(require(pkg, character.only = T), silent = T)

  if (!a) remove.packages(pkg)
  })

Functions and global parameters

warbleR_options(wl = 300, parallel = 1, bp = "frange", fast = TRUE, threshold = 15, ovlp = 20)

opts_knit$set(root.dir = "..")


theme_set(theme_classic(base_size = 34))

# set evaluation false
opts_chunk$set( fig.width = 7, fig.height = 4, warning = FALSE, message = FALSE, tidy = FALSE)

# number of trees in Random Forest models
num.trees <- 2000

# replicates in Random Forest replication
reps <- 50

# sensitivity cutoff
cutoff <- 0.86

# function to calculate classification random forest models with balanced sample sizes across categories
balanced_rf <- function(X, num.trees = 1000, random = FALSE, seed = 506){
  
    # get smallest n across individuals
    min.n <- min(table(X$indiv)) 
  
    # use seed
    set.seed(seed)
    
    # randomly get rows for equal n across indivs
    sel_rows <-
    sapply(unique(X$indiv), function(x)
      sample(rownames(X)[X$indiv == x], min.n, replace = FALSE))
  
  # subset to those rows  
  X <- X[c(sel_rows), ]
  
  # convert to factor
  if (random){ 
    
    # use seed
    set.seed(seed)

    X$indiv <- sample(X$indiv)
  }
  
   # make it a factor for ranger to work 
  X$indiv <- as.factor(X$indiv)
  
  # run RF model spectral and cepstral parameters
  rfm <-
    ranger(
      indiv ~ .,
      data = X[, !names(X) %in% c("sound.files", "selec")],
      num.trees = num.trees,
      importance = "impurity",
      probability = TRUE,
      seed = seed
    )
  
  # get predicted individual from probs
  pred_indiv <- apply(rfm$predictions, 1, function(x) colnames(rfm$predictions) [which.max(x)])
  
  rfm$predictions <- data.frame(rfm$predictions, # indiv = X$indiv, 
                                pred_indiv, sound.files = X$sound.files)
  
  # remove X from start of names 
  names(rfm$predictions) <- gsub("^X", "", names(rfm$predictions))
  
   return(rfm)
  }


# function to calculate sensitivities at increasing RF class probabilities

sensitivity_fun <- function(X, parameters, thresholds = seq(0,1, by = 0.01)){

# get sensitivities for each group at very threshold
sensitiv_l <- lapply(X, function(x){

  # extract prediction data.frame
  Y <- x$aggregated_predictions
  Y$max <- apply(Y[, sapply(Y, is.numeric)], 1, max)
  
  # get sensitivity at different thresholds
  sensi_l <- lapply(thresholds, function(y) data.frame(sensitivity = sum(Y$pred_indiv[Y$max >= y] == Y$actual_indiv[Y$max >= y])/ sum(Y$max >= y), n = sum(Y$max >= y) / nrow(Y))) 
  
  sensi <- do.call(rbind, sensi_l)
  
  # add metadata
  sensi$group <- x$group 
  sensi$n_indiv <- x$n_indiv
  sensi$min_n <- x$min_n
  sensi$n_calls <- nrow(Y) * sensi$n

  return(sensi)
  })

# put in a data frame
sensitivities <- as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "sensitivity")))

# get minimum sensitivity at each probabilities
sensitivities$min.sensitivity <- apply(sensitivities, 1, min, na.rm = TRUE)

# get minimum sensitivity at each probabilities
sensitivities$mean.sensitivity <- apply(sensitivities, 1, mean, na.rm = TRUE)

# add thresholds to data frame
sensitivities$thresholds <- thresholds


# put in a data frame
sensitivities$n_calls <- rowSums(as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "n_calls"))), na.rm = TRUE)

sensitivities$n_calls_prop <- sensitivities$n_calls / max(sensitivities$n_calls)

sensitivities <- sensitivities[!is.infinite(sensitivities$mean.sensitivity), ]

sensitivities$parameters <- parameters    
return(sensitivities)  
} 

# function to predict group flights, x is the group name, Y the acoustic data and Z the random forest models 
pred_group <- function(x, Y, Z){

  # print(x)
  # Z <- acous_param_l[[x]]
  # Y <- Z[grep("grup",Z$experiment), ]
  Y$indiv <- NULL
  
  # random forest models for this group
  # rfms <- random_forests_l[[x]]
  rfms <- Z
    
  # predict using all random forest models
  rf_preds <- lapply(rfms, function(x) predict(object = x, data = Y)$predictions)

    # add column (individual) if not found 
    rf_preds <- lapply(rf_preds, function(x){
    
    if(ncol(x) < length(unique(Z$indiv[Z$experiment == "vuelo solo"]))){
      # how many columns are missing
      mssng <- length(unique(Z$indiv[Z$experiment == "vuelo solo"])) - ncol(x)
      
      # add missing columns with 0
      for(i in 1:(mssng)) x <- data.frame(x, 0, check.names = FALSE)
    names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(Z$indiv[Z$experiment == "vuelo solo"]), names(x)) 
    }
    return(x)
  })
  
  # get together predictions from the same individual
  preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
    do.call(cbind, lapply(rf_preds, function(e) e[, y])) 
  )
   
  agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))  
  
  # add individual name to columns
  names(agg_preds) <- colnames(rf_preds[[1]])
  
  # add sound file column
  agg_preds$sound.files <- Y$sound.files
  
    # get predicted indiv from aggregated probabilities 
  agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])

  agg_preds$group <- x
  agg_preds$max_prob <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, max)
  
  return(agg_preds)
  }

Read detections and prepare data

clls <- readRDS("./data/processed/curated_extended_selection_table_inquiry_calls_2020_&_2021.RDS")

metadat <- read.csv("./data/processed/metadata_inquiry_calls_2020_&_2021.csv", stringsAsFactors = FALSE)

Measure acoustic parameters

grps <- unique(clls$Grupo[grep("grup", clls$Experimento)])

# function to get acoustic parameters
acous_param_fun <- function(x) {
  
  # print(x)
  # print(which(unique(clls$Grupo[grep("grup", clls$Experimento)]) == x) / length(unique(clls$Grupo[grep("grup", clls$Experimento)])))

  indivs <- unique(clls$Individuo[clls$Grupo == x & grepl("grup", clls$Experimento)])
  
  # get individual IDs for the group
  indivs <- strsplit(indivs, split = "\\|")[[1]]  
   indivs <- indivs[indivs != "NA"]
 
  indiv_calls <- clls[clls$Individuo %in% indivs & clls$Experimento == "vuelo solo", , drop = FALSE]
    
  # remove low SNR calls on individual flights
  # indiv_calls <- sig2noise(indiv_calls, mar = 0.025, pb = FALSE)
  # indiv_calls <- indiv_calls[indiv_calls$SNR > 1, , drop = FALSE]
  # indiv_calls$SNR <- NULL 
  
  group_calls <- clls[clls$Grupo == x & grepl("grup", clls$Experimento), , drop = FALSE]
  
  # select most recent group flight  
  group_calls <- group_calls[group_calls$date == max(unique(group_calls$date)), ]
  
  # measure structure only if all individuals are represented
  if (length(indivs) >= length(unique(indiv_calls$Individuo))) {
    
    # put all data together
    grp_test <- rbind(indiv_calls, group_calls)
    
    # measure acoustics parameters
    sp <- specan(grp_test, pb = FALSE, harmonicity = FALSE)
    
    # remove time parameters
    sp <- sp[, grep("time\\.", names(sp), invert = TRUE)]
  
    # measure cepstral coeffs
    cc <- mfcc_stats(grp_test, pb = FALSE)[, -c(1, 2)]
  
    # spectrographic cross correlation
    spxc <- xcorr(grp_test, pb = FALSE, parallel = 1)
    
    # MDS
    spxc <- cmdscale(1 - spxc, k = 10, list. = TRUE)
  
    spxc_mds <- spxc$points
    
    colnames(spxc_mds) <- paste0("spxcMDS", 1:ncol(spxc_mds))
     
    # mfcc cross correlation
    mfccxc <- xcorr(grp_test, pb = FALSE, type = "mfcc")
    
    # MDS
    mfccxc <- cmdscale(1 - mfccxc, k = 10, list. = TRUE)
  
    mfxc_mds <- mfccxc$points
    
    colnames(mfxc_mds) <- paste0("mfxcMDS", 1:ncol(mfxc_mds))

    # put parameters in a list
    all_params <- data.frame(sp, cc, spxc_mds, mfxc_mds)

    # scale for random forest
    all_params[,-c(1, 2)] <- scale(all_params[,-c(1, 2)])
    
    # add individual and experiment
    all_params$indiv <- grp_test$Individuo
    all_params$experiment <- grp_test$Experimento
    
    # remove bottom and top freq
    all_params$top.freq <- all_params$bottom.freq <- NULL
    all_params$group <- x
  
    output <- all_params
    }  else output <- NULL
  
  return(output)
  } 

# loop to measure acoustic parameters on each group
acous_param_l <- warbleR:::pblapply_wrblr_int(grps, FUN = function(x) try(acous_param_fun(x), silent = TRUE))

names(acous_param_l) <- grps

acous_param_l <- acous_param_l[sapply(acous_param_l, class) == "data.frame"]

names(acous_param_l)

# check if all have the same number of columns
all(sapply(acous_param_l, ncol) == max(sapply(acous_param_l, ncol)))

# save as RDS
saveRDS(acous_param_l, "./data/processed/acoustic_parameters_all_groups_specific_warbler_acoustic_measurements_curated_data_2020_&_2021.RDS")

Random forest on solo flights

Run random forest to predict solo flights

  • Calls from solo flights of all individuals in a particular group flight (regular or mixed) where pooled together to train a random forest model that can classified calls from those individuals
  • Those models will be later used to classified calls from group flights
  • P values were calculated as the proportion of randomized data sets that generated an out-of-bag error equals to or lower than that from the actual data
# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_specific_warbler_acoustic_measurements_curated_data_2020_&_2021.RDS")

# all should have 2 experiment types
all(sapply(acous_param_l, function(x) length(unique(x$experiment))) == 2)

# acous_param$idgroup <- paste(acous_param$indiv, acous_param$)

# minimum sample size per group
min_n <- sapply(acous_param_l, function(x) min(table(x$indiv)))

# remove groups with less than 5 observations for minimum sample size 
# how many left
sum(!names(acous_param_l) %in% names(min_n)[min_n < 5])
acous_param_l <- acous_param_l[!names(acous_param_l) %in% names(min_n)[min_n < 5]]

# loop over groups
avg_mods <- warbleR:::pblapply_wrblr_int(names(acous_param_l), function(x){
    # for (x in names(acous_param_l)){
      # print(x)
        # extract data
    X <- acous_param_l[[which(names(acous_param_l) == x)]]
    
    # only solo flight
    solo_rf_input <- X[X$experiment == "vuelo solo", ]
  
    # rename rows for sel_rows
    rownames(solo_rf_input) <- 1:nrow(solo_rf_input)
    
    # order by sound file column
    solo_rf_input <- solo_rf_input[order(solo_rf_input$sound.files), ]
    
    # remove experiment column
    solo_rf_input$experiment <- NULL
   
    # subset columns to keep only those from acoustic measurements 
    # solo_rf_input <- solo_rf_input[ , !names(solo_rf_input) %in% c("sound.files", "selec")]
   
    # run random forest, set a seed to make it replicable
    rf_results <- lapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, seed = x))
    
    # merge together predictions by sound files
    rf_preds <- lapply(rf_results, function(x){
      mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)
  
    mrg <- mrg[order(mrg$sound.files), -1]   
    }
   )
    
    # add column (individual) if not found 
    rf_preds <- lapply(rf_preds, function(x){
      
      if(ncol(x) < length(unique(solo_rf_input$indiv))){
        # how many columns are missing
        mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
        
        # add missing columns
        for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
      names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x)) 
      }
      return(x)
    })
    
    # get together predictions from the same individual
    preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
      do.call(cbind, lapply(rf_preds, "[", y)) 
    )
     
    agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))  
    
    # add individual name to columns
    names(agg_preds) <- names(rf_preds[[1]])
    
    # add sound file column
    agg_preds$sound.files <- solo_rf_input$sound.files
    
      # get predicted indiv from aggregated probabilities 
    agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])
  
    # make it a factor
    pred_indiv <- factor(agg_preds$pred_indiv, levels = unique(solo_rf_input$indiv))
    agg_preds$actual_indiv <- actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
    
    # get confusion matrix
    cm_solo <- confusionMatrix(pred_indiv, reference = actual_indiv)
    
    ### NULL MODEL
    # run null model by randomizing indiv labels
    rf_null_results <- lapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, random = TRUE, seed = x))
    
    # get accuracies form null models  
      # merge together predictions by sound files
    rf_null_preds <- lapply(rf_null_results, function(x){
      mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)
  
    mrg <- mrg[order(mrg$sound.files), -1]   
    }
   )
  
    # add column (individual) if not found 
    rf_null_preds <- lapply(rf_null_preds, function(x){
      
      if(ncol(x) < length(unique(solo_rf_input$indiv))){
        # how many columns are missing
        mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
        
        # add missing columns
        for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
      
        names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x)) 
      }
      return(x)
    })
    
    # get together predictions from the same individual
    preds_by_indv_null <- lapply(1:ncol(rf_null_preds[[1]]), function(y)
      do.call(cbind, lapply(rf_null_preds, "[", y)) 
    )
     
    agg_preds_null <- as.data.frame(lapply(preds_by_indv_null, rowMeans, na.rm = TRUE))  
    
    # add individual name to columns
    names(agg_preds_null) <- names(rf_null_preds[[1]])
    
    # add sound file column
    agg_preds_null$sound.files <- solo_rf_input$sound.files
    
      # get predicted indiv from aggregated probabilities 
    agg_preds_null$pred_indiv <- apply(agg_preds_null[, sapply(agg_preds_null, is.numeric)], 1, function(x) colnames(agg_preds_null)[which.max(x)])
  
    # make it a factor
    pred_indiv_null <- factor(agg_preds_null$pred_indiv, levels = unique(solo_rf_input$indiv))
    actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
    
    # get confusion matrix
    cm_solo_null <- confusionMatrix(pred_indiv_null, reference = actual_indiv)
    
  # get pvalue of mean OOB of real data
  random_acc <- sapply(rf_null_preds, function(e){
    
    # add sound file column
    e$sound.files <- solo_rf_input$sound.files
    
      # get predicted indiv from aggregated probabilities 
    e$pred_indiv <- apply(e[, sapply(agg_preds_null, is.numeric)], 1, function(x) if (length(colnames(agg_preds_null)[which.max(x)]) > 0) colnames(agg_preds_null)[which.max(x)] else NA)
  
    # make it a factor
    e$pred_indiv <- factor(e$pred_indiv, levels = unique(solo_rf_input$indiv))
    
    # get confusion matrix
    cm_solo_null <- confusionMatrix(e$pred_indiv[!is.na(e$pred_indiv)], reference = actual_indiv[!is.na(e$pred_indiv)])
  
    return(as.vector(cm_solo_null$overall[1]))
    })
    
    
    ### NOTE: ranger() OOB prediction error and confusionMatrix() Accuracy are the same
    # put all results together
    output <- list(group = x, accuracy = cm_solo$overall[1], null_accuracy = cm_solo_null$overall[1], aggregated_predictions = agg_preds, conf_matrix = cm_solo, random_forests = rf_results, n_indiv = length(unique(solo_rf_input$indiv)), min_n = min(table(solo_rf_input$indiv)), pvalue = sum(random_acc > cm_solo$overall[1]) / length(random_acc))

  return(output)
    }
  )
  
# add group name to list
names(avg_mods) <- names(acous_param_l)

# save as RDS
saveRDS(avg_mods, "./data/processed/average_models_random_forest_all_groups_best_parameter_combination.RDS")

Predicting solo flight performance

A row for each group detailing:

  • The total number of calls in that group flight
  • Number and proportion of calls above the lowest probability of all correctly classified calls in the training data set (for that particular group)
# read data
avg_mods <- readRDS("./data/processed/average_models_random_forest_all_groups_best_parameter_combination.RDS")

# sensitivities <- sensitivity_fun(X = avg_mods, parameters = "best")
  
# calculate threshold at cutoff
# thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
  
diagnostics <- data.frame(group = names(avg_mods))

diagnostics$avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
  
diagnostics$null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))

diagnostics$p_values <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "pvalue"))

diagnostics$min_prob_threshold <- sapply(avg_mods, function(x) {
  Y <- x$aggregated_predictions
  Y$max.prob <- apply(Y[, !names(Y) %in%  c("sound.files", "pred_indiv", "actual_indiv")], 1, max)
  Y$true.positive <- Y$pred_indiv == Y$actual_indiv
  min(Y$max.prob[Y$true.positive])
  })

sub_metadat <- metadat[!grepl("refugio|enmascar", metadat$Experimento) & !is.na(metadat$Experimento),]

diagnostics$experiment <- sapply(diagnostics$group, function(x) paste(unique(sub_metadat$Experimento[sub_metadat$Grupo == x & !grepl("solo", sub_metadat$Experimento)]), collapse = "-"))

diagnostics$experiment <- ifelse(grepl("mixto", diagnostics$experiment), "mixed", "regular")

diagnostics$group_flight_files <- sapply(diagnostics$group, function(x){ 
  
  year_audios <- unique(sub_metadat$year.audio[sub_metadat$Grupo == x & !grepl("solo", sub_metadat$Experimento)])

  Y <- acous_param_l[[x]]
  Y$sound.files <- sapply(Y$sound.files, function(x) strsplit(x, ".wav")[[1]][1])
  Y$year.audio <- sapply(Y$sound.files, function(x) clls$year.audio[gsub(".wav", "", clls$org.sound.files) == x][1])
  
  solo_indiv <- unique(Y$indiv[acous_param_l[[x]]$experiment == "vuelo solo"])
  
  year_audios <- year_audios[year_audios %in% unique(Y$year.audio)]
  
  year_audios <- paste(year_audios, collapse = "|")
  
  return(year_audios)
  }
)


diagnostics$no_call_solo_indivs <- sapply(1:nrow(diagnostics), function(x){
  
  Y <- acous_param_l[[diagnostics$group[x]]]
  Y$sound.files <- sapply(Y$sound.files, function(x) strsplit(x, ".wav")[[1]][1])
  Y$year.audio <- sapply(Y$sound.files, function(x) clls$year.audio[gsub(".wav", "", clls$org.sound.files) == x][1])
  
  indivs_in_group <- strsplit(Y$indiv[Y$year.audio == diagnostics$group_flight_files[x]], split = "\\|")[[1]] 
  
  indivs_in_solo <- unique(Y$indiv[grep("solo", Y$experiment)])
  
  return(all(indivs_in_group %in% indivs_in_solo))
    }
)


diagnostics$n_indiv <- sapply(avg_mods, function(x) ncol(x$conf_matrix$table))

# remove groups with no diagnostics
diagnostics <- diagnostics[!is.na(diagnostics$avg_accuracy), ]

saveRDS(diagnostics, "./data/processed/random_forests_diagnostics_solo_flight.RDS")

# save sensitivities
# saveRDS(sensitivities, "./data/processed/random_forests_sensitivity_solo_flight.RDS")

Acccuracy from observed and randomized data:

# read diagnostic
diagnostics <- readRDS("./data/processed/random_forests_diagnostics_solo_flight.RDS")

model_diagnostics <- data.frame(model = rep(c("observed", "null"), each =  nrow(diagnostics)), accuracy = c(diagnostics$avg_accuracy, diagnostics$null_accuracy), experiment = diagnostics$experiment)

# density plots
ggplot(model_diagnostics, aes(x = accuracy, fill = model)) +
  geom_density(alpha=0.4) + 
  theme_classic() + 
  scale_fill_viridis_d(end = 0.8) + 
  labs(x = "Mean accuracy", y = "Frequency") +
  theme(legend.position = c(0.9, 0.8)) +
  facet_wrap(~ experiment)

Table with descriptors for each group’s model:

diagnostics <- diagnostics[order(diagnostics$avg_accuracy, decreasing = TRUE), ]


oa_DT <- datatable(diagnostics, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(diagnostics, is.numeric), 3)

Predict group flight individual ID

# read diagnostics
diagnostics <- readRDS("./data/processed/random_forests_diagnostics_solo_flight.RDS")

# use only those with all individuals in group flight with solo flight calls and p value < 0.05
diagnostics <- diagnostics[diagnostics$no_call_solo_indivs & diagnostics$p_values < 0.05, ]

# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_specific_warbler_acoustic_measurements_curated_data_2020_&_2021.RDS")

# read random forest models
avg_mods <- readRDS("./data/processed/average_models_random_forest_all_groups_best_parameter_combination.RDS")

# acous_param_l <- acous_param_l[names(acous_param_l) %in% names(avg_mods)]

# extract random forests from acous_param_l list 
random_forests_l <- lapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "random_forests"))

# predict group flights
agg_preds_l <- warbleR:::pblapply_wrblr_int(1:nrow(diagnostics), function(x){
  
  preds <- try(pred_group(x = diagnostics$group[x], 
                 Y = acous_param_l[[diagnostics$group[x]]], 
                 Z = random_forests_l[[diagnostics$group[x]]]), 
      silent = TRUE)
    
  return(preds)
  }
)

agg_pred <- do.call(rbind, lapply(agg_preds_l, function(x) x[, c("group", "sound.files", "max_prob", "pred_indiv")]))

rownames(agg_pred) <- 1:nrow(agg_pred)

# save as RDS
saveRDS(agg_pred, "./data/processed/predicted_individual_in_group_flights.RDS")
# read as RDS
agg_pred <- readRDS("./data/processed/predicted_individual_in_group_flights.RDS")

# read diagnostics
diagnostics <- readRDS("./data/processed/random_forests_diagnostics_solo_flight.RDS")

# get summary by group
summary_by_group_list <- lapply(unique(agg_pred$group), function(x) {
    
  Y <- agg_pred[agg_pred$group == x, ]

#total number of calls above lowest group threshold for true positives
  above_threshold_calls <- sum(Y$max_prob > diagnostics$min_prob_threshold[diagnostics$group == x])

#proportion of calls
  prop_above_calls <- above_threshold_calls / nrow(Y)

  return(data.frame(group = x, experiment = diagnostics$experiment[diagnostics$group == x], total_calls = nrow(Y), above_threshold_calls = above_threshold_calls, prop_above_calls, n_individuals = diagnostics$n_indiv[diagnostics$group == x]))
  
})

summary_by_group <- do.call(rbind, summary_by_group_list)

oa_DT <- datatable(summary_by_group, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

formatRound(table = oa_DT, columns = sapply(summary_by_group, is.numeric), 3)
groups_by_cat <- aggregate(group ~ experiment, summary_by_group, FUN = function(x) length(unique(x)))

#print pretty table
df3 <- knitr::kable(groups_by_cat, row.names = FALSE, escape = FALSE, format = "html", digits = 2)

kable_styling(df3, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)
experiment group
mixed 28
regular 25

Group count by number of individuals and experiment:

groups_by_cat_n <- as.data.frame.matrix(table(summary_by_group$experiment, summary_by_group$n_individuals))

#print pretty table
df4 <- knitr::kable(groups_by_cat_n, row.names = TRUE, escape = FALSE, format = "html", digits = 2)

kable_styling(df4, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)
2 3 4 5 6 7
mixed 9 9 6 4 0 0
regular 2 7 8 5 1 2

Proportion of calls above the lowest probability of all correctly classified call in the trainings data set (for that particular group):

ggplot(summary_by_group, aes(x = prop_above_calls)) +
  geom_density(fill = viridis(10, alpha = 0.7)[8]) +
  scale_fill_viridis_d(alpha = 0.4, end = 0.8) +
  labs(x = "Proportion of calls above lowest probability with correct classification", y = "Density") + 
  facet_wrap( ~ experiment, scales = "free_y") +
  theme_classic()

  • This value ranges between 0.81 and 1 across all groups

  • Overall, 0.98 of calls in group flights had a individual belonging probability higher than the lowest probability of a correctly classified call in the training data set (for that particular group).

 

Takeaways


There is high confidence in assingning individual ID to calls in group flights for both regular and mixed flights


Session information

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] soundgen_2.2.0     shinyBS_0.61       Sim.DiffProc_4.8   rlang_0.4.11      
##  [5] kableExtra_1.3.4   DT_0.19            viridis_0.6.1      viridisLite_0.4.0 
##  [9] pbapply_1.4-3      e1071_1.7-8        caret_6.0-88       ggplot2_3.3.5     
## [13] lattice_0.20-44    ranger_0.13.1      readxl_1.3.1       warbleR_1.1.27    
## [17] NatureSounds_1.0.4 knitr_1.33         seewave_2.1.8      tuneR_1.3.3       
## [21] devtools_2.4.2     usethis_2.0.1      lubridate_1.7.10   data.table_1.14.0 
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-2     rjson_0.2.20         ellipsis_0.3.2      
##   [4] class_7.3-19         rprojroot_2.0.2      fs_1.5.0            
##   [7] rstudioapi_0.13      proxy_0.4-26         farver_2.1.0        
##  [10] Deriv_4.1.3          remotes_2.4.0        prodlim_2019.11.13  
##  [13] fansi_0.5.0          xml2_1.3.2           codetools_0.2-18    
##  [16] splines_4.1.0        cachem_1.0.5         pkgload_1.2.1       
##  [19] jsonlite_1.7.2       pROC_1.17.0.1        shiny_1.6.0         
##  [22] compiler_4.1.0       httr_1.4.2           assertthat_0.2.1    
##  [25] Matrix_1.3-4         fastmap_1.1.0        cli_3.0.1           
##  [28] later_1.3.0          htmltools_0.5.1.1    prettyunits_1.1.1   
##  [31] tools_4.1.0          gtable_0.3.0         glue_1.4.2          
##  [34] reshape2_1.4.4       dplyr_1.0.7          Rcpp_1.0.7          
##  [37] cellranger_1.1.0     jquerylib_0.1.4      vctrs_0.3.8         
##  [40] svglite_2.0.0        nlme_3.1-152         crosstalk_1.1.1     
##  [43] iterators_1.0.13     timeDate_3043.102    gower_0.2.2         
##  [46] xfun_0.24            stringr_1.4.0        ps_1.6.0            
##  [49] testthat_3.0.4       rvest_1.0.1          mime_0.11           
##  [52] lifecycle_1.0.0      MASS_7.3-54          scales_1.1.1        
##  [55] ipred_0.9-11         promises_1.2.0.1     parallel_4.1.0      
##  [58] yaml_2.2.1           memoise_2.0.0        gridExtra_2.3       
##  [61] sass_0.4.0           rpart_4.1-15         stringi_1.7.3       
##  [64] highr_0.9            desc_1.3.0           foreach_1.5.1       
##  [67] pkgbuild_1.2.0       lava_1.6.9           pkgconfig_2.0.3     
##  [70] systemfonts_1.0.2    dtw_1.22-3           bitops_1.0-7        
##  [73] evaluate_0.14        purrr_0.3.4          labeling_0.4.2      
##  [76] recipes_0.1.16       htmlwidgets_1.5.3    processx_3.5.2      
##  [79] tidyselect_1.1.1     plyr_1.8.6           magrittr_2.0.1      
##  [82] R6_2.5.0             fftw_1.0-6           generics_0.1.0      
##  [85] DBI_1.1.1            pillar_1.6.1         withr_2.4.2         
##  [88] survival_3.2-11      RCurl_1.98-1.3       nnet_7.3-16         
##  [91] tibble_3.1.3         crayon_1.4.1         utf8_1.2.2          
##  [94] rmarkdown_2.10       grid_4.1.0           callr_3.7.0         
##  [97] ModelMetrics_1.2.2.2 webshot_0.5.2        digest_0.6.27       
## [100] xtable_1.8-4         httpuv_1.6.2         signal_0.7-7        
## [103] stats4_4.1.0         munsell_0.5.0        bslib_0.2.5.1       
## [106] sessioninfo_1.1.1