Load packages

## add 'developer' to packages to be installed from github
x <- c("devtools", "maRce10/warbleR", "readxl", "ranger", "caret", "e1071", "pbapply", "viridis", "ggplot2", "kableExtra", "rlang", "Sim.DiffProc", "soundgen")

aa <- lapply(x, function(y) {
  
  # get pakage name
  pkg <- strsplit(y, "/")[[1]]
  pkg <- pkg[length(pkg)]
  
  # check if installed, if not then install 
  if (!pkg %in% installed.packages()[,"Package"])  {

      if (grepl("/", y))  devtools::install_github(y, force = TRUE) else
    install.packages(y) 
    }

  # load package
  try(require(pkg, character.only = T), silent = T)
})

Functions and global parameters

warbleR_options(wl = 1100, parallel = parallel::detectCores() - 1, bp = "frange", fast = TRUE, threshold = 15, ovlp = 20)

opts_knit$set(root.dir = "..")

# set evaluation false
opts_chunk$set( fig.width = 6, fig.height = 3, eval = FALSE, warning = FALSE, message = FALSE, tidy = TRUE)

# number of trees in Random Forest models
num.trees <- 2000

# replicates in Random Forest replication
reps <- 50

# sensitivity cutoff
cutoff <- 0.86

# function to calculate classification random forest models with balanced sample sizes across categories
balanced_rf <- function(X, num.trees = 1000, random = FALSE, seed = 506){
  
    # get smallest n across individuals
    min.n <- min(table(X$indiv)) 
  
    # use seed
    set.seed(seed)
    
    
    # randomly get rows for equal n across indivs
    sel_rows <-
    sapply(unique(X$indiv), function(x)
      sample(rownames(X)[X$indiv == x], min.n, replace = FALSE))
  
  # subset to those rows  
  X <- X[c(sel_rows), ]
  
  # convert to factor
  if (random){ 
    
    # use seed
    set.seed(seed)

    X$indiv <- sample(X$indiv)
  }
  
   # make it a factor for ranger to work 
  X$indiv <- as.factor(X$indiv)
  
  # run RF model spectral and cepstral parameters
  rfm <-
    ranger(
      indiv ~ .,
      data = X[, !names(X) %in% c("sound.files", "selec")],
      num.trees = num.trees,
      importance = "impurity",
      probability = TRUE,
      seed = seed
    )
  
  # get predicted individual from probs
  pred_indiv <- apply(rfm$predictions, 1, function(x) colnames(rfm$predictions) [which.max(x)])
  
  rfm$predictions <- data.frame(rfm$predictions, indiv = X$indiv, pred_indiv, sound.files = X$sound.files)
  
  # remove X from start of names 
  names(rfm$predictions) <- gsub("^X", "", names(rfm$predictions))
  
   return(rfm)
  }


# function to calculate sensitivities at increasing RF class probabilities

sensitivity_fun <- function(X, parameters, thresholds = seq(0,1, by = 0.01)){

# get sensitivities for each group at very threshold
sensitiv_l <- lapply(X, function(x){

  # extract prediction data.frame
  Y <- x$aggregated_predictions
  Y$max <- apply(Y[, sapply(Y, is.numeric)], 1, max)
  
  # get sensitivity at different thresholds
  sensi_l <- lapply(thresholds, function(y) data.frame(sensitivity = sum(Y$pred_indiv[Y$max >= y] == Y$actual_indiv[Y$max >= y])/ sum(Y$max >= y), n = sum(Y$max >= y) / nrow(Y))) 
  
  sensi <- do.call(rbind, sensi_l)
  
  # add metadata
  sensi$group <- x$group 
  sensi$n_indiv <- x$n_indiv
  sensi$min_n <- x$min_n
  sensi$n_calls <- nrow(Y) * sensi$n

  return(sensi)
  })

# put in a data frame
sensitivities <- as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "sensitivity")))

# get minimum sensitivity at each probabilities
sensitivities$min.sensitivity <- apply(sensitivities, 1, min, na.rm = TRUE)

# get minimum sensitivity at each probabilities
sensitivities$mean.sensitivity <- apply(sensitivities, 1, mean, na.rm = TRUE)

# add thresholds to data frame
sensitivities$thresholds <- thresholds


# put in a data frame
sensitivities$n_calls <- rowSums(as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "n_calls"))), na.rm = TRUE)

sensitivities$n_calls_prop <- sensitivities$n_calls / max(sensitivities$n_calls)

sensitivities <- sensitivities[!is.infinite(sensitivities$mean.sensitivity), ]

sensitivities$parameters <- parameters    
return(sensitivities)  
} 

Read detections and prepare data

# read ext sel tab calls
clls <- readRDS("./output/CALLS_fixed_detections_inquiry_calls.RDS")

# read metadata
metadat <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))
# nrow(metadat)

# remove video calibration
# metadat <- metadat[metadat$`tipo de video` !=  "calibracion de video", ]

metadat <- metadat[!is.na(metadat$Audio), ]

# get audio file name from sound files
clls$Audio <- sapply(clls$sound.files, function(x){
  
  # split by _
  spl <- strsplit(x, split = "_")[[1]]
  
  # take 3rd
  spl <- spl[grep(".wav$", spl)]
  
  # remove .wav
  spl <- gsub(".wav", "", spl)
  
  # make it anumber
  spl <- as.numeric(spl)

  return(spl)
})

# anyNA(clls$Audio)

# nrow(clls)

clls <- clls[clls$Audio %in% metadat$Audio, ]

# add metadata to ext sel table
clls2 <- merge(clls, metadat[!duplicated(metadat$Audio), c("Audio", "tipo de video", "Experimento", "Grupo", "Individuo")])

clls <- fix_extended_selection_table(X = clls2, Y = clls)

# exclude those with obstacles
clls <- clls[clls$Experimento != "Búsqueda refugio con obstaculos", ]


clls$Individuo[clls$Experimento != "vuelo solo"] <- NA

Measure acoustic parameters

# loop to measure acoustic parameters on each group
acous_param_l <- lapply(unique(clls$Grupo), function(x) {
  
  print(x)
  print(which(unique(clls$Grupo) == x) / length(unique(clls$Grupo)))
  
  # get 1 group data
  grp_test <- clls[clls$Grupo == x, ]
  
  # keep only solo flights
  grp_test <-
    grp_test[grp_test$Experimento != "vuelo grupal/enmascarando busqueda", ]
  
  # remove low SNR calls
  grp_test <- sig2noise(grp_test, mar = 0.025, pb = FALSE)
  grp_test <- grp_test[grp_test$SNR > 1, ]
  grp_test$SNR <- NULL
  
tb <- aggregate(Audio ~ Grupo + Experimento, data = grp_test, length)

# if group flight then go, otherwise return NA
if (any(grepl("grupal", tb$Experimento)) & length(unique(grp_test$Individuo[grp_test$Experimento == "vuelo solo"])) > 1){
  
  ######## measure acoustic structure ######
  # measure spectrographic parameters
  sp <- specan(grp_test, pb = FALSE, harmonicity = TRUE)
  
  # remove time parameters
  sp <- sp[, grep("time\\.", names(sp), invert = TRUE)]

  # check if there are NAs
  # anyNA(sp)
  
  # measure cepstral coeffs
  cc <- mfcc_stats(grp_test, pb = FALSE)[, -c(1, 2)]

  # spectrographic cross correlation
  spxc <- xcorr(grp_test, pb = FALSE)
  
  # MDS
  spxc <- cmdscale(1 - spxc, k = 10, list. = TRUE)

  spxc_mds <- spxc$points
  
  colnames(spxc_mds) <- paste0("spxcMDS", 1:ncol(spxc_mds))
   
  # mfcc cross correlation
  mfccxc <- xcorr(grp_test, pb = FALSE, type = "mfcc")
  
  # MDS
  mfccxc <- cmdscale(1 - mfccxc, k = 10, list. = TRUE)

  mfxc_mds <- mfccxc$points
  
  colnames(mfxc_mds) <- paste0("mfxcMDS", 1:ncol(mfxc_mds))

  
  # dynamic time warping
    dtw.dists <- df_DTW(grp_test, img = FALSE, pb = FALSE, threshold.time = 1)

    # MDS
    dtw_mds <- cmdscale(dtw.dists, k = 10, list. = TRUE)$points
  
  # fix colnames
  colnames(dtw_mds) <- paste0("dtwMDS", 1:ncol(dtw_mds))

  # put parameters in a list
  acous_param_l <- list(sp, cc, dtw_mds, spxc_mds, mfxc_mds)

  # remove null ones
  acous_param_l <- acous_param_l[!sapply(acous_param_l, is.null)]
  
  # bind all acoustic structure parameters together
  all_acous_param <- do.call(cbind, acous_param_l)
  
  # scale
  all_acous_param[,-c(1, 2)] <- scale(all_acous_param[,-c(1, 2)])
  
  # add individual and experiment
  all_acous_param$indiv <- grp_test$Individuo
  all_acous_param$experiment <- grp_test$Experimento
  
  # remove bottom and top freq
  all_acous_param$top.freq <- all_acous_param$bottom.freq <- NULL
  
  return(all_acous_param)
  } else
    return(NULL)
})

names(acous_param_l) <- unique(clls$Grupo)


# remove those that did not have group flights
acous_param_l <- acous_param_l[!sapply(acous_param_l, is.null)]

# remove columns with NAs across all groups
# get names of columns that have any NA in at least 1 group (list element)
na_colnms <- unique(unlist(lapply(acous_param_l, function(x) names(x)[sapply(x, anyNA)])))

# keep indiv column
na_colnms <- na_colnms[na_colnms != "indiv"]

# remove NA columns for all groups
acous_param_l <- lapply(acous_param_l, function(x) x[, !names(x) %in%  na_colnms])


# save as RDS
saveRDS(acous_param_l, "./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")

Run random forest all groups

# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")

# minimum sample size per group
min_n <- sapply(acous_param_l, function(x) min(table(x$indiv)))

# remove grpoups with less than 10 observations for minimum sample size 
acous_param_l <- acous_param_l[!names(acous_param_l) %in% names(min_n)[min_n < 10]]

# which parameters would be measured
param_categories <- c("mfcc",  "spxc", "mfxc", "dtw", "sp")

 # get actual parameter names
 col_names <- names(acous_param_l[[1]])
 col_names <- col_names[col_names != "experiment"]
 
 # measurement category for each measruremnt
 clss_col_names <- col_names
 
# name it by measurement function 
 for (i in  1:length(param_categories))
   clss_col_names[if (param_categories[i] != "sp") 
                              grepl(c("cc", "spxc", "mfxc", "dtw")[i], col_names) else
                              !grepl("cc|xc|dtw|indiv|sound.files|selec", col_names)] <- param_categories[i]



# all posible combinations of 2 and 3 parameters
combs4 <- combn(param_categories, 4)
combs3 <- combn(param_categories, 3)
combs2 <- combn(param_categories, 2)

# ake it a list
combs <- c(as.data.frame(combs4), as.data.frame(combs3), as.data.frame(combs2))

# add all 4 parameters as an option to the list
combs <- c(append(combs, list(param_categories)), param_categories)


for (i in 1:length(combs)) {
  
  rds_name <- paste0("./data/processed/averaged_random_forest_models_from_solo_flights_", paste(combs[[i]], collapse = "-"), ".RDS")
  
  # run if file doesn't exist
  if (!file.exists(rds_name)){
    
# avg_mods <- list()
# loop over list of acoustic parameters
avg_mods <- pblapply(names(acous_param_l),
# cl = .Options$warbleR$parallel,
cl = 1,
function(x){
  # for (x in names(acous_param_l)){
    print(x)              
      # extract data
  X <- acous_param_l[[which(names(acous_param_l) == x)]]
  
  # only solo flight
  solo_rf_input <- X[X$experiment == "vuelo solo", ]

  # rename rows for sel_rows
  rownames(solo_rf_input) <- 1:nrow(solo_rf_input)
  
  # order by sound file column
  solo_rf_input <- solo_rf_input[order(solo_rf_input$sound.files), ]
  
  # remove experiment column
  solo_rf_input$experiment <- NULL
 
  # subset columns to keep only those from selected acoustic measurements 
  solo_rf_input <- solo_rf_input[ , col_names[clss_col_names %in% c(combs[[i]], "sound.files", "selec", "indiv")]]
 
  # run random forest, set a seed to make it replicable
  rf_results <- pblapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, seed = x), cl = 1)
  
  # merge together predictions by sound files
  rf_preds <- lapply(rf_results, function(x){
    mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)

  mrg <- mrg[order(mrg$sound.files), -1]   
  }
 )
  
  # add column (individual) if not found 
  rf_preds <- lapply(rf_preds, function(x){
    
    if(ncol(x) < length(unique(solo_rf_input$indiv))){
      # how many columns are missing
      mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
      
      # add missing columns
      for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
    names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x)) 
    }
    return(x)
  })
  
  # get together predictions from the same individual
  preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
    do.call(cbind, lapply(rf_preds, "[", y)) 
  )
   
  agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))  
  
  # add individual name to columns
  names(agg_preds) <- names(rf_preds[[1]])
  
  # add sound file column
  agg_preds$sound.files <- solo_rf_input$sound.files
  
    # get predicted indiv from aggregated probabilities 
  agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])

  # make it a factor
  pred_indiv <- factor(agg_preds$pred_indiv, levels = unique(solo_rf_input$indiv))
  agg_preds$actual_indiv <- actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
  
  # get confusion matrix
  cm_solo <- confusionMatrix(pred_indiv, reference = actual_indiv)
  
  ### NULL MODEL
  # run null model by randomizing indiv labels
  rf_null_results <- pblapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, random = TRUE, seed = x), cl = 1)
  
  # get accuracies form null models  
    # merge together predictions by sound files
  rf_null_preds <- lapply(rf_null_results, function(x){
    mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)

  mrg <- mrg[order(mrg$sound.files), -1]   
  }
 )

  # add column (individual) if not found 
  rf_null_preds <- lapply(rf_null_preds, function(x){
    
    if(ncol(x) < length(unique(solo_rf_input$indiv))){
      # how many columns are missing
      mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
      
      # add missing columns
      for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
    
      names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x)) 
    }
    return(x)
  })
  
  # get together predictions from the same individual
  preds_by_indv_null <- lapply(1:ncol(rf_null_preds[[1]]), function(y)
    do.call(cbind, lapply(rf_null_preds, "[", y)) 
  )
   
  agg_preds_null <- as.data.frame(lapply(preds_by_indv_null, rowMeans, na.rm = TRUE))  
  
  # add individual name to columns
  names(agg_preds_null) <- names(rf_null_preds[[1]])
  
  # add sound file column
  agg_preds_null$sound.files <- solo_rf_input$sound.files
  
    # get predicted indiv from aggregated probabilities 
  agg_preds_null$pred_indiv <- apply(agg_preds_null[, sapply(agg_preds_null, is.numeric)], 1, function(x) colnames(agg_preds_null)[which.max(x)])

  # make it a factor
  pred_indiv_null <- factor(agg_preds_null$pred_indiv, levels = unique(solo_rf_input$indiv))
  actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
  
  # get confusion matrix
  cm_solo_null <- confusionMatrix(pred_indiv_null, reference = actual_indiv)
  
  ### NOTE: ranger() OOB prediction error and confusionMatrix() Accuracy are the same
  # put all results together
  output <- list(group = x, accuracy = cm_solo$overall[1], null_accuracy = cm_solo_null$overall[1], aggregated_predictions = agg_preds, conf_matrix = cm_solo, random_forests = rf_results, n_indiv = length(unique(solo_rf_input$indiv)), min_n = min(table(solo_rf_input$indiv)), parameters = combs[[i]])

  # avg_mods[[length(avg_mods) + 1]] <- output
  # rm(output)
  
return(output)
  }
)

# add group name to list
names(avg_mods) <- names(acous_param_l)

# save as RDS
saveRDS(avg_mods, rds_name)
  }
}

Predict solo flight optimizing sensitivity for different acoustic parameter sets

  • Prediction with all solo flight data with a sensitivy threshold of 0.86 for different acoustic parameters as predictors
# read rf outputs

rds_rf_outputs <- list.files(path = "./data/processed", pattern = "averaged_random_forest_models", full.names = TRUE)

model_diagnostics_l <- pblapply(rds_rf_outputs, cl = 1, function(x) {

  # read data
  avg_mods <- readRDS(x)

sensitivities <- sensitivity_fun(X = avg_mods, parameters = paste(avg_mods[[1]]$parameters, collapse = "-"))

# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])

avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))

null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))

filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 1:length(null_accuracy)])

diagnostics <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, null_accuracy, filtered_accuracy), acoustic_parametes = gsub("./data/processed/averaged_random_forest_models_from_solo_flights_|.RDS","", x))

diagnostics$model <- factor(diagnostics$model, levels = c("null model", "filtered model", "real data"))

return(list(diagnostics = diagnostics, sensitivities = sensitivities))
})

# put in a data frame
model_diagnostics <- do.call(rbind, lapply(model_diagnostics_l, "[[", 1))


saveRDS(model_diagnostics, "./data/processed/random_forests_diagnostics_several_acoustic_parameter_combinations_solo_flight.RDS")

# save sensitivities

senstivities_l <- lapply(model_diagnostics_l, "[[", 2)

   # determine all column names in all selection tables    
    cnms <- unique(unlist(lapply(senstivities_l, names)))    
    
    # add columns that are missing to each selection table
    senstivities_l <- lapply(senstivities_l, function(X)
    {
      nms <- names(X)
      if (length(nms) != length(cnms))  
        for(i in cnms[!cnms %in% nms]) {
          X <- data.frame(X,  NA, stringsAsFactors = FALSE, check.names = FALSE)
          names(X)[ncol(X)] <- i
        }
      return(X)
    })


model_sensitivities <- do.call(rbind, senstivities_l)


saveRDS(model_sensitivities, "./data/processed/random_forests_sensitivities_several_acoustic_parameter_combinations_solo_flight.RDS")
# read diagnostic
model_diagnostics <- readRDS("./data/processed/random_forests_diagnostics_several_acoustic_parameter_combinations_solo_flight.RDS")

# make factor to order plot x axis ticks
model_diagnostics$acoustic_parametes <- factor(model_diagnostics$acoustic_parametes, levels = sort(unique(model_diagnostics$acoustic_parametes)))

# make it numeric for points/lines
model_diagnostics$model <- factor(model_diagnostics$model)
model_diagnostics$model_num <- as.numeric(model_diagnostics$model)
model_diagnostics$model_num[model_diagnostics$model != "null model"] <- 2


# density plots
ggplot(model_diagnostics[model_diagnostics$model != "filtered model", ], aes(x = accuracy, fill = model)) +
  geom_density(alpha=0.4) + 
  theme_classic() + 
  scale_fill_viridis_d() + 
  ggtitle("Original (non-filtered) accuracies") + 
  labs(x = "Mean accuracy", y = "Frequency") +
    facet_wrap(~  acoustic_parametes, ncol = 4)

ggplot(model_diagnostics[model_diagnostics$model != "real data", ], aes(x = accuracy, fill = model)) +
  geom_density(alpha=0.4) + 
  theme_classic() + 
  scale_fill_viridis_d() + 
  ggtitle("Optimized accuracies") + 
  labs(x = "Mean accuracy", y = "Frequency") +
  facet_wrap(~  acoustic_parametes, ncol = 4)

# 
sensitivities <- readRDS("./data/processed/random_forests_sensitivities_several_acoustic_parameter_combinations_solo_flight.RDS")


# Probability threshold vs Mean sensitivty
# ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + 
#   geom_hline(yintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + 
#   # geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
#   geom_point(col = magma(10, alpha = 0.7)[2]) +
#   geom_line(col = magma(10, alpha = 0.7)[2]) +
#   labs(y = "Mean sensitivty", x = "Probability threshold") +
#   theme_classic() +
#   # annotate(geom = "text", x = thresh_prob + 0.04, y = 0.9, label = round(thresh_prob, 2)) +
#   facet_wrap(~  parameters, ncol = 4)
# 
# # calculate threshold at
# n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= cutoff])
# 


# #  Probability threshold vs Proportion of calls used
# ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) +
#   geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
#   # geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
#   geom_point(col = magma(10, alpha = 0.7)[2]) +
#   geom_line(col = magma(10, alpha = 0.7)[2]) +
#   labs(y = "Proportion of calls used", x = "Probability threshold") +
#   theme_classic() +
#   # annotate(geom = "text", x = n_call_cutoff + 0.04, y = 0.9, label = round(n_call_cutoff, 2))
#   facet_wrap(~  parameters, ncol = 4)
# 
# ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) +
#   geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
#   geom_vline(xintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
#   geom_point(col = magma(10, alpha = 0.7)[2]) +
#   geom_line(col = magma(10, alpha = 0.7)[2]) +
#   labs(y = "Proportion of calls used", x = "Sensitivity") +
#   theme_classic() +
#   # annotate(geom = "text", y = n_call_cutoff + 0.04, x = cutoff, label = round(n_call_cutoff, 2))
#   facet_wrap(~  parameters, ncol = 4)



maxs <- sapply(split(sensitivities, sensitivities$parameters), function(Z) max(Z$n_calls_prop[Z$mean.sensitivity >= cutoff]))

max_prop_calls <- data.frame(data_set = names(maxs), max_prop_calls = maxs)

max_prop_calls <- max_prop_calls[order(- max_prop_calls$max_prop_calls), ]

df1 <- knitr::kable(max_prop_calls, row.names = FALSE, escape = FALSE, format = "html", digits = 2)

kable_styling(df1, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)
data_set max_prop_calls
mfcc-spxc-mfxc-sp 0.98
mfcc-spxc-mfxc-dtw-sp 0.97
mfcc-mfxc-dtw-sp 0.96
mfcc-mfxc-sp 0.95
mfcc-spxc-sp 0.94
mfcc-spxc-dtw-sp 0.93
mfcc-spxc-mfxc-dtw 0.93
mfcc-sp 0.92
mfcc-dtw-sp 0.92
mfcc-spxc-mfxc 0.92
mfcc-mfxc 0.91
mfcc-mfxc-dtw 0.91
spxc-mfxc 0.91
spxc-mfxc-dtw 0.89
spxc-mfxc-dtw-sp 0.88
spxc-mfxc-sp 0.87
mfcc-spxc-dtw 0.86
mfcc-spxc 0.86
spxc-sp 0.83
spxc-dtw-sp 0.83
mfcc 0.83
mfxc-dtw-sp 0.82
mfcc-dtw 0.81
sp-mfcc-dtw 0.79
spxc 0.79
mfxc-sp 0.78
spxc-dtw 0.73
dtw-sp 0.71
mfxc 0.71
sp 0.69
mfxc-dtw 0.68
dtw 0.01

Predict solo flight optimizing sensitivity

  • Prediction with all solo flight data with a sensitivy threshold of 0.86 for acoustic parameters: mfcc-spxc-mfxc-sp
# read rf outputs
avg_mods <- readRDS(paste0("./data/processed/averaged_random_forest_models_from_solo_flights_", max_prop_calls$data_set[1],".RDS"))

sensitivities <- sensitivity_fun(X = avg_mods, parameters = "mfcc-sp")

# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])

avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))

null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))

filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 
    1:length(null_accuracy)])

df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", 
    "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, 
    null_accuracy, filtered_accuracy))

df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))

# violin plots
ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) + 
    geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Original accuracies") + 
    labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(2, 1), each = nrow(df)/3)), 
    size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(2, 
    1), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)

ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) + 
    geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Optimized accuracies") + 
    labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(1, 2), each = nrow(df)/3)), 
    size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(1, 
    2), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)

# density plots
ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) + 
    theme_classic() + scale_fill_viridis_d() + ggtitle("Original (non-filtered) accuracies") + 
    labs(x = "Mean accuracy", y = "Frequency")

ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) + 
    theme_classic() + scale_fill_viridis_d() + ggtitle("Optimized accuracies") + 
    labs(x = "Mean accuracy", y = "Frequency")

# Probability threshold vs Mean sensitivty
ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + geom_hline(yintercept = cutoff, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
    alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Mean sensitivty", 
    x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = thresh_prob + 
    0.04, y = 0.9, label = round(thresh_prob, 2))

# calculate threshold at
n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= 
    cutoff])

# Probability threshold vs Proportion of calls used
ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
    alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used", 
    x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = n_call_cutoff + 
    0.04, y = 0.9, label = round(n_call_cutoff, 2))

ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = cutoff, 
    col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
    alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used", 
    x = "Sensitivity") + theme_classic() + annotate(geom = "text", y = n_call_cutoff + 
    0.04, x = cutoff, label = round(n_call_cutoff, 2))

# 
# # remove group 42 AND 6
# avg_mods <- avg_mods[!names(avg_mods) %in% c("42", "6", "21", "29")]
# 
# # calculate sensitivities
# sensitivities <- sensitivity_fun(X = avg_mods, parameters = max_prop_calls$data_set[1])
# 
# # calculate threshold at cutoff
# thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
# 
# avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
# 
# null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))
# 
# filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 
#     1:length(null_accuracy)])
# 
# df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", 
#     "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, 
#     null_accuracy, filtered_accuracy))
# 
# df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))
# 
# # violin plots
# ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) + 
#     geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Original accuracies") + 
#     labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(2, 1), each = nrow(df)/3)), 
#     size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(2, 
#     1), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
# 
# 
# ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) + 
#     geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Optimized accuracies") + 
#     labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(1, 2), each = nrow(df)/3)), 
#     size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(1, 
#     2), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
# 
# 
# # density plots
# ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) + 
#     theme_classic() + scale_fill_viridis_d() + ggtitle("Original (non-filtered) accuracies") + 
#     labs(x = "Mean accuracy", y = "Frequency")
# 
# 
# ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) + 
#     theme_classic() + scale_fill_viridis_d() + ggtitle("Optimized accuracies") + 
#     labs(x = "Mean accuracy", y = "Frequency")
# 
# 
# ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + geom_hline(yintercept = cutoff, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
#     alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Mean sensitivty", 
#     x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = thresh_prob + 
#     0.04, y = 0.9, label = round(thresh_prob, 2))
# 
# 
# # calculate threshold at
# n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= 
#     cutoff])
# 
# 
# ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
#     alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used", 
#     x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = n_call_cutoff + 
#     0.04, y = 0.9, label = round(n_call_cutoff, 2))
# 
# 
# 
# ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = cutoff, 
#     col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10, 
#     alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used", 
#     x = "Sensitivity") + theme_classic() + annotate(geom = "text", y = n_call_cutoff + 
#     0.04, x = cutoff, label = round(n_call_cutoff, 2))
  • Probability threshold of 0.28

  • Prediction removing groups 42 and 6 (low sensitivity)
  • Sensitivy threshold of 0.86
# remove group 42 AND 6
avg_mods <- avg_mods[!names(avg_mods) %in% c("42", "6", "21", "29")]

# calculate sensitivities
sensitivities <- sensitivity_fun(X = avg_mods, parameters = max_prop_calls$data_set[1])

# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])

avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))

null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))

filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 1:length(null_accuracy)])

df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, null_accuracy, filtered_accuracy))

df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))

# violin plots
ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) +
  geom_violin() +
   theme_classic() + 
  scale_fill_viridis_d(alpha = 0.4) + 
  ggtitle("Original accuracies") + 
  labs(y = "Mean accuracy", x = "Model") +
  geom_point(aes(x = rep(c(2, 1), each = nrow(df) / 3)), size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) +
  geom_line(aes(x = rep(c(2, 1), each = nrow(df) / 3), group = group), col = "gray43", alpha = 0.8)

ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) +
  geom_violin() +
   theme_classic() + 
  scale_fill_viridis_d(alpha = 0.4) + 
  ggtitle("Optimized accuracies") + 
  labs(y = "Mean accuracy", x = "Model") +
  geom_point(aes(x = rep(c(1, 2), each = nrow(df) / 3)), size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) +
  geom_line(aes(x = rep(c(1, 2), each = nrow(df) / 3), group = group), col = "gray43", alpha = 0.8)

# density plots
ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) +
  geom_density(alpha=0.4) + 
  theme_classic() + 
  scale_fill_viridis_d() + 
  ggtitle("Original (non-filtered) accuracies") + 
  labs(x = "Mean accuracy", y = "Frequency")

ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) +
  geom_density(alpha=0.4) + 
  theme_classic() + 
  scale_fill_viridis_d() + 
  ggtitle("Optimized accuracies") + 
  labs(x = "Mean accuracy", y = "Frequency")

ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + 
  geom_hline(yintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + 
  geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
  geom_point(col = magma(10, alpha = 0.7)[2]) +
  geom_line(col = magma(10, alpha = 0.7)[2]) +
  labs(y = "Mean sensitivty", x = "Probability threshold") +
  theme_classic() +
  annotate(geom = "text", x = thresh_prob + 0.04, y = 0.9, label = round(thresh_prob, 2))

# calculate threshold at
n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= cutoff])


ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) +
  geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
  geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
  geom_point(col = magma(10, alpha = 0.7)[2]) +
  geom_line(col = magma(10, alpha = 0.7)[2]) +
  labs(y = "Proportion of calls used", x = "Probability threshold") +
  theme_classic() +
  annotate(geom = "text", x = n_call_cutoff + 0.04, y = 0.9, label = round(n_call_cutoff, 2))

ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) +
  geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
  geom_vline(xintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
  geom_point(col = magma(10, alpha = 0.7)[2]) +
  geom_line(col = magma(10, alpha = 0.7)[2]) +
  labs(y = "Proportion of calls used", x = "Sensitivity") +
  theme_classic() +
  annotate(geom = "text", y = n_call_cutoff + 0.04, x = cutoff, label = round(n_call_cutoff, 2))

  • Probability threshold of 0.21

Predict group flight for experiments without playback

  • Sensitivy threshold of 0.86 derived from solo flight optimization, which is produced by a probability threshold of 0.21
# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")

acous_param_l <- acous_param_l[names(acous_param_l) %in% names(avg_mods)]

# extract random forests from acous_param_l list 
random_forests_l <- lapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "random_forests"))

agg_preds_l <- lapply(names(random_forests_l), function(x){

  # print(x)
  Z <- acous_param_l[[x]]
  Y <- Z[Z$experiment == "vuelo grupal/sin sonido", ]
  Y$indiv <- NULL
  
  # random forest models for this group
  rfms <- random_forests_l[[x]]
  
  # predict using all random forest models
  rf_preds <- lapply(rfms, function(x) predict(object = x, data = Y)$predictions)

    # add column (individual) if not found 
  rf_preds <- lapply(rf_preds, function(x){
    
    if(ncol(x) < length(unique(Z$indiv[Z$experiment == "vuelo solo"]))){
      # how many columns are missing
      mssng <- length(unique(Z$indiv[Z$experiment == "vuelo solo"])) - ncol(x)
      
      # add missing columns with 0
      for(i in 1:(mssng)) x <- data.frame(x, 0, check.names = FALSE)
    names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(Z$indiv[Z$experiment == "vuelo solo"]), names(x)) 
    }
    return(x)
  })
  
  # get together predictions from the same individual
  preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
    do.call(cbind, lapply(rf_preds, function(e) e[, y])) 
  )
   
  agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))  
  
  # add individual name to columns
  names(agg_preds) <- colnames(rf_preds[[1]])
  
  # add sound file column
  agg_preds$sound.files <- Y$sound.files
  
    # get predicted indiv from aggregated probabilities 
  agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])

  agg_preds$group <- x
  agg_preds$max_prob <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, max)
  
  return(agg_preds)
  })

# add group name to list
names(agg_preds_l) <- names(acous_param_l)


agg_pred <- do.call(rbind, lapply(agg_preds_l, function(x) x[, c("group", "sound.files", "max_prob", "pred_indiv")]))

# remove those groups with low sensitivity
agg_pred <- agg_pred[!agg_pred$group %in% c("42", "6"), ] 

# save as RDS
saveRDS(agg_pred, "./data/processed/predicted_individual_in_group_flights.RDS")
# read as RDS
agg_pred <- readRDS("./data/processed/predicted_individual_in_group_flights.RDS")

# get summary by group
summ_by_groups <- lapply(unique(agg_pred$group), function(x) {
    
  Y <- agg_pred[agg_pred$group == x, ]

#total number of calls used
  n_used_calls <- sum(Y$max_prob > thresh_prob)

#proportion of calls used
  prop_used_calls <- sum(Y$max_prob > thresh_prob) / nrow(Y)

  return(data.frame(group = x, n_used_calls, total_calls = nrow(Y), prop_used_calls))
  
})

summ_by_group <- do.call(rbind, summ_by_groups)

# add row with total across all groups
total_row <- data.frame(group = "All groups",   n_used_calls = sum(summ_by_group$n_used_calls), total_calls = sum(summ_by_group$total_calls))

total_row$prop_used_calls <- total_row$n_used_calls / total_row$total_calls

# bind totals to summary table
summ_by_group <- rbind(summ_by_group, total_row)

#print pretty table
df1 <- knitr::kable(summ_by_group, row.names = TRUE, escape = FALSE, format = "html", digits = 2)

df1 <-  kable_styling(df1, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)

row_spec(df1, row = nrow(summ_by_group), bold = TRUE, color = "white", background = viridis(10)[8])
group n_used_calls total_calls prop_used_calls
1 4 69 69 1
2 3 176 176 1
3 1 46 46 1
4 40 3 3 1
5 9 39 39 1
6 15 59 59 1
7 13 5 5 1
8 18-A-B 20 20 1
9 6/? 72 72 1
10 12/12/N/N 15 15 1
11 12/NN 58 58 1
12 All groups 562 562 1

Analyze calling activity in solo vs group flight

Using all calls, including those with low class probabilities

# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")

# exclude groups with low sensitivity (42 & 6) or low sample size after removing low prob calls (29)
acous_param_l <- acous_param_l[!names(acous_param_l) %in% c("42", "6", "21", "29")]

# get group and solo call counts per individual 
sl_vs_grps <- lapply(names(acous_param_l), function(x){
  
  acous_param <- acous_param_l[[x]]
  
  group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
  
  # get sound file number for group flights
  group_file <- strsplit(group_file, ".wav")[[1]][1]
  group_file_num <- as.numeric(substr(group_file, start =  nchar(group_file) - 6, nchar(group_file)))
  
  # get data for solo flights
  solo <- acous_param[acous_param$experiment == "vuelo solo", ]
  
  sl_grp_cll <- lapply(unique(solo$indiv), function(y){
    
    solo_calls <- sum(solo$indiv == y)

    group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y)
    
      # get sound file number for group flights
    solo_file <- solo$sound.files[solo$indiv == y][1]
    solo_file <- strsplit(solo_file, ".wav")[[1]][1]
    solo_file_num <- as.numeric(substr(solo_file, start =  nchar(solo_file) - 6, nchar(solo_file)))
  
    return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num)))  
    
  })
  
    sl_grp_clls <- do.call(rbind, sl_grp_cll)
  
    return(sl_grp_clls)
  }
)

sl_vs_grp <- do.call(rbind, sl_vs_grps)


# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]

# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))

exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))

# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)

# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]

## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
  
  out <- if (sl_vs_grp$experiment[x] == "solo")
  unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)]))  else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))  
  
  return(out)
})


# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){ 
  
  ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))

  if (length(ft) == 0) ft <- NA
  return(ft)
  }                                                                
)

# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")

# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")

sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"

# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))


sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)

# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) + 
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) + 
  geom_point(size = 5, alpha = 0.8) + 
  theme_classic() + 
  scale_color_viridis_d(alpha = 0.4, guide=FALSE) + 
  ggtitle("Change in calling rate by sex and individual") + 
  labs(x = "Experiment", y = "Calling rate (calls / min)") + 
  geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) + 
  facet_wrap(~ group, nrow = 4)

Using only calls with class probabilities higher than 0.21

# get group and solo call counts per individual 
sl_vs_grps <- lapply(names(acous_param_l), function(x){
  
  acous_param <- acous_param_l[[x]]
  
  group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
  
  # get sound file number for group flights
  group_file <- strsplit(group_file, ".wav")[[1]][1]
  group_file_num <- as.numeric(substr(group_file, start =  nchar(group_file) - 6, nchar(group_file)))
  
  # get data for solo flights
  solo <- acous_param[acous_param$experiment == "vuelo solo", ]
  
  sl_grp_cll <- lapply(unique(solo$indiv), function(y){
    
    solo_calls <- sum(solo$indiv == y)

    # taking into acount those with class prob higher than 
    group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y & agg_pred$max_prob > thresh_prob)
  
      # get sound file number for group flights
    solo_file <- solo$sound.files[solo$indiv == y][1]
    solo_file <- strsplit(solo_file, ".wav")[[1]][1]
    solo_file_num <- as.numeric(substr(solo_file, start =  nchar(solo_file) - 6, nchar(solo_file)))
  
    return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num)))  
    
  })
  
    sl_grp_clls <- do.call(rbind, sl_grp_cll)
  
    return(sl_grp_clls)
  }
)

sl_vs_grp <- do.call(rbind, sl_vs_grps)


# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]

# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))

exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))

# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)

# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]

## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
  
  out <- if (sl_vs_grp$experiment[x] == "solo")
  unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)]))  else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))  
  
  return(out)
})


# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){ 
  
  ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))

  if (length(ft) == 0) ft <- NA
  return(ft)
  }                                                                
)

# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")

# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")

sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"

# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))


sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)

# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) + 
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) + 
  geom_point(size = 5, alpha = 0.8) + 
  theme_classic() + 
  scale_color_viridis_d(alpha = 0.4, guide=FALSE) + 
  ggtitle("Change in calling rate by sex and individual") + 
  labs(x = "Experiment", y = "Calling rate (calls / min)") + 
  geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) + 
  facet_wrap(~ group, nrow = 4)

# violin plots
# ggplot(sl_vs_grp, aes(y = prop_calls, x = experiment, col = indiv, shape = sex)) +
#   geom_point(size = 5, alpha = 0.8) + 
#   theme_classic() + 
#   scale_color_viridis_d(alpha = 0.4, guide=FALSE) + 
#   ggtitle("Original accuracies") + 
#   labs(y = "Mean accuracy", x = "Model") + 
#   geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) + 
#   facet_wrap(~ group, nrow = 4) + ggtitle(label = "Change in calling activity by sex and individual")

Using only calls with class probabilities higher than 0.21 and assinging low probability calls evenly across individuals

# get group and solo call counts per individual 
sl_vs_grps <- lapply(names(acous_param_l), function(x){
  
  acous_param <- acous_param_l[[x]]
  
  group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
  
  # get sound file number for group flights
  group_file <- strsplit(group_file, ".wav")[[1]][1]
  group_file_num <- as.numeric(substr(group_file, start =  nchar(group_file) - 6, nchar(group_file)))
  
  # get data for solo flights
  solo <- acous_param[acous_param$experiment == "vuelo solo", ]
  
  sl_grp_cll <- lapply(unique(solo$indiv), function(y){
    
    solo_calls <- sum(solo$indiv == y)

    # taking into acount those with class prob higher than 
    group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y & agg_pred$max_prob > thresh_prob)
  
    # add low probability calls to group calls
   group_calls <- group_calls + sum(agg_pred$group == x & agg_pred$max_prob <= thresh_prob) / length(unique(agg_pred$pred_indiv[agg_pred$group == x]))
    
      # get sound file number for group flights
    solo_file <- solo$sound.files[solo$indiv == y][1]
    solo_file <- strsplit(solo_file, ".wav")[[1]][1]
    solo_file_num <- as.numeric(substr(solo_file, start =  nchar(solo_file) - 6, nchar(solo_file)))
  
    return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num), sound_files = c(solo_file, group_file)))  
    
  })
  
    sl_grp_clls <- do.call(rbind, sl_grp_cll)
  
    return(sl_grp_clls)
  }
)

sl_vs_grp <- do.call(rbind, sl_vs_grps)

# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]

# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))

exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))

# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)

# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]

## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
  
  out <- if (sl_vs_grp$experiment[x] == "solo")
  unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)]))  else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))  
  
  return(out)
})


# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){ 
  
  ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))

  if (length(ft) == 0) ft <- NA
  return(ft)
  }                                                                
)

# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")

# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")

sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)

sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"

# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))


sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)

# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) + 
  geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) + 
  geom_point(size = 5, alpha = 0.8) + 
  theme_classic() + 
  scale_color_viridis_d(alpha = 0.4, guide=FALSE) + 
  ggtitle("Change in calling rate by sex and individual") + 
  labs(x = "Experiment", y = "Calling rate (calls / min)") + 
  geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) + 
  facet_wrap(~ group, nrow = 4)

Gaps between calls for solo and group flights

clls <- readRDS("./output/CALLS_fixed_detections_inquiry_calls.RDS")

sl_vs_grp$sound_files <- paste0(sl_vs_grp$sound_files, ".wav")


seltab <- attributes(clls)$check.results

gap_l <- lapply(1:nrow(sl_vs_grp), function(x){

  Y <- seltab[seltab$orig.sound.files == sl_vs_grp$sound_files[x], ]
  Y <- Y[order(Y$orig.start), ]
  gaps <- Y$orig.start[-1] - Y$orig.end[-nrow(Y)] 

  return(data.frame(group = sl_vs_grp$group[x], sound.files = sl_vs_grp$sound_files[x] ,experiment = sl_vs_grp$experiment[x], gaps = ifelse(length(gaps) == 0, NA, gaps)))
  
})

gaps <- do.call(rbind, gap_l)



ggplot(gaps, aes(x = gaps, fill = experiment, group = experiment)) +  
  geom_density() + 
  scale_fill_viridis_d(alpha = 0.4) + 
  ggtitle("Gaps sec by group") +
  theme_classic() + 
  facet_wrap(~group, ncol = 4, scales = "free_y")

ggplot(gaps, aes(x = gaps, fill = experiment, group = experiment)) +  
  geom_density() + 
  ggtitle("Gaps sec groups combined") +
  scale_fill_viridis_d(alpha = 0.4) + 
  theme_classic() 


Session information

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=pt_BR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
##  [5] LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
##  [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] soundgen_1.8.2     shinyBS_0.61       Sim.DiffProc_4.7   rlang_0.4.8       
##  [5] kableExtra_1.2.1   viridis_0.5.1      viridisLite_0.3.0  pbapply_1.4-2     
##  [9] e1071_1.7-4        caret_6.0-86       ggplot2_3.3.0      lattice_0.20-41   
## [13] ranger_0.12.1      readxl_1.3.1       warbleR_1.1.24     NatureSounds_1.0.3
## [17] knitr_1.28         seewave_2.1.5      tuneR_1.3.3        devtools_2.3.2    
## [21] usethis_1.6.3     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1     rjson_0.2.20         ellipsis_0.3.1      
##  [4] class_7.3-17         rprojroot_1.3-2      fs_1.5.0            
##  [7] rstudioapi_0.11      proxy_0.4-24         farver_2.0.3        
## [10] Deriv_4.1.0          remotes_2.2.0        prodlim_2019.11.13  
## [13] fansi_0.4.1          lubridate_1.7.9      xml2_1.3.2          
## [16] codetools_0.2-16     splines_4.0.3        pkgload_1.1.0       
## [19] pROC_1.16.2          shiny_1.5.0          compiler_4.0.3      
## [22] httr_1.4.2           backports_1.1.7      fastmap_1.0.1       
## [25] assertthat_0.2.1     Matrix_1.2-18        cli_2.0.2           
## [28] later_1.0.0          htmltools_0.5.0      prettyunits_1.1.1   
## [31] tools_4.0.3          gtable_0.3.0         glue_1.4.0          
## [34] reshape2_1.4.4       dplyr_1.0.2          Rcpp_1.0.4.6        
## [37] cellranger_1.1.0     vctrs_0.3.4          nlme_3.1-149        
## [40] iterators_1.0.13     timeDate_3043.102    gower_0.2.2         
## [43] xfun_0.13            stringr_1.4.0        ps_1.3.3            
## [46] testthat_2.3.2       rvest_0.3.6          mime_0.9            
## [49] lifecycle_0.2.0      MASS_7.3-53          scales_1.1.1        
## [52] ipred_0.9-9          promises_1.1.0       parallel_4.0.3      
## [55] yaml_2.2.1           memoise_1.1.0        gridExtra_2.3       
## [58] rpart_4.1-15         stringi_1.4.6        desc_1.2.0          
## [61] foreach_1.5.1        pkgbuild_1.1.0       lava_1.6.8          
## [64] pkgconfig_2.0.3      dtw_1.21-3           bitops_1.0-6        
## [67] evaluate_0.14        purrr_0.3.4          labeling_0.3        
## [70] recipes_0.1.14       processx_3.4.4       tidyselect_1.1.0    
## [73] plyr_1.8.6           magrittr_1.5         R6_2.4.1            
## [76] fftw_1.0-6           generics_0.0.2       pillar_1.4.4        
## [79] withr_2.2.0          survival_3.2-7       RCurl_1.98-1.2      
## [82] nnet_7.3-14          tibble_3.0.1         crayon_1.3.4        
## [85] rmarkdown_2.1        grid_4.0.3           data.table_1.13.0   
## [88] callr_3.5.1          ModelMetrics_1.2.2.2 digest_0.6.25       
## [91] webshot_0.5.2        xtable_1.8-4         httpuv_1.5.2        
## [94] signal_0.7-6         stats4_4.0.3         munsell_0.5.0       
## [97] sessioninfo_1.1.1