## add 'developer' to packages to be installed from github
x <- c("devtools", "maRce10/warbleR", "readxl", "ranger", "caret", "e1071", "pbapply", "viridis", "ggplot2", "kableExtra", "rlang", "Sim.DiffProc", "soundgen")
aa <- lapply(x, function(y) {
# get pakage name
pkg <- strsplit(y, "/")[[1]]
pkg <- pkg[length(pkg)]
# check if installed, if not then install
if (!pkg %in% installed.packages()[,"Package"]) {
if (grepl("/", y)) devtools::install_github(y, force = TRUE) else
install.packages(y)
}
# load package
try(require(pkg, character.only = T), silent = T)
})
warbleR_options(wl = 1100, parallel = parallel::detectCores() - 1, bp = "frange", fast = TRUE, threshold = 15, ovlp = 20)
opts_knit$set(root.dir = "..")
# set evaluation false
opts_chunk$set( fig.width = 6, fig.height = 3, eval = FALSE, warning = FALSE, message = FALSE, tidy = TRUE)
# number of trees in Random Forest models
num.trees <- 2000
# replicates in Random Forest replication
reps <- 50
# sensitivity cutoff
cutoff <- 0.86
# function to calculate classification random forest models with balanced sample sizes across categories
balanced_rf <- function(X, num.trees = 1000, random = FALSE, seed = 506){
# get smallest n across individuals
min.n <- min(table(X$indiv))
# use seed
set.seed(seed)
# randomly get rows for equal n across indivs
sel_rows <-
sapply(unique(X$indiv), function(x)
sample(rownames(X)[X$indiv == x], min.n, replace = FALSE))
# subset to those rows
X <- X[c(sel_rows), ]
# convert to factor
if (random){
# use seed
set.seed(seed)
X$indiv <- sample(X$indiv)
}
# make it a factor for ranger to work
X$indiv <- as.factor(X$indiv)
# run RF model spectral and cepstral parameters
rfm <-
ranger(
indiv ~ .,
data = X[, !names(X) %in% c("sound.files", "selec")],
num.trees = num.trees,
importance = "impurity",
probability = TRUE,
seed = seed
)
# get predicted individual from probs
pred_indiv <- apply(rfm$predictions, 1, function(x) colnames(rfm$predictions) [which.max(x)])
rfm$predictions <- data.frame(rfm$predictions, indiv = X$indiv, pred_indiv, sound.files = X$sound.files)
# remove X from start of names
names(rfm$predictions) <- gsub("^X", "", names(rfm$predictions))
return(rfm)
}
# function to calculate sensitivities at increasing RF class probabilities
sensitivity_fun <- function(X, parameters, thresholds = seq(0,1, by = 0.01)){
# get sensitivities for each group at very threshold
sensitiv_l <- lapply(X, function(x){
# extract prediction data.frame
Y <- x$aggregated_predictions
Y$max <- apply(Y[, sapply(Y, is.numeric)], 1, max)
# get sensitivity at different thresholds
sensi_l <- lapply(thresholds, function(y) data.frame(sensitivity = sum(Y$pred_indiv[Y$max >= y] == Y$actual_indiv[Y$max >= y])/ sum(Y$max >= y), n = sum(Y$max >= y) / nrow(Y)))
sensi <- do.call(rbind, sensi_l)
# add metadata
sensi$group <- x$group
sensi$n_indiv <- x$n_indiv
sensi$min_n <- x$min_n
sensi$n_calls <- nrow(Y) * sensi$n
return(sensi)
})
# put in a data frame
sensitivities <- as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "sensitivity")))
# get minimum sensitivity at each probabilities
sensitivities$min.sensitivity <- apply(sensitivities, 1, min, na.rm = TRUE)
# get minimum sensitivity at each probabilities
sensitivities$mean.sensitivity <- apply(sensitivities, 1, mean, na.rm = TRUE)
# add thresholds to data frame
sensitivities$thresholds <- thresholds
# put in a data frame
sensitivities$n_calls <- rowSums(as.data.frame(lapply(sensitiv_l, "[[", which(names(sensitiv_l[[1]]) == "n_calls"))), na.rm = TRUE)
sensitivities$n_calls_prop <- sensitivities$n_calls / max(sensitivities$n_calls)
sensitivities <- sensitivities[!is.infinite(sensitivities$mean.sensitivity), ]
sensitivities$parameters <- parameters
return(sensitivities)
}
# read ext sel tab calls
clls <- readRDS("./output/CALLS_fixed_detections_inquiry_calls.RDS")
# read metadata
metadat <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))
# nrow(metadat)
# remove video calibration
# metadat <- metadat[metadat$`tipo de video` != "calibracion de video", ]
metadat <- metadat[!is.na(metadat$Audio), ]
# get audio file name from sound files
clls$Audio <- sapply(clls$sound.files, function(x){
# split by _
spl <- strsplit(x, split = "_")[[1]]
# take 3rd
spl <- spl[grep(".wav$", spl)]
# remove .wav
spl <- gsub(".wav", "", spl)
# make it anumber
spl <- as.numeric(spl)
return(spl)
})
# anyNA(clls$Audio)
# nrow(clls)
clls <- clls[clls$Audio %in% metadat$Audio, ]
# add metadata to ext sel table
clls2 <- merge(clls, metadat[!duplicated(metadat$Audio), c("Audio", "tipo de video", "Experimento", "Grupo", "Individuo")])
clls <- fix_extended_selection_table(X = clls2, Y = clls)
# exclude those with obstacles
clls <- clls[clls$Experimento != "Búsqueda refugio con obstaculos", ]
clls$Individuo[clls$Experimento != "vuelo solo"] <- NA
# loop to measure acoustic parameters on each group
acous_param_l <- lapply(unique(clls$Grupo), function(x) {
print(x)
print(which(unique(clls$Grupo) == x) / length(unique(clls$Grupo)))
# get 1 group data
grp_test <- clls[clls$Grupo == x, ]
# keep only solo flights
grp_test <-
grp_test[grp_test$Experimento != "vuelo grupal/enmascarando busqueda", ]
# remove low SNR calls
grp_test <- sig2noise(grp_test, mar = 0.025, pb = FALSE)
grp_test <- grp_test[grp_test$SNR > 1, ]
grp_test$SNR <- NULL
tb <- aggregate(Audio ~ Grupo + Experimento, data = grp_test, length)
# if group flight then go, otherwise return NA
if (any(grepl("grupal", tb$Experimento)) & length(unique(grp_test$Individuo[grp_test$Experimento == "vuelo solo"])) > 1){
######## measure acoustic structure ######
# measure spectrographic parameters
sp <- specan(grp_test, pb = FALSE, harmonicity = TRUE)
# remove time parameters
sp <- sp[, grep("time\\.", names(sp), invert = TRUE)]
# check if there are NAs
# anyNA(sp)
# measure cepstral coeffs
cc <- mfcc_stats(grp_test, pb = FALSE)[, -c(1, 2)]
# spectrographic cross correlation
spxc <- xcorr(grp_test, pb = FALSE)
# MDS
spxc <- cmdscale(1 - spxc, k = 10, list. = TRUE)
spxc_mds <- spxc$points
colnames(spxc_mds) <- paste0("spxcMDS", 1:ncol(spxc_mds))
# mfcc cross correlation
mfccxc <- xcorr(grp_test, pb = FALSE, type = "mfcc")
# MDS
mfccxc <- cmdscale(1 - mfccxc, k = 10, list. = TRUE)
mfxc_mds <- mfccxc$points
colnames(mfxc_mds) <- paste0("mfxcMDS", 1:ncol(mfxc_mds))
# dynamic time warping
dtw.dists <- df_DTW(grp_test, img = FALSE, pb = FALSE, threshold.time = 1)
# MDS
dtw_mds <- cmdscale(dtw.dists, k = 10, list. = TRUE)$points
# fix colnames
colnames(dtw_mds) <- paste0("dtwMDS", 1:ncol(dtw_mds))
# put parameters in a list
acous_param_l <- list(sp, cc, dtw_mds, spxc_mds, mfxc_mds)
# remove null ones
acous_param_l <- acous_param_l[!sapply(acous_param_l, is.null)]
# bind all acoustic structure parameters together
all_acous_param <- do.call(cbind, acous_param_l)
# scale
all_acous_param[,-c(1, 2)] <- scale(all_acous_param[,-c(1, 2)])
# add individual and experiment
all_acous_param$indiv <- grp_test$Individuo
all_acous_param$experiment <- grp_test$Experimento
# remove bottom and top freq
all_acous_param$top.freq <- all_acous_param$bottom.freq <- NULL
return(all_acous_param)
} else
return(NULL)
})
names(acous_param_l) <- unique(clls$Grupo)
# remove those that did not have group flights
acous_param_l <- acous_param_l[!sapply(acous_param_l, is.null)]
# remove columns with NAs across all groups
# get names of columns that have any NA in at least 1 group (list element)
na_colnms <- unique(unlist(lapply(acous_param_l, function(x) names(x)[sapply(x, anyNA)])))
# keep indiv column
na_colnms <- na_colnms[na_colnms != "indiv"]
# remove NA columns for all groups
acous_param_l <- lapply(acous_param_l, function(x) x[, !names(x) %in% na_colnms])
# save as RDS
saveRDS(acous_param_l, "./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")
# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")
# minimum sample size per group
min_n <- sapply(acous_param_l, function(x) min(table(x$indiv)))
# remove grpoups with less than 10 observations for minimum sample size
acous_param_l <- acous_param_l[!names(acous_param_l) %in% names(min_n)[min_n < 10]]
# which parameters would be measured
param_categories <- c("mfcc", "spxc", "mfxc", "dtw", "sp")
# get actual parameter names
col_names <- names(acous_param_l[[1]])
col_names <- col_names[col_names != "experiment"]
# measurement category for each measruremnt
clss_col_names <- col_names
# name it by measurement function
for (i in 1:length(param_categories))
clss_col_names[if (param_categories[i] != "sp")
grepl(c("cc", "spxc", "mfxc", "dtw")[i], col_names) else
!grepl("cc|xc|dtw|indiv|sound.files|selec", col_names)] <- param_categories[i]
# all posible combinations of 2 and 3 parameters
combs4 <- combn(param_categories, 4)
combs3 <- combn(param_categories, 3)
combs2 <- combn(param_categories, 2)
# ake it a list
combs <- c(as.data.frame(combs4), as.data.frame(combs3), as.data.frame(combs2))
# add all 4 parameters as an option to the list
combs <- c(append(combs, list(param_categories)), param_categories)
for (i in 1:length(combs)) {
rds_name <- paste0("./data/processed/averaged_random_forest_models_from_solo_flights_", paste(combs[[i]], collapse = "-"), ".RDS")
# run if file doesn't exist
if (!file.exists(rds_name)){
# avg_mods <- list()
# loop over list of acoustic parameters
avg_mods <- pblapply(names(acous_param_l),
# cl = .Options$warbleR$parallel,
cl = 1,
function(x){
# for (x in names(acous_param_l)){
print(x)
# extract data
X <- acous_param_l[[which(names(acous_param_l) == x)]]
# only solo flight
solo_rf_input <- X[X$experiment == "vuelo solo", ]
# rename rows for sel_rows
rownames(solo_rf_input) <- 1:nrow(solo_rf_input)
# order by sound file column
solo_rf_input <- solo_rf_input[order(solo_rf_input$sound.files), ]
# remove experiment column
solo_rf_input$experiment <- NULL
# subset columns to keep only those from selected acoustic measurements
solo_rf_input <- solo_rf_input[ , col_names[clss_col_names %in% c(combs[[i]], "sound.files", "selec", "indiv")]]
# run random forest, set a seed to make it replicable
rf_results <- pblapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, seed = x), cl = 1)
# merge together predictions by sound files
rf_preds <- lapply(rf_results, function(x){
mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)
mrg <- mrg[order(mrg$sound.files), -1]
}
)
# add column (individual) if not found
rf_preds <- lapply(rf_preds, function(x){
if(ncol(x) < length(unique(solo_rf_input$indiv))){
# how many columns are missing
mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
# add missing columns
for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x))
}
return(x)
})
# get together predictions from the same individual
preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
do.call(cbind, lapply(rf_preds, "[", y))
)
agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))
# add individual name to columns
names(agg_preds) <- names(rf_preds[[1]])
# add sound file column
agg_preds$sound.files <- solo_rf_input$sound.files
# get predicted indiv from aggregated probabilities
agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])
# make it a factor
pred_indiv <- factor(agg_preds$pred_indiv, levels = unique(solo_rf_input$indiv))
agg_preds$actual_indiv <- actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
# get confusion matrix
cm_solo <- confusionMatrix(pred_indiv, reference = actual_indiv)
### NULL MODEL
# run null model by randomizing indiv labels
rf_null_results <- pblapply(1:reps, function(x) balanced_rf(X = solo_rf_input, num.trees = num.trees, random = TRUE, seed = x), cl = 1)
# get accuracies form null models
# merge together predictions by sound files
rf_null_preds <- lapply(rf_null_results, function(x){
mrg <- merge(data.frame(sound.files = solo_rf_input$sound.files), x$predictions[, grep("indiv$", names(x$predictions), invert = TRUE)], all.x = TRUE)
mrg <- mrg[order(mrg$sound.files), -1]
}
)
# add column (individual) if not found
rf_null_preds <- lapply(rf_null_preds, function(x){
if(ncol(x) < length(unique(solo_rf_input$indiv))){
# how many columns are missing
mssng <- length(unique(solo_rf_input$indiv)) - ncol(x)
# add missing columns
for(i in 1:(mssng)) x <- data.frame(x, NA, check.names = FALSE)
names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(solo_rf_input$indiv), names(x))
}
return(x)
})
# get together predictions from the same individual
preds_by_indv_null <- lapply(1:ncol(rf_null_preds[[1]]), function(y)
do.call(cbind, lapply(rf_null_preds, "[", y))
)
agg_preds_null <- as.data.frame(lapply(preds_by_indv_null, rowMeans, na.rm = TRUE))
# add individual name to columns
names(agg_preds_null) <- names(rf_null_preds[[1]])
# add sound file column
agg_preds_null$sound.files <- solo_rf_input$sound.files
# get predicted indiv from aggregated probabilities
agg_preds_null$pred_indiv <- apply(agg_preds_null[, sapply(agg_preds_null, is.numeric)], 1, function(x) colnames(agg_preds_null)[which.max(x)])
# make it a factor
pred_indiv_null <- factor(agg_preds_null$pred_indiv, levels = unique(solo_rf_input$indiv))
actual_indiv <- factor(solo_rf_input$indiv, levels = unique(solo_rf_input$indiv))
# get confusion matrix
cm_solo_null <- confusionMatrix(pred_indiv_null, reference = actual_indiv)
### NOTE: ranger() OOB prediction error and confusionMatrix() Accuracy are the same
# put all results together
output <- list(group = x, accuracy = cm_solo$overall[1], null_accuracy = cm_solo_null$overall[1], aggregated_predictions = agg_preds, conf_matrix = cm_solo, random_forests = rf_results, n_indiv = length(unique(solo_rf_input$indiv)), min_n = min(table(solo_rf_input$indiv)), parameters = combs[[i]])
# avg_mods[[length(avg_mods) + 1]] <- output
# rm(output)
return(output)
}
)
# add group name to list
names(avg_mods) <- names(acous_param_l)
# save as RDS
saveRDS(avg_mods, rds_name)
}
}
# read rf outputs
rds_rf_outputs <- list.files(path = "./data/processed", pattern = "averaged_random_forest_models", full.names = TRUE)
model_diagnostics_l <- pblapply(rds_rf_outputs, cl = 1, function(x) {
# read data
avg_mods <- readRDS(x)
sensitivities <- sensitivity_fun(X = avg_mods, parameters = paste(avg_mods[[1]]$parameters, collapse = "-"))
# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))
filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 1:length(null_accuracy)])
diagnostics <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, null_accuracy, filtered_accuracy), acoustic_parametes = gsub("./data/processed/averaged_random_forest_models_from_solo_flights_|.RDS","", x))
diagnostics$model <- factor(diagnostics$model, levels = c("null model", "filtered model", "real data"))
return(list(diagnostics = diagnostics, sensitivities = sensitivities))
})
# put in a data frame
model_diagnostics <- do.call(rbind, lapply(model_diagnostics_l, "[[", 1))
saveRDS(model_diagnostics, "./data/processed/random_forests_diagnostics_several_acoustic_parameter_combinations_solo_flight.RDS")
# save sensitivities
senstivities_l <- lapply(model_diagnostics_l, "[[", 2)
# determine all column names in all selection tables
cnms <- unique(unlist(lapply(senstivities_l, names)))
# add columns that are missing to each selection table
senstivities_l <- lapply(senstivities_l, function(X)
{
nms <- names(X)
if (length(nms) != length(cnms))
for(i in cnms[!cnms %in% nms]) {
X <- data.frame(X, NA, stringsAsFactors = FALSE, check.names = FALSE)
names(X)[ncol(X)] <- i
}
return(X)
})
model_sensitivities <- do.call(rbind, senstivities_l)
saveRDS(model_sensitivities, "./data/processed/random_forests_sensitivities_several_acoustic_parameter_combinations_solo_flight.RDS")
# read diagnostic
model_diagnostics <- readRDS("./data/processed/random_forests_diagnostics_several_acoustic_parameter_combinations_solo_flight.RDS")
# make factor to order plot x axis ticks
model_diagnostics$acoustic_parametes <- factor(model_diagnostics$acoustic_parametes, levels = sort(unique(model_diagnostics$acoustic_parametes)))
# make it numeric for points/lines
model_diagnostics$model <- factor(model_diagnostics$model)
model_diagnostics$model_num <- as.numeric(model_diagnostics$model)
model_diagnostics$model_num[model_diagnostics$model != "null model"] <- 2
# density plots
ggplot(model_diagnostics[model_diagnostics$model != "filtered model", ], aes(x = accuracy, fill = model)) +
geom_density(alpha=0.4) +
theme_classic() +
scale_fill_viridis_d() +
ggtitle("Original (non-filtered) accuracies") +
labs(x = "Mean accuracy", y = "Frequency") +
facet_wrap(~ acoustic_parametes, ncol = 4)
ggplot(model_diagnostics[model_diagnostics$model != "real data", ], aes(x = accuracy, fill = model)) +
geom_density(alpha=0.4) +
theme_classic() +
scale_fill_viridis_d() +
ggtitle("Optimized accuracies") +
labs(x = "Mean accuracy", y = "Frequency") +
facet_wrap(~ acoustic_parametes, ncol = 4)
#
sensitivities <- readRDS("./data/processed/random_forests_sensitivities_several_acoustic_parameter_combinations_solo_flight.RDS")
# Probability threshold vs Mean sensitivty
# ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) +
# geom_hline(yintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# # geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# geom_point(col = magma(10, alpha = 0.7)[2]) +
# geom_line(col = magma(10, alpha = 0.7)[2]) +
# labs(y = "Mean sensitivty", x = "Probability threshold") +
# theme_classic() +
# # annotate(geom = "text", x = thresh_prob + 0.04, y = 0.9, label = round(thresh_prob, 2)) +
# facet_wrap(~ parameters, ncol = 4)
#
# # calculate threshold at
# n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= cutoff])
#
# # Probability threshold vs Proportion of calls used
# ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) +
# geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# # geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# geom_point(col = magma(10, alpha = 0.7)[2]) +
# geom_line(col = magma(10, alpha = 0.7)[2]) +
# labs(y = "Proportion of calls used", x = "Probability threshold") +
# theme_classic() +
# # annotate(geom = "text", x = n_call_cutoff + 0.04, y = 0.9, label = round(n_call_cutoff, 2))
# facet_wrap(~ parameters, ncol = 4)
#
# ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) +
# geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# geom_vline(xintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
# geom_point(col = magma(10, alpha = 0.7)[2]) +
# geom_line(col = magma(10, alpha = 0.7)[2]) +
# labs(y = "Proportion of calls used", x = "Sensitivity") +
# theme_classic() +
# # annotate(geom = "text", y = n_call_cutoff + 0.04, x = cutoff, label = round(n_call_cutoff, 2))
# facet_wrap(~ parameters, ncol = 4)
maxs <- sapply(split(sensitivities, sensitivities$parameters), function(Z) max(Z$n_calls_prop[Z$mean.sensitivity >= cutoff]))
max_prop_calls <- data.frame(data_set = names(maxs), max_prop_calls = maxs)
max_prop_calls <- max_prop_calls[order(- max_prop_calls$max_prop_calls), ]
df1 <- knitr::kable(max_prop_calls, row.names = FALSE, escape = FALSE, format = "html", digits = 2)
kable_styling(df1, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)
data_set | max_prop_calls |
---|---|
mfcc-spxc-mfxc-sp | 0.98 |
mfcc-spxc-mfxc-dtw-sp | 0.97 |
mfcc-mfxc-dtw-sp | 0.96 |
mfcc-mfxc-sp | 0.95 |
mfcc-spxc-sp | 0.94 |
mfcc-spxc-dtw-sp | 0.93 |
mfcc-spxc-mfxc-dtw | 0.93 |
mfcc-sp | 0.92 |
mfcc-dtw-sp | 0.92 |
mfcc-spxc-mfxc | 0.92 |
mfcc-mfxc | 0.91 |
mfcc-mfxc-dtw | 0.91 |
spxc-mfxc | 0.91 |
spxc-mfxc-dtw | 0.89 |
spxc-mfxc-dtw-sp | 0.88 |
spxc-mfxc-sp | 0.87 |
mfcc-spxc-dtw | 0.86 |
mfcc-spxc | 0.86 |
spxc-sp | 0.83 |
spxc-dtw-sp | 0.83 |
mfcc | 0.83 |
mfxc-dtw-sp | 0.82 |
mfcc-dtw | 0.81 |
sp-mfcc-dtw | 0.79 |
spxc | 0.79 |
mfxc-sp | 0.78 |
spxc-dtw | 0.73 |
dtw-sp | 0.71 |
mfxc | 0.71 |
sp | 0.69 |
mfxc-dtw | 0.68 |
dtw | 0.01 |
# read rf outputs
avg_mods <- readRDS(paste0("./data/processed/averaged_random_forest_models_from_solo_flights_", max_prop_calls$data_set[1],".RDS"))
sensitivities <- sensitivity_fun(X = avg_mods, parameters = "mfcc-sp")
# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))
filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob,
1:length(null_accuracy)])
df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data",
"null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy,
null_accuracy, filtered_accuracy))
df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))
# violin plots
ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) +
geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Original accuracies") +
labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(2, 1), each = nrow(df)/3)),
size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(2,
1), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) +
geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Optimized accuracies") +
labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(1, 2), each = nrow(df)/3)),
size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(1,
2), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
# density plots
ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) +
theme_classic() + scale_fill_viridis_d() + ggtitle("Original (non-filtered) accuracies") +
labs(x = "Mean accuracy", y = "Frequency")
ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) +
theme_classic() + scale_fill_viridis_d() + ggtitle("Optimized accuracies") +
labs(x = "Mean accuracy", y = "Frequency")
# Probability threshold vs Mean sensitivty
ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + geom_hline(yintercept = cutoff,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Mean sensitivty",
x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = thresh_prob +
0.04, y = 0.9, label = round(thresh_prob, 2))
# calculate threshold at
n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >=
cutoff])
# Probability threshold vs Proportion of calls used
ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used",
x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = n_call_cutoff +
0.04, y = 0.9, label = round(n_call_cutoff, 2))
ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = cutoff,
col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used",
x = "Sensitivity") + theme_classic() + annotate(geom = "text", y = n_call_cutoff +
0.04, x = cutoff, label = round(n_call_cutoff, 2))
#
# # remove group 42 AND 6
# avg_mods <- avg_mods[!names(avg_mods) %in% c("42", "6", "21", "29")]
#
# # calculate sensitivities
# sensitivities <- sensitivity_fun(X = avg_mods, parameters = max_prop_calls$data_set[1])
#
# # calculate threshold at cutoff
# thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
#
# avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
#
# null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))
#
# filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob,
# 1:length(null_accuracy)])
#
# df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data",
# "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy,
# null_accuracy, filtered_accuracy))
#
# df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))
#
# # violin plots
# ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) +
# geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Original accuracies") +
# labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(2, 1), each = nrow(df)/3)),
# size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(2,
# 1), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
#
#
# ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) +
# geom_violin() + theme_classic() + scale_fill_viridis_d(alpha = 0.4) + ggtitle("Optimized accuracies") +
# labs(y = "Mean accuracy", x = "Model") + geom_point(aes(x = rep(c(1, 2), each = nrow(df)/3)),
# size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) + geom_line(aes(x = rep(c(1,
# 2), each = nrow(df)/3), group = group), col = "gray43", alpha = 0.8)
#
#
# # density plots
# ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) +
# theme_classic() + scale_fill_viridis_d() + ggtitle("Original (non-filtered) accuracies") +
# labs(x = "Mean accuracy", y = "Frequency")
#
#
# ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) + geom_density(alpha = 0.4) +
# theme_classic() + scale_fill_viridis_d() + ggtitle("Optimized accuracies") +
# labs(x = "Mean accuracy", y = "Frequency")
#
#
# ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) + geom_hline(yintercept = cutoff,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
# alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Mean sensitivty",
# x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = thresh_prob +
# 0.04, y = 0.9, label = round(thresh_prob, 2))
#
#
# # calculate threshold at
# n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >=
# cutoff])
#
#
# ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = thresh_prob,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
# alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used",
# x = "Probability threshold") + theme_classic() + annotate(geom = "text", x = n_call_cutoff +
# 0.04, y = 0.9, label = round(n_call_cutoff, 2))
#
#
#
# ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) + geom_hline(yintercept = n_call_cutoff,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_vline(xintercept = cutoff,
# col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) + geom_point(col = magma(10,
# alpha = 0.7)[2]) + geom_line(col = magma(10, alpha = 0.7)[2]) + labs(y = "Proportion of calls used",
# x = "Sensitivity") + theme_classic() + annotate(geom = "text", y = n_call_cutoff +
# 0.04, x = cutoff, label = round(n_call_cutoff, 2))
# remove group 42 AND 6
avg_mods <- avg_mods[!names(avg_mods) %in% c("42", "6", "21", "29")]
# calculate sensitivities
sensitivities <- sensitivity_fun(X = avg_mods, parameters = max_prop_calls$data_set[1])
# calculate threshold at cutoff
thresh_prob <- min(sensitivities$thresholds[sensitivities$mean.sensitivity >= cutoff])
avg_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "accuracy"))
null_accuracy <- sapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "null_accuracy"))
filtered_accuracy <- unlist(sensitivities[sensitivities$thresholds == thresh_prob, 1:length(null_accuracy)])
df <- data.frame(group = gsub("X", "", names(filtered_accuracy)), model = rep(c("real data", "null model", "filtered model"), each = length(avg_accuracy)), accuracy = c(avg_accuracy, null_accuracy, filtered_accuracy))
df$model <- factor(df$model, levels = c("null model", "filtered model", "real data"))
# violin plots
ggplot(df[df$model != "filtered model", ], aes(y = accuracy, x = model, fill = model)) +
geom_violin() +
theme_classic() +
scale_fill_viridis_d(alpha = 0.4) +
ggtitle("Original accuracies") +
labs(y = "Mean accuracy", x = "Model") +
geom_point(aes(x = rep(c(2, 1), each = nrow(df) / 3)), size = 5, show.legend = FALSE, col = "gray43", alpha = 0.8) +
geom_line(aes(x = rep(c(2, 1), each = nrow(df) / 3), group = group), col = "gray43", alpha = 0.8)
ggplot(df[df$model != "real data", ], aes(y = accuracy, x = model, fill = model)) +
geom_violin() +
theme_classic() +
scale_fill_viridis_d(alpha = 0.4) +
ggtitle("Optimized accuracies") +
labs(y = "Mean accuracy", x = "Model") +
geom_point(aes(x = rep(c(1, 2), each = nrow(df) / 3)), size = 3, show.legend = FALSE, col = "gray43", alpha = 0.8) +
geom_line(aes(x = rep(c(1, 2), each = nrow(df) / 3), group = group), col = "gray43", alpha = 0.8)
# density plots
ggplot(df[df$model != "filtered model", ], aes(x = accuracy, fill = model)) +
geom_density(alpha=0.4) +
theme_classic() +
scale_fill_viridis_d() +
ggtitle("Original (non-filtered) accuracies") +
labs(x = "Mean accuracy", y = "Frequency")
ggplot(df[df$model != "real data", ], aes(x = accuracy, fill = model)) +
geom_density(alpha=0.4) +
theme_classic() +
scale_fill_viridis_d() +
ggtitle("Optimized accuracies") +
labs(x = "Mean accuracy", y = "Frequency")
ggplot(data = sensitivities, aes(x = thresholds, y = mean.sensitivity)) +
geom_hline(yintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_point(col = magma(10, alpha = 0.7)[2]) +
geom_line(col = magma(10, alpha = 0.7)[2]) +
labs(y = "Mean sensitivty", x = "Probability threshold") +
theme_classic() +
annotate(geom = "text", x = thresh_prob + 0.04, y = 0.9, label = round(thresh_prob, 2))
# calculate threshold at
n_call_cutoff <- max(sensitivities$n_calls_prop[sensitivities$mean.sensitivity >= cutoff])
ggplot(data = sensitivities, aes(x = thresholds, y = n_calls_prop)) +
geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_vline(xintercept = thresh_prob, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_point(col = magma(10, alpha = 0.7)[2]) +
geom_line(col = magma(10, alpha = 0.7)[2]) +
labs(y = "Proportion of calls used", x = "Probability threshold") +
theme_classic() +
annotate(geom = "text", x = n_call_cutoff + 0.04, y = 0.9, label = round(n_call_cutoff, 2))
ggplot(data = sensitivities, aes(x = mean.sensitivity, y = n_calls_prop)) +
geom_hline(yintercept = n_call_cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_vline(xintercept = cutoff, col = magma(10, alpha = 0.7)[8], size = 1.6, lty = 3) +
geom_point(col = magma(10, alpha = 0.7)[2]) +
geom_line(col = magma(10, alpha = 0.7)[2]) +
labs(y = "Proportion of calls used", x = "Sensitivity") +
theme_classic() +
annotate(geom = "text", y = n_call_cutoff + 0.04, x = cutoff, label = round(n_call_cutoff, 2))
# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")
acous_param_l <- acous_param_l[names(acous_param_l) %in% names(avg_mods)]
# extract random forests from acous_param_l list
random_forests_l <- lapply(avg_mods, "[[", which(names(avg_mods[[1]]) == "random_forests"))
agg_preds_l <- lapply(names(random_forests_l), function(x){
# print(x)
Z <- acous_param_l[[x]]
Y <- Z[Z$experiment == "vuelo grupal/sin sonido", ]
Y$indiv <- NULL
# random forest models for this group
rfms <- random_forests_l[[x]]
# predict using all random forest models
rf_preds <- lapply(rfms, function(x) predict(object = x, data = Y)$predictions)
# add column (individual) if not found
rf_preds <- lapply(rf_preds, function(x){
if(ncol(x) < length(unique(Z$indiv[Z$experiment == "vuelo solo"]))){
# how many columns are missing
mssng <- length(unique(Z$indiv[Z$experiment == "vuelo solo"])) - ncol(x)
# add missing columns with 0
for(i in 1:(mssng)) x <- data.frame(x, 0, check.names = FALSE)
names(x)[(ncol(x) - mssng + 1):ncol(x)] <- setdiff(unique(Z$indiv[Z$experiment == "vuelo solo"]), names(x))
}
return(x)
})
# get together predictions from the same individual
preds_by_indv <- lapply(1:ncol(rf_preds[[1]]), function(y)
do.call(cbind, lapply(rf_preds, function(e) e[, y]))
)
agg_preds <- as.data.frame(lapply(preds_by_indv, rowMeans, na.rm = TRUE))
# add individual name to columns
names(agg_preds) <- colnames(rf_preds[[1]])
# add sound file column
agg_preds$sound.files <- Y$sound.files
# get predicted indiv from aggregated probabilities
agg_preds$pred_indiv <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, function(x) colnames(agg_preds)[which.max(x)])
agg_preds$group <- x
agg_preds$max_prob <- apply(agg_preds[, sapply(agg_preds, is.numeric)], 1, max)
return(agg_preds)
})
# add group name to list
names(agg_preds_l) <- names(acous_param_l)
agg_pred <- do.call(rbind, lapply(agg_preds_l, function(x) x[, c("group", "sound.files", "max_prob", "pred_indiv")]))
# remove those groups with low sensitivity
agg_pred <- agg_pred[!agg_pred$group %in% c("42", "6"), ]
# save as RDS
saveRDS(agg_pred, "./data/processed/predicted_individual_in_group_flights.RDS")
# read as RDS
agg_pred <- readRDS("./data/processed/predicted_individual_in_group_flights.RDS")
# get summary by group
summ_by_groups <- lapply(unique(agg_pred$group), function(x) {
Y <- agg_pred[agg_pred$group == x, ]
#total number of calls used
n_used_calls <- sum(Y$max_prob > thresh_prob)
#proportion of calls used
prop_used_calls <- sum(Y$max_prob > thresh_prob) / nrow(Y)
return(data.frame(group = x, n_used_calls, total_calls = nrow(Y), prop_used_calls))
})
summ_by_group <- do.call(rbind, summ_by_groups)
# add row with total across all groups
total_row <- data.frame(group = "All groups", n_used_calls = sum(summ_by_group$n_used_calls), total_calls = sum(summ_by_group$total_calls))
total_row$prop_used_calls <- total_row$n_used_calls / total_row$total_calls
# bind totals to summary table
summ_by_group <- rbind(summ_by_group, total_row)
#print pretty table
df1 <- knitr::kable(summ_by_group, row.names = TRUE, escape = FALSE, format = "html", digits = 2)
df1 <- kable_styling(df1, bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE, font_size = 18)
row_spec(df1, row = nrow(summ_by_group), bold = TRUE, color = "white", background = viridis(10)[8])
group | n_used_calls | total_calls | prop_used_calls | |
---|---|---|---|---|
1 | 4 | 69 | 69 | 1 |
2 | 3 | 176 | 176 | 1 |
3 | 1 | 46 | 46 | 1 |
4 | 40 | 3 | 3 | 1 |
5 | 9 | 39 | 39 | 1 |
6 | 15 | 59 | 59 | 1 |
7 | 13 | 5 | 5 | 1 |
8 | 18-A-B | 20 | 20 | 1 |
9 | 6/? | 72 | 72 | 1 |
10 | 12/12/N/N | 15 | 15 | 1 |
11 | 12/NN | 58 | 58 | 1 |
12 | All groups | 562 | 562 | 1 |
Using all calls, including those with low class probabilities
# read acoustic parameter data
acous_param_l <- readRDS("./data/processed/acoustic_parameters_all_groups_all_warbler_acoustic_measurements.RDS")
# exclude groups with low sensitivity (42 & 6) or low sample size after removing low prob calls (29)
acous_param_l <- acous_param_l[!names(acous_param_l) %in% c("42", "6", "21", "29")]
# get group and solo call counts per individual
sl_vs_grps <- lapply(names(acous_param_l), function(x){
acous_param <- acous_param_l[[x]]
group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
# get sound file number for group flights
group_file <- strsplit(group_file, ".wav")[[1]][1]
group_file_num <- as.numeric(substr(group_file, start = nchar(group_file) - 6, nchar(group_file)))
# get data for solo flights
solo <- acous_param[acous_param$experiment == "vuelo solo", ]
sl_grp_cll <- lapply(unique(solo$indiv), function(y){
solo_calls <- sum(solo$indiv == y)
group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y)
# get sound file number for group flights
solo_file <- solo$sound.files[solo$indiv == y][1]
solo_file <- strsplit(solo_file, ".wav")[[1]][1]
solo_file_num <- as.numeric(substr(solo_file, start = nchar(solo_file) - 6, nchar(solo_file)))
return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num)))
})
sl_grp_clls <- do.call(rbind, sl_grp_cll)
return(sl_grp_clls)
}
)
sl_vs_grp <- do.call(rbind, sl_vs_grps)
# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]
# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))
exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))
# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)
# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]
## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
out <- if (sl_vs_grp$experiment[x] == "solo")
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)])) else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))
return(out)
})
# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))
if (length(ft) == 0) ft <- NA
return(ft)
}
)
# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")
# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")
sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"
# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))
sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)
# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) +
geom_point(size = 5, alpha = 0.8) +
theme_classic() +
scale_color_viridis_d(alpha = 0.4, guide=FALSE) +
ggtitle("Change in calling rate by sex and individual") +
labs(x = "Experiment", y = "Calling rate (calls / min)") +
geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) +
facet_wrap(~ group, nrow = 4)
Using only calls with class probabilities higher than 0.21
# get group and solo call counts per individual
sl_vs_grps <- lapply(names(acous_param_l), function(x){
acous_param <- acous_param_l[[x]]
group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
# get sound file number for group flights
group_file <- strsplit(group_file, ".wav")[[1]][1]
group_file_num <- as.numeric(substr(group_file, start = nchar(group_file) - 6, nchar(group_file)))
# get data for solo flights
solo <- acous_param[acous_param$experiment == "vuelo solo", ]
sl_grp_cll <- lapply(unique(solo$indiv), function(y){
solo_calls <- sum(solo$indiv == y)
# taking into acount those with class prob higher than
group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y & agg_pred$max_prob > thresh_prob)
# get sound file number for group flights
solo_file <- solo$sound.files[solo$indiv == y][1]
solo_file <- strsplit(solo_file, ".wav")[[1]][1]
solo_file_num <- as.numeric(substr(solo_file, start = nchar(solo_file) - 6, nchar(solo_file)))
return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num)))
})
sl_grp_clls <- do.call(rbind, sl_grp_cll)
return(sl_grp_clls)
}
)
sl_vs_grp <- do.call(rbind, sl_vs_grps)
# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]
# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))
exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))
# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)
# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]
## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
out <- if (sl_vs_grp$experiment[x] == "solo")
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)])) else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))
return(out)
})
# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))
if (length(ft) == 0) ft <- NA
return(ft)
}
)
# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")
# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")
sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"
# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))
sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)
# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) +
geom_point(size = 5, alpha = 0.8) +
theme_classic() +
scale_color_viridis_d(alpha = 0.4, guide=FALSE) +
ggtitle("Change in calling rate by sex and individual") +
labs(x = "Experiment", y = "Calling rate (calls / min)") +
geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) +
facet_wrap(~ group, nrow = 4)
# violin plots
# ggplot(sl_vs_grp, aes(y = prop_calls, x = experiment, col = indiv, shape = sex)) +
# geom_point(size = 5, alpha = 0.8) +
# theme_classic() +
# scale_color_viridis_d(alpha = 0.4, guide=FALSE) +
# ggtitle("Original accuracies") +
# labs(y = "Mean accuracy", x = "Model") +
# geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) +
# facet_wrap(~ group, nrow = 4) + ggtitle(label = "Change in calling activity by sex and individual")
Using only calls with class probabilities higher than 0.21 and assinging low probability calls evenly across individuals
# get group and solo call counts per individual
sl_vs_grps <- lapply(names(acous_param_l), function(x){
acous_param <- acous_param_l[[x]]
group_file <- acous_param$sound.files[acous_param$experiment == "vuelo grupal/sin sonido"][1]
# get sound file number for group flights
group_file <- strsplit(group_file, ".wav")[[1]][1]
group_file_num <- as.numeric(substr(group_file, start = nchar(group_file) - 6, nchar(group_file)))
# get data for solo flights
solo <- acous_param[acous_param$experiment == "vuelo solo", ]
sl_grp_cll <- lapply(unique(solo$indiv), function(y){
solo_calls <- sum(solo$indiv == y)
# taking into acount those with class prob higher than
group_calls <- sum(agg_pred$group == x & agg_pred$pred_indiv == y & agg_pred$max_prob > thresh_prob)
# add low probability calls to group calls
group_calls <- group_calls + sum(agg_pred$group == x & agg_pred$max_prob <= thresh_prob) / length(unique(agg_pred$pred_indiv[agg_pred$group == x]))
# get sound file number for group flights
solo_file <- solo$sound.files[solo$indiv == y][1]
solo_file <- strsplit(solo_file, ".wav")[[1]][1]
solo_file_num <- as.numeric(substr(solo_file, start = nchar(solo_file) - 6, nchar(solo_file)))
return(data.frame(group = x, indiv = y, experiment = c("solo", "group"), calls = c(solo_calls, group_calls), prop_calls = c(solo_calls / nrow(solo), group_calls / sum(agg_pred$group == x)), sound_file_number = c(solo_file_num, group_file_num), sound_files = c(solo_file, group_file)))
})
sl_grp_clls <- do.call(rbind, sl_grp_cll)
return(sl_grp_clls)
}
)
sl_vs_grp <- do.call(rbind, sl_vs_grps)
# remove 21 due to no group calls left
sl_vs_grp <- sl_vs_grp[sl_vs_grp$group != "21", ]
# add metadata
caps <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Capturas"))
exps_mtdt <- as.data.frame(read_excel("./data/raw/Proyecto MPI enero 2020_3.xlsx", sheet = "Experimento video coor vuelo"))
# time in seconds
exps_mtdt$flight_time <- round(as.numeric(exps_mtdt$`Tiempo de vuelo aprox`) * 60 / 0.04166667)
# remove NA in audio
exps_mtdt <- exps_mtdt[!is.na(exps_mtdt$Audio), ]
## Add flight time for solo and group flight
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
out <- if (sl_vs_grp$experiment[x] == "solo")
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Individuo == sl_vs_grp$indiv[x] & exps_mtdt$Experimento == "vuelo solo" & !is.na(exps_mtdt$Individuo)])) else
unique(na.omit(exps_mtdt$flight_time[exps_mtdt$Grupo == sl_vs_grp$group[x] & exps_mtdt$Experimento == "vuelo grupal/sin sonido"]))
return(out)
})
# flight time
sl_vs_grp$flight_time <- sapply(1:nrow(sl_vs_grp), function(x){
ft <- unique(na.omit(exps_mtdt$flight_time[which(as.numeric(exps_mtdt$Audio) == sl_vs_grp$sound_file_number[x])]))
if (length(ft) == 0) ft <- NA
return(ft)
}
)
# add sex
sl_vs_grp$sex <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Sexo[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$sex <- ifelse(sl_vs_grp$sex == "m", "Male", "Female")
# add age
sl_vs_grp$age <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$Edad[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$age <- ifelse(sl_vs_grp$age == "sa", "Sub-adult", "Adult")
sl_vs_grp$reprod.stg <- sapply(sl_vs_grp$indiv, function(x) na.exclude(caps$`Estado reproductivo`[caps$Murci == x])[1], USE.NAMES = FALSE)
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "p?"] <- "Pregnant"
sl_vs_grp$reprod.stg[sl_vs_grp$reprod.stg == "ne"] <- "Inactive"
# order levels
sl_vs_grp$experiment <- factor(sl_vs_grp$experiment, levels = c("solo", "group"))
sl_vs_grp$call_rate <- sl_vs_grp$calls / (sl_vs_grp$flight_time / 60)
# calling rate solo vs group flights
ggplot(sl_vs_grp, aes(y = call_rate, x = experiment, col = indiv, shape = sex)) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "group"]), lty = 2, alpha = 0.5) +
geom_hline(yintercept = mean(sl_vs_grp$call_rate[sl_vs_grp$experiment == "solo"], na.rm = TRUE), lty = 2, alpha = 0.5) +
geom_point(size = 5, alpha = 0.8) +
theme_classic() +
scale_color_viridis_d(alpha = 0.4, guide=FALSE) +
ggtitle("Change in calling rate by sex and individual") +
labs(x = "Experiment", y = "Calling rate (calls / min)") +
geom_line(aes(group = indiv), col = "gray43", alpha = 0.8) +
facet_wrap(~ group, nrow = 4)
clls <- readRDS("./output/CALLS_fixed_detections_inquiry_calls.RDS")
sl_vs_grp$sound_files <- paste0(sl_vs_grp$sound_files, ".wav")
seltab <- attributes(clls)$check.results
gap_l <- lapply(1:nrow(sl_vs_grp), function(x){
Y <- seltab[seltab$orig.sound.files == sl_vs_grp$sound_files[x], ]
Y <- Y[order(Y$orig.start), ]
gaps <- Y$orig.start[-1] - Y$orig.end[-nrow(Y)]
return(data.frame(group = sl_vs_grp$group[x], sound.files = sl_vs_grp$sound_files[x] ,experiment = sl_vs_grp$experiment[x], gaps = ifelse(length(gaps) == 0, NA, gaps)))
})
gaps <- do.call(rbind, gap_l)
ggplot(gaps, aes(x = gaps, fill = experiment, group = experiment)) +
geom_density() +
scale_fill_viridis_d(alpha = 0.4) +
ggtitle("Gaps sec by group") +
theme_classic() +
facet_wrap(~group, ncol = 4, scales = "free_y")
ggplot(gaps, aes(x = gaps, fill = experiment, group = experiment)) +
geom_density() +
ggtitle("Gaps sec groups combined") +
scale_fill_viridis_d(alpha = 0.4) +
theme_classic()
Session information
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=pt_BR.UTF-8 LC_COLLATE=pt_BR.UTF-8
## [5] LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=pt_BR.UTF-8
## [7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] soundgen_1.8.2 shinyBS_0.61 Sim.DiffProc_4.7 rlang_0.4.8
## [5] kableExtra_1.2.1 viridis_0.5.1 viridisLite_0.3.0 pbapply_1.4-2
## [9] e1071_1.7-4 caret_6.0-86 ggplot2_3.3.0 lattice_0.20-41
## [13] ranger_0.12.1 readxl_1.3.1 warbleR_1.1.24 NatureSounds_1.0.3
## [17] knitr_1.28 seewave_2.1.5 tuneR_1.3.3 devtools_2.3.2
## [21] usethis_1.6.3
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rjson_0.2.20 ellipsis_0.3.1
## [4] class_7.3-17 rprojroot_1.3-2 fs_1.5.0
## [7] rstudioapi_0.11 proxy_0.4-24 farver_2.0.3
## [10] Deriv_4.1.0 remotes_2.2.0 prodlim_2019.11.13
## [13] fansi_0.4.1 lubridate_1.7.9 xml2_1.3.2
## [16] codetools_0.2-16 splines_4.0.3 pkgload_1.1.0
## [19] pROC_1.16.2 shiny_1.5.0 compiler_4.0.3
## [22] httr_1.4.2 backports_1.1.7 fastmap_1.0.1
## [25] assertthat_0.2.1 Matrix_1.2-18 cli_2.0.2
## [28] later_1.0.0 htmltools_0.5.0 prettyunits_1.1.1
## [31] tools_4.0.3 gtable_0.3.0 glue_1.4.0
## [34] reshape2_1.4.4 dplyr_1.0.2 Rcpp_1.0.4.6
## [37] cellranger_1.1.0 vctrs_0.3.4 nlme_3.1-149
## [40] iterators_1.0.13 timeDate_3043.102 gower_0.2.2
## [43] xfun_0.13 stringr_1.4.0 ps_1.3.3
## [46] testthat_2.3.2 rvest_0.3.6 mime_0.9
## [49] lifecycle_0.2.0 MASS_7.3-53 scales_1.1.1
## [52] ipred_0.9-9 promises_1.1.0 parallel_4.0.3
## [55] yaml_2.2.1 memoise_1.1.0 gridExtra_2.3
## [58] rpart_4.1-15 stringi_1.4.6 desc_1.2.0
## [61] foreach_1.5.1 pkgbuild_1.1.0 lava_1.6.8
## [64] pkgconfig_2.0.3 dtw_1.21-3 bitops_1.0-6
## [67] evaluate_0.14 purrr_0.3.4 labeling_0.3
## [70] recipes_0.1.14 processx_3.4.4 tidyselect_1.1.0
## [73] plyr_1.8.6 magrittr_1.5 R6_2.4.1
## [76] fftw_1.0-6 generics_0.0.2 pillar_1.4.4
## [79] withr_2.2.0 survival_3.2-7 RCurl_1.98-1.2
## [82] nnet_7.3-14 tibble_3.0.1 crayon_1.3.4
## [85] rmarkdown_2.1 grid_4.0.3 data.table_1.13.0
## [88] callr_3.5.1 ModelMetrics_1.2.2.2 digest_0.6.25
## [91] webshot_0.5.2 xtable_1.8-4 httpuv_1.5.2
## [94] signal_0.7-6 stats4_4.0.3 munsell_0.5.0
## [97] sessioninfo_1.1.1