Classifying USV subtypes

Rat vocalization and alcohol

Author

Marcelo Araya-Salas

Published

March 21, 2025

Source code and data found at https://github.com/maRce10/rat_vocalization_alcohol

1 Purpose

Optimizing USV subtype classification based on acoustic features

Load packages and set custom parameters

Code

# knitr is require for creating html/pdf/word reports formatR is
# used for soft-wrapping code

# install/ load packages
sketchy::load_packages(packages = c("knitr", "formatR", "viridis",
    "warbleR", github = "maRce10/Rraven", github = "maRce10/warbleR",
    github = "maRce10/ohun", "caret", "randomForest", "DiagrammeR",
    "beepr", "pbapply", "ggridges", "doParallel"))

# recording hours isf <- info_sound_files() rh <-
# sum(isf$duration) / 60
rh <- 1455

# Define the custom colors
custom_colors <- rev(c("#38AAAC66", "#54C9AD66", "#A0DFB966", "#DEF5E566",
    "#ffffff"))

# Create a color palette function
custom_palette <- colorRampPalette(custom_colors)

# Generate a gradient of colors from the custom palette
gradient_colors <- custom_palette(100)  # 100 colors for a smooth gradient

Prepare data

Code

anns <- read.csv("./data/processed/curated_final_data_set_single_unit_and_composed_annotations_and_spectrogram_and_mel_features.csv")

anns$modulation.index <- anns$modindx * anns$dfrange/(anns$end - anns$start)

2 Describe annotations (single element subtypes)

Single units were manually annotated in Raven
Composed calls (2 or more single units) were determined automatically as those in which single units were 5 ms apart or less
The data set anns contains all single units for the composed calls as single annotations (so several annotations per composed subtype) as well as the single unit calls (the column anns$composed)
The column call_id contains a unique ID for each call (either single or multielement)
The data comes from an acoustic data set with 1455 recording hours
78 sound files were analyzed
16506 single units were manually annotated
10147 single unit calls
3030 composed calls

3 Summarize features for composed calls

Acoustic features are summarized as the average value weighted by the duration of each composing single unit

Code

sel_feats <- names(anns)[c(16:219, 226)]

# features at the composed level
comp_feats <- song_analysis(X = anns, song_colm = "call_id", mean_colm = sel_feats,
    weight = "duration", parallel = 20)

comp_feats$subtype <- sapply(comp_feats$call_id, function(x) anns$overall_type[anns$call_id ==
    x][1])

comp_feats$subunit_type <- sapply(comp_feats$call_id, function(x) anns$subunit_type[anns$call_id ==
    x][1])

comp_feats$ordered_subunit_type <- sapply(comp_feats$call_id, function(x) anns$ordered_subunit_type[anns$call_id ==
    x][1])


write.csv(comp_feats, "./data/processed/call_level_features.csv",
    row.names = FALSE)

Code

# read
comp_feats <- read.csv("./data/processed/call_level_features.csv")

comp_feats$composed <- !is.na(comp_feats$gap.duration)
comp_feats$gap.duration <- ifelse(is.na(comp_feats$gap.duration),
    0, comp_feats$gap.duration)

# order levels by mean
agg <- aggregate(modulation.index ~ subtype, data = comp_feats, mean)

comp_feats$subtype <- factor(comp_feats$subtype, levels = agg[order(agg$modulation.index),
    "subtype"])

4 Classify subtypes

Random forest is used for supervised classification (30 interations and 100 repeats)
The data set is split into 75% training and 25% testing
The model is trained using the training set and tested on the testing set

4.1 Classification using manual single unit classification

It uses the manually classified single unit categories and their combination for composed calls as predictors
This approach represents the base case scenario in which there is no error in single unit classification
It shows how distinguishable subtypes are

flowchart LR
  A(Manual single\nunit classification) --> B(Measure\nacoustic\nstructure) 
  B --> C(Build\ncomposed\ntypes)
  C --> D(Average\nvalues for\ncomposed calls)
  D --> E(Supervised\nsubtype classification\nof calls)

style A fill:#0B04054D
style B fill:#3E356B4D
style C fill:#357BA24D
style D fill:#49C1AD4D
style E fill:#DEF5E54D

Code

# remove collinear
target_features <- names(comp_feats)[!names(comp_feats) %in% c("sound.files",
    "selec", "start", "end", "bottom.freq", "top.freq", "subtype",
    names(comp_feats)[sapply(comp_feats, anyNA)])]


num_target_features <- target_features[sapply(comp_feats[, target_features],
    is.numeric)]
cat_target_features <- target_features[!sapply(comp_feats[, target_features],
    is.numeric)]

cormat <- cor(comp_feats[, num_target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.95)
hc <- sort(hc)

num_target_features <- num_target_features[-hc]
num_target_features <- unique(c(num_target_features, "gap.duration",
    "elm.duration"))

target_features <- c(num_target_features, cat_target_features)

# Create data subsets
partition <- createDataPartition(y = comp_feats$subtype, times = 1,
    p = 0.75, list = FALSE)

comp_feats$subtype <- make.names(comp_feats$subtype, unique = FALSE,
    allow_ = TRUE)

comp_feats$subtype <- as.factor(comp_feats$subtype)

trainset <- comp_feats[partition, c(target_features, "subtype")]
testset <- comp_feats[-partition, c(target_features, "subtype")]
testset <- testset[testset$subunit_type %in% unique(trainset$subunit_type),
    ]

trcontrol <- trainControl(method = "repeatedcv", number = 100, repeats = 30,
    savePredictions = TRUE, classProbs = TRUE)

# use parallelization
registerDoParallel(cores = 20)

pred_model <- train(subtype ~ ., data = trainset, method = "rf", trControl = trcontrol,
    metric = "Accuracy")

beepr::beep(2)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), testset$subtype)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(testset$subtype ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total


rf_model_results <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

saveRDS(rf_model_results, "./data/processed/random_forest_model_results_perfect_subunit_classification.RDS")

4.1.1 Checking performance on test data

Code

rf_model_results <-
    readRDS("./data/processed/random_forest_model_results_perfect_subunit_classification.RDS")

# print confusion matrix results
#rf_model_results$conf_mat_bb

confusion_df <- rf_model_results$confusion_df_bb

confusion_df$Prediction <- as.character(confusion_df$Prediction)
confusion_df$Prediction <- gsub("step.", "step-", confusion_df$Prediction)
confusion_df$Prediction <- gsub("complex.", "complex-", confusion_df$Prediction)
confusion_df$Prediction <- gsub(".", " ", confusion_df$Prediction, fixed = TRUE)

confusion_df$Reference <- as.character(confusion_df$Reference)
confusion_df$Reference <- gsub("step.", "step-", confusion_df$Reference)
confusion_df$Reference <- gsub("complex.", "complex-", confusion_df$Reference)
confusion_df$Reference <- gsub(".", " ", confusion_df$Reference, fixed = TRUE)

# Use the custom palette in the ggplot
ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() +
    coord_equal() +
    scale_fill_gradientn(colors = gradient_colors) +  # Use custom gradient
    geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
    theme_classic() +
    labs(x = "Observed", y = "Predicted", title = paste("Overall accuracy:", round(max(rf_model_results$pred_model_bb$results$Accuracy), 2))) + 
    theme(axis.text.x = element_text(
        color = "black",
        size = 11,
        angle = 30,
        vjust = 0.8,
        hjust = 0.8
    ))

4.1.1.1 Feature importance

Which predictor features contribute the most to the classification

Code

# feature importance
imp <- varImp(rf_model_results$pred_model_bb)
dat <- ggplot(imp, top = 10) + theme_classic()

ggplot(data = dat$data, aes(x = Importance, y = Feature, fill = after_stat(x))) +
    scale_fill_viridis_c(name = "Importance", option = "G", guide = "none",
        begin = 0.4) + geom_bar(stat = "identity") + theme_classic()

4.2 Classification without single unit classification

It only uses the numeric features of single and composed calls
Skips any classification of single units

flowchart LR
  A(Measure\nacoustic\nstructure) --> B(Build\ncomposed\ntypes)
  B --> C(Average\nvalues for\ncomposed calls)
  C --> D(Supervised\nsubtype classification\nof calls)

style A fill:#0B04054D
style B fill:#3E356B4D
style C fill:#357BA24D
style D fill:#DEF5E54D

Code

comp_feats$composed <- !is.na(comp_feats$gap.duration)
comp_feats$gap.duration <- ifelse(is.na(comp_feats$gap.duration),
    0, comp_feats$gap.duration)

# remove collinear
target_features <- names(comp_feats)[!names(comp_feats) %in% c("sound.files",
    "selec", "start", "end", "bottom.freq", "top.freq", "subtype",
    names(comp_feats)[sapply(comp_feats, anyNA)])]

num_target_features <- target_features[sapply(comp_feats[, target_features],
    is.numeric)]
cat_target_features <- "composed"

cormat <- cor(comp_feats[, num_target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.95)
hc <- sort(hc)

num_target_features <- num_target_features[-hc]
num_target_features <- unique(c(num_target_features, "gap.duration",
    "elm.duration"))

target_features <- c(num_target_features, cat_target_features)

set.seed(123)

# Create data subsets
partition <- createDataPartition(y = comp_feats$subtype, times = 1,
    p = 0.75, list = FALSE)

comp_feats$subtype <- make.names(comp_feats$subtype, unique = FALSE,
    allow_ = TRUE)

comp_feats$subtype <- as.factor(comp_feats$subtype)

trainset <- comp_feats[partition, c(target_features, "subtype")]
testset <- comp_feats[-partition, c(target_features, "subtype")]

trcontrol <- trainControl(method = "repeatedcv", number = 100, repeats = 30,
    savePredictions = TRUE, classProbs = TRUE)

# use parallelization
registerDoParallel(cores = 20)

pred_model <- train(subtype ~ ., data = trainset, method = "rf", trControl = trcontrol,
    metric = "Accuracy")

beepr::beep(2)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), testset$subtype)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(testset$subtype ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total


rf_model_results <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

saveRDS(rf_model_results, "./data/processed/random_forest_model_results_no_subunit_classification.RDS")

4.2.1 Checking performance on test data

Code

rf_model_results <-
    readRDS("./data/processed/random_forest_model_results_no_subunit_classification.RDS")

# print confusion matrix results
#rf_model_results$conf_mat_bb

confusion_df <- rf_model_results$confusion_df_bb

confusion_df$Prediction <- as.character(confusion_df$Prediction)
confusion_df$Prediction <- gsub("step.", "step-", confusion_df$Prediction)
confusion_df$Prediction <- gsub("complex.", "complex-", confusion_df$Prediction)
confusion_df$Prediction <- gsub(".", " ", confusion_df$Prediction, fixed = TRUE)

confusion_df$Reference <- as.character(confusion_df$Reference)
confusion_df$Reference <- gsub("step.", "step-", confusion_df$Reference)
confusion_df$Reference <- gsub("complex.", "complex-", confusion_df$Reference)
confusion_df$Reference <- gsub(".", " ", confusion_df$Reference, fixed = TRUE)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() +
    coord_equal() +
      scale_fill_gradientn(colors = gradient_colors) +  # Use custom gradient  
    geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
    theme_classic() +
    labs(x = "Observed", y = "Predicted", title = paste("Overall accuracy:", round(max(rf_model_results$pred_model_bb$results$Accuracy), 2))) +
    theme(axis.text.x = element_text(
        color = "black",
        size = 11,
        angle = 30,
        vjust = 0.8,
        hjust = 0.8
    ))

4.2.1.1 Feature importance

Which predictor features contribute the most to the classification

Code

# feature importance
imp <- varImp(rf_model_results$pred_model_bb)
dat <- ggplot(imp, top = 10) + theme_classic()

ggplot(data = dat$data, aes(x = Importance, y = Feature, fill = after_stat(x))) +
    scale_fill_viridis_c(name = "Importance", option = "G", guide = "none",
        begin = 0.4) + geom_bar(stat = "identity") + theme_classic()

4.3 2 step random forest

The first step classifies single units in a supervised manner
The second step classifies composed calls using the single units categories from the previous step along with the numeric features as predictors

flowchart LR
  A(Measure\nacoustic\nstructure) --> B(Supervised single\nunit classification)
  B --> C(Build\ncomposed\ntypes)
  C --> D(Average\nvalues for\ncomposed calls)
  D --> E(Supervised\nsubtype classification\nof calls\nusing single unit\nclassification)

style A fill:#0B04054D
style B fill:#3E356B4D
style C fill:#357BA24D
style D fill:#49C1AD4D
style E fill:#DEF5E54D

4.3.1 1. Classified single units

Code

single_anns <- anns[!anns$composed, ]

# remove collinear
target_features <- names(single_anns)[!names(single_anns) %in% c("sound.files",
    "selec", "start", "end", "bottom.freq", "top.freq", "subtype",
    "Delta.Time..s.", "composed.subtype", "ovlp.sels", "call_id",
    "Channel", "subtipos", names(comp_feats)[sapply(single_anns, anyNA)])]

target_features <- target_features[sapply(single_anns[, target_features],
    is.numeric)]

cormat <- cor(single_anns[, target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.95)
hc <- sort(hc)

target_features <- target_features[-hc]
target_features <- unique(c(target_features, "duration"))

set.seed(123)

# Create data subsets
partition <- createDataPartition(y = single_anns$subtype, times = 1,
    p = 0.75, list = FALSE)

# single_anns$subtype <- make.names(single_anns$subtype, unique
# = FALSE, allow_ = TRUE)

single_anns$subtype <- as.factor(single_anns$subtype)

trainset <- single_anns[partition, c(target_features, "subtype")]
testset <- single_anns[-partition, c(target_features, "subtype")]

trcontrol <- trainControl(method = "repeatedcv", number = 100, repeats = 30,
    savePredictions = TRUE, classProbs = TRUE)

# use parallelization
registerDoParallel(cores = 20)

pred_model <- train(subtype ~ ., data = trainset, method = "rf", trControl = trcontrol,
    metric = "Accuracy")

beepr::beep(2)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), testset$subtype)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(testset$subtype ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total


rf_model_results_step_1 <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

saveRDS(rf_model_results_step_1, "./data/processed/random_forest_subunit_classification.RDS")

4.3.1.1 Checking performance on test data

Code

rf_model_results_step_1 <-
    readRDS("./data/processed/random_forest_subunit_classification.RDS")

# print confusion matrix results
#rf_model_results_step_1$conf_mat_bb

confusion_df <- rf_model_results_step_1$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() +
    coord_equal() +
      scale_fill_gradientn(colors = gradient_colors) +  # Use custom gradient  
    geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
    theme_classic() +
    labs(x = "Observed", y = "Predicted", title = paste("Overall accuracy:", round(max(rf_model_results_step_1$pred_model_bb$results$Accuracy), 2))) +
    theme(axis.text.x = element_text(
        color = "black",
        size = 11,
        angle = 30,
        vjust = 0.8,
        hjust = 0.8
    ))

4.3.1.1.1 Feature importance

Which predictor features contribute the most to the classification

Code

# feature importance
imp <- varImp(rf_model_results_step_1$pred_model_bb)
dat <- ggplot(imp, top = 10) + theme_classic()

ggplot(data = dat$data, aes(x = Importance, y = Feature, fill = after_stat(x))) +
    scale_fill_viridis_c(name = "Importance", option = "G", guide = "none",
        begin = 0.4) + geom_bar(stat = "identity") + theme_classic()

4.3.2 2. Random forest with unsupervised-classified sub-units

Code

anns$superv_subunit_type <- NA

anns$superv_subunit_type[!anns$composed] <- as.character(predict(rf_model_results_step_1$pred_model_bb,
    anns[!anns$composed, names(anns) != "subtype"]))


comp_feats$superv_subunit_type <- sapply(comp_feats$call_id, function(x) {

    subtypes <- anns$subtype.label[anns$call_id == x]
    paste(subtypes, collapse = "-")
})

comp_feats$superv_ordered_subunit_type <- sapply(comp_feats$call_id,
    function(x) {

        subtypes <- anns$subtype.label[anns$call_id == x]
        order_subtypes <- sort(subtypes)
        paste(order_subtypes, collapse = "-")

    })

comp_feats$subtype <- make.names(comp_feats$subtype, unique = FALSE,
    allow_ = TRUE)

comp_feats$subtype <- as.factor(comp_feats$subtype)

Code

# remove collinear
target_features <- names(comp_feats)[!names(comp_feats) %in% c("call_id",
    "sound.files", "selec", "start", "end", "bottom.freq", "top.freq",
    "subtype", names(comp_feats)[sapply(comp_feats, anyNA)])]


num_target_features <- target_features[sapply(comp_feats[, target_features],
    is.numeric)]

cat_target_features <- c("composed", "superv_subunit_type", "superv_ordered_subunit_type")

cormat <- cor(comp_feats[, num_target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.95)
hc <- sort(hc)

num_target_features <- num_target_features[-hc]
num_target_features <- unique(c(num_target_features, "gap.duration",
    "elm.duration"))

target_features <- c(num_target_features, cat_target_features)

# factor superv_subunit_type has new levels
# complex-trill-complex, flat-complex-flat-trill,
# flat-flat-trill-complex, trill-flat-flat-complex-trill

# Create data subsets
partition <- createDataPartition(y = comp_feats$subtype, times = 1,
    p = 0.75, list = FALSE)

trainset <- comp_feats[partition, c(target_features, "subtype")]
testset <- comp_feats[-partition, c(target_features, "subtype")]
testset <- testset[testset$superv_subunit_type %in% unique(trainset$superv_subunit_type),
    ]

trcontrol <- trainControl(method = "repeatedcv", number = 100, repeats = 30,
    savePredictions = TRUE, classProbs = TRUE)

# use parallelization
registerDoParallel(cores = 20)

pred_model <- train(subtype ~ ., data = trainset, method = "rf", trControl = trcontrol,
    metric = "Accuracy")

beepr::beep(2)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), testset$subtype)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(testset$subtype ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total


rf_model_results_step_2 <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

saveRDS(rf_model_results_step_2, "./data/processed/random_forest_model_results_2_step_classification.RDS")

4.3.2.1 Checking performance on test data

Code

rf_model_results_step_2 <-
    readRDS("./data/processed/random_forest_model_results_2_step_classification.RDS")

# print confusion matrix results
#rf_model_results_step_2$conf_mat_bb

confusion_df <- rf_model_results_step_2$confusion_df_bb

confusion_df$Prediction <- as.character(confusion_df$Prediction)
confusion_df$Prediction <- gsub("step.", "step-", confusion_df$Prediction)
confusion_df$Prediction <- gsub("complex.", "complex-", confusion_df$Prediction)
confusion_df$Prediction <- gsub(".", " ", confusion_df$Prediction, fixed = TRUE)

confusion_df$Reference <- as.character(confusion_df$Reference)
confusion_df$Reference <- gsub("step.", "step-", confusion_df$Reference)
confusion_df$Reference <- gsub("complex.", "complex-", confusion_df$Reference)
confusion_df$Reference <- gsub(".", " ", confusion_df$Reference, fixed = TRUE)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() +
    coord_equal() +
      scale_fill_gradientn(colors = gradient_colors) +  # Use custom gradient  
    geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
    theme_classic() +
    labs(x = "Observed", y = "Predicted", title = paste("Overall accuracy:", round(max(rf_model_results_step_2$pred_model_bb$results$Accuracy), 2))) +
    theme(axis.text.x = element_text(
        color = "black",
        size = 11,
        angle = 30,
        vjust = 0.8,
        hjust = 0.8
    ))

4.3.2.1.1 Feature importance

Which predictor features contribute the most to the classification

Code

# feature importance
imp <- varImp(rf_model_results_step_2$pred_model_bb)
dat <- ggplot(imp, top = 10) + theme_classic()

ggplot(data = dat$data, aes(x = Importance, y = Feature, fill = after_stat(x))) +
    scale_fill_viridis_c(name = "Importance", option = "G", guide = "none",
        begin = 0.4) + geom_bar(stat = "identity") + theme_classic()

Code

## XG boosting

trcontrol <- trainControl(method = "repeatedcv", number = 100, repeats = 30,
    savePredictions = TRUE, classProbs = TRUE)

# use parallelization
registerDoParallel(cores = 20)

xgb_model <- train(subtype ~ ., data = trainset, method = "xgbTree",
    trControl = trcontrol, metric = "Accuracy")


saveRDS(xbg_model, "./data/processed/xgb_model_results.RDS")

5 Compare manual vs supervised classification

Comparing features from both approaches helps to understand the putative effect on downstream analyses
Modulation index measures the cumulative change in frequency of the dominant frequency contour over the duration of the call

Code

# read comp_feats <-
# read.csv('./data/processed/call_level_features.csv')
# comp_feats$composed <- !is.na(comp_feats$gap.duration)
# comp_feats$gap.duration <-
# ifelse(is.na(comp_feats$gap.duration), 0,
# comp_feats$gap.duration)

# order levels by mean
agg <- aggregate(modulation.index ~ subtype, data = comp_feats, mean)

# comp_feats$subtype <- factor(comp_feats$subtype, levels =
# agg[order(agg$modulation.index), 'subtype'])

comp_feats2 <- comp_feats

comp_feats2 <- comp_feats2[comp_feats2$superv_subunit_type %in% unique(rf_model_results_step_2$pred_model_bb$trainingData$superv_subunit_type),
    ]

comp_feats2$subtype <- predict(rf_model_results_step_2$pred_model_bb,
    comp_feats2[, names(comp_feats2) != "subtype"])

comp_feats$method <- "manual"
comp_feats2$method <- "2 step supervised"

comp_pooled <- rbind(comp_feats, comp_feats2)

# ggplot(comp_feats, aes(x = modulation.index, y = subtype, fill
# = after_stat(x))) + geom_density_ridges_gradient(scale = 3,
# rel_min_height = 0.01) + scale_fill_viridis_c(name =
# 'Modulation', option = 'G', guide = 'none', begin = 0.4) +
# xlim(c(-100, 10000)) + theme_minimal() + labs(x = 'Modulation
# index', y = 'Density')

comp_pooled$subtype <- factor(comp_pooled$subtype, levels = agg[order(agg$modulation.index),
    "subtype"])

ggplot(comp_pooled, aes(y = modulation.index, x = subtype, fill = after_stat(x))) +
    geom_boxplot(outliers = FALSE) + scale_fill_viridis_c(name = "Modulation",
    option = "G", guide = "none", begin = 0.4) + theme_minimal() +
    facet_wrap(~method, nrow = 2) + labs(y = "Modulation index", x = "Subtype")

Takeaways

Good classification performance with 2 step supervised classification

Session information

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.2 (2024-10-31)
 os       Ubuntu 22.04.5 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Costa_Rica
 date     2025-03-21
 pandoc   3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version    date (UTC) lib source
 audio           0.1-11     2023-08-18 [1] CRAN (R 4.4.1)
 backports       1.5.0      2024-05-23 [1] CRAN (R 4.4.1)
 beepr         * 2.0        2024-07-06 [1] CRAN (R 4.4.1)
 bitops          1.0-9      2024-10-03 [1] CRAN (R 4.4.1)
 brio            1.1.5      2024-04-24 [1] CRAN (R 4.4.1)
 cachem          1.1.0      2024-05-16 [1] CRAN (R 4.4.1)
 caret         * 6.0-94     2023-03-21 [1] CRAN (R 4.4.1)
 checkmate       2.3.2      2024-07-29 [1] CRAN (R 4.4.1)
 class           7.3-22     2023-05-03 [1] CRAN (R 4.4.1)
 classInt        0.4-11     2025-01-08 [1] CRAN (R 4.4.2)
 cli             3.6.4      2025-02-13 [1] CRAN (R 4.4.2)
 codetools       0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace      2.1-1      2024-07-26 [1] CRAN (R 4.4.1)
 crayon          1.5.3      2024-06-20 [1] CRAN (R 4.4.1)
 curl            6.2.1      2025-02-19 [1] CRAN (R 4.4.2)
 data.table      1.15.4     2024-03-30 [1] CRAN (R 4.4.1)
 DBI             1.2.3      2024-06-02 [1] CRAN (R 4.4.1)
 devtools        2.4.5      2022-10-11 [1] CRAN (R 4.4.1)
 DiagrammeR    * 1.0.11     2024-02-02 [1] CRAN (R 4.4.1)
 digest          0.6.37     2024-08-19 [1] CRAN (R 4.4.1)
 doParallel    * 1.0.17     2022-02-07 [1] CRAN (R 4.4.1)
 dplyr           1.1.4      2023-11-17 [1] CRAN (R 4.4.1)
 dtw             1.23-1     2022-09-19 [1] CRAN (R 4.4.1)
 e1071           1.7-16     2024-09-16 [1] CRAN (R 4.4.1)
 ellipsis        0.3.2      2021-04-29 [1] CRAN (R 4.4.1)
 evaluate        1.0.3      2025-01-10 [1] CRAN (R 4.4.2)
 farver          2.1.2      2024-05-13 [1] CRAN (R 4.4.1)
 fastmap         1.2.0      2024-05-15 [1] CRAN (R 4.4.1)
 fftw            1.0-9      2024-09-20 [1] CRAN (R 4.4.1)
 foreach       * 1.5.2      2022-02-02 [1] CRAN (R 4.4.1)
 formatR       * 1.14       2023-01-17 [1] CRAN (R 4.4.1)
 fs              1.6.5      2024-10-30 [1] CRAN (R 4.4.1)
 future          1.34.0     2024-07-29 [1] CRAN (R 4.4.1)
 future.apply    1.11.2     2024-03-28 [1] CRAN (R 4.4.1)
 generics        0.1.3      2022-07-05 [1] CRAN (R 4.4.1)
 ggplot2       * 3.5.1      2024-04-23 [1] CRAN (R 4.4.1)
 ggridges      * 0.5.6      2024-01-23 [1] CRAN (R 4.4.1)
 globals         0.16.3     2024-03-08 [1] CRAN (R 4.4.1)
 glue            1.8.0      2024-09-30 [1] CRAN (R 4.4.1)
 gower           1.0.1      2022-12-22 [1] CRAN (R 4.4.1)
 gridExtra       2.3        2017-09-09 [1] CRAN (R 4.4.1)
 gtable          0.3.6      2024-10-25 [1] CRAN (R 4.4.1)
 hardhat         1.4.0      2024-06-02 [1] CRAN (R 4.4.1)
 htmltools       0.5.8.1    2024-04-04 [1] CRAN (R 4.4.1)
 htmlwidgets     1.6.4      2023-12-06 [1] CRAN (R 4.4.1)
 httpuv          1.6.15     2024-03-26 [1] CRAN (R 4.4.1)
 httr            1.4.7      2023-08-15 [1] CRAN (R 4.4.1)
 igraph          2.1.4      2025-01-23 [1] CRAN (R 4.4.2)
 ipred           0.9-14     2023-03-09 [1] CRAN (R 4.4.1)
 iterators     * 1.0.14     2022-02-05 [1] CRAN (R 4.4.1)
 jsonlite        1.9.0      2025-02-19 [1] CRAN (R 4.4.2)
 KernSmooth      2.23-24    2024-05-17 [1] CRAN (R 4.4.1)
 knitr         * 1.49       2024-11-08 [1] CRAN (R 4.4.1)
 labeling        0.4.3      2023-08-29 [1] CRAN (R 4.4.1)
 later           1.3.2      2023-12-06 [1] CRAN (R 4.4.1)
 lattice       * 0.22-6     2024-03-20 [1] CRAN (R 4.4.1)
 lava            1.8.0      2024-03-05 [1] CRAN (R 4.4.1)
 lifecycle       1.0.4      2023-11-07 [1] CRAN (R 4.4.1)
 listenv         0.9.1      2024-01-29 [1] CRAN (R 4.4.1)
 lubridate       1.9.3      2023-09-27 [1] CRAN (R 4.4.1)
 magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.4.1)
 MASS            7.3-61     2024-06-13 [1] CRAN (R 4.4.1)
 Matrix          1.7-0      2024-04-26 [1] CRAN (R 4.4.1)
 memoise         2.0.1      2021-11-26 [1] CRAN (R 4.4.1)
 mime            0.12       2021-09-28 [1] CRAN (R 4.4.1)
 miniUI          0.1.1.1    2018-05-18 [1] CRAN (R 4.4.1)
 ModelMetrics    1.2.2.2    2020-03-17 [1] CRAN (R 4.4.1)
 munsell         0.5.1      2024-04-01 [1] CRAN (R 4.4.1)
 NatureSounds  * 1.0.5      2025-01-17 [1] CRAN (R 4.4.2)
 nlme            3.1-165    2024-06-06 [1] CRAN (R 4.4.1)
 nnet            7.3-19     2023-05-03 [1] CRAN (R 4.4.1)
 ohun          * 1.0.2      2025-02-17 [1] local
 packrat         0.9.2      2023-09-05 [1] CRAN (R 4.4.1)
 parallelly      1.38.0     2024-07-27 [1] CRAN (R 4.4.1)
 pbapply       * 1.7-2      2023-06-27 [1] CRAN (R 4.4.1)
 pillar          1.10.1     2025-01-07 [1] CRAN (R 4.4.2)
 pkgbuild        1.4.6      2025-01-16 [1] CRAN (R 4.4.2)
 pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.4.1)
 pkgload         1.4.0      2024-06-28 [1] CRAN (R 4.4.1)
 plyr            1.8.9      2023-10-02 [1] CRAN (R 4.4.1)
 pROC            1.18.5     2023-11-01 [1] CRAN (R 4.4.1)
 prodlim         2024.06.25 2024-06-24 [1] CRAN (R 4.4.1)
 profvis         0.3.8      2023-05-02 [1] CRAN (R 4.4.1)
 promises        1.3.0      2024-04-05 [1] CRAN (R 4.4.1)
 proxy           0.4-27     2022-06-09 [1] CRAN (R 4.4.1)
 purrr           1.0.4      2025-02-05 [1] CRAN (R 4.4.2)
 R6              2.6.1      2025-02-15 [1] CRAN (R 4.4.2)
 randomForest  * 4.7-1.1    2022-05-23 [1] CRAN (R 4.4.1)
 RColorBrewer    1.1-3      2022-04-03 [1] CRAN (R 4.4.1)
 Rcpp            1.0.14     2025-01-12 [1] CRAN (R 4.4.2)
 RCurl           1.98-1.16  2024-07-11 [1] CRAN (R 4.4.1)
 recipes         1.0.10     2024-02-18 [1] CRAN (R 4.4.1)
 remotes         2.5.0      2024-03-17 [1] CRAN (R 4.4.1)
 reshape2        1.4.4      2020-04-09 [1] CRAN (R 4.4.1)
 rjson           0.2.23     2024-09-16 [1] CRAN (R 4.4.1)
 rlang           1.1.5      2025-01-17 [1] CRAN (R 4.4.2)
 rmarkdown       2.29       2024-11-04 [1] CRAN (R 4.4.2)
 rpart           4.1.23     2023-12-05 [1] CRAN (R 4.4.1)
 Rraven        * 1.0.14     2017-11-17 [1] CRAN (R 4.4.1)
 rstudioapi      0.16.0     2024-03-24 [1] CRAN (R 4.4.1)
 scales          1.3.0      2023-11-28 [1] CRAN (R 4.4.1)
 seewave       * 2.2.3      2023-10-19 [1] CRAN (R 4.4.1)
 sessioninfo     1.2.2      2021-12-06 [1] CRAN (R 4.4.1)
 sf              1.0-19     2024-11-05 [1] CRAN (R 4.4.1)
 shiny           1.9.1      2024-08-01 [1] CRAN (R 4.4.1)
 signal          1.8-1      2024-06-26 [1] CRAN (R 4.4.1)
 sketchy         1.0.5      2025-01-16 [1] CRANs (R 4.4.2)
 stringi         1.8.4      2024-05-06 [1] CRAN (R 4.4.1)
 stringr         1.5.1      2023-11-14 [1] CRAN (R 4.4.1)
 survival        3.7-0      2024-06-05 [1] CRAN (R 4.4.1)
 testthat        3.2.3      2025-01-13 [1] CRAN (R 4.4.2)
 tibble          3.2.1      2023-03-20 [1] CRAN (R 4.4.1)
 tidyselect      1.2.1      2024-03-11 [1] CRAN (R 4.4.1)
 timechange      0.3.0      2024-01-18 [1] CRAN (R 4.4.1)
 timeDate        4032.109   2023-12-14 [1] CRAN (R 4.4.1)
 tuneR         * 1.4.7      2024-04-17 [1] CRAN (R 4.4.1)
 units           0.8-5      2023-11-28 [1] CRAN (R 4.4.1)
 urlchecker      1.0.1      2021-11-30 [1] CRAN (R 4.4.1)
 usethis         2.2.3      2024-02-19 [1] CRAN (R 4.4.1)
 vctrs           0.6.5      2023-12-01 [1] CRAN (R 4.4.1)
 viridis       * 0.6.5      2024-01-29 [1] CRAN (R 4.4.1)
 viridisLite   * 0.4.2      2023-05-02 [1] CRAN (R 4.4.1)
 visNetwork      2.1.2      2022-09-29 [1] CRAN (R 4.4.1)
 warbleR       * 1.1.34     2025-01-17 [1] CRAN (R 4.4.2)
 withr           3.0.2      2024-10-28 [1] CRAN (R 4.4.1)
 xaringanExtra   0.8.0      2024-05-19 [1] CRAN (R 4.4.1)
 xfun            0.51       2025-02-19 [1] CRAN (R 4.4.2)
 xtable          1.8-4      2019-04-21 [1] CRAN (R 4.4.1)
 yaml            2.3.10     2024-07-26 [1] CRAN (R 4.4.1)

 [1] /home/m/R/x86_64-pc-linux-gnu-library/4.4
 [2] /usr/local/lib/R/site-library
 [3] /usr/lib/R/site-library
 [4] /usr/lib/R/library

──────────────────────────────────────────────────────────────────────────────