Classifying USV subtypes

Rat vocalization and alcohol

Author

Marcelo Araya-Salas

Published

November 17, 2023

Source code and data found at https://github.com/maRce10/rat_vocalization_alcohol

1 Purpose

Classify subtypes based on structure
Determine multi-element subtypes based on composing single subtypes

Load packages

Code

# knitr is require for creating html/pdf/word reports formatR is
# used for soft-wrapping code

# install/ load packages
sketchy::load_packages(packages = c("knitr", "formatR", "viridis",
    "warbleR", github = "maRce10/Rraven", github = "maRce10/warbleR",
    github = "maRce10/ohun", "caret", "randomForest", "DiagrammeR"))

warbleR_options(wav.path = "/media/m/Expansion/audios_cin_alcohol_2023")

Read data

Code

anns <- imp_raven("./data/processed/annotations", warbler.format = TRUE,
    all.data = TRUE)

anns$subc <- anns$subtipos_completo <- NULL

anns <- anns[!is.na(anns$subtipos), ]

anns$subtipos[anns$subtipos %in% c(5, 44)] <- 4

anns <- anns[order(anns$sound.files, anns$start), ]

nrow(anns)

[1] 17147

Code

anns <- anns[anns$start < anns$end, ]

anns$subtype.label <- NA

anns$subtype.label[anns$subtipos == 22] <- "22 kHz"
anns$subtype.label[anns$subtipos == 1] <- "flat"
anns$subtype.label[anns$subtipos == 4] <- "trill"
anns$subtype.label[anns$subtipos == 6] <- "complex"

2 Describe annotations (single element subtypes)

Proportion of calls per subtype
This is the annotation data without any modification

Code

count_types <- round(table(anns$subtype.label)/nrow(anns), 2)

count_types <- as.data.frame(count_types)

count_types$Count <- as.vector(table(anns$subtype.label))
names(count_types) <- c("Subtype", "Proportion", "Count")

count_types

Subtype	Proportion	Count
22 kHz	0.00	13
complex	0.14	2328
flat	0.04	681
trill	0.82	14123

3 Data analysis

flowchart LR
  A[Annotations] --> C(Measure\nsingle-note\nsubtype\nstructure)
  B[recordings] --> C
  C --> D(Build\ncomposed\nsubtypes)
  C --> E(Classify\nsimple\nsubtypes) 
  D --> F(Classify\ncomposed\nsubtypes)
  F --> G(Evaluate classification)
  E --> G
  
style A fill:#44015466
style B fill:#3E4A894D
style C fill:#26828E4D
style D fill:#31688E4D
style E fill:#6DCD594D
style G fill:#FDE7254D

4 Measure acoustic structure

Acoustic structure measured as spectrographic/spectrum/envelope features as well as statistical descriptors of Mel frequency cepstral coefficients

Code

cs <- check_sels(anns, parallel = 1)

sp <- spectro_analysis(anns, parallel = 5, ovlp = 70)
mfccs <- mfcc_stats(anns, parallel = 5, ovlp = 70)

anns_sp <- merge(anns, sp)

anns_sp <- merge(anns_sp, mfccs)
colnames(anns_sp)

write.csv(anns_sp, "./data/processed/annotations_and_spectrogram_and_mel_features.csv",
    row.names = FALSE)

5 Multi-element composed subtypes

Define as those in which single elements are within a 5 ms range

Code

anns <- read.csv("./data/processed/annotations_and_spectrogram_and_mel_features.csv")

anns$true.end <- anns$end

anns$end <- anns$end + 0.005

anns <- overlapping_sels(anns)

anns$end <- anns$true.end

anns$true.end <- NULL

# add song label to elements
anns$composed.subtype <- anns$ovlp.sels

for (i in seq_len(nrow(anns))) if (is.na(anns$composed.subtype[i])) anns$composed.subtype[i] <- max(anns$composed.subtype,
    na.rm = TRUE) + 1

write.csv(anns, "./data/processed/annotations_groups_spectrogram_and_mel_features.csv",
    row.names = FALSE)

5.1 Rules to determine composed subtypes

If any of the composing subtypes is complex then composed complex
If it contains 3 or more single subtypes then composed complex
If ‘flat-trill’ and ‘trill-flat’ then step flat trill
If it has at least two adjacent trills (‘trill-trill’) and the frequency change is equal or higher than 5 kHz then step trill
If it has at least two adjacent flats (‘flat-flat’) and the frequency change is equal or higher than 5 kHz then step flat

Code

anns <- read.csv("./data/processed/annotations_groups_spectrogram_and_mel_features.csv")


composed_anns_list <- lapply(unique(na.omit(anns$ovlp.sels)), function(x) {

    data.frame(sound.files = anns$sound.files[anns$ovlp.sels == x &
        !is.na(anns$ovlp.sels)][1], selec = x, start = min(anns$start[anns$ovlp.sels ==
        x & !is.na(anns$ovlp.sels)]), end = max(anns$end[anns$ovlp.sels ==
        x & !is.na(anns$ovlp.sels)]), bottom.freq = min(anns$bottom.freq[anns$ovlp.sels ==
        x & !is.na(anns$ovlp.sels)]), top.freq = max(anns$top.freq[anns$ovlp.sels ==
        x & !is.na(anns$ovlp.sels)]), subtypes = paste(anns$subtipos[anns$ovlp.sels ==
        x & !is.na(anns$ovlp.sels)], collapse = "-"), composed.subtype = x,
        count.subtypes = sum(anns$ovlp.sels == x & !is.na(anns$ovlp.sels)))
})

composed_anns <- do.call(rbind, composed_anns_list)

unique(composed_anns$subtypes)


# subtypes <- c(1 = 'flat', 4 = 'trill', 6 = 'complejas', 22 =
# '22 kHz')

# add labels
composed_anns$composed.subtype.label <- NA

# of it contain complex subtype then is complex
composed_anns$composed.subtype.label <- ifelse(grepl("6", composed_anns$subtypes),
    "composed.complex", composed_anns$composed.subtype.label)

# of it contain 3 or more elements it is complex
composed_anns$composed.subtype.label <- ifelse(composed_anns$count.subtypes >=
    3, "composed.complex", composed_anns$composed.subtype.label)

# 1-4 and 4-1 is a 3 (trill flat steps)
composed_anns$composed.subtype.label <- ifelse(grepl("^1-4$|^4-1$",
    composed_anns$subtypes), "step.flat.trill", composed_anns$composed.subtype.label)

# if it has at least two adjacent 4 ('4-4' two trills) and the
# frequency change is equal or higher than 5 kHz it is a step
composed_anns$composed.subtype.label <- sapply(seq_len(nrow(composed_anns)),
    function(x) {

        if (sum(grepl("4-4", composed_anns$subtypes[x])) & composed_anns$count.subtypes[x] ==
            2) {

            peakfs <- anns$meanpeakf[anns$composed.subtype == composed_anns$composed.subtype[x]]

            diff_freq <- abs(peakfs[1] - peakfs[2])

            out <- if (diff_freq >= 5)
                "step.trill" else "composed.trill"
        } else out <- composed_anns$composed.subtype.label[x]

        return(out)
    })

# if it has at least two adjacent 1 ('1-1' two flats) and the
# frequency change is equal or higher than 5 kHz it is a step
composed_anns$composed.subtype.label <- sapply(seq_len(nrow(composed_anns)),
    function(x) {

        if (sum(grepl("1-1", composed_anns$subtypes[x])) & composed_anns$count.subtypes[x] ==
            2) {

            peakfs <- anns$meanpeakf[anns$composed.subtype == composed_anns$composed.subtype[x]]

            diff_freq <- abs(peakfs[1] - peakfs[2])

            out <- if (diff_freq >= 5)
                "step.flat" else "composed.flat"
        } else out <- composed_anns$composed.subtype.label[x]

        return(out)
    })

unique(composed_anns$subtypes[composed_anns$count.subtypes == 2 &
    is.na(composed_anns$composed.subtype.label)])

round(table(composed_anns$composed.subtype.label)/nrow(composed_anns),
    2)

composed_anns$subtype.label <- ifelse(grepl("step", composed_anns$composed.subtype.label),
    "step", composed_anns$composed.subtype.label)

composed_anns$subtype.label <- ifelse(grepl("complex", composed_anns$composed.subtype.label),
    "complex", composed_anns$composed.subtype.label)


composed_anns$subtype.label <- ifelse(grepl("composed.trill", composed_anns$composed.subtype.label),
    "trill", composed_anns$subtype.label)

write.csv(composed_anns, "./data/processed/annotations_composed_subtypes_5ms.csv",
    row.names = FALSE)

anns$type <- "single"
anns$composed.subtype.label <- anns$subtype.label

composed_anns$type <- "composed"

cols <- intersect(colnames(anns), colnames(composed_anns))

combined_anns <- rbind(anns[is.na(anns$ovlp.sels), cols], composed_anns[,
    cols])

write.csv(combined_anns, "./data/processed/annotations_simple_and_composed_subtypes_5ms.csv",
    row.names = FALSE)

6 Summary

Proportion of calls per subtypes for composed (multi-element) subtypes

Code

composed_anns <- read.csv("./data/processed/annotations_composed_subtypes_5ms.csv")

count_types <- round(table(composed_anns$composed.subtype.label)/nrow(composed_anns),
    2)

count_types <- as.data.frame(count_types)

count_types$Count <- as.vector(table(composed_anns$composed.subtype.label))
names(count_types) <- c("Subtype", "Proportion", "Count")

count_types

Subtype	Proportion	Count
composed.complex	0.35	1156
composed.trill	0.01	28
step.flat	0.00	13
step.flat.trill	0.04	120
step.trill	0.60	1951

Combination of single element calls on each composed subtype

Code

subtypes <- c(`1` = "flat", `4` = "trill", `6` = "complex", `22` = "22 kHz")

composed_anns$single.type.combs <- composed_anns$subtypes

for (i in seq_along(subtypes)) composed_anns$single.type.combs <- gsub(names(subtypes)[i],
    subtypes[i], composed_anns$single.type.combs)

count_types <- round(table(composed_anns$single.type.combs)/nrow(composed_anns),
    2)

count_types <- as.data.frame(count_types)

count_types$Count <- as.vector(table(composed_anns$single.type.combs))

count_types$comp <- sapply(count_types$Var1, function(x) composed_anns$composed.subtype.label[composed_anns$single.type.combs ==
    x][1])

names(count_types) <- c("Single subtype combination", "Proportion",
    "Count", "Composed subtype")

count_types <- count_types[order(count_types$`Composed subtype`),
    ]

count_types[, c(4, 1:3)]

	Composed subtype	Single subtype combination	Proportion	Count
1	composed.complex	complex-complex	0.01	23
2	composed.complex	complex-complex-complex	0.00	3
3	composed.complex	complex-flat	0.00	8
4	composed.complex	complex-trill	0.03	91
5	composed.complex	complex-trill-complex	0.00	1
6	composed.complex	complex-trill-trill	0.00	8
7	composed.complex	flat-complex	0.00	6
9	composed.complex	flat-flat-trill	0.00	3
10	composed.complex	flat-flat-trill-flat	0.00	1
11	composed.complex	flat-flat-trill-trill	0.00	1
13	composed.complex	flat-trill-flat	0.00	4
14	composed.complex	flat-trill-flat-flat	0.00	1
15	composed.complex	flat-trill-flat-trill	0.00	1
16	composed.complex	flat-trill-trill	0.00	4
17	composed.complex	flat-trill-trill-trill	0.00	4
18	composed.complex	trill-complex	0.23	744
19	composed.complex	trill-complex-complex	0.00	4
20	composed.complex	trill-complex-trill	0.00	16
21	composed.complex	trill-complex-trill-complex	0.00	1
22	composed.complex	trill-complex-trill-trill	0.00	4
24	composed.complex	trill-flat-complex	0.00	2
25	composed.complex	trill-flat-flat	0.00	3
26	composed.complex	trill-flat-flat-trill	0.00	5
27	composed.complex	trill-flat-trill	0.01	17
28	composed.complex	trill-flat-trill-complex	0.00	2
29	composed.complex	trill-flat-trill-trill	0.00	9
30	composed.complex	trill-flat-trill-trill-complex	0.00	1
32	composed.complex	trill-trill-complex	0.01	29
33	composed.complex	trill-trill-complex-complex	0.00	1
34	composed.complex	trill-trill-complex-trill	0.00	1
35	composed.complex	trill-trill-flat	0.00	4
36	composed.complex	trill-trill-flat-trill	0.00	5
37	composed.complex	trill-trill-trill	0.04	134
38	composed.complex	trill-trill-trill-complex-trill	0.00	1
39	composed.complex	trill-trill-trill-trill	0.00	13
40	composed.complex	trill-trill-trill-trill-trill-trill	0.00	1
8	step.flat	flat-flat	0.00	13
12	step.flat.trill	flat-trill	0.02	56
23	step.flat.trill	trill-flat	0.02	64
31	step.trill	trill-trill	0.61	1979

Proportion of calls per subtypes for both single and composed (multi-element) subtypes

Code

combined_anns <- read.csv("./data/processed/annotations_simple_and_composed_subtypes_5ms.csv")

count_types <- round(table(combined_anns$subtype.label)/nrow(combined_anns),
    2)

count_types <- as.data.frame(count_types)

count_types$Count <- as.vector(table(combined_anns$subtype.label))
names(count_types) <- c("Subtype", "Proportion", "Count")

count_types$type <- sapply(count_types$Subtype, function(x) unique(combined_anns$type[combined_anns$subtype.label ==
    x]))

count_types

Subtype	Proportion	Count	type
22 kHz	0.00	13	single
complex	0.18	2502	single , composed
flat	0.03	433	single
step.flat	0.00	13	composed
step.flat.trill	0.01	120	composed
step.trill	0.14	1951	composed
trill	0.63	8505	single , composed

Proportion of calls per subtypes for both single and composed by type (single vs composed)

Code

combined_anns <- read.csv("./data/processed/annotations_simple_and_composed_subtypes_5ms.csv")

count_types <- round(table(combined_anns$composed.subtype.label)/nrow(combined_anns),
    2)

count_types <- as.data.frame(count_types)

count_types$Count <- as.vector(table(combined_anns$composed.subtype.label))
names(count_types) <- c("Subtype", "Proportion", "Count")

count_types$type <- sapply(count_types$Subtype, function(x) unique(combined_anns$type[combined_anns$composed.subtype.label ==
    x]))

count_types

Subtype	Proportion	Count	type
22 kHz	0.00	13	single
complex	0.10	1346	single
composed.complex	0.09	1156	composed
composed.trill	0.00	28	composed
flat	0.03	433	single
step.flat	0.00	13	composed
step.flat.trill	0.01	120	composed
step.trill	0.14	1951	composed
trill	0.63	8477	single

7 Create spectrograms

Code

types <- unique(combined_anns$composed.subtype.label)

for (i in types[7:1]) {
    print(i)
    X <- combined_anns[combined_anns$composed.subtype.label == i,
        ]

    # subset
    set.seed(123)
    if (nrow(X) > 64)
        X <- X[sample(seq_len(nrow(X)), 64), ]

    nrw <- ncl <- 2
    n <- nrow(X)
    if (n > 4) {
        nrw <- ncl <- ceiling(sqrt(n))
        if (((nrw - 1) * ncl) >= n)
            nrw <- nrw - 1
    }

    # print catalog
    catalog(X = X, nrow = nrw, ncol = ncl, same.time.scale = T, mar = 0.001,
        res = 100, pb = FALSE, spec.mar = 0.001, max.group.cols = 5,
        title = i, ovlp = 90, wl = 512, width = 15, height = 9, hatching = 0,
        cex = 1.3, fast.spec = FALSE, pal = viridis, img.prefix = i,
        rm.axes = TRUE, flim = c(min(X$bottom.freq), max(X$top.freq)) +
            c(-1, 1), collevels = seq(-120, 0, 5), box = FALSE, lab.mar = 1e-05)

    move_images(from = .Options$warbleR$path, to = "./scripts", overwrite = TRUE,
        cut = TRUE, pb = FALSE)
}

7.1 Single element subtypes

22 kHz

Flats

Trills Complex

7.2 Composed subtypes

Step flat Step trill

Step flat

Composed complex

8 Random forest classification

Models trained for 100 iterations on 75% of the data and tested on the remaining 25%.

8.1 On single element subtypes

Code

elm_anns <- read.csv("./data/processed/annotations_groups_spectrogram_and_mel_features.csv")

elm_anns <- elm_anns[elm_anns$subtype.label != "22", ]


# remove collinear
target_features <- names(elm_anns)[!names(elm_anns) %in%  c("sound.files", "selec", "View", "Channel", "start", "end", "bottom.freq", "top.freq", "Delta.Time..s.", "Begin.File", "subtipos", "selec.file", "true.end", "ovlp.sels", "composed.subtype", "subtype.label", names(elm_anns)[sapply(elm_anns, anyNA)])]

cormat <- cor(elm_anns[, target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff= 0.9) # putt any value as a "cutoff" 
hc <- sort(hc)

target_features <- target_features[hc]

# Create data subsets
partition <- createDataPartition(
    y = elm_anns$subtype.label,
    times = 1,
    p = 0.75,
    list = FALSE
)



elm_anns$subtype.label <- as.factor(elm_anns$subtype.label)

trainset <- elm_anns[partition, c(target_features, "subtype.label")]
testset <- elm_anns[-partition, c(target_features, "subtype.label")]

trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 100,
        repeats = 100,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    subtype.label ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "scale"
)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <-
    confusionMatrix(predict(pred_model, testset), testset$subtype.label)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$subtype.label ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

# all_rf_model <- randomForest(
#   subtype.label ~ .,
#   data = elm_anns[, c(target_features, "subtype.label")],  # Your entire dataset
#   proximity = TRUE,  # Include proximity matrix
#   ntree = best_rf_model$ntree,  # Number of trees
#   mtry = best_rf_model$mtry,    # Number of variables tried for splitting at each node
#   nodesize = best_rf_model$nodesize,  # Minimum size of terminal nodes
#   maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
# )

rf_model_results <-
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset
        #all_rf_model = all_rf_model
    )


saveRDS(
    rf_model_results,
    "./data/processed/random_forest_model_results.RDS"
)

8.1.1 Checking performance on test data

Code

rf_model_results <- readRDS("./data/processed/random_forest_model_results.RDS")


# print confusion matrix results
rf_model_results$conf_mat_bb

Confusion Matrix and Statistics

          Reference
Prediction complex flat trill
   complex     550    0   357
   flat          2  138   604
   trill        30   32  2569

Overall Statistics
                                        
               Accuracy : 0.761         
                 95% CI : (0.748, 0.773)
    No Information Rate : 0.824         
    P-Value [Acc > NIR] : 1             
                                        
                  Kappa : 0.477         
                                        
 Mcnemar's Test P-Value : <2e-16        

Statistics by Class:

                     Class: complex Class: flat Class: trill
Sensitivity                   0.945      0.8118        0.728
Specificity                   0.904      0.8526        0.918
Pos Pred Value                0.606      0.1855        0.976
Neg Pred Value                0.991      0.9910        0.418
Prevalence                    0.136      0.0397        0.824
Detection Rate                0.128      0.0322        0.600
Detection Prevalence          0.212      0.1738        0.614
Balanced Accuracy             0.924      0.8322        0.823

Code

confusion_df <- rf_model_results$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + labs(x = "Observed",
    y = "Predicted") + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

8.2 On composed (multi-element) subtypes

Summarize structure at the multi-element subtype level

Code

elm_anns <- read.csv("./data/processed/annotations_groups_spectrogram_and_mel_features.csv")

mean_colms <- c("duration", "meanfreq", "sd", "freq.median", "freq.Q25",
    "freq.Q75", "freq.IQR", "time.median", "time.Q25", "time.Q75",
    "time.IQR", "skew", "kurt", "sp.ent", "time.ent", "entropy", "sfm",
    "meandom", "mindom", "maxdom", "dfrange", "modindx", "startdom",
    "enddom", "dfslope", "meanpeakf", "min.cc1", "min.cc2", "min.cc3",
    "min.cc4", "min.cc5", "min.cc6", "min.cc7", "min.cc8", "min.cc9",
    "min.cc10", "min.cc11", "min.cc12", "min.cc13", "min.cc14", "min.cc15",
    "min.cc16", "min.cc17", "min.cc18", "min.cc19", "min.cc20", "min.cc21",
    "min.cc22", "min.cc23", "min.cc24", "min.cc25", "max.cc1", "max.cc2",
    "max.cc3", "max.cc4", "max.cc5", "max.cc6", "max.cc7", "max.cc8",
    "max.cc9", "max.cc10", "max.cc11", "max.cc12", "max.cc13", "max.cc14",
    "max.cc15", "max.cc16", "max.cc17", "max.cc18", "max.cc19", "max.cc20",
    "max.cc21", "max.cc22", "max.cc23", "max.cc24", "max.cc25", "median.cc1",
    "median.cc2", "median.cc3", "median.cc4", "median.cc5", "median.cc6",
    "median.cc7", "median.cc8", "median.cc9", "median.cc10", "median.cc11",
    "median.cc12", "median.cc13", "median.cc14", "median.cc15", "median.cc16",
    "median.cc17", "median.cc18", "median.cc19", "median.cc20", "median.cc21",
    "median.cc22", "median.cc23", "median.cc24", "median.cc25", "mean.cc1",
    "mean.cc2", "mean.cc3", "mean.cc4", "mean.cc5", "mean.cc6", "mean.cc7",
    "mean.cc8", "mean.cc9", "mean.cc10", "mean.cc11", "mean.cc12",
    "mean.cc13", "mean.cc14", "mean.cc15", "mean.cc16", "mean.cc17",
    "mean.cc18", "mean.cc19", "mean.cc20", "mean.cc21", "mean.cc22",
    "mean.cc23", "mean.cc24", "mean.cc25", "var.cc1", "var.cc2", "var.cc3",
    "var.cc4", "var.cc5", "var.cc6", "var.cc7", "var.cc8", "var.cc9",
    "var.cc10", "var.cc11", "var.cc12", "var.cc13", "var.cc14", "var.cc15",
    "var.cc16", "var.cc17", "var.cc18", "var.cc19", "var.cc20", "var.cc21",
    "var.cc22", "var.cc23", "var.cc24", "var.cc25", "skew.cc1", "skew.cc2",
    "skew.cc3", "skew.cc4", "skew.cc5", "skew.cc6", "skew.cc7", "skew.cc8",
    "skew.cc9", "skew.cc10", "skew.cc11", "skew.cc12", "skew.cc13",
    "skew.cc14", "skew.cc15", "skew.cc16", "skew.cc17", "skew.cc18",
    "skew.cc19", "skew.cc20", "skew.cc21", "skew.cc22", "skew.cc23",
    "skew.cc24", "skew.cc25", "kurt.cc1", "kurt.cc2", "kurt.cc3",
    "kurt.cc4", "kurt.cc5", "kurt.cc6", "kurt.cc7", "kurt.cc8", "kurt.cc9",
    "kurt.cc10", "kurt.cc11", "kurt.cc12", "kurt.cc13", "kurt.cc14",
    "kurt.cc15", "kurt.cc16", "kurt.cc17", "kurt.cc18", "kurt.cc19",
    "kurt.cc20", "kurt.cc21", "kurt.cc22", "kurt.cc23", "kurt.cc24",
    "kurt.cc25", "mean.d1.cc", "var.d1.cc", "mean.d2.cc", "var.d2.cc")

composed_param <- song_param(X = elm_anns, song_colm = "composed.subtype",
    mean_colm = mean_colms)

combined_anns <- read.csv("./data/processed/annotations_simple_and_composed_subtypes_5ms.csv")


composed_param$composed.subtype.label <- sapply(composed_param$composed.subtype,
    function(x) unique(combined_anns$composed.subtype.label[combined_anns$composed.subtype ==
        x]))


write.csv(composed_param, "./data/processed/accoustic_features_simple_and_composed_subtypes.csv",
    row.names = FALSE)

8.2.1 Train model

Code

composed_param <- read.csv("./data/processed/accoustic_features_simple_and_composed_subtypes.csv")

# keep only multi-element
composed_param <- composed_param[composed_param$num.elms > 1, ]

composed_param$song.rate[is.infinite(composed_param$song.rate)] <- mean(composed_param$song.rate[!is.infinite(composed_param$song.rate)])

# remove collinear
target_features <- names(composed_param)[!names(composed_param) %in%  c("sound.files", "selec", "View", "Channel", "start", "end", "bottom.freq", "top.freq", "Delta.Time..s.", "Begin.File", "subtipos", "selec.file", "true.end", "ovlp.sels", "composed.subtype", "subtype.label", "composed.subtype.label", names(composed_param)[sapply(composed_param, anyNA)])]

cormat <- cor(composed_param[, target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff= 0.9) # putt any value as a "cutoff" 
hc <- sort(hc)

target_features <- target_features[hc]

# Create data subsets
partition <- createDataPartition(
    y = composed_param$composed.subtype.label,
    times = 1,
    p = 0.75,
    list = FALSE
)



composed_param$composed_param$composed.subtype.label <- as.factor(composed_param$composed.subtype.label)

trainset <- composed_param[partition, c(target_features, "composed.subtype.label")]
testset <- composed_param[-partition, c(target_features, "composed.subtype.label")]

trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 100,
        repeats = 100,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    composed.subtype.label ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "scale"
)

# ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <-
    confusionMatrix(predict(pred_model, testset), as.factor(testset$composed.subtype.label))

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$composed.subtype.label ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

# all_rf_model <- randomForest(
#   subtype.label ~ .,
#   data = elm_anns[, c(target_features, "subtype.label")],  # Your entire dataset
#   proximity = TRUE,  # Include proximity matrix
#   ntree = best_rf_model$ntree,  # Number of trees
#   mtry = best_rf_model$mtry,    # Number of variables tried for splitting at each node
#   nodesize = best_rf_model$nodesize,  # Minimum size of terminal nodes
#   maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
# )

rf_model_results <-
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset
        #all_rf_model = all_rf_model
    )

saveRDS(
    rf_model_results,
    "./data/processed/random_forest_model_results_composed_subtypes.RDS"
)

8.2.2 Checking performance on test data

Code

rf_model_results <- readRDS("./data/processed/random_forest_model_results_composed_subtypes.RDS")


# print confusion matrix results
rf_model_results$conf_mat_bb

Confusion Matrix and Statistics

                  Reference
Prediction         composed.complex composed.trill step.flat step.flat.trill
  composed.complex              140              1         0               1
  composed.trill                 25              2         0               7
  step.flat                       4              0         2               5
  step.flat.trill                22              2         1              12
  step.trill                     98              2         0               5
                  Reference
Prediction         step.trill
  composed.complex         62
  composed.trill           90
  step.flat                14
  step.flat.trill          50
  step.trill              271

Overall Statistics
                                        
               Accuracy : 0.523         
                 95% CI : (0.488, 0.558)
    No Information Rate : 0.597         
    P-Value [Acc > NIR] : 1             
                                        
                  Kappa : 0.245         
                                        
 Mcnemar's Test P-Value : NA            

Statistics by Class:

                     Class: composed.complex Class: composed.trill
Sensitivity                            0.484               0.28571
Specificity                            0.879               0.84920
Pos Pred Value                         0.686               0.01613
Neg Pred Value                         0.757               0.99277
Prevalence                             0.354               0.00858
Detection Rate                         0.172               0.00245
Detection Prevalence                   0.250               0.15196
Balanced Accuracy                      0.681               0.56746
                     Class: step.flat Class: step.flat.trill Class: step.trill
Sensitivity                   0.66667                 0.4000             0.556
Specificity                   0.97171                 0.9046             0.681
Pos Pred Value                0.08000                 0.1379             0.721
Neg Pred Value                0.99874                 0.9753             0.509
Prevalence                    0.00368                 0.0368             0.597
Detection Rate                0.00245                 0.0147             0.332
Detection Prevalence          0.03064                 0.1066             0.461
Balanced Accuracy             0.81919                 0.6523             0.619

Code

confusion_df <- rf_model_results$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + labs(x = "Observed",
    y = "Predicted") + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

8.3 On single and composed (multi-element) subtypes

8.3.1 Train model

Code

composed_param <- read.csv("./data/processed/accoustic_features_simple_and_composed_subtypes.csv")

composed_param <- composed_param[composed_param$composed.subtype.label != "22 kHz", ]

composed_param$song.rate[is.infinite(composed_param$song.rate)] <- mean(composed_param$song.rate[!is.infinite(composed_param$song.rate)])

# remove collinear
target_features <- names(composed_param)[!names(composed_param) %in%  c("sound.files", "selec", "View", "Channel", "start", "end", "bottom.freq", "top.freq", "Delta.Time..s.", "Begin.File", "subtipos", "selec.file", "true.end", "ovlp.sels", "composed.subtype", "subtype.label", "composed.subtype.label" , "song.rate", names(composed_param)[sapply(composed_param, anyNA)])]

cormat <- cor(composed_param[, target_features], use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff= 0.9) # putt any value as a "cutoff" 
hc <- sort(hc)

target_features <- target_features[hc]

# Create data subsets
partition <- createDataPartition(
    y = composed_param$composed.subtype.label,
    times = 1,
    p = 0.75,
    list = FALSE
)



composed_param$composed_param$composed.subtype.label <- as.factor(composed_param$composed.subtype.label)

trainset <- composed_param[partition, c(target_features, "composed.subtype.label")]
testset <- composed_param[-partition, c(target_features, "composed.subtype.label")]

trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 100,
        repeats = 100,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    composed.subtype.label ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "scale"
)

ggplot(pred_model) + theme_bw()

# save confusion matrix
conf_mat <-
    confusionMatrix(predict(pred_model, testset), as.factor(testset$composed.subtype.label))

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$composed.subtype.label ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

# all_rf_model <- randomForest(
#   subtype.label ~ .,
#   data = elm_anns[, c(target_features, "subtype.label")],  # Your entire dataset
#   proximity = TRUE,  # Include proximity matrix
#   ntree = best_rf_model$ntree,  # Number of trees
#   mtry = best_rf_model$mtry,    # Number of variables tried for splitting at each node
#   nodesize = best_rf_model$nodesize,  # Minimum size of terminal nodes
#   maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
# )

rf_model_results <-
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset
        #all_rf_model = all_rf_model
    )

saveRDS(
    rf_model_results,
    "./data/processed/random_forest_model_results_single_and_composed_subtypes.RDS"
)

8.3.2 Checking performance on test data

Code

rf_model_results <- readRDS("./data/processed/random_forest_model_results_single_and_composed_subtypes.RDS")

# print confusion matrix results
rf_model_results$conf_mat_bb

Confusion Matrix and Statistics

                  Reference
Prediction         complex composed.complex composed.trill flat step.flat
  complex              293               63              0    0         0
  composed.complex      16              121              2    0         0
  composed.trill         3               18              4   10         1
  flat                   0                0              0   46         0
  step.flat             16               10              1   39         2
  step.flat.trill        0               14              0    3         0
  step.trill             0               57              0    0         0
  trill                  8                6              0   10         0
                  Reference
Prediction         step.flat.trill step.trill trill
  complex                        0          5   120
  composed.complex               1        107   197
  composed.trill                 4         67   389
  flat                           0          3   385
  step.flat                     11         33   251
  step.flat.trill                7         49    97
  step.trill                     7        215    99
  trill                          0          8   581

Overall Statistics
                                        
               Accuracy : 0.376         
                 95% CI : (0.359, 0.392)
    No Information Rate : 0.627         
    P-Value [Acc > NIR] : 1             
                                        
                  Kappa : 0.256         
                                        
 Mcnemar's Test P-Value : NA            

Statistics by Class:

                     Class: complex Class: composed.complex
Sensitivity                  0.8720                  0.4187
Specificity                  0.9382                  0.8955
Pos Pred Value               0.6091                  0.2725
Neg Pred Value               0.9852                  0.9428
Prevalence                   0.0994                  0.0855
Detection Rate               0.0867                  0.0358
Detection Prevalence         0.1423                  0.1314
Balanced Accuracy            0.9051                  0.6571
                     Class: composed.trill Class: flat Class: step.flat
Sensitivity                        0.57143      0.4259         0.666667
Specificity                        0.85409      0.8814         0.893069
Pos Pred Value                     0.00806      0.1060         0.005510
Neg Pred Value                     0.99896      0.9789         0.999668
Prevalence                         0.00207      0.0320         0.000888
Detection Rate                     0.00118      0.0136         0.000592
Detection Prevalence               0.14679      0.1284         0.107428
Balanced Accuracy                  0.71276      0.6537         0.779868
                     Class: step.flat.trill Class: step.trill Class: trill
Sensitivity                         0.23333            0.4415        0.274
Specificity                         0.95133            0.9436        0.975
Pos Pred Value                      0.04118            0.5688        0.948
Neg Pred Value                      0.99283            0.9094        0.444
Prevalence                          0.00888            0.1441        0.627
Detection Rate                      0.00207            0.0636        0.172
Detection Prevalence                0.05031            0.1119        0.181
Balanced Accuracy                   0.59233            0.6926        0.624

Code

confusion_df <- rf_model_results$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + labs(x = "Observed",
    y = "Predicted") + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

Takeaways

Decent classification of single element subtypes (accuracy: 0.76)
Poor classification of composed subtypes (accuracy: 0.52 and 0.37 when including single-element subtypes)

9 To-do list

Try other classification methods (VGGish, BirdNet, google model)
arreglar subtipos vacíos (NAs) y con etiquetas 5 y 44
buscar ejemplos específicos de subtipos raros para aumentar n de entrenamiento
buscar grabaciones completas (o cortes) con subtipos raros para n de prueba

Session information

R version 4.3.1 (2023-06-16)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3 
LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
 [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
 [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       

time zone: America/Costa_Rica
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] DiagrammeR_1.0.10    randomForest_4.7-1.1 caret_6.0-94        
 [4] lattice_0.22-5       ggplot2_3.4.4        ohun_1.0.0          
 [7] Rraven_1.0.14        warbleR_1.1.29       NatureSounds_1.0.4  
[10] seewave_2.2.3        tuneR_1.4.5          viridis_0.6.4       
[13] viridisLite_0.4.2    formatR_1.14         knitr_1.45          

loaded via a namespace (and not attached):
 [1] DBI_1.1.3            bitops_1.0-7         pROC_1.18.5         
 [4] pbapply_1.7-2        gridExtra_2.3        remotes_2.4.2.1     
 [7] testthat_3.2.0       rlang_1.1.2          magrittr_2.0.3      
[10] e1071_1.7-13         compiler_4.3.1       vctrs_0.6.4         
[13] reshape2_1.4.4       stringr_1.5.0        pkgconfig_2.0.3     
[16] crayon_1.5.2         fastmap_1.1.1        backports_1.4.1     
[19] labeling_0.4.3       utf8_1.2.4           rmarkdown_2.25      
[22] prodlim_2023.08.28   purrr_1.0.2          xfun_0.41           
[25] jsonlite_1.8.7       recipes_1.0.8        parallel_4.3.1      
[28] R6_2.5.1             RColorBrewer_1.1-3   stringi_1.7.12      
[31] parallelly_1.36.0    rpart_4.1.21         brio_1.1.3          
[34] lubridate_1.9.3      Rcpp_1.0.11          iterators_1.0.14    
[37] future.apply_1.11.0  Matrix_1.6-2         splines_4.3.1       
[40] nnet_7.3-19          igraph_1.5.1         timechange_0.2.0    
[43] tidyselect_1.2.0     rstudioapi_0.15.0    yaml_2.3.7          
[46] timeDate_4022.108    codetools_0.2-19     dtw_1.23-1          
[49] listenv_0.9.0        tibble_3.2.1         plyr_1.8.9          
[52] withr_2.5.2          evaluate_0.23        signal_0.7-7        
[55] future_1.33.0        survival_3.5-7       sf_1.0-14           
[58] sketchy_1.0.2        units_0.8-4          proxy_0.4-27        
[61] pillar_1.9.0         packrat_0.9.2        KernSmooth_2.23-22  
[64] stats4_4.3.1         checkmate_2.3.0      foreach_1.5.2       
[67] generics_0.1.3       RCurl_1.98-1.13      munsell_0.5.0       
[70] scales_1.2.1         globals_0.16.2       class_7.3-22        
[73] glue_1.6.2           tools_4.3.1          xaringanExtra_0.7.0 
[76] data.table_1.14.8    ModelMetrics_1.2.2.2 gower_1.0.1         
[79] visNetwork_2.1.2     grid_4.3.1           ipred_0.9-14        
[82] colorspace_2.1-0     nlme_3.1-163         cli_3.6.1           
[85] fansi_1.0.5          lava_1.7.3           dplyr_1.1.3         
[88] gtable_0.3.4         fftw_1.0-7           digest_0.6.33       
[91] classInt_0.4-10      farver_2.1.1         rjson_0.2.21        
[94] htmlwidgets_1.6.2    htmltools_0.5.7      lifecycle_1.0.4     
[97] hardhat_1.3.0        MASS_7.3-60