Cryptic species in Pseudasthenes

Author
Published

October 30, 2024

Next steps

  • Add new recordings to EST (“Pc_monticola7.wav” “Pc_monticola8.wav”)

  • Fix importance plot for songs

  • Redo stats

1 Purpose

  • Measure acoustic structure of cape parrot contact calls

  • Compare acoustic dissimilarity between individuals from different localities

 

2 Report overview

 

Load packages

Code
# knitr is require for creating html/pdf/word reports
# formatR is used for soft-wrapping code

# install/ load packages
sketchy::load_packages(
  packages = c(
    "knitr",
    "formatR",
    "viridis",
    "warbleR",
    # github = "maRce10/PhenotypeSpace",
    "ggplot2",
    "randomForest",
    "mlbench",
    "caret",
    "pbapply",
    "vegan",
    "umap",
    "maRce10/ohun",
    "corrplot",
    "Rraven"
  )
)

source("./scripts/MRM2.R")

2.1 Incorporate new data

Code
new_anns <- imp_raven(path = "./data/raw/", name.from.file = TRUE,
    ext.case = "lower", warbler.format = TRUE, all.data = TRUE)

check_sels(new_anns, path = "./data/raw/")


new_est <- selection_table(new_anns, path = "./data/raw/", extended = TRUE,
    by.song = "Senal")
new_est$`Begin File` <- new_est$`Begin Path` <- new_est$`End Path` <- new_est$`End File` <- new_est$`Delta Time (s)` <- new_est$`Peak Freq (Hz)` <- new_est$org.file <- NA

new_est$Poblacion <- "Pseudasthenes cactorum monticola"
new_est$Localidad <- "Arequipa"
new_est$`End File` <- new_est$Individuo <- substr(new_est$sound.files,
    0, 13)
new_est$`Subespecie correct` <- "Pseudasthenes cactorum monticola"

2.2 Format data

Code
coors_dat <- readxl::read_xlsx("./data/raw/Base_datos_Pseudasthenes_coordenadas.xlsx")

# dat <-
# readRDS('./data/processed/pool_params_Pseudasthenes.RDS')

est <- readRDS("./data/processed/tabla_extendida_por_Senal_Pseudasthenes_UnidoExcel_v02.RDS")


est$`Subespecie correct`[est$`Subespecie correct` == "Pseudasthenes cactorum cactorum / lachayensis"] <- "Pseudasthenes cactorum lachayensis"

est <- rbind(est, new_est)

est$subspecies <- gsub("Pseudasthenes cactorum ", "", est$`Subespecie correct`)

call_est <- est[grep("Canto", est$Senal, invert = TRUE), ]

song_est <- est[grep("Canto", est$Senal), ]

nrow(call_est)
[1] 276
Code
nrow(song_est)
[1] 100
Code
nrow(est)
[1] 376
Code
# dat <- dat[dat$Localidad != 'Ica', c('Localidad', 'M.peakf',
# 'M.mindom', 'M.maxdom', 'M.elm.duration', 'M.duration',
# 'I.mindom', 'I.maxdom', 'I.peakf', 'I.elm.duration',
# 'F.peakf', 'F.mindom', 'F.maxdom', 'F.elm.duration',
# 'F.duration', 'COMPLETO.peakf', 'COMPLETO.dfrange',
# 'COMPLETO.mindom', 'COMPLETO.maxdom', 'COMPLETO.elm.duration',
# 'COMPLETO.duration' , 'Nnotas')] dat$Localidad <-
# factor(dat$Localidad)

3 Descriptive stats

3.1 Calls

  • Recordings per subspecies
Code
as.data.frame(table(call_est$subspecies[!duplicated(call_est$`End File`)]))
Var1 Freq
cactorum 3
lachayensis 15
monticola 4
  • Calls per subspecies
Code
as.data.frame(table(call_est$subspecies[call_est$Nota == "COMPLETO"]))
Var1 Freq
cactorum 6
lachayensis 45
monticola 18
  • Recordings per locality
Code
as.data.frame(table(call_est$Localidad[!duplicated(call_est$`End File`)]))
Var1 Freq
Arequipa 4
Ica 3
Lima 15
  • Calls per localities
Code
as.data.frame(table(call_est$Localidad[call_est$Nota == "COMPLETO"]))
Var1 Freq
Arequipa 18
Ica 6
Lima 45

3.2 Songs

  • Recordings per subspecies
Code
as.data.frame(table(song_est$subspecies[!duplicated(song_est$`End File`)]))
Var1 Freq
cactorum 5
lachayensis 4
monticola 5
  • Songs per subspecies
Code
as.data.frame(table(song_est$subspecies[song_est$Nota == "COMPLETO"]))
Var1 Freq
cactorum 7
lachayensis 12
monticola 6
  • Songs per localities
Code
as.data.frame(table(song_est$Localidad[song_est$Nota == "COMPLETO"]))
Var1 Freq
Arequipa 8
Ica 5
Lima 12

4 Acoustic and statistical analyses

4.0.1 Measure acoustic structure

Code
call_acous_feat <- lapply(unique(call_est$Nota), function(x) {

    X <- spectro_analysis(call_est[call_est$Nota == x, ], wl = 200,
        ovlp = 70, parallel = 2)
    if (x == "I")
        names(X)[-1:-2] <- paste(names(X)[-1:-2], x, sep = "-") else {

        X <- X[, -1:-2]
        names(X) <- paste(names(X), x, sep = "-")
    }

    return(X)
})



call_acous_feat_df <- do.call(cbind, call_acous_feat)

call_acous_feat_df$locality <- call_est$Localidad[call_est$Nota ==
    "I"]
call_acous_feat_df$subspecies <- call_est$subspecies[call_est$Nota ==
    "I"]


head(call_acous_feat_df)

saveRDS(call_acous_feat_df, "./data/processed/acoustic_features_calls.RDS")

4.0.2 Random forest

Code
call_acous_feat_df <- readRDS("./data/processed/acoustic_features_calls.RDS")

# remove collinear
cor_dat <- call_acous_feat_df[, !names(call_acous_feat_df) %in% c("sound.files",
    "selec", "locality", "subspecies")]
cor_dat <- cor_dat[, !sapply(cor_dat, anyNA)]
cormat <- cor(cor_dat, use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.9)  # putt any value as a 'cutoff' 
hc <- sort(hc)

cor_dat <- cor_dat[, -c(hc)]

names(cor_dat) <- gsub("-", ".", names(cor_dat))

cor_dat$subspecies <- as.factor(call_acous_feat_df$subspecies)

# Create data subsets
partition <- createDataPartition(y = cor_dat$subspecies, times = 1,
    p = 0.75, list = FALSE)

trainset <- cor_dat[partition, ]
testset <- cor_dat[-partition, ]
Code
trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 2000,
        repeats = 100,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    subspecies ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "scale"
)

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), factor(testset$subspecies))

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$subspecies ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

all_rf_model <- randomForest(
  as.factor(subspecies) ~ .,
  data = cor_dat,  # Your entire dataset
  proximity = TRUE,  # Include proximity matrix
  ntree = best_rf_model$ntree,  # Number of trees
  mtry = best_rf_model$mtry    # Number of variables tried for splitting at each node
  # nodesize = best_rf_model$,  # Minimum size of terminal nodes
  # maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
)

rf_model_results <-
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset,
        cor_dat = cor_dat,
        all_rf_model = all_rf_model
    )


saveRDS(
    rf_model_results,
    "./data/processed/random_forest_model_results_calls.RDS"
)

4.0.3 Checking performance on test data

Code
rf_model_results <- readRDS("./data/processed/random_forest_model_results_calls.RDS")


# print confusion matrix results
rf_model_results$conf_mat_bb$overall
      Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
      0.687500       0.432624       0.413379       0.889830       0.687500 
AccuracyPValue  McnemarPValue 
      0.618019       0.391625 
Code
confusion_df <- rf_model_results$confusion_df_bb
# confusion_df$Prediction <- gsub('.model|P.tricarunculatus.',
# '', confusion_df$Prediction) confusion_df$Reference <-
# gsub('.model|P.tricarunculatus.', '', confusion_df$Reference)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

4.0.4 Checking performance on the entire data

Code
cor_dat <- rf_model_results$cor_dat
pred_model <- rf_model_results$pred_model_bb

# USING ALL DATA
conf_mat <- confusionMatrix(predict(pred_model, cor_dat), cor_dat$subspecies)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(cor_dat$subspecies ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total

# saveRDS(list(pred_model_bb = pred_model, conf_mat_bb =
# conf_mat, confusion_df_bb = conf_df, testset_bb = testset),
# './data/processed/random_forest_model_results.RDS')

rf_model_results_test <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

# print confusion matrix results
rf_model_results_test$conf_mat_bb$overall
      Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
    0.72463768     0.54855372     0.60384919     0.82537405     0.65217391 
AccuracyPValue  McnemarPValue 
    0.12672396     0.00153085 
Code
confusion_df <- rf_model_results_test$confusion_df_bb
# confusion_df$Prediction <- gsub('.model|P.tricarunculatus.',
# '', confusion_df$Prediction) confusion_df$Reference <-
# gsub('.model|P.tricarunculatus.', '', confusion_df$Reference)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

4.0.5 Variable importance

Code
# importance
gbmImp <- varImp(pred_model, scale = FALSE)
gbmImp
rf variable importance

  only 20 most important variables shown (out of 83)

                     Overall
meanfreq.COMPLETO      0.683
time.median.F          0.655
duration.M             0.485
time.ent.M             0.447
modindx.M              0.424
duration.F             0.391
entropy.M              0.387
freq.Q25.COMPLETO      0.354
modindx.COMPLETO       0.297
freq.median.COMPLETO   0.290
freq.Q75.F             0.273
mindom.COMPLETO        0.263
time.Q75.F             0.224
time.Q75.M             0.210
meanpeakf.COMPLETO     0.203
time.median.M          0.190
startdom.M             0.166
freq.IQR.M             0.162
dfrange.COMPLETO       0.156
skew.M                 0.152
Code
plot(gbmImp, top = 20)

4.0.6 Projecting random forest proximity with UMAP

Code
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15,
    n_components = 2)

# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[,
    2], location = cor_dat$subspecies)

umap_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model,
    cor_dat)

write.csv(umap_df, "./data/processed/umap_on_rf_proximity_localities_calls.csv",
    row.names = FALSE)
Code
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_localities_calls.csv")

# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location,
    fill = location)) + geom_point(size = 4) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "UMAP1",
    y = "UMAP2", color = "subspecies", fill = "subspecies")

gg_umap_loc

4.0.6.1 Projecting random forest proximity with PCA

Code
pca_result <- prcomp(rf_model_results$all_rf_model$proximity)

# Create a data frame with the UMAP results
pca_df <- data.frame(PC1 = pca_result$x[, 1], PC2 = pca_result$x[,
    2], PC3 = pca_result$x[, 3], location = cor_dat$subspecies)

pca_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model,
    cor_dat)

# Create a scatterplot
gg_pca_loc <- ggplot(pca_df, aes(x = PC1, y = PC2, color = location,
    fill = location)) + geom_point(size = 4) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "PC1", y = "PC2",
    color = "subspecies", fill = "subspecies")

gg_pca_loc

Code
gg_pca_loc2 <- ggplot(pca_df, aes(x = PC1, y = PC3, color = location,
    fill = location)) + geom_point(size = 4) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "PC1", y = "PC3",
    color = "subspecies", fill = "subspecies")

gg_pca_loc2

4.0.7 Measure acoustic structure

Code
song_acous_feat <- lapply(unique(song_est$Nota), function(x) {

    X <- spectro_analysis(song_est[song_est$Nota == x, ], wl = 200,
        ovlp = 70, parallel = 2)
    if (x == "I")
        names(X)[-1:-2] <- paste(names(X)[-1:-2], x, sep = "-") else {

        X <- X[, -1:-2]
        names(X) <- paste(names(X), x, sep = "-")
    }

    return(X)
})

song_acous_feat_df <- do.call(cbind, song_acous_feat)

song_acous_feat_df$locality <- song_est$Localidad[song_est$Nota ==
    "I"]
song_acous_feat_df$subspecies <- song_est$subspecies[song_est$Nota ==
    "I"]


head(song_acous_feat_df)

saveRDS(song_acous_feat_df, "./data/processed/acoustic_features_songs.RDS")

4.0.8 Random forest

Code
song_acous_feat_df <- readRDS("./data/processed/acoustic_features_songs.RDS")

# remove collinear
cor_dat <- song_acous_feat_df[, !names(song_acous_feat_df) %in% c("sound.files",
    "selec", "locality", "subspecies")]

names(cor_dat) <- gsub("-", ".", names(cor_dat))


cor_dat <- cor_dat[, !sapply(cor_dat, anyNA)]
cormat <- cor(cor_dat, use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff = 0.9)  # putt any value as a 'cutoff' 
hc <- sort(hc)

cor_dat <- cor_dat[, -c(hc)]

head(cor_dat)
sd.I freq.median.I freq.Q25.I freq.Q75.I freq.IQR.I time.Q25.I kurt.I time.ent.I sfm.I meandom.I mindom.I maxdom.I dfrange.I startdom.I enddom.I dfslope.I meanpeakf.I duration.M freq.median.M freq.Q25.M freq.IQR.M time.median.M time.Q25.M skew.M time.ent.M sfm.M mindom.M maxdom.M dfrange.M modindx.M startdom.M enddom.M dfslope.M meanpeakf.M duration.F freq.median.F freq.Q25.F freq.IQR.F kurt.F sp.ent.F time.ent.F entropy.F sfm.F meandom.F mindom.F maxdom.F dfrange.F modindx.F startdom.F enddom.F dfslope.F meanpeakf.F meanfreq.COMPLETO freq.Q25.COMPLETO freq.IQR.COMPLETO time.IQR.COMPLETO kurt.COMPLETO time.ent.COMPLETO sfm.COMPLETO meandom.COMPLETO mindom.COMPLETO maxdom.COMPLETO modindx.COMPLETO startdom.COMPLETO enddom.COMPLETO dfslope.COMPLETO meanpeakf.COMPLETO
1.451843 5.33136 4.71238 6.38362 1.671240 0.006071 4.69996 0.762849 0.503807 5.29200 4.74075 6.06375 1.323 6.06375 4.74075 -81.6997 5.40225 0.012651 4.35755 4.19976 0.473352 0.007592 0.005061 3.176594 0.740863 0.379431 4.07925 5.40225 1.323 1.00000 5.40225 4.07925 -104.5756 4.29975 0.020242 4.41674 3.82382 1.581118 14.39336 0.894639 0.857600 0.767242 0.511963 4.70400 4.29975 5.84325 1.5435 2.71429 5.18175 4.52025 -32.6799 4.52025 4.61947 3.96704 1.14713 2.51661 25.87454 0.901155 0.445954 4.36940 1.43325 8.93025 91.0882 6.06375 4.52025 -0.291744 4.29975
1.413290 4.97235 3.98066 6.34546 2.364797 0.003762 2.39394 0.780811 0.797688 4.95000 3.72000 5.88000 2.160 5.64000 5.40000 -18.2410 5.64000 0.018607 4.67104 3.97249 1.773230 0.010148 0.006765 1.006360 0.805626 0.597710 4.20000 6.36000 2.160 2.33333 4.44000 4.20000 -12.8983 4.44000 0.020754 4.15965 3.67831 1.203341 6.32277 0.928581 0.827942 0.768811 0.658997 4.12000 3.00000 4.68000 1.6800 2.71429 4.68000 3.96000 -34.6921 4.20000 4.75432 3.98602 1.61328 2.57086 9.69421 0.902993 0.547594 4.44784 2.28000 7.08000 216.4000 5.16000 2.76000 -0.415964 4.44000
1.232368 5.25811 4.87599 6.08602 1.210026 0.005243 6.68887 0.815834 0.553800 5.18400 4.68000 6.12000 1.440 4.92000 4.92000 0.0000 4.92000 0.014671 4.85142 3.82598 2.050883 0.009167 0.003667 0.388957 0.819423 0.708228 3.24000 5.88000 2.640 2.09091 4.44000 4.20000 -16.3589 4.44000 0.026479 6.11108 5.20403 1.738503 5.82817 0.926448 0.916417 0.849013 0.613309 6.10667 5.16000 7.32000 2.1600 3.00000 5.88000 5.16000 -27.1911 6.36000 5.14902 4.23968 1.83649 4.81603 7.87507 0.908760 0.521551 4.76462 1.56000 9.72000 224.3529 4.92000 5.40000 0.046501 4.44000
0.356876 4.42406 4.30945 4.65329 0.343844 0.001771 6.58879 0.805504 0.249210 4.64727 4.56000 5.04000 0.480 5.04000 5.04000 0.0000 4.56000 0.014169 3.96940 3.26115 1.274850 0.006715 0.002985 1.898133 0.892998 0.612120 2.64000 4.56000 1.920 3.25000 4.08000 4.56000 33.8761 4.08000 0.018607 4.43378 3.78647 1.564330 11.87179 0.937028 0.925921 0.867615 0.720422 4.72000 4.08000 5.52000 1.4400 2.66667 5.52000 4.56000 -51.5933 4.56000 4.19800 3.21353 1.76863 2.76715 13.98839 0.910768 0.603869 3.45788 1.68000 7.92000 659.3846 4.56000 4.56000 0.000000 4.56000
0.298200 4.73430 4.56956 4.89905 0.329483 0.004551 4.15210 0.835565 0.237394 5.09647 4.56000 6.00000 1.440 6.00000 5.04000 -79.0443 5.04000 0.016193 4.31981 3.94475 0.562589 0.005151 0.002207 2.358714 0.889456 0.549513 3.12000 5.52000 2.400 3.00000 5.04000 4.56000 -29.6416 4.56000 0.034411 3.30212 2.89317 0.701056 4.33306 0.876227 0.969471 0.849477 0.376832 3.25846 1.68000 3.60000 1.9200 4.25000 3.12000 3.60000 13.9490 3.60000 3.86812 3.31257 1.35822 4.32634 9.69180 0.912347 0.568584 2.44741 1.20000 6.00000 941.8000 1.20000 4.08000 0.375037 4.56000
1.190522 5.45789 4.75876 6.23928 1.480520 0.013676 2.92154 0.851060 0.712503 5.06118 2.52000 6.84000 4.320 5.64000 5.88000 9.8706 5.64000 0.020424 4.90725 4.56354 0.982018 0.009423 0.006282 2.282513 0.890094 0.645359 4.68000 7.32000 2.640 2.27273 4.68000 4.92000 11.7507 4.92000 0.017507 5.06532 4.94967 2.024016 10.72274 0.852082 0.862406 0.734841 0.526528 4.89818 4.44000 5.16000 0.7200 1.33333 4.92000 4.44000 -27.4183 4.92000 4.93364 4.27040 1.37461 1.15550 19.08989 0.897020 0.627395 3.77114 1.80000 8.04000 295.5385 1.80000 2.76000 0.285813 4.92000
Code
cor_dat$subspecies <- as.factor(song_acous_feat_df$subspecies)

table(cor_dat$subspecies)

   cactorum lachayensis   monticola 
          7          12           6 
Code
# Create data subsets
partition <- createDataPartition(y = cor_dat$subspecies, times = 1,
    p = 0.75, list = FALSE)

trainset <- cor_dat[partition, ]
testset <- cor_dat[-partition, ]
Code
trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 2000,
        repeats = 100,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    subspecies ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "scale"
)

# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), factor(testset$subspecies, levels = unique(testset$subspecies)))

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$subspecies ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

all_rf_model <- randomForest(
  as.factor(subspecies) ~ .,
  data = cor_dat,  # Your entire dataset
  proximity = TRUE,  # Include proximity matrix
  ntree = best_rf_model$ntree,  # Number of trees
  mtry = best_rf_model$mtry    # Number of variables tried for splitting at each node
  # nodesize = best_rf_model$,  # Minimum size of terminal nodes
  # maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
)

rf_model_results <-
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset,
        all_rf_model = all_rf_model
    )


saveRDS(
    rf_model_results,
    "./data/processed/random_forest_model_results_songs.RDS"
)

4.0.9 Checking performance on test data

Code
rf_model_results <- readRDS("./data/processed/random_forest_model_results_songs.RDS")


# print confusion matrix results
rf_model_results$conf_mat_bb$overall
      Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
      1.000000       1.000000       0.478176       1.000000       0.600000 
AccuracyPValue  McnemarPValue 
      0.077760            NaN 
Code
confusion_df <- rf_model_results$confusion_df_bb
# confusion_df$Prediction <- gsub('.model|P.tricarunculatus.',
# '', confusion_df$Prediction) confusion_df$Reference <-
# gsub('.model|P.tricarunculatus.', '', confusion_df$Reference)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

4.0.10 Checking performance on the entire data

Code
pred_model <- rf_model_results$all_rf_model

# USING ALL DATA
conf_mat <- confusionMatrix(predict(pred_model, cor_dat), cor_dat$subspecies)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <- sapply(conf_df$Reference, function(x) sum(cor_dat$subspecies ==
    x))

conf_df$proportion <- conf_df$Freq/conf_df$total

# saveRDS(list(pred_model_bb = pred_model, conf_mat_bb =
# conf_mat, confusion_df_bb = conf_df, testset_bb = testset),
# './data/processed/random_forest_model_results.RDS')

rf_model_results_test <- list(pred_model_bb = pred_model, conf_mat_bb = conf_mat,
    confusion_df_bb = conf_df, testset_bb = testset)

# print confusion matrix results
rf_model_results_test$conf_mat_bb$overall
      Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
   1.00000e+00    1.00000e+00    8.62815e-01    1.00000e+00    4.80000e-01 
AccuracyPValue  McnemarPValue 
   1.07407e-08            NaN 
Code
confusion_df <- rf_model_results_test$confusion_df_bb
# confusion_df$Prediction <- gsub('.model|P.tricarunculatus.',
# '', confusion_df$Prediction) confusion_df$Reference <-
# gsub('.model|P.tricarunculatus.', '', confusion_df$Reference)

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

4.0.11 Variable importance

Code
# importance
gbmImp <- varImp(pred_model, scale = FALSE)
gbmImp
Overall
sd.I 0.183035
freq.median.I 0.092123
freq.Q25.I 0.266699
freq.Q75.I 0.172164
freq.IQR.I 0.262791
time.Q25.I 0.176199
kurt.I 0.166116
time.ent.I 0.236473
sfm.I 0.141004
meandom.I 0.114293
mindom.I 0.197284
maxdom.I 0.161212
dfrange.I 0.222982
startdom.I 0.154037
enddom.I 0.140063
dfslope.I 0.166555
meanpeakf.I 0.134893
duration.M 0.281443
freq.median.M 0.331528
freq.Q25.M 0.356906
freq.IQR.M 0.265460
time.median.M 0.317432
time.Q25.M 0.137375
skew.M 0.095538
time.ent.M 0.429136
sfm.M 0.264533
mindom.M 0.197773
maxdom.M 0.550380
dfrange.M 0.365828
modindx.M 0.401957
startdom.M 0.371078
enddom.M 0.265095
dfslope.M 0.459403
meanpeakf.M 0.242443
duration.F 0.188369
freq.median.F 0.241376
freq.Q25.F 0.165176
freq.IQR.F 0.259426
kurt.F 0.196289
sp.ent.F 0.156933
time.ent.F 0.206783
entropy.F 0.112629
sfm.F 0.131789
meandom.F 0.403378
mindom.F 0.106897
maxdom.F 0.168016
dfrange.F 0.266894
modindx.F 0.121617
startdom.F 0.213325
enddom.F 0.127532
dfslope.F 0.154502
meanpeakf.F 0.358626
meanfreq.COMPLETO 0.315679
freq.Q25.COMPLETO 0.249872
freq.IQR.COMPLETO 0.268306
time.IQR.COMPLETO 0.297259
kurt.COMPLETO 0.097717
time.ent.COMPLETO 0.229971
sfm.COMPLETO 0.221108
meandom.COMPLETO 0.238710
mindom.COMPLETO 0.152013
maxdom.COMPLETO 0.272782
modindx.COMPLETO 0.346063
startdom.COMPLETO 0.131592
enddom.COMPLETO 0.118977
dfslope.COMPLETO 0.121959
meanpeakf.COMPLETO 0.263124
Code
plot(gbmImp, top = 20)
Warning in plot.window(xlim, ylim, log, ...): "top" is not a graphical
parameter
Warning in axis(side = side, at = at, labels = labels, ...): "top" is not a
graphical parameter
Warning in title(xlab = xlab, ylab = ylab, ...): "top" is not a graphical
parameter
Warning in plot.xy(xy.coords(x, y), type = type, ...): "top" is not a graphical
parameter

4.0.12 Projecting random forest proximity with UMAP

Code
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15,
    n_components = 2)

# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[,
    2], location = cor_dat$subspecies)

umap_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model,
    cor_dat)

write.csv(umap_df, "./data/processed/umap_on_rf_proximity_localities_songs.csv",
    row.names = FALSE)
Code
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_localities_songs.csv")

# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location,
    fill = location)) + geom_point(size = 4) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "UMAP1",
    y = "UMAP2", color = "subspecies", fill = "subspecies")

gg_umap_loc

4.0.13 Projecting random forest proximity with PCA

Code
pca_result <- prcomp(rf_model_results$all_rf_model$proximity)

# Create a data frame with the UMAP results
pca_df <- data.frame(PC1 = pca_result$x[, 1], PC2 = pca_result$x[,
    2], location = cor_dat$subspecies)

pca_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model,
    cor_dat)

# Create a scatterplot
gg_pca_loc <- ggplot(pca_df, aes(x = PC1, y = PC2, color = location,
    fill = location)) + geom_point(size = 4) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "PC1", y = "PC2",
    color = "subspecies", fill = "subspecies")

gg_pca_loc

Code
est <- readRDS("./data/processed/tabla_extendida_por_Senal_Pseudasthenes_UnidoExcel.RDS")

est <- resample_est(est, pb = FALSE)
call_est <- est[grep("llamado", est$Senal, ignore.case = TRUE), ]

# spectrograms(call_est, ovlp = 90, flim = c(1, 10), collevels =
# seq(-110, 0, 5), pal = reverse.gray.colors.1, dest.path =
# './data/processed/spectrograms/calls', by.song =
# 'sound.files', xl = 3, wl = 200, parallel = 20, sel.labels =
# 'Nota')

song_est <- est[grep("Canto", est$Senal, ignore.case = TRUE), ]

# spectrograms(song_est, ovlp = 90, flim = c(1, 18), collevels =
# seq(-110, 0, 5), pal = reverse.gray.colors.1, dest.path =
# './data/processed/spectrograms/songs', by.song =
# 'sound.files', xl = 5, wl = 200, parallel = 20, sel.labels =
# 'Nota')

# sample size call
table(call_est$Localidad[!duplicated(call_est$org.file)])

# calls per individual and site
table(call_est$Localidad[!duplicated(call_est$sound.files)], call_est$org.file[!duplicated(call_est$sound.files)])

# song
table(song_est$Localidad[!duplicated(song_est$org.file)])


# songs per individual and site
table(song_est$Localidad[!duplicated(song_est$sound.files)], song_est$org.file[!duplicated(song_est$sound.files)])

# get med element
m_call_est <- call_est[call_est$Nota == "M", ]


xc_m_call <- cross_correlation(m_call_est, wl = 200, ovlp = 70)

mds_m_call <- cmdscale(1 - xc_m_call, k = 2)


# medidas - cross-correlation between M notes, F notes and I
# notes, - difference in peak frequency between first and last
# note - number of notes per call/song - duration - mean element
# values for acoustic features divided by note type (I, M, F)
Code
# coordinates
coors <- readxl::read_excel("./data/processed/Base_datos_Pseudasthenes_coordenadas.xlsx")

cll_elms_est <- call_est[call_est$Nota != "COMPLETO", ]
cll_i_est <- call_est[call_est$Nota == "I", ]


out <- lapply(c("I", "M", "F", "COMPLETO"), function(x) {

    cll_elms_sp <- spectro_analysis(call_est[call_est$Nota == x, ])
    names(cll_elms_sp)[-c(1, 2)] <- paste0(names(cll_elms_sp)[-c(1,
        2)], "-", x)
    return(cll_elms_sp)
})


cll_sp <- cbind(out[[1]], out[[2]][, -c(1, 2)], out[[3]][, -c(1, 2)],
    dur_complete = out[[4]]$`duration-COMPLETO`)
cll_sp$diff_IF <- cll_sp$`meanpeakf-I` - cll_sp$`meanpeakf-F`

# remove 0 var parameters
cll_sp <- cll_sp[, !names(cll_sp) %in% names(which(sapply(cll_sp[,
    -c(1, 2)], sd) == 0))]

pca <- prcomp(cll_sp[, -c(1, 2)], scale. = TRUE)
cll_sp$pc1 <- pca$x[, 1]
cll_sp$pc2 <- pca$x[, 2]

cll_sp$org.sound.file <- sapply(strsplit(cll_sp$sound.files, ".wav"),
    "[[", 1)


coors$file <- gsub(".wav$", "", coors$`Begin File`)


cll_sp$lat <- sapply(cll_sp$org.sound.file, function(x) coors$Latitud[coors$file ==
    x][1])
cll_sp$lon <- sapply(cll_sp$org.sound.file, function(x) coors$Longitud[coors$file ==
    x][1])

cll_sp$pob <- sapply(cll_sp$org.sound.file, function(x) coors$Poblacion[coors$file ==
    x][1])
cll_sp$locality <- sapply(cll_sp$org.sound.file, function(x) coors$Localidad[coors$file ==
    x][1])


subsp_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = cll_sp$pob))

loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = cll_sp$locality))
acu_dist <- dist(cll_sp[, c("pc1", "pc2")])
geo_dist <- dist(cll_sp[, c("lat", "lon")])


# regression models set data format
rect_var <- cbind(acu_dist = as.dist(scale(acu_dist)), geo_dist = as.dist(scale(geo_dist)),
    loc_member_binary = loc_member_binary, subsp_member_binary = subsp_member_binary)

rect_var <- cbind(rect_var, res = residuals(lm(rect_var[, "loc_member_binary"] ~
    rect_var[, "geo_dist"])))

head(rect_var)
apply(rect_var, 2, var)


nperm <- 1000

# (mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary +
# geo_dist + subsp_member_binary, nperm = nperm, mat =
# rect_var[, c('acu_dist','loc_member_binary', 'geo_dist',
# 'subsp_member_binary')]))

# (mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary +
# geo_dist, nperm = nperm, mat = rect_var[,
# c('acu_dist','loc_member_binary', 'geo_dist')]))


(mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary, nperm = nperm,
    mat = rect_var[, c("acu_dist", "loc_member_binary")]))

# (mod_dialect <- MRM2(formula = acu_dist ~ subsp_member_binary
# , nperm = nperm, mat = rect_var[, c('acu_dist',
# 'subsp_member_binary')]))


# (mod_dialect <- MRM2(formula = acu_dist ~ geo_dist, nperm =
# nperm, mat = rect_var[, c('acu_dist', 'geo_dist')]))

# cor(rect_var)

# corrplot.mixed(cor(rect_var), upper= 'ellipse')

# (mod_dialect <- MRM2(formula = acu_dist ~ geo_dist, nperm =
# nperm, mat = rect_var[, c('acu_dist', 'geo_dist')]))

# (mod_dialect <- MRM2(formula = acu_dist ~ res, nperm = nperm,
# mat = rect_var[, c('acu_dist', 'res')]))

5 Statistical analysis

Two approaches:

5.1 Multiple Regression on distance Matrices

  • Model:
    \[\begin{align*} Acoustic\ dissimilarity &\sim locality + sampling\ site + geographic\ distance \end{align*}\]
  • Response values scaled to make effect sizes comparable across models
  • Locality was coded as pairwise binary matrix in which 0 means that calls in a dyad belong to the same locality and 1 means calls belong to different locality
Code
xcorr <- readRDS("./data/processed/cross_correlation_matrix.RDS")

xcorr_mds <- readRDS("./data/processed/cross_correlation_MDS.RDS")


dat <- read.csv("./data/raw/consolidated_sound_files_CPV_contact_calls_CURATED.csv")

# add data from second location
dat_loc2 <- read.csv("./data/raw/consolidated_sound_files_CPV_contact_calls _UPDATED.csv")

dat <- dat[grepl("cape", dat$species, ignore.case = T) & !is.na(dat$Location.for.cluster) &
    !is.na(dat$Longitude.for.cluster) & !is.na(dat$Latittude.for.cluster),
    ]


dat$sampling.site <- sapply(dat$new_name, function(x) dat_loc2$Sampling.sites[dat_loc2$New_Name ==
    x])
dat$species <- "Cape parrot"

sub_xcorr <- xcorr[rownames(xcorr) %in% dat$new_name, colnames(xcorr) %in%
    dat$new_name]
sub_xcorr_mds <- xcorr_mds[rownames(xcorr_mds) %in% dat$new_name,
    ]

sub_dat <- dat[dat$new_name %in% rownames(sub_xcorr_mds), ]
sub_dat <- sub_dat[match(rownames(sub_xcorr), sub_dat$new_name), ]

location <- sapply(rownames(sub_xcorr), function(x) sub_dat$Location.for.cluster[sub_dat$new_name ==
    x])
sampling.site <- sapply(rownames(sub_xcorr), function(x) sub_dat$sampling.site[sub_dat$new_name ==
    x])
Code
loc_bi_tri <- as.dist(binary_triangular_matrix(group = location))

samp_bi_tri <- as.dist(binary_triangular_matrix(group = sampling.site))

geo_dist <- dist(sub_dat[, c("Latittude.for.cluster", "Longitude.for.cluster")])

rect_var <- cbind(as.dist(1 - sub_xcorr), geo_dist, loc_bi_tri, samp_bi_tri)

colnames(rect_var) <- c("fourier_xc", "geo_distance", "location",
    "sampling.site")

rect_var <- rect_var[!is.infinite(rect_var[, 1]), ]

xc_mod <- MRM2(formula = fourier_xc ~ geo_distance + location + sampling.site,
    nperm = 10000, mat = rect_var)

saveRDS(xc_mod, "./data/processed/matrix_correlation_fourier_cross_correlation.RDS")
Code
readRDS("./data/processed/matrix_correlation_fourier_cross_correlation.RDS")

6 Simulation

6.1 Simulate data with different patterns of geographic variation in call structure

  • 26 localities (10 individuals each) along a longitudinal range
  • Simulated patterns following Podos & Warren (2007)

Figure from Podos & Warren (2007)
Code
# seed to make it reproducible
set.seed(123)

# number of groups
n_locality <- length(unique(sub_dat$Location.for.cluster))

# number of individuals per group
n_indiv <- table(sub_dat$Location.for.cluster)

# create locality labels
localities <- sample(LETTERS[1:n_locality])

# simulated individuals per group
locality_label <- unlist(sapply(1:length(localities), function(x) rep(localities[x],
    each = n_indiv[x])))

# simulate geographic coordinates along a gradient
lon <- unlist(sapply(1:n_locality, function(x) rep(x, each = n_indiv[x]))) +
    rnorm(n = length(locality_label), sd = 0.1)
lat <- rnorm(n = length(locality_label), sd = 0.1)


# put all together in a data frame
data <- data.frame(locality = locality_label, lat, lon, color = viridis(n_locality)[as.numeric(as.factor(locality_label))])

# add sampling site (broader geographic mozaic)
data <- data[order(data$lon), ]

data$site_num <- 2

for (i in 2:nrow(data)) {

    data$site_num[i] <- if (data$locality[i - 1] == data$locality[i])
        data$site_num[i - 1] else data$site_num[i - 1] + 1

}

# make last location the same as th previous one
data$site_num[data$locality == data$locality[which.max(data$lon)]] <- data$site_num[which.max(data$lon)] -
    1

data$sampling.site <- sample(letters)[floor(data$site_num/2)]

# plot localities along longitude
plot(data[, c("lon", "lat")], col = data$color, cex = 1.8, ylim = range(data$lat) +
    c(-0.2, 0.2), pch = as.numeric(as.factor(data$sampling.site)))

lon_loc <- tapply(data$lon, data$locality, mean)
text(x = lon_loc, y = rep(0.3, n_locality), labels = names(lon_loc),
    cex = 2.5)

abline(v = 1:30 - 0.5)

6.2 Simulate acoustic variation:

  • Clinal: acoustic structure vector increases with longitude
  • Dialect-type: acoustic structure vector similar within a locality but varies randomly between neighnoring localities
    • 2 levels: locality and sampling site
    • Sampling site create by grouping 2 adjacent localities with the same label
  • Random: acoustic structure vector varies randomly between individuals regardless of locality or longitude
Code
# seed to make it reproducible
set.seed(123)

# simulate acoustic structure vector clinal variation
data$clinal <- data$lon + rnorm(n = nrow(data), sd = 0.2)

# dialect type variation
data$dialect <- as.numeric(as.factor(data$locality)) + rnorm(n = nrow(data),
    sd = 0.2)

data$dialect_site <- as.numeric(as.factor(data$sampling.site)) + rnorm(n = nrow(data),
    sd = 0.2)

# random variation
data$random <- rnorm(n = nrow(data), sd = 0.2)

# sort so lines look good in plot
data <- data[order(data$lon), ]

lng_data <- stack(data[, c("clinal", "dialect", "dialect_site", "random")])

lng_data$locality <- data$locality
lng_data$lat <- data$lat
lng_data$lat <- data$lon
lng_data$lon_seq <- 1:nrow(data)

ggplot(lng_data, aes(x = lon_seq, y = values)) + geom_line(color = "#3E4A89FF",
    linewidth = 1.6) + facet_wrap(~ind, scales = "free_y", nrow = 3) +
    theme_classic(base_size = 25) + labs(x = "Longitude", y = "Acoustic feature")

6.3 Stats on simulated data

  • Model:
    \[\begin{align*} Acoustic\ dissimilarity &\sim locality + sampling\ site + geographic\ distance \end{align*}\]

Predict acoustic structure based on geographic distance and locality membership using multiple Regression on distance matrices

Data is z-transformed so all predictors have similar variance and effect sizes are comparable.

Variance for each predictor:

Code
# create distance matrices
loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$locality))
site_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$sampling.site))
clinal_dist <- dist(data$clinal)
dialect_loc_dist <- dist(data$dialect)
dialect_site_dist <- dist(data$dialect_site)
random_dist <- dist(data$random)
geo_dist <- dist(data[, c("lat", "lon")])

# regression models set data format
rect_var <- cbind(clinal_dist = as.dist(scale(clinal_dist)), dialect_loc_dist = as.dist(scale(dialect_loc_dist)),
    dialect_site_dist = as.dist(scale(dialect_site_dist)), random_dist = as.dist(scale(random_dist)),
    geo_dist = as.dist(scale(geo_dist)), loc_member_binary = loc_member_binary,
    site_member_binary = site_member_binary)

apply(rect_var, 2, var)

6.3.1 Dialect-type variation model at locality level

Code
nperm <- 100

# model predicting dialect variation
(mod_dialect <- MRM2(formula = dialect_loc_dist ~ geo_dist + loc_member_binary +
    site_member_binary, nperm = nperm, mat = rect_var[, c("dialect_loc_dist",
    "geo_dist", "loc_member_binary", "site_member_binary")]))

6.3.2 Dialect-type variation model at sampling site level

Code
nperm <- 100

# model predicting dialect variation
(mod_dialect_site <- MRM2(formula = dialect_site_dist ~ geo_dist +
    loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[,
    c("dialect_site_dist", "geo_dist", "loc_member_binary", "site_member_binary")]))

6.3.3 Clinal variation model

Code
# model predicting clinal variation
(mod_clinal <- MRM2(formula = clinal_dist ~ geo_dist + loc_member_binary +
    site_member_binary, nperm = nperm, mat = rect_var[, c("clinal_dist",
    "geo_dist", "loc_member_binary", "site_member_binary")]))

6.3.4 Random variation model

Code
# model predicting random variation
(mod_random <- MRM2(formula = random_dist ~ geo_dist + loc_member_binary +
    site_member_binary, nperm = nperm, mat = rect_var[, c("random_dist",
    "geo_dist", "loc_member_binary", "site_member_binary")]))

6.3.5 Combined results

Code
mods <- list(mod_clinal = mod_clinal, mod_dialect = mod_dialect, mod_dialect_site = mod_dialect_site,
    mod_random = mod_random)

names(mods) <- c("Clinal", "Dialect locality", "Dialect site", "Random")

estimates <- do.call(rbind, lapply(seq_along(mods), function(x) {
    Y <- data.frame(mods[[x]]$coef[-1, ])
    Y$rel_coef <- Y[, 1]/max(Y[, 1])
    Y$mod <- names(mods)[x]
    Y$predictor <- rownames(Y)
    names(Y) <- c("coef", "p", "rel_coef", "model", "predictor")
    return(Y)
}))

estimates$rel_coef <- ifelse(estimates$p < 0.05, estimates$rel_coef,
    0)
estimates$signif <- ifelse(estimates$p < 0.05, "p < 0.05", "p >= 0.05")

ggplot(estimates, aes(x = predictor, y = model, fill = rel_coef)) +
    geom_tile() + coord_equal() + scale_fill_gradient2(low = viridis(10)[3],
    high = viridis(10)[7], guide = "none") + geom_text(aes(label = round(coef,
    3), color = signif)) + scale_color_manual(values = c("black",
    "gray")) + labs(x = "", y = "", color = "P value") + theme_classic() +
    theme(axis.text.x = element_text(color = "black", size = 11, angle = 30,
        vjust = 0.8, hjust = 0.8))

6.4 Replicate simulation 100 times

Code
nperm <- 100
reps <- 100
rep_models <- pbreplicate(reps, cl = 1, expr = {
    # number of groups
    n_locality <- length(unique(sub_dat$Location.for.cluster))

    # number of individuals per group
    n_indiv <- table(sub_dat$Location.for.cluster)

    # create locality labels
    localities <- sample(LETTERS[1:n_locality])

    # simulated individuals per group
    locality_label <- unlist(sapply(1:length(localities), function(x) rep(localities[x],
        each = n_indiv[x])))

    # simulate geographic coordinates along a gradient
    lon <- unlist(sapply(1:n_locality, function(x) rep(x, each = n_indiv[x]))) +
        rnorm(n = length(locality_label), sd = 0.1)
    lat <- rnorm(n = length(locality_label), sd = 0.1)

    # locality_label <- localities[n_indivs]

    # put all together in a data frame
    data <- data.frame(locality = locality_label, lat, lon)

    # add sampling site (broader geographic mozaic)
    data <- data[order(data$lon), ]

    data$site_num <- 2

    for (i in 2:nrow(data)) {

        data$site_num[i] <- if (data$locality[i - 1] == data$locality[i])
            data$site_num[i - 1] else data$site_num[i - 1] + 1

    }

    # make last location the same as the previous one
    data$site_num[data$locality == data$locality[which.max(data$lon)]] <- data$site_num[which.max(data$lon)] -
        1

    data$sampling.site <- sample(letters)[floor(data$site_num/2)]

    # simulate acoustic structure vector clinal variation
    data$clinal <- data$lon + rnorm(n = nrow(data), sd = 0.2)

    # dialect type variation locality level
    data$dialect <- as.numeric(as.factor(data$locality)) + rnorm(n = nrow(data),
        sd = 0.2)

    # dialect type variation
    data$dialect_site <- as.numeric(as.factor(data$sampling.site)) +
        rnorm(n = nrow(data), sd = 0.2)

    # random variation
    data$random <- rnorm(n = nrow(data), sd = 0.2)

    loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$locality))
    site_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$sampling.site))
    clinal_dist <- dist(data$clinal)
    dialect_loc_dist <- dist(data$dialect)
    dialect_site_dist <- dist(data$dialect_site)
    random_dist <- dist(data$random)
    geo_dist <- dist(data[, c("lat", "lon")])

    # regression models set data format
    rect_var <- cbind(clinal_dist = as.dist(scale(clinal_dist)), dialect_loc_dist = as.dist(scale(dialect_loc_dist)),
        dialect_site_dist = as.dist(scale(dialect_site_dist)), random_dist = as.dist(scale(random_dist)),
        geo_dist = as.dist(scale(geo_dist)), loc_member_binary = loc_member_binary,
        site_member_binary = site_member_binary)

    # model predicting dialect variation
    mod_dialect <- MRM2(formula = dialect_loc_dist ~ geo_dist + loc_member_binary +
        site_member_binary, nperm = nperm, mat = rect_var[, c("dialect_loc_dist",
        "geo_dist", "loc_member_binary", "site_member_binary")])

    # model predicting clinal variation
    mod_clinal <- MRM2(formula = clinal_dist ~ geo_dist + loc_member_binary +
        site_member_binary, nperm = nperm, mat = rect_var[, c("clinal_dist",
        "geo_dist", "loc_member_binary", "site_member_binary")])

    # # model predicting dialect site variation
    dialect_site <- MRM2(formula = dialect_site_dist ~ geo_dist +
        loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[,
        c("dialect_site_dist", "geo_dist", "loc_member_binary", "site_member_binary")])

    # model predicting random variation
    mod_random <- MRM2(formula = random_dist ~ geo_dist + loc_member_binary +
        site_member_binary, nperm = nperm, mat = rect_var[, c("random_dist",
        "geo_dist", "loc_member_binary", "site_member_binary")])

    mods <- list(mod_clinal = mod_clinal, mod_dialect = mod_dialect,
        dialect_site = dialect_site, mod_random = mod_random)

    names(mods) <- c("Clinal", "Dialect locality", "Dialect site",
        "Random")

    estimates <- do.call(rbind, lapply(seq_along(mods), function(x) {
        Y <- data.frame(mods[[x]]$coef[-1, ])
        Y$rel_coef <- Y[, 1]/max(Y[, 1])
        Y$mod <- names(mods)[x]
        Y$predictor <- rownames(Y)
        names(Y) <- c("coef", "p", "rel_coef", "model", "predictor")
        return(Y)
    }))

    estimates
}, simplify = FALSE)

coeffs <- do.call(cbind, lapply(rep_models, function(x) x[, 1]))
ps <- do.call(cbind, lapply(rep_models, function(x) x[, 2]))

estimates <- rep_models[[1]]
estimates$coef <- rowMeans(coeffs)
estimates$sd <- apply(coeffs, 1, sd)
estimates$p <- 1 - (apply(ps, 1, function(x) sum(x < 0.05))/reps)
estimates$rel_coef <- estimates[, 1]/max(estimates[, 1])
estimates$rel_coef <- ifelse(estimates$p < 0.05, estimates$rel_coef,
    0)
estimates$signif <- ifelse(estimates$p < 0.05, "p < 0.05", "p >= 0.05")

estimates$coef_sd <- paste0(round(estimates$coef, 3), "\n(", round(estimates$sd,
    4), ")")

saveRDS(estimates, "./data/processed/estimates_for_replicated_simulation_dialects.RDS")
Code
estimates <- readRDS("./data/processed/estimates_for_replicated_simulation_dialects.RDS")

ggplot(estimates, aes(x = predictor, y = model, fill = rel_coef)) +
    geom_tile() + coord_equal() + scale_fill_gradient2(low = viridis(10)[3],
    high = viridis(10)[7], guide = "none") + geom_text(aes(label = coef_sd,
    color = signif)) + scale_color_manual(values = c("black", "gray")) +
    labs(x = "", y = "", color = "P value") + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

6.5 Partial Mantel test

Evaluate association between acoustic dissimilarity and locality membership accounting for the effect of geographic distance

6.5.1 By location

Code
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))],
    na.rm = TRUE)

mantel_part <- mantel.partial(ydis = xc_dist, xdis = loc_bi_tri, zdis = geo_dist)

saveRDS(mantel_part, "./data/processed/partial_mantel_correlation_cross_correlation.RDS")
Code
readRDS("./data/processed/partial_mantel_correlation_cross_correlation.RDS")

6.5.2 By sampling site

Code
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))],
    na.rm = TRUE)

mantel_part <- mantel.partial(ydis = xc_dist, xdis = samp_bi_tri,
    zdis = geo_dist)

saveRDS(mantel_part, "./data/processed/partial_mantel_correlation_cross_correlation_by_sampling_site.RDS")
Code
readRDS("./data/processed/partial_mantel_correlation_cross_correlation_by_sampling_site.RDS")

6.6 Mantel correlogram at different distances

Code
geo_vect <- geo_dist[lower.tri(geo_dist)]
geo_vect <- geo_vect[!is.na(geo_vect)]

xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))],
    na.rm = TRUE)


dists <- 1:10

mantelcorrlg <- function(i) {

    classes <- seq(0, max(geo_vect), i)
    # length(classes)

    # Run a mantel correlation on the data
    correl_SPCC <- vegan::mantel.correlog(D.eco = xc_dist, D.geo = geo_dist,
        break.pts = classes, cutoff = FALSE, r.type = "pearson", nperm = 1,
        mult = "holm", progressive = FALSE)

    mantel.res <- correl_SPCC$mantel.res[, 1:3]
    mantel.res <- cbind(mantel.res, break.size = i)

    return(mantel.res)

}

mantel_list <- warbleR:::pblapply_wrblr_int(dists, cl = 1, function(x) try(mantelcorrlg(x),
    silent = TRUE))

mantel_list <- mantel_list[sapply(mantel_list, class) != "try-error"]


mantel_list <- lapply(mantel_list, as.data.frame)

# # Save the correlation as an .RDS file so you don't have to
# run it multiple times in the future
saveRDS(mantel_list, paste0("./data/processed/correl_SPCC_several_distances.RDS"))
Code
mantel_list <- readRDS(paste0("./data/processed/correl_SPCC_several_distances.RDS"))

mantels_df <- as.data.frame(do.call(rbind, mantel_list))

combined_dists <- sort(unique(mantels_df$class.index))

# interpolate
interpol_mantel_list <- lapply(mantel_list, function(x) {

    appx <- approx(x = x$class.index[x$n.dist > 0], y = x$Mantel.cor[x$n.dist >
        0], xout = combined_dists, method = "linear")

    return(appx$y)
})


interpol_mantel_mat <- do.call(cbind, interpol_mantel_list)


interpol_mantel_df <- data.frame(combined_dists, mean.cor = apply(interpol_mantel_mat,
    1, mean, na.rm = TRUE), sd.cor = apply(interpol_mantel_mat, 1,
    sd, na.rm = TRUE))


ggplot(data = interpol_mantel_df, mapping = aes(x = combined_dists,
    y = mean.cor)) + geom_ribbon(data = interpol_mantel_df, aes(ymin = mean.cor -
    sd.cor, ymax = mean.cor + sd.cor), fill = "gray", alpha = 0.3) +
    geom_point(col = viridis(10, alpha = 0.5)[7], size = 2.5) + geom_line(col = viridis(10,
    alpha = 0.5)[7], size = 2) + xlim(c(0, 4)) + ylim(c(-0.025, 0.2)) +
    geom_point(size = 3, color = "transparent") + theme_classic(base_size = 20) +
    labs(x = "Pairwise geographic distance (Degrees)", y = "Correlation coefficient")

6.7 Dialect discrimination with Random Forest

  • Data split in training and testing subsets (75% and 25% respectively)
  • Accuracy measured on test subset

6.7.1 Discriminating locations

Code
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <-
    mean(xc_dist[-which(is.infinite(xc_dist))], na.rm = TRUE)

xcorr_mds <- cmdscale(d = as.dist(xc_dist), k = 10)

xcorr_mds <-
    data.frame(sound.files = rownames(xcorr_mds), xcorr_mds)
xcorr_mds$location <-
    sapply(xcorr_mds$sound.files, function(x)
        sub_dat$Location.for.cluster[sub_dat$new_name == x])

xcorr_mds$location <- as.factor(make.names(xcorr_mds$location,
                                           unique = FALSE, allow_ = TRUE))


# Create data subsets
partition <- createDataPartition(
    y = xcorr_mds$location,
    times = 1,
    p = 0.75,
    list = FALSE
)

trainset <- xcorr_mds[partition,-1]
testset <- xcorr_mds[-partition,-1]

trcontrol <-
    trainControl(
        method = "repeatedcv",
        number = 20,
        savePredictions = TRUE,
        sampling = "down",
        classProbs = TRUE,
        returnResamp = "all"
    )

pred_model <- train(
    location ~ .,
    data = trainset,
    method = "rf",
    trControl = trcontrol,
    metric = "Accuracy",
    preProcess = "BoxCox",
    proximity = TRUE    
)

# save confusion matrix
conf_mat <-
    confusionMatrix(predict(pred_model, testset), testset$location)

conf_df <- as.data.frame(conf_mat$table)

conf_df$total <-
    sapply(conf_df$Reference, function(x)
        sum(testset$location ==
                x))

conf_df$proportion <- conf_df$Freq / conf_df$total

# fit model on complete data set
best_rf_model <- pred_model$finalModel

all_rf_model <- randomForest(
  location ~ .,
  data = xcorr_mds[,-1],  # Your entire dataset
  proximity = TRUE,  # Include proximity matrix
  ntree = best_rf_model$ntree,  # Number of trees
  mtry = best_rf_model$mtry,    # Number of variables tried for splitting at each node
  nodesize = best_rf_model$nodesize,  # Minimum size of terminal nodes
  maxnodes = best_rf_model$maxnodes  # Maximum number of terminal nodes
)

saveRDS(
    list(
        pred_model_bb = pred_model,
        conf_mat_bb = conf_mat,
        confusion_df_bb = conf_df,
        testset_bb = testset,
        all_rf_model = all_rf_model
    ),
    "./data/processed/random_forest_model_results_locations.RDS"
)
Code
rf_model_results <- readRDS("./data/processed/random_forest_model_results_locations.RDS")

# print confusion matrix results
rf_model_results$conf_mat_bb$overall

confusion_df <- rf_model_results$confusion_df_bb

ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
    geom_tile() + theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
    direction = 1) + geom_text(aes(label = round(proportion, 2)),
    color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
    size = 11, angle = 30, vjust = 0.8, hjust = 0.8))

6.7.2 Projecting random forest proximity with UMAP

Code
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15,
    n_components = 2)

# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[,
    2], location = xcorr_mds$location, sound.files = xcorr_mds$sound.files)

umap_df$pred.locality <- predict(object = rf_model_results$all_rf_model,
    xcorr_mds[, grep("X", names(xcorr_mds))])

write.csv(umap_df, "./data/processed/umap_on_rf_proximity_on_xcorr_dists_localities.csv",
    row.names = FALSE)
Code
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_on_xcorr_dists_localities.csv")

# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location,
    fill = location)) + geom_point(size = 4) + ylim(c(-7, 7)) + scale_color_viridis_d(alpha = 0.3,
    begin = 0.1, end = 0.8) + scale_fill_viridis_d(alpha = 0.2, begin = 0.1,
    end = 0.8) + theme_classic(base_size = 20) + labs(x = "UMAP1",
    y = "UMAP2", color = "Locality", fill = "Locality")

gg_umap_loc

6.7.3 Projecting random forest proximity with UMAP

Code
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15,
    n_components = 2)


# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[, 1], UMAP2 = umap_result$layout[,
    2], sampling.site = xcorr_mds$sampling.site, sound.files = xcorr_mds$sound.files)

umap_df$pred.site <- predict(object = rf_model_results$all_rf_model,
    xcorr_mds[, grep("X", names(xcorr_mds))])

write.csv(umap_df, "./data/processed/umap_on_rf_proximity_on_xcorr_dists.csv",
    row.names = FALSE)
Code
umap_df <- read.csv("./data/processed/umap_on_rf_proximity_on_xcorr_dists.csv")


umap_df$sampling.site <- factor(umap_df$sampling.site, levels = c("central.coast",
    "southern", "northern", "central.mountains"))
# Create a scatterplot
gg_umap_site <- ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = sampling.site,
    fill = sampling.site, shape = sampling.site)) + geom_point(size = 4) +
    ylim(c(-7, 7)) + scale_color_viridis_d(alpha = 0, begin = 0.1,
    end = 0.8, labels = c("Central Coast", "Central Mountains", "Northern",
        "Southern")) + scale_fill_viridis_d(alpha = 0.25, begin = 0.1,
    end = 0.8, labels = c("Central Coast", "Central Mountains", "Northern",
        "Southern")) + scale_shape_manual(values = 21:24, labels = c("Central Coast",
    "Central Mountains", "Northern", "Southern")) + guides(shape = guide_legend(override.aes = list(size = 5))) +
    theme_classic(base_size = 20) + labs(x = "UMAP1", y = "UMAP2",
    color = "Region", shape = "Region", fill = "Region")


gg_umap_site

6.8 Combined plots

Code
# cowplot::plot_grid(gg_umap_loc, gg_umap_site, nrow = 1,
# rel_widths = c(1.1, 1))
# cowplot::ggsave2('./output/UMAP_scatterplots.png', dpi = 300,
# width = 22, height = 7)

cowplot::plot_grid(gg_umap_loc, gg_umap_site, nrow = 2)

cowplot::ggsave2("./output/UMAP_scatterplots2.png", dpi = 300, width = 17,
    height = 17)

7 Spectrograms of close-to-centroid calls for each sample site

Code
# keep only those correctly predicted
umap_df <- umap_df[umap_df$sampling.site == umap_df$pred.site, ]

# find prepresentative
y <- 96  #37, 50, 57, 47

# for ( i in 1:100){ y <- y + 1
repren_calls_list <- lapply(unique(umap_df$sampling.site), function(x) {
    X <- umap_df[umap_df$sampling.site == x, ]
    centroid <- sapply(X[, c("UMAP1", "UMAP2")], mean)
    X$centroid_dist <- sapply(seq_len(nrow(X)), function(y) dist(rbind(centroid,
        X[y, c("UMAP1", "UMAP2")])))

    set.seed(y)
    out <- X[order(X$centroid_dist, decreasing = FALSE), ][sample(1:30,
        2), ]
})


repren_calls <- do.call(rbind, repren_calls_list)
repren_calls$selec <- 1
repren_calls$start <- rep(0, nrow(repren_calls))
repren_calls$end <- rep(0.5, nrow(repren_calls))

repren_calls <- repren_calls[c(1, 3, 5, 7, 2, 4, 6, 8), ]

lf <- rep(c(0.06, 0.5), each = 4)
rg <- rep(c(0.5, 0.95), each = 4)
horiz <- seq(0.95, 0.075, length.out = 5)
btm <- rep(horiz[-1], 2)
tp <- rep(horiz[-length(horiz)], 2)

m <- cbind(lf, rg, btm, tp)
m <- m[(1:8), ]
lf <- c(rep(0.95, each = 4), 0.06, 0.5, 0, 0)
rg <- c(rep(1, each = 4), 0.5, 0.95, 0.05, 1)
horiz <- seq(0.95, 0.075, length.out = 5)
btm <- c(horiz[-1], 0.95, 0.95, 0.075, 0)
tp <- c(horiz[-length(horiz)], 1, 1, 0.95, 0.075)

m2 <- cbind(lf, rg, btm, tp)
m <- rbind(m, m2)



# png(filename = paste0('./output/spectrograms_by_site', y,
# '.png'), res = 300, width = 4000, height = 3000)

ss <- split.screen(figs = m)

# # testing layout screens for(i in 1:nrow(m)) {screen(i) par(
# mar = rep(0, 4)) plot(0.5, xlim = c(0,1), ylim = c(0,1), type
# = 'n', axes = FALSE, xlab = '', ylab = '', xaxt = 'n', yaxt =
# 'n') box() text(x = 0.5, y = 0.5, labels = i) }
# close.screen(all.screens = T)


ovlp <- 95
colls <- seq(-110, 0, 5)
wl <- 200

lab_bg <- viridis(10, alpha = 0.25)[8]

lab_bg <- "#3A3B39"



# frequency label
screen(15)
par(mar = c(0, 0, 0, 0), new = TRUE)
plot(1, frame.plot = FALSE, type = "n", yaxt = "n", xaxt = "n")
text(x = 0.9, y = 1, "Frequency (kHz)", srt = 90, cex = 1.6)

# time label
screen(16)
par(mar = c(0, 0, 0, 0), new = TRUE)
plot(1, frame.plot = FALSE, type = "n", yaxt = "n", xaxt = "n")
text(x = 1, y = 0.75, "Time (s)", cex = 1.6)

for (i in seq_len(nrow(repren_calls))) {
    # print(i)
    screen(i)
    par(mar = c(0, 0, 0, 0))

    warbleR:::spectro_wrblr_int2(wave = read_wave(X = repren_calls,
        index = i, path = "./data/raw/consolidated_files/"), collevels = colls,
        ovlp = ovlp, wl = wl, flim = c(1, 9), palette = viridis, axisX = FALSE,
        axisY = FALSE, grid = FALSE)

    # add frequency axis
    if (i %in% 1:4)
        axis(2, at = c(seq(2, 10, 2)))

    # add time axis
    if (i %in% c(4, 8))
        axis(1)
}

vlabs <- c("Central\nMountains", "Southern", "Northern", "Central\nCoast")

par(mar = c(0, 0, 0, 0), bg = lab_bg, new = TRUE)
# add vertical labels
for (i in 9:12) {
    screen(i)
    # par(mar = c(0, 0, 0, 0))
    par(mar = c(0, 0, 0, 0), bg = lab_bg, new = TRUE)
    plot(1, frame.plot = FALSE, type = "n", yaxt = "n", xaxt = "n")
    text(x = 1, y = 1, vlabs[i - 8], srt = 270, cex = 1.6, col = "white",
        font = 1)
    box()
}

# dev.off() }

Takeaways

 


 

Session information

R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       

time zone: America/Costa_Rica
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] Rraven_1.0.13        corrplot_0.92        ohun_1.0.2          
 [4] umap_0.2.10.0        vegan_2.6-8          permute_0.9-7       
 [7] pbapply_1.7-2        caret_6.0-94         lattice_0.22-6      
[10] mlbench_2.1-5        randomForest_4.7-1.1 ggplot2_3.5.1       
[13] warbleR_1.1.32       NatureSounds_1.0.4   seewave_2.2.3       
[16] tuneR_1.4.7          viridis_0.6.5        viridisLite_0.4.2   
[19] formatR_1.14         knitr_1.48          

loaded via a namespace (and not attached):
  [1] DBI_1.2.3            bitops_1.0-9         pROC_1.18.5         
  [4] gridExtra_2.3        remotes_2.5.0        readxl_1.4.3        
  [7] testthat_3.2.1.1     rlang_1.1.4          magrittr_2.0.3      
 [10] e1071_1.7-16         compiler_4.4.1       mgcv_1.9-1          
 [13] png_0.1-8            vctrs_0.6.5          reshape2_1.4.4      
 [16] stringr_1.5.1        pkgconfig_2.0.3      crayon_1.5.3        
 [19] fastmap_1.2.0        backports_1.5.0      labeling_0.4.3      
 [22] utf8_1.2.4           rmarkdown_2.28       prodlim_2024.06.25  
 [25] purrr_1.0.2          xfun_0.48            jsonlite_1.8.9      
 [28] recipes_1.0.10       parallel_4.4.1       cluster_2.1.6       
 [31] R6_2.5.1             RColorBrewer_1.1-3   stringi_1.8.4       
 [34] reticulate_1.39.0    parallelly_1.38.0    rpart_4.1.23        
 [37] cellranger_1.1.0     brio_1.1.5           lubridate_1.9.3     
 [40] Rcpp_1.0.13          iterators_1.0.14     future.apply_1.11.2 
 [43] igraph_2.0.3         Matrix_1.7-0         splines_4.4.1       
 [46] nnet_7.3-19          timechange_0.3.0     tidyselect_1.2.1    
 [49] rstudioapi_0.16.0    yaml_2.3.10          timeDate_4032.109   
 [52] codetools_0.2-20     dtw_1.23-1           listenv_0.9.1       
 [55] tibble_3.2.1         plyr_1.8.9           withr_3.0.1         
 [58] askpass_1.2.1        evaluate_1.0.1       signal_1.8-1        
 [61] sf_1.0-17            future_1.34.0        survival_3.7-0      
 [64] units_0.8-5          sketchy_1.0.3        proxy_0.4-27        
 [67] pillar_1.9.0         KernSmooth_2.23-24   packrat_0.9.2       
 [70] checkmate_2.3.2      foreach_1.5.2        stats4_4.4.1        
 [73] generics_0.1.3       RCurl_1.98-1.16      munsell_0.5.1       
 [76] scales_1.3.0         globals_0.16.3       class_7.3-22        
 [79] glue_1.8.0           tools_4.4.1          xaringanExtra_0.8.0 
 [82] data.table_1.15.4    RSpectra_0.16-1      ModelMetrics_1.2.2.2
 [85] gower_1.0.1          grid_4.4.1           ipred_0.9-14        
 [88] colorspace_2.1-1     nlme_3.1-165         cli_3.6.3           
 [91] fansi_1.0.6          lava_1.8.0           dplyr_1.1.4         
 [94] gtable_0.3.5         fftw_1.0-9           digest_0.6.37       
 [97] classInt_0.4-10      farver_2.1.2         rjson_0.2.23        
[100] htmlwidgets_1.6.4    htmltools_0.5.8.1    lifecycle_1.0.4     
[103] hardhat_1.4.0        openssl_2.2.2        MASS_7.3-61