---
title: Acoustic analysis
subtitle: Cryptic species in Pseudasthenes
author: <a href="https://marce10.github.io/">Marcelo Araya-Salas</a>
date: "`r Sys.Date()`"
toc: true
toc-depth: 2
toc-location: left
number-sections: true
highlight-style: pygments
format:
html:
df-print: kable
code-fold: true
code-tools: true
css: qmd.css
editor_options:
chunk_output_type: console
---
```{r set root directory, echo = FALSE}
knitr::opts_knit$set(root.dir = "..")
```
```{r add link to github repo, echo = FALSE, results='asis'}
# print link to github repo if any
if (file.exists("./.git/config")){
config <- readLines("./.git/config")
url <- grep("url", config, value = TRUE)
url <- gsub("\\turl = |.git$", "", url)
cat("\nSource code and data found at [", url, "](", url, ")", sep = "")
}
```
```{r setup style, echo = FALSE, message = FALSE, warning=FALSE}
# options to customize chunk outputs
knitr::opts_chunk$set(
tidy.opts = list(width.cutoff = 65),
tidy = TRUE,
message = FALSE
)
```
::: {.alert .alert-info}
# Next steps {.unnumbered .unlisted}
- Add new recordings to EST ("Pc_monticola7.wav" "Pc_monticola8.wav")
- Fix importance plot for songs
- Redo stats
:::
<!-- skyblue box -->
<div class="alert alert-info">
# Purpose
- Measure acoustic structure of cape parrot contact calls
- Compare acoustic dissimilarity between individuals from different localities
</div>
<!-- light brown box -->
<div class="alert alert-warning">
# Report overview
- [ Acoustic analysis ](#acoustic-analysis)
- [ Statistical analysis ](#statistical-analysis)
</div>
# Load packages {.unnumbered .unlisted}
```{r load packages}
# knitr is require for creating html/pdf/word reports
# formatR is used for soft-wrapping code
# install/ load packages
sketchy::load_packages(
packages = c(
"knitr",
"formatR",
"viridis",
"warbleR",
# github = "maRce10/PhenotypeSpace",
"ggplot2",
"randomForest",
"mlbench",
"caret",
"pbapply",
"vegan",
"umap",
"maRce10/ohun",
"corrplot",
"Rraven"
)
)
source("./scripts/MRM2.R")
```
## Incorporate new data
```{r}
new_anns <- imp_raven (path = "./data/raw/" , name.from.file = TRUE , ext.case = "lower" , warbler.format = TRUE , all.data = TRUE )
check_sels (new_anns, path = "./data/raw/" )
new_est <- selection_table (new_anns, path = "./data/raw/" , extended = TRUE , by.song = "Senal" )
new_est$ ` Begin File ` <- new_est$ ` Begin Path ` <- new_est$ ` End Path ` <- new_est$ ` End File ` <- new_est$ ` Delta Time (s) ` <- new_est$ ` Peak Freq (Hz) ` <- new_est$ org.file <- NA
new_est$ Poblacion <- "Pseudasthenes cactorum monticola"
new_est$ Localidad <- "Arequipa"
new_est$ ` End File ` <- new_est$ Individuo <- substr (new_est$ sound.files, 0 , 13 )
new_est$ ` Subespecie correct ` <- "Pseudasthenes cactorum monticola"
```
## Format data
```{r, eval = TRUE}
coors_dat <- readxl::read_xlsx("./data/raw/Base_datos_Pseudasthenes_coordenadas.xlsx")
# dat <- readRDS("./data/processed/pool_params_Pseudasthenes.RDS")
est <- readRDS("./data/processed/tabla_extendida_por_Senal_Pseudasthenes_UnidoExcel_v02.RDS")
est$`Subespecie correct`[est$`Subespecie correct` == "Pseudasthenes cactorum cactorum / lachayensis"] <- "Pseudasthenes cactorum lachayensis"
est <- rbind(est, new_est)
est$subspecies <- gsub("Pseudasthenes cactorum ", "", est$`Subespecie correct`)
call_est <- est[grep("Canto", est$Senal, invert = TRUE), ]
song_est <- est[grep("Canto", est$Senal), ]
nrow(call_est)
nrow(song_est)
nrow(est)
# dat <- dat[dat$Localidad != "Ica", c("Localidad", "M.peakf", "M.mindom", "M.maxdom", "M.elm.duration", "M.duration", "I.mindom", "I.maxdom", "I.peakf", "I.elm.duration", "F.peakf", "F.mindom", "F.maxdom", "F.elm.duration", "F.duration", "COMPLETO.peakf", "COMPLETO.dfrange", "COMPLETO.mindom", "COMPLETO.maxdom", "COMPLETO.elm.duration", "COMPLETO.duration" , "Nnotas")]
#
#
# dat$Localidad <- factor(dat$Localidad)
```
# Descriptive stats
## Calls
- Recordings per subspecies
```{r}
as.data.frame (table (call_est$ subspecies[! duplicated (call_est$ ` End File ` )])
)
```
- Calls per subspecies
```{r}
as.data.frame (table (call_est$ subspecies[call_est$ Nota == "COMPLETO" ]))
```
- Recordings per locality
```{r}
as.data.frame (table (call_est$ Localidad[! duplicated (call_est$ ` End File ` )])
)
```
- Calls per localities
```{r}
as.data.frame (table (call_est$ Localidad[call_est$ Nota == "COMPLETO" ]))
```
## Songs
- Recordings per subspecies
```{r}
as.data.frame (table (song_est$ subspecies[! duplicated (song_est$ ` End File ` )])
)
```
- Songs per subspecies
```{r}
as.data.frame (table (song_est$ subspecies[song_est$ Nota == "COMPLETO" ]))
```
- Songs per localities
```{r}
as.data.frame (table (song_est$ Localidad[song_est$ Nota == "COMPLETO" ]))
```
# Acoustic and statistical analyses
::: {.panel-tabset}
## Calls
### Measure acoustic structure
```{r, eval = FALSE}
call_acous_feat <- lapply(unique(call_est$Nota), function(x){
X <- spectro_analysis(call_est[call_est$Nota == x, ], wl = 200, ovlp = 70, parallel = 2)
if (x == "I")
names(X)[-1:-2] <- paste(names(X)[-1:-2], x, sep = "-") else {
X <- X[,-1:-2]
names(X)<- paste(names(X), x, sep = "-")
}
return(X)
})
call_acous_feat_df <- do.call(cbind, call_acous_feat)
call_acous_feat_df$locality <- call_est$Localidad[call_est$Nota == "I"]
call_acous_feat_df$subspecies <- call_est$subspecies[call_est$Nota == "I"]
head(call_acous_feat_df)
saveRDS(call_acous_feat_df, "./data/processed/acoustic_features_calls.RDS")
```
### Random forest
```{r, eval = TRUE}
call_acous_feat_df <- readRDS("./data/processed/acoustic_features_calls.RDS")
# remove collinear
cor_dat <- call_acous_feat_df[,!names(call_acous_feat_df) %in% c("sound.files", "selec", "locality", "subspecies")]
cor_dat <- cor_dat[, !sapply(cor_dat, anyNA)]
cormat <- cor(cor_dat, use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff=0.9) # putt any value as a "cutoff"
hc <- sort(hc)
cor_dat <- cor_dat[,-c(hc)]
names(cor_dat) <- gsub("-", ".", names(cor_dat))
cor_dat$subspecies <- as.factor(call_acous_feat_df$subspecies)
# Create data subsets
partition <- createDataPartition(
y = cor_dat$subspecies,
times = 1,
p = 0.75,
list = FALSE
)
trainset <- cor_dat[partition,]
testset <- cor_dat[-partition,]
```
```{r, eval = FALSE}
trcontrol <-
trainControl(
method = "repeatedcv",
number = 2000,
repeats = 100,
savePredictions = TRUE,
sampling = "down",
classProbs = TRUE,
returnResamp = "all"
)
pred_model <- train(
subspecies ~ .,
data = trainset,
method = "rf",
trControl = trcontrol,
metric = "Accuracy",
preProcess = "scale"
)
# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), factor(testset$subspecies))
conf_df <- as.data.frame(conf_mat$table)
conf_df$total <-
sapply(conf_df$Reference, function(x)
sum(testset$subspecies ==
x))
conf_df$proportion <- conf_df$Freq / conf_df$total
# fit model on complete data set
best_rf_model <- pred_model$finalModel
all_rf_model <- randomForest(
as.factor(subspecies) ~ .,
data = cor_dat, # Your entire dataset
proximity = TRUE, # Include proximity matrix
ntree = best_rf_model$ntree, # Number of trees
mtry = best_rf_model$mtry # Number of variables tried for splitting at each node
# nodesize = best_rf_model$, # Minimum size of terminal nodes
# maxnodes = best_rf_model$maxnodes # Maximum number of terminal nodes
)
rf_model_results <-
list(
pred_model_bb = pred_model,
conf_mat_bb = conf_mat,
confusion_df_bb = conf_df,
testset_bb = testset,
cor_dat = cor_dat,
all_rf_model = all_rf_model
)
saveRDS(
rf_model_results,
"./data/processed/random_forest_model_results_calls.RDS"
)
```
### Checking performance on test data
```{r, eval = TRUE}
rf_model_results <-
readRDS("./data/processed/random_forest_model_results_calls.RDS")
# print confusion matrix results
rf_model_results$conf_mat_bb$overall
confusion_df <- rf_model_results$confusion_df_bb
# confusion_df$Prediction <- gsub(".model|P.tricarunculatus.", "", confusion_df$Prediction)
# confusion_df$Reference <- gsub(".model|P.tricarunculatus.", "", confusion_df$Reference)
ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
geom_tile() +
coord_equal() +
scale_fill_distiller(palette = "Greens", direction = 1) +
geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
theme_classic() +
theme(axis.text.x = element_text(
color = "black",
size = 11,
angle = 30,
vjust = 0.8,
hjust = 0.8
))
```
### Checking performance on the entire data
```{r}
cor_dat <- rf_model_results$ cor_dat
pred_model <- rf_model_results$ pred_model_bb
# USING ALL DATA
conf_mat <- confusionMatrix (predict (pred_model, cor_dat), cor_dat$ subspecies)
conf_df <- as.data.frame (conf_mat$ table)
conf_df$ total <- sapply (conf_df$ Reference, function (x) sum (cor_dat$ subspecies ==
x))
conf_df$ proportion <- conf_df$ Freq/ conf_df$ total
# saveRDS(list(pred_model_bb = pred_model, conf_mat_bb = conf_mat, confusion_df_bb = conf_df,
# testset_bb = testset), "./data/processed/random_forest_model_results.RDS")
rf_model_results_test <- list (pred_model_bb = pred_model, conf_mat_bb = conf_mat, confusion_df_bb = conf_df,
testset_bb = testset)
# print confusion matrix results
rf_model_results_test$ conf_mat_bb$ overall
confusion_df <- rf_model_results_test$ confusion_df_bb
# confusion_df$Prediction <- gsub(".model|P.tricarunculatus.", "", confusion_df$Prediction)
# confusion_df$Reference <- gsub(".model|P.tricarunculatus.", "", confusion_df$Reference)
ggplot (confusion_df, aes (x = Reference, y = Prediction, fill = proportion)) +
geom_tile () +
coord_equal () +
scale_fill_distiller (palette = "Greens" , direction = 1 ) +
geom_text (aes (label = round (proportion, 2 )), color = "black" , size = 3 ) +
theme_classic () +
theme (axis.text.x = element_text (color = "black" ,size = 11 , angle = 30 , vjust = 0.8 , hjust = 0.8 ))
```
### Variable importance
```{r}
# importance
gbmImp <- varImp (pred_model, scale = FALSE )
gbmImp
plot (gbmImp, top = 20 )
```
### Projecting random forest proximity with UMAP
```{r, warning=FALSE, eval = FALSE}
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15, n_components = 2)
# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[,1], UMAP2 = umap_result$layout[,2], location = cor_dat$subspecies)
umap_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model, cor_dat)
write.csv(umap_df, "./data/processed/umap_on_rf_proximity_localities_calls.csv", row.names = FALSE)
```
```{r, warning=FALSE}
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_localities_calls.csv")
# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location, fill = location)) +
geom_point(size = 4) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="UMAP1", y = "UMAP2", color = "subspecies", fill = "subspecies")
gg_umap_loc
```
#### Projecting random forest proximity with PCA
```{r, warning=FALSE, eval = TRUE}
pca_result <- prcomp(rf_model_results$all_rf_model$proximity)
# Create a data frame with the UMAP results
pca_df <- data.frame(PC1 = pca_result$x[,1], PC2 = pca_result$x[,2] , PC3 = pca_result$x[,3], location = cor_dat$subspecies)
pca_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model, cor_dat)
# Create a scatterplot
gg_pca_loc <- ggplot(pca_df, aes(x = PC1, y = PC2, color = location, fill = location)) +
geom_point(size = 4) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="PC1", y = "PC2", color = "subspecies", fill = "subspecies")
gg_pca_loc
gg_pca_loc2 <- ggplot(pca_df, aes(x = PC1, y = PC3, color = location, fill = location)) +
geom_point(size = 4) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="PC1", y = "PC3", color = "subspecies", fill = "subspecies")
gg_pca_loc2
```
## Songs
### Measure acoustic structure
```{r, eval = FALSE}
song_acous_feat <- lapply(unique(song_est$Nota), function(x){
X <- spectro_analysis(song_est[song_est$Nota == x, ], wl = 200, ovlp = 70, parallel = 2)
if (x == "I")
names(X)[-1:-2] <- paste(names(X)[-1:-2], x, sep = "-") else {
X <- X[,-1:-2]
names(X)<- paste(names(X), x, sep = "-")
}
return(X)
})
song_acous_feat_df <- do.call(cbind, song_acous_feat)
song_acous_feat_df$locality <- song_est$Localidad[song_est$Nota == "I"]
song_acous_feat_df$subspecies <- song_est$subspecies[song_est$Nota == "I"]
head(song_acous_feat_df)
saveRDS(song_acous_feat_df, "./data/processed/acoustic_features_songs.RDS")
```
### Random forest
```{r, eval = TRUE}
song_acous_feat_df <- readRDS("./data/processed/acoustic_features_songs.RDS")
# remove collinear
cor_dat <- song_acous_feat_df[,!names(song_acous_feat_df) %in% c("sound.files", "selec", "locality", "subspecies")]
names(cor_dat) <- gsub("-", ".", names(cor_dat))
cor_dat <- cor_dat[, !sapply(cor_dat, anyNA)]
cormat <- cor(cor_dat, use = "pairwise.complete.obs")
hc <- findCorrelation(cormat, cutoff=0.9) # putt any value as a "cutoff"
hc <- sort(hc)
cor_dat <- cor_dat[,-c(hc)]
head(cor_dat)
cor_dat$subspecies <- as.factor(song_acous_feat_df$subspecies)
table(cor_dat$subspecies)
# Create data subsets
partition <- createDataPartition(
y = cor_dat$subspecies,
times = 1,
p = 0.75,
list = FALSE
)
trainset <- cor_dat[partition,]
testset <- cor_dat[-partition,]
```
```{r, eval = FALSE}
trcontrol <-
trainControl(
method = "repeatedcv",
number = 2000,
repeats = 100,
savePredictions = TRUE,
sampling = "down",
classProbs = TRUE,
returnResamp = "all"
)
pred_model <- train(
subspecies ~ .,
data = trainset,
method = "rf",
trControl = trcontrol,
metric = "Accuracy",
preProcess = "scale"
)
# save confusion matrix
conf_mat <- confusionMatrix(predict(pred_model, testset), factor(testset$subspecies, levels = unique(testset$subspecies)))
conf_df <- as.data.frame(conf_mat$table)
conf_df$total <-
sapply(conf_df$Reference, function(x)
sum(testset$subspecies ==
x))
conf_df$proportion <- conf_df$Freq / conf_df$total
# fit model on complete data set
best_rf_model <- pred_model$finalModel
all_rf_model <- randomForest(
as.factor(subspecies) ~ .,
data = cor_dat, # Your entire dataset
proximity = TRUE, # Include proximity matrix
ntree = best_rf_model$ntree, # Number of trees
mtry = best_rf_model$mtry # Number of variables tried for splitting at each node
# nodesize = best_rf_model$, # Minimum size of terminal nodes
# maxnodes = best_rf_model$maxnodes # Maximum number of terminal nodes
)
rf_model_results <-
list(
pred_model_bb = pred_model,
conf_mat_bb = conf_mat,
confusion_df_bb = conf_df,
testset_bb = testset,
all_rf_model = all_rf_model
)
saveRDS(
rf_model_results,
"./data/processed/random_forest_model_results_songs.RDS"
)
```
### Checking performance on test data
```{r, eval = TRUE}
rf_model_results <-
readRDS("./data/processed/random_forest_model_results_songs.RDS")
# print confusion matrix results
rf_model_results$conf_mat_bb$overall
confusion_df <- rf_model_results$confusion_df_bb
# confusion_df$Prediction <- gsub(".model|P.tricarunculatus.", "", confusion_df$Prediction)
# confusion_df$Reference <- gsub(".model|P.tricarunculatus.", "", confusion_df$Reference)
ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
geom_tile() +
coord_equal() +
scale_fill_distiller(palette = "Greens", direction = 1) +
geom_text(aes(label = round(proportion, 2)), color = "black", size = 3) +
theme_classic() +
theme(axis.text.x = element_text(
color = "black",
size = 11,
angle = 30,
vjust = 0.8,
hjust = 0.8
))
```
### Checking performance on the entire data
```{r}
pred_model <- rf_model_results$ all_rf_model
# USING ALL DATA
conf_mat <- confusionMatrix (predict (pred_model, cor_dat), cor_dat$ subspecies)
conf_df <- as.data.frame (conf_mat$ table)
conf_df$ total <- sapply (conf_df$ Reference, function (x) sum (cor_dat$ subspecies ==
x))
conf_df$ proportion <- conf_df$ Freq/ conf_df$ total
# saveRDS(list(pred_model_bb = pred_model, conf_mat_bb = conf_mat, confusion_df_bb = conf_df,
# testset_bb = testset), "./data/processed/random_forest_model_results.RDS")
rf_model_results_test <- list (pred_model_bb = pred_model, conf_mat_bb = conf_mat, confusion_df_bb = conf_df,
testset_bb = testset)
# print confusion matrix results
rf_model_results_test$ conf_mat_bb$ overall
confusion_df <- rf_model_results_test$ confusion_df_bb
# confusion_df$Prediction <- gsub(".model|P.tricarunculatus.", "", confusion_df$Prediction)
# confusion_df$Reference <- gsub(".model|P.tricarunculatus.", "", confusion_df$Reference)
ggplot (confusion_df, aes (x = Reference, y = Prediction, fill = proportion)) +
geom_tile () +
coord_equal () +
scale_fill_distiller (palette = "Greens" , direction = 1 ) +
geom_text (aes (label = round (proportion, 2 )), color = "black" , size = 3 ) +
theme_classic () +
theme (axis.text.x = element_text (color = "black" ,size = 11 , angle = 30 , vjust = 0.8 , hjust = 0.8 ))
```
### Variable importance
```{r}
# importance
gbmImp <- varImp (pred_model, scale = FALSE )
gbmImp
plot (gbmImp, top = 20 )
```
### Projecting random forest proximity with UMAP
```{r song umap, warning=FALSE, eval = FALSE}
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15, n_components = 2)
# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[,1], UMAP2 = umap_result$layout[,2], location = cor_dat$subspecies)
umap_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model, cor_dat)
write.csv(umap_df, "./data/processed/umap_on_rf_proximity_localities_songs.csv", row.names = FALSE)
```
```{r, warning=FALSE}
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_localities_songs.csv")
# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location, fill = location)) +
geom_point(size = 4) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="UMAP1", y = "UMAP2", color = "subspecies", fill = "subspecies")
gg_umap_loc
```
### Projecting random forest proximity with PCA
```{r, warning=FALSE, eval = TRUE}
pca_result <- prcomp(rf_model_results$all_rf_model$proximity)
# Create a data frame with the UMAP results
pca_df <- data.frame(PC1 = pca_result$x[,1], PC2 = pca_result$x[,2], location = cor_dat$subspecies)
pca_df$pred.subspecies <- predict(object = rf_model_results$all_rf_model, cor_dat)
# Create a scatterplot
gg_pca_loc <- ggplot(pca_df, aes(x = PC1, y = PC2, color = location, fill = location)) +
geom_point(size = 4) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="PC1", y = "PC2", color = "subspecies", fill = "subspecies")
gg_pca_loc
```
:::
```{r, eval = FALSE}
est <- readRDS("./data/processed/tabla_extendida_por_Senal_Pseudasthenes_UnidoExcel.RDS")
est <- resample_est(est, pb = FALSE)
call_est <- est[grep("llamado", est$Senal, ignore.case = TRUE), ]
# spectrograms(call_est, ovlp = 90, flim = c(1, 10), collevels = seq(-110, 0, 5), pal = reverse.gray.colors.1, dest.path = "./data/processed/spectrograms/calls", by.song = "sound.files", xl = 3, wl = 200, parallel = 20, sel.labels = "Nota")
song_est <- est[grep("Canto", est$Senal, ignore.case = TRUE), ]
# spectrograms(song_est, ovlp = 90, flim = c(1, 18), collevels = seq(-110, 0, 5), pal = reverse.gray.colors.1, dest.path = "./data/processed/spectrograms/songs", by.song = "sound.files", xl = 5, wl = 200, parallel = 20, sel.labels = "Nota")
#sample size
#call
table(call_est$Localidad[!duplicated(call_est$org.file)])
# calls per individual and site
table(call_est$Localidad[!duplicated(call_est$sound.files)], call_est$org.file[!duplicated(call_est$sound.files)])
# song
table(song_est$Localidad[!duplicated(song_est$org.file)])
# songs per individual and site
table(song_est$Localidad[!duplicated(song_est$sound.files)], song_est$org.file[!duplicated(song_est$sound.files)])
# get med element
m_call_est <- call_est[call_est$Nota == "M", ]
xc_m_call <- cross_correlation(m_call_est, wl = 200, ovlp = 70)
mds_m_call <- cmdscale(1 - xc_m_call, k = 2)
# medidas
# - cross-correlation between M notes, F notes and I notes,
# - difference in peak frequency between first and last note
# - number of notes per call/song
# - duration
# - mean element values for acoustic features divided by note type (I, M, F)
```
```{r, eval = FALSE}
# coordinates
coors <- readxl::read_excel("./data/processed/Base_datos_Pseudasthenes_coordenadas.xlsx")
cll_elms_est <- call_est[call_est$Nota != "COMPLETO", ]
cll_i_est <- call_est[call_est$Nota == "I", ]
out <- lapply(c("I", "M", "F", "COMPLETO"), function(x){
cll_elms_sp <- spectro_analysis(call_est[call_est$Nota == x, ] )
names(cll_elms_sp)[-c(1, 2)] <- paste0(names(cll_elms_sp)[-c(1, 2)],"-", x)
return(cll_elms_sp)
})
cll_sp <- cbind(out[[1]], out[[2]][,-c(1, 2)], out[[3]][,-c(1, 2)], dur_complete = out[[4]]$`duration-COMPLETO`)
cll_sp$diff_IF <- cll_sp$`meanpeakf-I` - cll_sp$`meanpeakf-F`
# remove 0 var parameters
cll_sp <- cll_sp[,!names(cll_sp) %in% names(which(sapply(cll_sp[, -c(1, 2)], sd) == 0))]
pca <- prcomp(cll_sp[, -c(1, 2)], scale. = TRUE)
cll_sp$pc1 <- pca$x[,1]
cll_sp$pc2 <- pca$x[,2]
cll_sp$org.sound.file <- sapply(strsplit(cll_sp$sound.files, ".wav"), "[[", 1)
coors$file <- gsub(".wav$", "", coors$`Begin File`)
cll_sp$lat <- sapply(cll_sp$org.sound.file, function(x) coors$Latitud[coors$file == x][1])
cll_sp$lon <- sapply(cll_sp$org.sound.file, function(x) coors$Longitud[coors$file == x][1])
cll_sp$pob <- sapply(cll_sp$org.sound.file, function(x) coors$Poblacion[coors$file == x][1])
cll_sp$locality <- sapply(cll_sp$org.sound.file, function(x) coors$Localidad[coors$file == x][1])
subsp_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = cll_sp$pob))
loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = cll_sp$locality))
acu_dist <- dist(cll_sp[, c("pc1", "pc2")])
geo_dist <- dist(cll_sp[, c("lat", "lon")])
# regression models set data format
rect_var <- cbind(acu_dist = as.dist(scale(acu_dist)), geo_dist = as.dist(scale(geo_dist)), loc_member_binary = loc_member_binary, subsp_member_binary = subsp_member_binary)
rect_var <- cbind(rect_var, res = residuals(lm(rect_var[, "loc_member_binary"] ~ rect_var[, "geo_dist"]))
)
head(rect_var)
apply(rect_var, 2, var)
nperm <- 1000
# (mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary + geo_dist + subsp_member_binary, nperm = nperm, mat = rect_var[, c("acu_dist","loc_member_binary", "geo_dist", "subsp_member_binary")]))
# (mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary + geo_dist, nperm = nperm, mat = rect_var[, c("acu_dist","loc_member_binary", "geo_dist")]))
(mod_dialect <- MRM2(formula = acu_dist ~ loc_member_binary , nperm = nperm, mat = rect_var[, c("acu_dist", "loc_member_binary")]))
# (mod_dialect <- MRM2(formula = acu_dist ~ subsp_member_binary , nperm = nperm, mat = rect_var[, c("acu_dist", "subsp_member_binary")]))
# (mod_dialect <- MRM2(formula = acu_dist ~ geo_dist, nperm = nperm, mat = rect_var[, c("acu_dist", "geo_dist")]))
# cor(rect_var)
# corrplot.mixed(cor(rect_var), upper= "ellipse")
# (mod_dialect <- MRM2(formula = acu_dist ~ geo_dist, nperm = nperm, mat = rect_var[, c("acu_dist", "geo_dist")]))
# (mod_dialect <- MRM2(formula = acu_dist ~ res, nperm = nperm, mat = rect_var[, c("acu_dist", "res")]))
```
# Statistical analysis
Two approaches:
- [ Multiple Regression on distance Matrices ](https://search.r-project.org/CRAN/refmans/ecodist/html/MRM.html)
- Partial Mantel test
## Multiple Regression on distance Matrices
- Model:
\begin{align*}
Acoustic\ dissimilarity &\sim locality + sampling\ site + geographic\ distance
\end{align*}
- Response values scaled to make effect sizes comparable across models
- Locality was coded as pairwise binary matrix in which 0 means that calls in a dyad belong to the same locality and 1 means calls belong to different locality
```{r, eval = FALSE}
xcorr <- readRDS("./data/processed/cross_correlation_matrix.RDS")
xcorr_mds <- readRDS("./data/processed/cross_correlation_MDS.RDS")
dat <- read.csv("./data/raw/consolidated_sound_files_CPV_contact_calls_CURATED.csv")
# add data from second location
dat_loc2 <- read.csv("./data/raw/consolidated_sound_files_CPV_contact_calls _UPDATED.csv")
dat <- dat[grepl("cape", dat$species, ignore.case = T) & !is.na(dat$Location.for.cluster) & !is.na(dat$Longitude.for.cluster) & !is.na(dat$Latittude.for.cluster),]
dat$sampling.site <- sapply(dat$new_name, function(x) dat_loc2$Sampling.sites[dat_loc2$New_Name == x])
dat$species <- "Cape parrot"
sub_xcorr <- xcorr[rownames(xcorr) %in% dat$new_name, colnames(xcorr) %in% dat$new_name]
sub_xcorr_mds <- xcorr_mds[rownames(xcorr_mds) %in% dat$new_name, ]
sub_dat <- dat[dat$new_name %in% rownames(sub_xcorr_mds), ]
sub_dat <- sub_dat[match(rownames(sub_xcorr), sub_dat$new_name), ]
location <- sapply(rownames(sub_xcorr), function(x) sub_dat$Location.for.cluster[sub_dat$new_name == x])
sampling.site <- sapply(rownames(sub_xcorr), function(x) sub_dat$sampling.site[sub_dat$new_name == x])
```
```{r, eval = FALSE}
loc_bi_tri <- as.dist(binary_triangular_matrix(group = location))
samp_bi_tri <- as.dist(binary_triangular_matrix(group = sampling.site))
geo_dist <- dist(sub_dat[ , c("Latittude.for.cluster","Longitude.for.cluster")])
rect_var <- cbind(as.dist(1 - sub_xcorr), geo_dist, loc_bi_tri, samp_bi_tri)
colnames(rect_var) <- c("fourier_xc", "geo_distance", "location", "sampling.site")
rect_var <- rect_var[!is.infinite(rect_var[, 1]), ]
xc_mod <- MRM2(formula = fourier_xc ~ geo_distance + location + sampling.site, nperm = 10000, mat = rect_var)
saveRDS(xc_mod, "./data/processed/matrix_correlation_fourier_cross_correlation.RDS")
```
```{r, eval = FALSE}
readRDS("./data/processed/matrix_correlation_fourier_cross_correlation.RDS")
```
# Simulation
## Simulate data with different patterns of geographic variation in call structure
- 26 localities (10 individuals each) along a longitudinal range
- Simulated patterns following [ Podos & Warren (2007) ](https://d1wqtxts1xzle7.cloudfront.net/53993414/The_Evolution_of_Geographic_Variation_in20170727-7403-7je2az-libre.pdf?1501183774=&response-content-disposition=inline%3B+filename%3DThe_Evolution_of_Geographic_Variation_in.pdf&Expires=1689718560&Signature=c9HxG2TqaqonkjDKlotzx1jrLJl25sX1V2Jj0DpDFTtDNsSi7ykNv~Ii4XFFe-yEh4fUF6HX0MYzT9ImbUJykmwIJXumf8VELyHp5tj5QYjR9FmpCftHXf1JdwBh-vCm2zaQ2VOh~iaHcMqTqjnDQR1W3FMQDU1dVeduLpkSma~8eLQ15e-SAHPYVwvSu-R1a2Q9KDibQyq84MbEFbLgbz9T3eBxZpsUAnxrGUrlHWo9CtbBEMtORbh9id4KG-RaAVyA38v4FtS8qofXX9zpcZNQILVGkhIwXeCbYkRVBbC9w5OieTVvIUQBXxEwZltgQ~ZWzbkzI1~wmHRnkbhYzQ__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA)

```{r, fig.cap="Geographic location of individuals/localities in the simulated data set. Letters highlight locality ids", out.width="%100", eval = FALSE}
# seed to make it reproducible
set.seed(123)
# number of groups
n_locality <- length(unique(sub_dat$Location.for.cluster))
# number of individuals per group
n_indiv <- table(sub_dat$Location.for.cluster)
# create locality labels
localities <- sample(LETTERS[1:n_locality])
# simulated individuals per group
locality_label <- unlist(sapply(1:length(localities), function(x) rep(localities[x], each = n_indiv[x])))
# simulate geographic coordinates along a gradient
lon <- unlist(sapply(1:n_locality, function(x) rep(x, each = n_indiv[x]))) + rnorm(n = length(locality_label), sd = 0.1)
lat <- rnorm(n = length(locality_label), sd = 0.1)
# put all together in a data frame
data <- data.frame(locality = locality_label, lat, lon, color = viridis(n_locality)[as.numeric(as.factor(locality_label))])
# add sampling site (broader geographic mozaic)
data <- data[order(data$lon), ]
data$site_num <- 2
for(i in 2:nrow(data)){
data$site_num[i] <- if (data$locality[i-1] == data$locality[i]) data$site_num[i - 1] else data$site_num[i - 1] + 1
}
# make last location the same as th previous one
data$site_num[data$locality == data$locality[which.max(data$lon)]] <- data$site_num[which.max(data$lon)] - 1
data$sampling.site <- sample(letters)[floor(data$site_num / 2)]
# plot localities along longitude
plot(data[, c("lon", "lat")], col = data$color, cex = 1.8, ylim = range(data$lat) + c(-.2, .2), pch = as.numeric(as.factor(data$sampling.site)))
lon_loc <- tapply(data$lon, data$locality, mean)
text(x = lon_loc, y = rep(0.3, n_locality), labels = names(lon_loc), cex = 2.5)
abline(v= 1:30 - 0.5)
```
## Simulate acoustic variation:
- Clinal: acoustic structure vector increases with longitude
- Dialect-type: acoustic structure vector similar within a locality but varies randomly between neighnoring localities
- 2 levels: locality and sampling site
- Sampling site create by grouping 2 adjacent localities with the same label
- Random: acoustic structure vector varies randomly between individuals regardless of locality or longitude
```{r, fig.cap="Types of vocal geographic variation that were simulated", out.height="100%", eval = FALSE}
# seed to make it reproducible
set.seed(123)
# simulate acoustic structure vector
# clinal variation
data$clinal <- data$lon + rnorm(n = nrow(data), sd = 0.2)
# dialect type variation
data$dialect <- as.numeric(as.factor(data$locality)) + rnorm(n = nrow(data), sd = 0.2)
data$dialect_site <- as.numeric(as.factor(data$sampling.site)) + rnorm(n = nrow(data), sd = 0.2)
# random variation
data$random <- rnorm(n = nrow(data), sd = 0.2)
# sort so lines look good in plot
data <- data[order(data$lon), ]
lng_data <- stack(data[, c("clinal", "dialect", "dialect_site", "random")])
lng_data$locality <- data$locality
lng_data$lat <- data$lat
lng_data$lat <- data$lon
lng_data$lon_seq <- 1:nrow(data)
ggplot(lng_data, aes(x = lon_seq, y = values)) +
geom_line(color = "#3E4A89FF", linewidth = 1.6) +
facet_wrap(~ ind, scales = "free_y", nrow = 3) +
theme_classic(base_size = 25) +
labs(x = "Longitude", y = "Acoustic feature")
```
## Stats on simulated data
- Model:
\begin{align*}
Acoustic\ dissimilarity &\sim locality + sampling\ site + geographic\ distance
\end{align*}
Predict acoustic structure based on geographic distance and locality membership using multiple Regression on distance matrices
Data is z-transformed so all predictors have similar variance and effect sizes are comparable.
Variance for each predictor:
```{r, eval = FALSE}
# create distance matrices
loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$locality))
site_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$sampling.site))
clinal_dist <- dist(data$clinal)
dialect_loc_dist <- dist(data$dialect)
dialect_site_dist <- dist(data$dialect_site)
random_dist <- dist(data$random)
geo_dist <- dist(data[, c("lat", "lon")])
# regression models
# set data format
rect_var <- cbind(clinal_dist = as.dist(scale(clinal_dist)), dialect_loc_dist = as.dist(scale(dialect_loc_dist)), dialect_site_dist = as.dist(scale(dialect_site_dist)), random_dist = as.dist(scale(random_dist)), geo_dist = as.dist(scale(geo_dist)), loc_member_binary = loc_member_binary, site_member_binary = site_member_binary)
apply(rect_var,2, var)
```
### Dialect-type variation model at locality level
```{r, eval = FALSE}
nperm <- 100
# model predicting dialect variation
(mod_dialect <- MRM2(formula = dialect_loc_dist ~ geo_dist + loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[, c("dialect_loc_dist", "geo_dist", "loc_member_binary", "site_member_binary")]))
```
### Dialect-type variation model at sampling site level
```{r, eval = FALSE}
nperm <- 100
# model predicting dialect variation
(mod_dialect_site <- MRM2(formula = dialect_site_dist ~ geo_dist + loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[, c("dialect_site_dist", "geo_dist", "loc_member_binary", "site_member_binary")]))
```
### Clinal variation model
```{r, eval = FALSE}
# model predicting clinal variation
(mod_clinal <- MRM2(formula = clinal_dist ~ geo_dist + loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[, c("clinal_dist", "geo_dist", "loc_member_binary", "site_member_binary")]))
```
### Random variation model
```{r, eval = FALSE}
# model predicting random variation
(mod_random <- MRM2(formula = random_dist ~ geo_dist + loc_member_binary + site_member_binary, nperm = nperm, mat = rect_var[, c("random_dist", "geo_dist", "loc_member_binary", "site_member_binary")]))
```
### Combined results
```{r, eval = FALSE}
mods <- list(mod_clinal = mod_clinal, mod_dialect = mod_dialect, mod_dialect_site = mod_dialect_site, mod_random = mod_random)
names(mods) <- c("Clinal", "Dialect locality", "Dialect site", "Random")
estimates <- do.call(rbind, lapply(seq_along(mods), function(x) {
Y <- data.frame(mods[[x]]$coef[-1, ])
Y$rel_coef <- Y[, 1]/max(Y[, 1])
Y$mod <- names(mods)[x]
Y$predictor <- rownames(Y)
names(Y) <- c("coef", "p", "rel_coef", "model", "predictor")
return(Y)
}))
estimates$rel_coef <- ifelse(estimates$p < 0.05, estimates$rel_coef,
0)
estimates$signif <- ifelse(estimates$p < 0.05, "p < 0.05", "p >= 0.05")
ggplot(estimates, aes(x = predictor, y = model, fill = rel_coef)) +
geom_tile() + coord_equal() + scale_fill_gradient2(low = viridis(10)[3],
high = viridis(10)[7], guide = "none") + geom_text(aes(label = round(coef,
3), color = signif)) + scale_color_manual(values = c("black",
"gray")) + labs(x = "", y = "", color = "P value") + theme_classic() +
theme(axis.text.x = element_text(color = "black", size = 11, angle = 30,
vjust = 0.8, hjust = 0.8))
```
## Replicate simulation 100 times
```{r, eval = FALSE}
nperm <- 100
reps <- 100
rep_models <- pbreplicate(reps, cl = 1, expr = {
# number of groups
n_locality <- length(unique(sub_dat$Location.for.cluster))
# number of individuals per group
n_indiv <- table(sub_dat$Location.for.cluster)
# create locality labels
localities <- sample(LETTERS[1:n_locality])
# simulated individuals per group
locality_label <-
unlist(sapply(1:length(localities), function(x)
rep(localities[x], each = n_indiv[x])))
# simulate geographic coordinates along a gradient
lon <-
unlist(sapply(1:n_locality, function(x)
rep(x, each = n_indiv[x]))) + rnorm(n = length(locality_label), sd = 0.1)
lat <- rnorm(n = length(locality_label), sd = 0.1)
# locality_label <- localities[n_indivs]
# put all together in a data frame
data <- data.frame(locality = locality_label, lat, lon)
# add sampling site (broader geographic mozaic)
data <- data[order(data$lon), ]
data$site_num <- 2
for(i in 2:nrow(data)){
data$site_num[i] <- if (data$locality[i-1] == data$locality[i]) data$site_num[i - 1] else data$site_num[i - 1] + 1
}
# make last location the same as the previous one
data$site_num[data$locality == data$locality[which.max(data$lon)]] <- data$site_num[which.max(data$lon)] - 1
data$sampling.site <- sample(letters)[floor(data$site_num / 2)]
# simulate acoustic structure vector
# clinal variation
data$clinal <- data$lon + rnorm(n = nrow(data), sd = 0.2)
# dialect type variation locality level
data$dialect <-
as.numeric(as.factor(data$locality)) + rnorm(n = nrow(data), sd = 0.2)
# dialect type variation
data$dialect_site <-
as.numeric(as.factor(data$sampling.site)) + rnorm(n = nrow(data), sd = 0.2)
# random variation
data$random <- rnorm(n = nrow(data), sd = 0.2)
loc_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$locality))
site_member_binary <- as.dist(PhenotypeSpace::binary_triangular_matrix(group = data$sampling.site))
clinal_dist <- dist(data$clinal)
dialect_loc_dist <- dist(data$dialect)
dialect_site_dist <- dist(data$dialect_site)
random_dist <- dist(data$random)
geo_dist <- dist(data[, c("lat", "lon")])
# regression models
# set data format
rect_var <- cbind(clinal_dist = as.dist(scale(clinal_dist)), dialect_loc_dist = as.dist(scale(dialect_loc_dist)), dialect_site_dist = as.dist(scale(dialect_site_dist)), random_dist = as.dist(scale(random_dist)), geo_dist = as.dist(scale(geo_dist)), loc_member_binary = loc_member_binary, site_member_binary = site_member_binary)
# model predicting dialect variation
mod_dialect <-
MRM2(
formula = dialect_loc_dist ~ geo_dist + loc_member_binary + site_member_binary,
nperm = nperm,
mat = rect_var[, c("dialect_loc_dist", "geo_dist", "loc_member_binary", "site_member_binary")]
)
# model predicting clinal variation
mod_clinal <-
MRM2(
formula = clinal_dist ~ geo_dist + loc_member_binary + site_member_binary,
nperm = nperm,
mat = rect_var[, c("clinal_dist", "geo_dist", "loc_member_binary", "site_member_binary")]
)
# # model predicting dialect site variation
dialect_site <-
MRM2(
formula = dialect_site_dist ~ geo_dist + loc_member_binary + site_member_binary,
nperm = nperm,
mat = rect_var[, c("dialect_site_dist", "geo_dist", "loc_member_binary", "site_member_binary")]
)
# model predicting random variation
mod_random <-
MRM2(
formula = random_dist ~ geo_dist + loc_member_binary + site_member_binary,
nperm = nperm,
mat = rect_var[, c("random_dist", "geo_dist", "loc_member_binary", "site_member_binary")]
)
mods <-
list(
mod_clinal = mod_clinal,
mod_dialect = mod_dialect,
dialect_site = dialect_site,
mod_random = mod_random
)
names(mods) <-
c("Clinal", "Dialect locality", "Dialect site", "Random")
estimates <-
do.call(rbind, lapply(seq_along(mods), function(x) {
Y <- data.frame(mods[[x]]$coef[-1, ])
Y$rel_coef <- Y[, 1] / max(Y[, 1])
Y$mod <- names(mods)[x]
Y$predictor <- rownames(Y)
names(Y) <- c("coef", "p", "rel_coef", "model", "predictor")
return(Y)
}))
estimates
}, simplify = FALSE)
coeffs <- do.call(cbind, lapply(rep_models, function(x)
x[, 1]))
ps <- do.call(cbind, lapply(rep_models, function(x)
x[, 2]))
estimates <- rep_models[[1]]
estimates$coef <- rowMeans(coeffs)
estimates$sd <- apply(coeffs, 1, sd)
estimates$p <- 1 - (apply(ps, 1, function(x) sum(x < 0.05)) / reps)
estimates$rel_coef <- estimates[, 1] / max(estimates[, 1])
estimates$rel_coef <- ifelse(estimates$p < 0.05, estimates$rel_coef,
0)
estimates$signif <-
ifelse(estimates$p < 0.05, "p < 0.05", "p >= 0.05")
estimates$coef_sd <- paste0(round(estimates$coef, 3), "\n(", round(estimates$sd, 4), ")")
saveRDS(estimates,
"./data/processed/estimates_for_replicated_simulation_dialects.RDS")
```
```{r, fig.cap="Mean estimates (and SD) of replicated simulation regression results. P values ", eval = FALSE}
estimates <- readRDS("./data/processed/estimates_for_replicated_simulation_dialects.RDS")
ggplot(estimates, aes(x = predictor, y = model, fill = rel_coef)) +
geom_tile() + coord_equal() + scale_fill_gradient2(low = viridis(10)[3],
high = viridis(10)[7], guide = "none") + geom_text(aes(label = coef_sd, color = signif)) + scale_color_manual(values = c("black",
"gray")) + labs(x = "", y = "", color = "P value") + theme_classic() +
theme(axis.text.x = element_text(color = "black", size = 11, angle = 30,
vjust = 0.8, hjust = 0.8))
```
## Partial Mantel test
Evaluate association between acoustic dissimilarity and locality membership accounting for the effect of geographic distance
### By location
```{r, eval = FALSE}
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))], na.rm = TRUE)
mantel_part <- mantel.partial(ydis = xc_dist, xdis = loc_bi_tri, zdis = geo_dist)
saveRDS(mantel_part, "./data/processed/partial_mantel_correlation_cross_correlation.RDS")
```
```{r, eval = FALSE}
readRDS("./data/processed/partial_mantel_correlation_cross_correlation.RDS")
```
### By sampling site
```{r, eval = FALSE}
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))], na.rm = TRUE)
mantel_part <- mantel.partial(ydis = xc_dist, xdis = samp_bi_tri, zdis = geo_dist)
saveRDS(mantel_part, "./data/processed/partial_mantel_correlation_cross_correlation_by_sampling_site.RDS")
```
```{r, eval = FALSE}
readRDS("./data/processed/partial_mantel_correlation_cross_correlation_by_sampling_site.RDS")
```
## Mantel correlogram at different distances
```{r, eval = FALSE}
geo_vect <- geo_dist[lower.tri(geo_dist)]
geo_vect <- geo_vect[!is.na(geo_vect)]
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <- mean(xc_dist[-which(is.infinite(xc_dist))], na.rm = TRUE)
dists <- 1:10
mantelcorrlg <- function(i) {
classes <- seq(0, max(geo_vect), i)
# length(classes)
# Run a mantel correlation on the data
correl_SPCC <- vegan::mantel.correlog(D.eco = xc_dist, D.geo = geo_dist,
break.pts = classes, cutoff = FALSE, r.type = "pearson", nperm = 1,
mult = "holm", progressive = FALSE)
mantel.res <- correl_SPCC$mantel.res[, 1:3]
mantel.res <- cbind(mantel.res, break.size = i)
return(mantel.res)
}
mantel_list <- warbleR:::pblapply_wrblr_int(dists, cl = 1, function(x) try(mantelcorrlg(x), silent = TRUE))
mantel_list <- mantel_list[sapply(mantel_list, class) != "try-error"]
mantel_list <- lapply(mantel_list, as.data.frame)
# # Save the correlation as an .RDS file so you don't have to
# run it multiple times in the future
saveRDS(mantel_list, paste0("./data/processed/correl_SPCC_several_distances.RDS"))
```
```{r, warning=FALSE, eval = FALSE}
mantel_list <- readRDS(paste0("./data/processed/correl_SPCC_several_distances.RDS"))
mantels_df <- as.data.frame(do.call(rbind, mantel_list))
combined_dists <- sort(unique(mantels_df$class.index))
# interpolate
interpol_mantel_list <- lapply(mantel_list, function(x) {
appx <- approx(x = x$class.index[x$n.dist > 0], y = x$Mantel.cor[x$n.dist >
0], xout = combined_dists, method = "linear")
return(appx$y)
})
interpol_mantel_mat <- do.call(cbind, interpol_mantel_list)
interpol_mantel_df <- data.frame(combined_dists, mean.cor = apply(interpol_mantel_mat,
1, mean, na.rm = TRUE), sd.cor = apply(interpol_mantel_mat, 1,
sd, na.rm = TRUE))
ggplot(data = interpol_mantel_df, mapping = aes(x = combined_dists,
y = mean.cor)) + geom_ribbon(data = interpol_mantel_df, aes(ymin = mean.cor -
sd.cor, ymax = mean.cor + sd.cor), fill = "gray", alpha = 0.3) +
geom_point(col = viridis(10, alpha = 0.5)[7], size = 2.5) + geom_line(col = viridis(10,
alpha = 0.5)[7], size = 2) + xlim(c(0, 4)) + ylim(c(-0.025,
0.2)) + geom_point(size = 3, color = "transparent") + theme_classic(base_size = 20) +
labs(x = "Pairwise geographic distance (Degrees)", y = "Correlation coefficient")
```
## Dialect discrimination with Random Forest
- Data split in training and testing subsets (75% and 25% respectively)
- Accuracy measured on test subset
### Discriminating locations
```{r, eval = FALSE, fig.width=10}
xc_dist <- 1 - sub_xcorr
xc_dist[which(is.infinite(xc_dist))] <-
mean(xc_dist[-which(is.infinite(xc_dist))], na.rm = TRUE)
xcorr_mds <- cmdscale(d = as.dist(xc_dist), k = 10)
xcorr_mds <-
data.frame(sound.files = rownames(xcorr_mds), xcorr_mds)
xcorr_mds$location <-
sapply(xcorr_mds$sound.files, function(x)
sub_dat$Location.for.cluster[sub_dat$new_name == x])
xcorr_mds$location <- as.factor(make.names(xcorr_mds$location,
unique = FALSE, allow_ = TRUE))
# Create data subsets
partition <- createDataPartition(
y = xcorr_mds$location,
times = 1,
p = 0.75,
list = FALSE
)
trainset <- xcorr_mds[partition,-1]
testset <- xcorr_mds[-partition,-1]
trcontrol <-
trainControl(
method = "repeatedcv",
number = 20,
savePredictions = TRUE,
sampling = "down",
classProbs = TRUE,
returnResamp = "all"
)
pred_model <- train(
location ~ .,
data = trainset,
method = "rf",
trControl = trcontrol,
metric = "Accuracy",
preProcess = "BoxCox",
proximity = TRUE
)
# save confusion matrix
conf_mat <-
confusionMatrix(predict(pred_model, testset), testset$location)
conf_df <- as.data.frame(conf_mat$table)
conf_df$total <-
sapply(conf_df$Reference, function(x)
sum(testset$location ==
x))
conf_df$proportion <- conf_df$Freq / conf_df$total
# fit model on complete data set
best_rf_model <- pred_model$finalModel
all_rf_model <- randomForest(
location ~ .,
data = xcorr_mds[,-1], # Your entire dataset
proximity = TRUE, # Include proximity matrix
ntree = best_rf_model$ntree, # Number of trees
mtry = best_rf_model$mtry, # Number of variables tried for splitting at each node
nodesize = best_rf_model$nodesize, # Minimum size of terminal nodes
maxnodes = best_rf_model$maxnodes # Maximum number of terminal nodes
)
saveRDS(
list(
pred_model_bb = pred_model,
conf_mat_bb = conf_mat,
confusion_df_bb = conf_df,
testset_bb = testset,
all_rf_model = all_rf_model
),
"./data/processed/random_forest_model_results_locations.RDS"
)
```
```{r, eval = FALSE}
rf_model_results <- readRDS("./data/processed/random_forest_model_results_locations.RDS")
# print confusion matrix results
rf_model_results$conf_mat_bb$overall
confusion_df <- rf_model_results$confusion_df_bb
ggplot(confusion_df, aes(x = Reference, y = Prediction, fill = proportion)) +
geom_tile() + theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
direction = 1) + geom_text(aes(label = round(proportion, 2)),
color = "black", size = 3) + theme_classic() + theme(axis.text.x = element_text(color = "black",
size = 11, angle = 30, vjust = 0.8, hjust = 0.8))
```
### Projecting random forest proximity with UMAP
```{r, warning=FALSE, eval = FALSE}
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15, n_components = 2)
# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[,1], UMAP2 = umap_result$layout[,2], location = xcorr_mds$location, sound.files = xcorr_mds$sound.files)
umap_df$pred.locality <- predict(object = rf_model_results$all_rf_model, xcorr_mds[,grep("X", names(xcorr_mds))])
write.csv(umap_df, "./data/processed/umap_on_rf_proximity_on_xcorr_dists_localities.csv", row.names = FALSE)
```
```{r, warning=FALSE, eval = FALSE}
umap_loc_df <- read.csv("./data/processed/umap_on_rf_proximity_on_xcorr_dists_localities.csv")
# Create a scatterplot
gg_umap_loc <- ggplot(umap_loc_df, aes(x = UMAP1, y = UMAP2, color = location, fill = location)) +
geom_point(size = 4) +
ylim(c(-7, 7)) +
scale_color_viridis_d(alpha = 0.3, begin = 0.1, end = 0.8) +
scale_fill_viridis_d(alpha = 0.2, begin = 0.1, end = 0.8) +
theme_classic(base_size = 20) +
labs(x ="UMAP1", y = "UMAP2", color = "Locality", fill = "Locality")
gg_umap_loc
```
### Projecting random forest proximity with UMAP
```{r, warning=FALSE, eval = FALSE}
umap_result <- umap(rf_model_results$all_rf_model$proximity, n_neighbors = 15, n_components = 2)
# Create a data frame with the UMAP results
umap_df <- data.frame(UMAP1 = umap_result$layout[,1], UMAP2 = umap_result$layout[,2], sampling.site = xcorr_mds$sampling.site, sound.files = xcorr_mds$sound.files)
umap_df$pred.site <- predict(object = rf_model_results$all_rf_model, xcorr_mds[,grep("X", names(xcorr_mds))])
write.csv(umap_df, "./data/processed/umap_on_rf_proximity_on_xcorr_dists.csv", row.names = FALSE)
```
```{r, warning=FALSE, eval = FALSE}
umap_df <- read.csv("./data/processed/umap_on_rf_proximity_on_xcorr_dists.csv")
umap_df$sampling.site <- factor(umap_df$sampling.site, levels = c("central.coast", "southern", "northern", "central.mountains"))
# Create a scatterplot
gg_umap_site <- ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = sampling.site, fill = sampling.site, shape = sampling.site)) +
geom_point(size = 4) +
ylim(c(-7, 7)) +
scale_color_viridis_d(alpha = 0, begin = 0.1, end = 0.8, labels = c("Central Coast", "Central Mountains", "Northern", "Southern")) +
scale_fill_viridis_d(alpha = 0.25, begin = 0.1, end = 0.8, labels = c("Central Coast", "Central Mountains", "Northern", "Southern")) +
scale_shape_manual(values = 21:24, labels = c("Central Coast", "Central Mountains", "Northern", "Southern")) +
guides(shape = guide_legend(override.aes = list(size = 5))) +
theme_classic(base_size = 20) +
labs(x ="UMAP1", y = "UMAP2", color = "Region", shape = "Region", fill = "Region")
gg_umap_site
```
## Combined plots
```{r, warning=FALSE, eval = FALSE}
# cowplot::plot_grid(gg_umap_loc, gg_umap_site, nrow = 1, rel_widths = c(1.1, 1))
#
# cowplot::ggsave2("./output/UMAP_scatterplots.png", dpi = 300, width = 22, height = 7)
cowplot::plot_grid(gg_umap_loc, gg_umap_site, nrow = 2)
cowplot::ggsave2("./output/UMAP_scatterplots2.png", dpi = 300, width = 17, height = 17)
```
# Spectrograms of close-to-centroid calls for each sample site
```{r, eval = FALSE, out.width="100%", fig.height=8, fig.width=6, fig.dpi=100}
# keep only those correctly predicted
umap_df <- umap_df[umap_df$sampling.site == umap_df$pred.site, ]
# find prepresentative
y <- 96 #37, 50, 57, 47
# for ( i in 1:100){
# y <- y + 1
repren_calls_list <- lapply(unique(umap_df$sampling.site), function(x){
X <- umap_df[umap_df$sampling.site == x, ]
centroid <- sapply(X[,c("UMAP1", "UMAP2")], mean)
X$centroid_dist <- sapply(seq_len(nrow(X)), function(y) dist(rbind(centroid, X[y, c("UMAP1", "UMAP2")])))
set.seed(y)
out <- X[order(X$centroid_dist, decreasing = FALSE), ][sample(1:30, 2), ]
})
repren_calls <- do.call(rbind, repren_calls_list)
repren_calls$selec <- 1
repren_calls$start <- rep(0, nrow(repren_calls))
repren_calls$end <- rep(0.5, nrow(repren_calls))
repren_calls <- repren_calls[c(1, 3, 5, 7, 2, 4, 6, 8), ]
lf <- rep(c(0.06, 0.5), each = 4)
rg <- rep(c(0.5, 0.95), each = 4)
horiz <- seq(0.95, 0.075, length.out = 5)
btm <- rep(horiz[-1], 2)
tp <- rep(horiz[-length(horiz)], 2)
m <- cbind(lf, rg, btm, tp)
m <- m[(1:8),]
lf <- c(rep(0.95, each = 4), 0.06, 0.5, 0, 0)
rg <- c(rep(1, each = 4), 0.5, 0.95, 0.05, 1)
horiz <- seq(0.95, 0.075, length.out = 5)
btm <- c(horiz[-1], 0.95, 0.95, 0.075, 0)
tp <- c(horiz[-length(horiz)], 1, 1, 0.95, 0.075)
m2 <- cbind(lf, rg, btm, tp)
m <- rbind(m, m2)
# png(filename = paste0("./output/spectrograms_by_site", y, ".png"), res = 300, width = 4000, height = 3000)
ss <- split.screen(figs = m)
# # testing layout screens
# for(i in 1:nrow(m))
# {screen(i)
# par( mar = rep(0, 4))
# plot(0.5, xlim = c(0,1), ylim = c(0,1), type = "n", axes = FALSE, xlab = "", ylab = "", xaxt = "n", yaxt = "n")
# box()
# text(x = 0.5, y = 0.5, labels = i)
# }
# close.screen(all.screens = T)
ovlp <- 95
colls <- seq(-110, 0, 5)
wl <- 200
lab_bg <- viridis(10, alpha = 0.25)[8]
lab_bg <- "#3A3B39"
# frequency label
screen(15)
par(mar = c(0, 0, 0, 0), new = TRUE)
plot(
1,
frame.plot = FALSE,
type = "n",
yaxt = "n",
xaxt = "n"
)
text(
x = 0.9,
y = 1,
"Frequency (kHz)",
srt = 90,
cex = 1.6
)
# time label
screen(16)
par(mar = c(0, 0, 0, 0), new = TRUE)
plot(
1,
frame.plot = FALSE,
type = "n",
yaxt = "n",
xaxt = "n"
)
text(x = 1,
y = 0.75,
"Time (s)",
cex = 1.6)
for (i in seq_len(nrow(repren_calls))) {
# print(i)
screen(i)
par(mar = c(0, 0, 0, 0))
warbleR:::spectro_wrblr_int2(
wave = read_wave(X = repren_calls,
index = i,
path = "./data/raw/consolidated_files/"
),
collevels = colls,
ovlp = ovlp,
wl = wl,
flim = c(1, 9),
palette = viridis,
axisX = FALSE,
axisY = FALSE,
grid = FALSE
)
# add frequency axis
if (i %in% 1:4)
axis(2, at = c(seq(2, 10, 2)))
# add time axis
if (i %in% c(4, 8))
axis(1)
}
vlabs <- c("Central\nMountains", "Southern", "Northern", "Central\nCoast")
par(mar = c(0, 0, 0, 0),
bg = lab_bg,
new = TRUE)
# add vertical labels
for (i in 9:12) {
screen(i)
# par(mar = c(0, 0, 0, 0))
par(mar = c(0, 0, 0, 0),
bg = lab_bg,
new = TRUE)
plot(
1,
frame.plot = FALSE,
type = "n",
yaxt = "n",
xaxt = "n"
)
text(
x = 1,
y = 1,
vlabs[i - 8],
srt = 270,
cex = 1.6,
col = "white",
font = 1
)
box()
}
# dev.off()
# }
```
<div class="alert alert-success">
# Takeaways {.unnumbered .unlisted}
</div>
<!-- '---' adds a gray vertical line -->
---
<!-- add packages used, system details and versions -->
# Session information {.unnumbered .unlisted}
```{r session info, echo=F}
sessionInfo()
```