---
title: Measuring segment similarity
subtitle: LBH song type prevalence and degradation
author: <a href="http://researcher.website.com/">Researcher name</a>
date: "`r Sys.Date()`"
toc: true
toc-depth: 2
toc-location: left
number-sections: true
highlight-style: pygments
format:
html:
df-print: kable
code-fold: show
code-tools: true
css: qmd.css
editor_options:
chunk_output_type: console
---
<!-- this code add line numbers to code blocks -->
<!-- only works when code folding is not used in yaml (code_folding: show) -->
```{=html}
<style>
body
{ counter-reset: source-line 0; }
pre.numberSource code
{ counter-reset: none; }
</style>
```
```{r set root directory, echo = FALSE}
# set working directory as project directory or one directory above,
knitr::opts_knit$set(root.dir = "..")
```
```{r add link to github repo, echo = FALSE, results='asis'}
# print link to github repo if any
if (file.exists("./.git/config")){
config <- readLines("./.git/config")
url <- grep("url", config, value = TRUE)
url <- gsub("\\turl = |.git$", "", url)
cat("\nSource code and data found at [", url, "](", url, ")", sep = "")
}
```
<!-- skyblue box -->
<div class="alert alert-info">
# Purpose
- Format manual annotations for segmenting song types
- Analyze segmented song tpyes
</div>
# Load packages
```{r load packages, message=FALSE, warning=FALSE}
# knitr is require for creating html/pdf/word reports
# formatR is used for soft-wrapping code
# install/ load packages
sketchy::load_packages(packages = c("knitr", "formatR", "rprojroot", "warbleR", "baRulho", "readxl", "Rraven", "dplyr", "dtw", github = "maRce10/PhenotypeSpace", "umap", "mclust", "viridis", "caret"))
```
# Split data in segments
```{r read data, eval = TRUE}
contours <- read.csv(file = "./data/processed/seltailor_output.csv", header = T)
contours$tailored <- NULL
rav_dat_raw <- imp_raven(path = "./data/processed", files = "FrameworkContourDegradationv1.4.txt", all.data = TRUE, warbler.format = TRUE)
rav_dat_raw$segment.id <- paste(rav_dat_raw$indxF, rav_dat_raw$selec, sep = "-")
```
```{r split segments, eval = FALSE}
# check all have 1 sample
all(table(rav_dat_raw$segment.id) == 1)
# number of fundamental freq values
ff_n <- length(grep("ff", names(contours)))
segment_list <- pblapply_wrblr_int(X = unique(rav_dat_raw$segment.id), cl = 10, function(x){
segment_data <- rav_dat_raw[rav_dat_raw$segment.id == x, ]
segment_data[,c("start", "end")]
contour <- contours[contours$selec == segment_data$indxF[1], ]
times <- seq(contour$start, contour$end, length.out = ff_n)
freq_time <- data.frame(freq = t(contour[, grep("ff", names(contours))]), times)
names(freq_time)[1] <- "freq"
seg_freq_time <- freq_time$freq[freq_time$times >= segment_data$start & freq_time$times <= segment_data$end]
return(seg_freq_time)
})
names(segment_list) <- unique(rav_dat_raw$segment.id)
saveRDS(segment_list, "./data/processed/raw_segment_ff_list.RDS")
```
# DTW distances
## Padding with mean freq values
All have 143 values (which is the maximum number of ff values in a segment)
```{r padd with mean ff, eval= FALSE}
segment_list <- readRDS("./data/processed/raw_segment_ff_list.RDS")
# temporarily remove "280" id segments
segment_list <- segment_list[!grepl("^280", names(segment_list))]
max_length <- max(sapply(segment_list, length))
pad_segment_list <- lapply(segment_list, function(x) {
pad_x <- c(x, rep(mean(x), max_length - length(x)))
return(pad_x)
})
names(pad_segment_list) <- names(segment_list)
range(sapply(pad_segment_list, length))
pad_segment_df <- do.call(rbind, pad_segment_list)
# no NAs
which(apply(pad_segment_df, 1, anyNA))
dtw_dist_pad_segments <- dtwDist(mx = pad_segment_df)
saveRDS(dtw_dist_pad_segments, "./data/processed/dtw_padded_segment_ff_distances.RDS")
```
## Interpolate to force same length
```{r Interpolate mean ff, eval= FALSE}
segment_list <- readRDS("./data/processed/raw_segment_ff_list.RDS")
# temporarily remove "280" id segments
segment_list <- segment_list[!grepl("^280", names(segment_list))]
max_length <- max(sapply(segment_list, length))
intrp_segment_list <- lapply(segment_list, function(x) {
if (length(x) > 1)
intrp_x <- stats::approx(x = x, n = max_length)$y
if (length(x) == 1)
intrp_x <- rep(x, max_length)
return(intrp_x)
})
names(intrp_segment_list) <- names(segment_list)
range(sapply(intrp_segment_list, length))
intrp_segment_df <- do.call(rbind, intrp_segment_list)
# no NAs
which(apply(intrp_segment_df, 1, anyNA))
dtw_dist_intrp_segments <- dtwDist(mx = intrp_segment_df)
saveRDS(dtw_dist_intrp_segments, "./data/processed/dtw_interpolated_segment_ff_distances.RDS")
```
## Sliding window minimum DTW distance
```{r sliding window minimum DTW distance, eval= FALSE}
segment_list <- readRDS("./data/processed/raw_segment_ff_list.RDS")
# temporarily remove "280" id segments
segment_list <- segment_list[!grepl("^280", names(segment_list))]
max_length <- max(sapply(segment_list, length))
combs <- combn(x = names(segment_list), m = 2)
slided_segment_list <- pblapply_wrblr_int(seq_len(ncol(combs)), cl = 20, function(x) {
xs <- segment_list[combs[, x]]
min_x <- which.min(shrt_length <- sapply(xs, length))
min_length <- shrt_length[min_x]
lg_x <- xs[[-min_x]]
shrt_x <- xs[[min_x]]
stps <- abs(diff(shrt_length))
if (stps <= 1)
stps <- 1 else
stps <- 1:stps
dtw_dists <- sapply(stps, function(e, cor.method = cm) {
warbleR::try_na(dtw(lg_x[e:(e + min_length - 1)],
shrt_x)$distance)
})
return(min(dtw_dists, na.rm = TRUE))
})
slided_segment_df <- cbind(t(combs), unlist(slided_segment_list))
rownames(slided_segment_df) <- 1:nrow(slided_segment_df)
colnames(slided_segment_df) <- c("segment.id1", "segment.id2", "dtw.dist")
head(slided_segment_df)
dim(slided_segment_df)
slided_segment_df <- as.data.frame(slided_segment_df)
slided_segment_df$dtw.dist <- as.numeric(slided_segment_df$dtw.dist)
saveRDS(slided_segment_df, "./data/processed/dtw_slided_segment_ff_tabular.RDS")
slided_segment_dist <- rectangular_to_triangular(slided_segment_df)
# no NAs
which(apply(slided_segment_dist, 1, anyNA))
saveRDS(slided_segment_dist, "./data/processed/dtw_slided_segment_ff_distances.RDS")
```
## Combine distances
```{r, eval = FALSE}
dtw_dist_pad_segments <- readRDS("./data/processed/dtw_padded_segment_ff_distances.RDS")
dtw_dist_intrp_segments <- readRDS("./data/processed/dtw_interpolated_segment_ff_distances.RDS")
dtw_dist_slided_segments <- readRDS("./data/processed/dtw_slided_segment_ff_distances.RDS")
mds_pad_segments <- cmdscale(slided_segment_dist, k = 10)
# mds_intrp_segments <- cmdscale(dtw_dist_intrp_segments, k = 10)
mds_slided_segments <- cmdscale(dtw_dist_slided_segments, k = 10)
mds_segments <- cbind(mds_pad_segments, mds_slided_segments, mds_intrp_segments)
mds_segments_df <- data.frame(segment.id = rownames(mds_pad_segments), mds_segments)
# head(mds_segments_df)
mds_segments_df$segment.cat <- sapply(mds_segments_df$segment.id, function(x) rav_dat_raw$Category[rav_dat_raw$segment.id == x])
# table(mds_segments_df$segment.cat)
# temporarily remove 199-681
mds_segments_df <- mds_segments_df[mds_segments_df$segment.id != "199-681",]
mds_segments_df$X1 <- scale(mds_segments_df$X1) * 1000
mds_segments_df$X11 <- scale(mds_segments_df$X11) * 1000
category_similarity <- space_similarity(
formula = segment.cat ~ X1 + X11,
data = mds_segments_df,
cores = 10,
method = "mean.density.overlap"
)
# set method parameters
ctrl <- caret::trainControl(method = "repeatedcv", repeats = 5)
seg.id <- mds_segments_df$segment.id
mds_segments_df$segment.id <- NULL
# get similarities ("overlap")
category_similarity <- space_similarity(
formula = segment.cat ~ .,
data = mds_segments_df,
cores = 4,
method = "rf",
trControl = ctrl,
tuneLength = 4,
seed = 123
)
```
# UMAP unsupervised segment classification
```{r umap, eval = FALSE}
set.seed(123)
umap_result <- umap(mds_segments_df[, grep("^X", names(mds_segments_df))], n_components = 20)
mds_segments_df$UMAP1 <- umap_result$layout[, 1]
mds_segments_df$UMAP2 <- umap_result$layout[, 2]
mod_umap <- Mclust(umap_result$layout)
mds_segments_df$umap.segment.cat <- as.character(mod_umap$classification)
table(mds_segments_df$umap.segment.cat)
write.csv(mds_segments_df, "./data/processed/dtw_tabular_dimensions_umap_vectors_and_unsupervised_categories_segments.csv", row.names = FALSE)
```
```{r plot umap, eval = TRUE}
mds_segments_df <- read.csv("./data/processed/dtw_tabular_dimensions_umap_vectors_and_unsupervised_categories_segments.csv")
conf_mat <- confusionMatrix(data = factor(mds_segments_df$segment.cat),
reference = factor(as.factor(mds_segments_df$umap.segment.cat), levels = c(unique(mds_segments_df$umap.segment.cat), unique(mds_segments_df$segment.cat))))
conf_df <- as.data.frame(conf_mat$table)
conf_df$total <- sapply(conf_df$Reference, function(x) sum(mds_segments_df$umap.segment.cat ==
x))
conf_df$proportion <- conf_df$Freq/conf_df$total
conf_df <- conf_df[conf_df$Reference %in% unique(mds_segments_df$umap.segment.cat),
]
plot(mds_segments_df$UMAP1, mds_segments_df$UMAP2, col = viridis(length(unique(mds_segments_df$umap.segment.cat)))[as.numeric(mds_segments_df$umap.segment.cat)], cex = 4, pch = as.numeric(as.factor(mds_segments_df$umap.segment.cat)) + 10, main= paste("UMAP: 2 dimensions;", length(unique(mds_segments_df$umap.segment.cat)), "groups"), xlab = "UMAP1", ylab = "UMAP2")
conf_df <- conf_df[!conf_df$Prediction %in% 1:9, ]
ggplot(conf_df, aes(x = Reference, y = Prediction, fill = proportion)) +
geom_tile() + theme_bw() + coord_equal() + scale_fill_distiller(palette = "Greens",
direction = 1) + geom_text(aes(label = round(proportion, 2)),
color = "black") + labs(x = "Manually labeled category", y = "Unsupervised category") +
theme_classic() + theme(axis.text.x = element_text(color = "black",
size = 11, angle = 30, vjust = 0.8, hjust = 0.8))
```
<!-- light green box -->
<div class="alert alert-success">
# Takeaways {.unnumbered .unlisted}
</div>
<!-- '---' adds a gray vertical line -->
---
<!-- add packages used, system details and versions -->
<font size="4">Session information</font>
```{r session info, echo=F}
sessionInfo()
```