knitr::opts_knit$set(root.dir = normalizePath(".."))

knitr::opts_chunk$set(dpi = 58, fig.width = 12) 

# ggplot2 theme
theme_set(theme_classic(base_size = 20))

split_seqs <- function(x) {
 
  # from start to end
  y <-  sapply(1:20, function(y) 
     if (y == 1)
    ifelse(substr(x, start = y + 1, stop = y + 1) == substr(x, start = y, stop = y), 0 , 1) else
          ifelse(substr(x, start = y - 1, stop = y - 1) == substr(x, start = y, stop = y), 0 , 1)
    )

  x2 <- strsplit(x, "")[[1]]
  
  w1 <- which(y == 1)
  
  # to add single elements at the end
    w1 <- c(w1, 21)
  if (w1[1] == 1 & w1[2] == 2) w1 <- w1[2:length(w1)]
    
  sgmts <- lapply(1:length(w1), function(w){
    if (w1[w] == w1[1]) paste(x2[1:(w1[w] - 1)], collapse = "")  else
  paste(x2[w1[w - 1]:(w1[w] - 1)], collapse = "")  
  })

  return(sgmts)
}


write_lek_fas <- function(x){
  
 lk <- unlist(strsplit(x[seq(1, length(x), by = 2)], split = "_"))[seq(1, length(x), by = 2)]
  
 lk <- gsub(">", "", paste0(lk, "_"))
 
 out <- sapply(unique(lk), USE.NAMES = FALSE, function(y){
   
   
   wch.lk <- grep(pattern = y, x = x)
   
   fl.nms <-tempfile(tmpdir = getwd(), fileext =  paste0("song.seq_",y, "SongsSeq.fa"))
   
   writeLines(x[min(wch.lk):(max(wch.lk)+ 1)], con = fl.nms)
   
   return(basename(fl.nms))
 })

 return(out)  
}

Optimization of alignment costs

different category: fast_trill- flat_pure_tone
same segment: fast_trill-fast_trill
same category: fast_trill- slow_trill
Costs were always increasing: different category < same category < same segment
Same segment costs were all positive
Horizontal dotted line shows the highest gap difference between real and simulated data

Same category costs aggregated

*Same segment costs in facet label

cost.res <- read.csv("./data/processed/costs_simulation_results.csv", stringsAsFactors = FALSE)

cost.res <- cost.res[cost.res$convg != "converged and non-converged" ,]

cost.res <- cost.res[cost.res$cost.vals %in% cost.res$cost.vals[cost.res$data.type != "real"], ]


cost.res <- tidyr::separate(cost.res, "cost.vals", into = c("different.category", "same.category", "same.segment"), FALSE, sep = "_")

cost.res <- cost.res[cost.res$same.segment > 0, ]

cost.res$different.category <- as.numeric(cost.res$different.category)
cost.res$same.category <- as.numeric(cost.res$same.category)
cost.res$same.segment <- as.numeric(cost.res$same.segment)

# sapply(cost.res, function(x) length(unique(x)))

cost.diff <- sapply(unique(cost.res$cost.vals), USE.NAMES = FALSE, function(x){
  
mean(cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type != "real"] - 
  cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type == "real"])
})


cost.diff.df <- cost.res[1:length(cost.diff), ]

cost.diff.df$gaps <- cost.diff
cost.diff.df$data.type <- "difference"

cost.res <- rbind(cost.res, cost.diff.df)

agg.same.category <- aggregate(cbind(gaps, sd) ~ same.segment + different.category + data.type, cost.res, mean)


ggplot(agg.same.category, aes(x = different.category, y = gaps, col = data.type)) +
   geom_hline(linetype="dotted", yintercept = max(agg.same.category$gaps[agg.same.category$data.type == "difference"], na.rm = TRUE)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.segment, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for same category")

No clear effect of same category cost

Different category costs aggregated

*Same category costs in facet label

agg.diff <- aggregate(cbind(gaps, sd) ~ same.category + same.segment + data.type, cost.res, mean)

ggplot(agg.diff, aes(x = same.category, y = gaps, col = data.type)) +
     geom_hline(linetype="dotted", yintercept = max(agg.diff$gaps[agg.diff$data.type == "difference"], na.rm = TRUE)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.segment, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for same segments")

The higher the same segment cost the better
No clear effect of different category cost

Same segment costs aggregated

*Same category costs in facet label

agg.same.sgmt <- aggregate(cbind(gaps, sd) ~ same.category + different.category + data.type, cost.res, mean)

ggplot(agg.same.sgmt, aes(x = different.category, y = gaps, col = data.type)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
     geom_hline(linetype="dotted", yintercept = max(agg.same.sgmt$gaps[agg.same.sgmt$data.type == "difference"], na.rm = TRUE)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.category, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for different category")

High same category costs produce the highest gap difference regardless of different category costs

Difference between same segment and different category costs

Horizontal dotted line shows the maximum gap difference between real and simulated data

combs <- t(combn(c("different.category", "same.category", "same.segment"), 2))

cost.diff.l <- lapply(1:nrow(combs), function(y){
  
  X <- cost.res[cost.res$data.type == "difference",]
  
  names(X)[names(X) == "gaps"] <- "gap.diff"
  
  X$cost.diff <- X[, combs[y, 2]] - X[, combs[y, 1]]
  
  X$diff.cats <- paste(combs[y,], collapse = "-")
  
  if (X$diff.cats[1] == "different.category-same.segment") {X$cost.diff <- X$cost.diff / 2 
  
  X$diff.cats <- "different.category-same.segment / 2"
  }
  return(X)
}
)

cost.diff <- do.call(rbind, cost.diff.l)

agg.same.cat <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = mean)

agg.same.cat$sd <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = sd)$gap.diff

# position of points in ggplot2 
pos <- position_dodge(width= 0.5)

ggplot(agg.same.cat, aes(x = cost.diff, y = gap.diff, col = diff.cats)) +
  geom_hline(linetype="dotted",yintercept = max(agg.same.cat$gap.diff, na.rm = TRUE)) +
    geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd), position = pos) +
  geom_line(position = pos) +
  scale_color_viridis_d(begin = 0.2, end = 0.8,alpha = 0.6) +
  labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories")

Difference in cost between same segment and same category should not be higher than ~1.5 nor lower than ~0.7
Difference in cost between same segment and different category doesn’t really matter

Average across the adjacent categories

agg.same.cat2 <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = mean)

agg.same.cat2$sd <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = sd)$gap.diff

# position of points in ggplot2 
pos <- position_dodge(width= 0.1)

ggplot(agg.same.cat2, aes(x = cost.diff, y = gap.diff)) +
  geom_hline(linetype="dotted",yintercept = max(agg.same.cat2$gap.diff, na.rm = TRUE)) +
    geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd),  col = viridis::viridis(10)[6]) +
  geom_line( col = viridis::viridis(10)[6]) +
  labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories")

A cost difference between adjacent categories of 0.7 produces the highest gap difference between real and simulated data sets

R session information

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] ape_5.3       ggplot2_3.3.0 pbapply_1.4-2
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3        pillar_1.4.2      compiler_3.6.1    viridis_0.5.1    
##  [5] tools_3.6.1       zeallot_0.1.0     digest_0.6.22     viridisLite_0.3.0
##  [9] lifecycle_0.1.0   evaluate_0.14     tibble_2.1.3      gtable_0.3.0     
## [13] nlme_3.1-142      lattice_0.20-41   pkgconfig_2.0.3   rlang_0.4.1      
## [17] yaml_2.2.1        xfun_0.12         gridExtra_2.3     withr_2.1.2      
## [21] dplyr_0.8.3       stringr_1.4.0     knitr_1.28        vctrs_0.2.0      
## [25] grid_3.6.1        tidyselect_0.2.5  glue_1.3.1        R6_2.4.1         
## [29] rmarkdown_1.17    purrr_0.3.3       tidyr_1.0.0       magrittr_1.5     
## [33] ellipsis_0.3.0    backports_1.1.5   scales_1.0.0      htmltools_0.4.0  
## [37] assertthat_0.2.1  colorspace_1.4-1  labeling_0.3      stringi_1.4.6    
## [41] munsell_0.5.0     crayon_1.3.4

Optimizing MAFFT alignments

Long-billed hermit song cultural evolution

Marcelo Araya-Salas, PhD

18-04-2020

Optimization of alignment costs

Same category costs aggregated

Different category costs aggregated

Same segment costs aggregated

Difference between same segment and different category costs

Average across the adjacent categories