knitr::opts_knit$set(root.dir = normalizePath(".."))

knitr::opts_chunk$set(dpi = 58, fig.width = 12) 

# ggplot2 theme
theme_set(theme_classic(base_size = 20))

split_seqs <- function(x) {
 
  # from start to end
  y <-  sapply(1:20, function(y) 
     if (y == 1)
    ifelse(substr(x, start = y + 1, stop = y + 1) == substr(x, start = y, stop = y), 0 , 1) else
          ifelse(substr(x, start = y - 1, stop = y - 1) == substr(x, start = y, stop = y), 0 , 1)
    )

  x2 <- strsplit(x, "")[[1]]
  
  w1 <- which(y == 1)
  
  # to add single elements at the end
    w1 <- c(w1, 21)
  if (w1[1] == 1 & w1[2] == 2) w1 <- w1[2:length(w1)]
    
  sgmts <- lapply(1:length(w1), function(w){
    if (w1[w] == w1[1]) paste(x2[1:(w1[w] - 1)], collapse = "")  else
  paste(x2[w1[w - 1]:(w1[w] - 1)], collapse = "")  
  })

  return(sgmts)
}


write_lek_fas <- function(x){
  
 lk <- unlist(strsplit(x[seq(1, length(x), by = 2)], split = "_"))[seq(1, length(x), by = 2)]
  
 lk <- gsub(">", "", paste0(lk, "_"))
 
 out <- sapply(unique(lk), USE.NAMES = FALSE, function(y){
   
   
   wch.lk <- grep(pattern = y, x = x)
   
   fl.nms <-tempfile(tmpdir = getwd(), fileext =  paste0("song.seq_",y, "SongsSeq.fa"))
   
   writeLines(x[min(wch.lk):(max(wch.lk)+ 1)], con = fl.nms)
   
   return(basename(fl.nms))
 })

 return(out)  
}

Optimization of alignment costs

  • different category: fast_trill- flat_pure_tone
  • same segment: fast_trill-fast_trill
  • same category: fast_trill- slow_trill
  • Costs were always increasing: different category < same category < same segment
  • Same segment costs were all positive
  • Horizontal dotted line shows the highest gap difference between real and simulated data

Same category costs aggregated

*Same segment costs in facet label

cost.res <- read.csv("./data/processed/costs_simulation_results.csv", stringsAsFactors = FALSE)

cost.res <- cost.res[cost.res$convg != "converged and non-converged" ,]

cost.res <- cost.res[cost.res$cost.vals %in% cost.res$cost.vals[cost.res$data.type != "real"], ]


cost.res <- tidyr::separate(cost.res, "cost.vals", into = c("different.category", "same.category", "same.segment"), FALSE, sep = "_")

cost.res <- cost.res[cost.res$same.segment > 0, ]

cost.res$different.category <- as.numeric(cost.res$different.category)
cost.res$same.category <- as.numeric(cost.res$same.category)
cost.res$same.segment <- as.numeric(cost.res$same.segment)

# sapply(cost.res, function(x) length(unique(x)))

cost.diff <- sapply(unique(cost.res$cost.vals), USE.NAMES = FALSE, function(x){
  
mean(cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type != "real"] - 
  cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type == "real"])
})


cost.diff.df <- cost.res[1:length(cost.diff), ]

cost.diff.df$gaps <- cost.diff
cost.diff.df$data.type <- "difference"

cost.res <- rbind(cost.res, cost.diff.df)

agg.same.category <- aggregate(cbind(gaps, sd) ~ same.segment + different.category + data.type, cost.res, mean)


ggplot(agg.same.category, aes(x = different.category, y = gaps, col = data.type)) +
   geom_hline(linetype="dotted", yintercept = max(agg.same.category$gaps[agg.same.category$data.type == "difference"], na.rm = TRUE)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.segment, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for same category") 

No clear effect of same category cost

Different category costs aggregated

*Same category costs in facet label

agg.diff <- aggregate(cbind(gaps, sd) ~ same.category + same.segment + data.type, cost.res, mean)

ggplot(agg.diff, aes(x = same.category, y = gaps, col = data.type)) +
     geom_hline(linetype="dotted", yintercept = max(agg.diff$gaps[agg.diff$data.type == "difference"], na.rm = TRUE)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.segment, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for same segments") 

  • The higher the same segment cost the better
  • No clear effect of different category cost

Same segment costs aggregated

*Same category costs in facet label

agg.same.sgmt <- aggregate(cbind(gaps, sd) ~ same.category + different.category + data.type, cost.res, mean)

ggplot(agg.same.sgmt, aes(x = different.category, y = gaps, col = data.type)) +
  geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
     geom_hline(linetype="dotted", yintercept = max(agg.same.sgmt$gaps[agg.same.sgmt$data.type == "difference"], na.rm = TRUE)) +
  geom_line() +
  scale_color_viridis_d(begin = 0.3, end = 0.7) +
  facet_wrap(~ same.category, nrow = 3) +
  labs(y = "# of gaps", x = "Costs for different category") 

  • High same category costs produce the highest gap difference regardless of different category costs

Difference between same segment and different category costs

  • Horizontal dotted line shows the maximum gap difference between real and simulated data
combs <- t(combn(c("different.category", "same.category", "same.segment"), 2))

cost.diff.l <- lapply(1:nrow(combs), function(y){
  
  X <- cost.res[cost.res$data.type == "difference",]
  
  names(X)[names(X) == "gaps"] <- "gap.diff"
  
  X$cost.diff <- X[, combs[y, 2]] - X[, combs[y, 1]]
  
  X$diff.cats <- paste(combs[y,], collapse = "-")
  
  if (X$diff.cats[1] == "different.category-same.segment") {X$cost.diff <- X$cost.diff / 2 
  
  X$diff.cats <- "different.category-same.segment / 2"
  }
  return(X)
}
)

cost.diff <- do.call(rbind, cost.diff.l)

agg.same.cat <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = mean)

agg.same.cat$sd <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = sd)$gap.diff

# position of points in ggplot2 
pos <- position_dodge(width= 0.5)

ggplot(agg.same.cat, aes(x = cost.diff, y = gap.diff, col = diff.cats)) +
  geom_hline(linetype="dotted",yintercept = max(agg.same.cat$gap.diff, na.rm = TRUE)) +
    geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd), position = pos) +
  geom_line(position = pos) +
  scale_color_viridis_d(begin = 0.2, end = 0.8,alpha = 0.6) +
  labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories") 

  • Difference in cost between same segment and same category should not be higher than ~1.5 nor lower than ~0.7

  • Difference in cost between same segment and different category doesn’t really matter

Average across the adjacent categories

agg.same.cat2 <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = mean)

agg.same.cat2$sd <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = sd)$gap.diff

# position of points in ggplot2 
pos <- position_dodge(width= 0.1)

ggplot(agg.same.cat2, aes(x = cost.diff, y = gap.diff)) +
  geom_hline(linetype="dotted",yintercept = max(agg.same.cat2$gap.diff, na.rm = TRUE)) +
    geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd),  col = viridis::viridis(10)[6]) +
  geom_line( col = viridis::viridis(10)[6]) +
  labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories")

  • A cost difference between adjacent categories of 0.7 produces the highest gap difference between real and simulated data sets

R session information

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] ape_5.3       ggplot2_3.3.0 pbapply_1.4-2
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3        pillar_1.4.2      compiler_3.6.1    viridis_0.5.1    
##  [5] tools_3.6.1       zeallot_0.1.0     digest_0.6.22     viridisLite_0.3.0
##  [9] lifecycle_0.1.0   evaluate_0.14     tibble_2.1.3      gtable_0.3.0     
## [13] nlme_3.1-142      lattice_0.20-41   pkgconfig_2.0.3   rlang_0.4.1      
## [17] yaml_2.2.1        xfun_0.12         gridExtra_2.3     withr_2.1.2      
## [21] dplyr_0.8.3       stringr_1.4.0     knitr_1.28        vctrs_0.2.0      
## [25] grid_3.6.1        tidyselect_0.2.5  glue_1.3.1        R6_2.4.1         
## [29] rmarkdown_1.17    purrr_0.3.3       tidyr_1.0.0       magrittr_1.5     
## [33] ellipsis_0.3.0    backports_1.1.5   scales_1.0.0      htmltools_0.4.0  
## [37] assertthat_0.2.1  colorspace_1.4-1  labeling_0.3      stringi_1.4.6    
## [41] munsell_0.5.0     crayon_1.3.4