knitr::opts_knit$set(root.dir = normalizePath(".."))
knitr::opts_chunk$set(dpi = 58, fig.width = 12)
# ggplot2 theme
theme_set(theme_classic(base_size = 20))
split_seqs <- function(x) {
# from start to end
y <- sapply(1:20, function(y)
if (y == 1)
ifelse(substr(x, start = y + 1, stop = y + 1) == substr(x, start = y, stop = y), 0 , 1) else
ifelse(substr(x, start = y - 1, stop = y - 1) == substr(x, start = y, stop = y), 0 , 1)
)
x2 <- strsplit(x, "")[[1]]
w1 <- which(y == 1)
# to add single elements at the end
w1 <- c(w1, 21)
if (w1[1] == 1 & w1[2] == 2) w1 <- w1[2:length(w1)]
sgmts <- lapply(1:length(w1), function(w){
if (w1[w] == w1[1]) paste(x2[1:(w1[w] - 1)], collapse = "") else
paste(x2[w1[w - 1]:(w1[w] - 1)], collapse = "")
})
return(sgmts)
}
write_lek_fas <- function(x){
lk <- unlist(strsplit(x[seq(1, length(x), by = 2)], split = "_"))[seq(1, length(x), by = 2)]
lk <- gsub(">", "", paste0(lk, "_"))
out <- sapply(unique(lk), USE.NAMES = FALSE, function(y){
wch.lk <- grep(pattern = y, x = x)
fl.nms <-tempfile(tmpdir = getwd(), fileext = paste0("song.seq_",y, "SongsSeq.fa"))
writeLines(x[min(wch.lk):(max(wch.lk)+ 1)], con = fl.nms)
return(basename(fl.nms))
})
return(out)
}
*Same segment costs in facet label
cost.res <- read.csv("./data/processed/costs_simulation_results.csv", stringsAsFactors = FALSE)
cost.res <- cost.res[cost.res$convg != "converged and non-converged" ,]
cost.res <- cost.res[cost.res$cost.vals %in% cost.res$cost.vals[cost.res$data.type != "real"], ]
cost.res <- tidyr::separate(cost.res, "cost.vals", into = c("different.category", "same.category", "same.segment"), FALSE, sep = "_")
cost.res <- cost.res[cost.res$same.segment > 0, ]
cost.res$different.category <- as.numeric(cost.res$different.category)
cost.res$same.category <- as.numeric(cost.res$same.category)
cost.res$same.segment <- as.numeric(cost.res$same.segment)
# sapply(cost.res, function(x) length(unique(x)))
cost.diff <- sapply(unique(cost.res$cost.vals), USE.NAMES = FALSE, function(x){
mean(cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type != "real"] -
cost.res$gaps[cost.res$cost.vals == x & cost.res$data.type == "real"])
})
cost.diff.df <- cost.res[1:length(cost.diff), ]
cost.diff.df$gaps <- cost.diff
cost.diff.df$data.type <- "difference"
cost.res <- rbind(cost.res, cost.diff.df)
agg.same.category <- aggregate(cbind(gaps, sd) ~ same.segment + different.category + data.type, cost.res, mean)
ggplot(agg.same.category, aes(x = different.category, y = gaps, col = data.type)) +
geom_hline(linetype="dotted", yintercept = max(agg.same.category$gaps[agg.same.category$data.type == "difference"], na.rm = TRUE)) +
geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
geom_line() +
scale_color_viridis_d(begin = 0.3, end = 0.7) +
facet_wrap(~ same.segment, nrow = 3) +
labs(y = "# of gaps", x = "Costs for same category")
No clear effect of same category cost
*Same category costs in facet label
agg.diff <- aggregate(cbind(gaps, sd) ~ same.category + same.segment + data.type, cost.res, mean)
ggplot(agg.diff, aes(x = same.category, y = gaps, col = data.type)) +
geom_hline(linetype="dotted", yintercept = max(agg.diff$gaps[agg.diff$data.type == "difference"], na.rm = TRUE)) +
geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
geom_line() +
scale_color_viridis_d(begin = 0.3, end = 0.7) +
facet_wrap(~ same.segment, nrow = 3) +
labs(y = "# of gaps", x = "Costs for same segments")
*Same category costs in facet label
agg.same.sgmt <- aggregate(cbind(gaps, sd) ~ same.category + different.category + data.type, cost.res, mean)
ggplot(agg.same.sgmt, aes(x = different.category, y = gaps, col = data.type)) +
geom_pointrange(aes(ymin = gaps - sd, ymax = gaps + sd)) +
geom_hline(linetype="dotted", yintercept = max(agg.same.sgmt$gaps[agg.same.sgmt$data.type == "difference"], na.rm = TRUE)) +
geom_line() +
scale_color_viridis_d(begin = 0.3, end = 0.7) +
facet_wrap(~ same.category, nrow = 3) +
labs(y = "# of gaps", x = "Costs for different category")
combs <- t(combn(c("different.category", "same.category", "same.segment"), 2))
cost.diff.l <- lapply(1:nrow(combs), function(y){
X <- cost.res[cost.res$data.type == "difference",]
names(X)[names(X) == "gaps"] <- "gap.diff"
X$cost.diff <- X[, combs[y, 2]] - X[, combs[y, 1]]
X$diff.cats <- paste(combs[y,], collapse = "-")
if (X$diff.cats[1] == "different.category-same.segment") {X$cost.diff <- X$cost.diff / 2
X$diff.cats <- "different.category-same.segment / 2"
}
return(X)
}
)
cost.diff <- do.call(rbind, cost.diff.l)
agg.same.cat <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = mean)
agg.same.cat$sd <- aggregate(gap.diff ~ cost.diff + diff.cats, data = cost.diff, FUN = sd)$gap.diff
# position of points in ggplot2
pos <- position_dodge(width= 0.5)
ggplot(agg.same.cat, aes(x = cost.diff, y = gap.diff, col = diff.cats)) +
geom_hline(linetype="dotted",yintercept = max(agg.same.cat$gap.diff, na.rm = TRUE)) +
geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd), position = pos) +
geom_line(position = pos) +
scale_color_viridis_d(begin = 0.2, end = 0.8,alpha = 0.6) +
labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories")
Difference in cost between same segment and same category should not be higher than ~1.5 nor lower than ~0.7
Difference in cost between same segment and different category doesn’t really matter
agg.same.cat2 <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = mean)
agg.same.cat2$sd <- aggregate(gap.diff ~ cost.diff, data = cost.diff[cost.diff$diff.cats != "different.category-same.segment / 2",], FUN = sd)$gap.diff
# position of points in ggplot2
pos <- position_dodge(width= 0.1)
ggplot(agg.same.cat2, aes(x = cost.diff, y = gap.diff)) +
geom_hline(linetype="dotted",yintercept = max(agg.same.cat2$gap.diff, na.rm = TRUE)) +
geom_pointrange(aes(ymin = gap.diff - sd, ymax = gap.diff + sd), col = viridis::viridis(10)[6]) +
geom_line( col = viridis::viridis(10)[6]) +
labs(y = "Gaps in real - gaps in simulated", x = "Difference in cost between segment categories")
R session information
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_CR.UTF-8 LC_COLLATE=es_ES.UTF-8
## [5] LC_MONETARY=es_CR.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=es_CR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ape_5.3 ggplot2_3.3.0 pbapply_1.4-2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 pillar_1.4.2 compiler_3.6.1 viridis_0.5.1
## [5] tools_3.6.1 zeallot_0.1.0 digest_0.6.22 viridisLite_0.3.0
## [9] lifecycle_0.1.0 evaluate_0.14 tibble_2.1.3 gtable_0.3.0
## [13] nlme_3.1-142 lattice_0.20-41 pkgconfig_2.0.3 rlang_0.4.1
## [17] yaml_2.2.1 xfun_0.12 gridExtra_2.3 withr_2.1.2
## [21] dplyr_0.8.3 stringr_1.4.0 knitr_1.28 vctrs_0.2.0
## [25] grid_3.6.1 tidyselect_0.2.5 glue_1.3.1 R6_2.4.1
## [29] rmarkdown_1.17 purrr_0.3.3 tidyr_1.0.0 magrittr_1.5
## [33] ellipsis_0.3.0 backports_1.1.5 scales_1.0.0 htmltools_0.4.0
## [37] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 stringi_1.4.6
## [41] munsell_0.5.0 crayon_1.3.4