Model-diagnostics.utf8

## vector with package names
x <- c("pbapply", "parallel", "coda", "DT", "stringr", "rwty", "viridis", "wesanderson", "tidyr", "kableExtra")

aa <- lapply(x, function(y) {
  
  # check if installed, if not then install 
  if (!y %in% installed.packages()[,"Package"]) 
    install.packages(y) 

  # load package
  try(require(y, character.only = T), silent = T)
})

knitr::opts_knit$set(root.dir = normalizePath(".."))

# knitr::opts_chunk$set(dpi = 58, fig.width = 12, echo = TRUE) 

# options(knitr.kable.NA = '')

# function to extract diagnostic stats
diag_stats <- function(x, thin = 1, age.extant = TRUE) {
  
  mods <- grep(x, model_runs, value = TRUE)
  
  out <- lapply(mods, function(y){
    
    temp <- read.table(file.path("output/revbayes", y), header = TRUE)
    chain <- mcmc(data = temp , start = 1, end = length(temp[,1]), thin = thin)

  return(chain)  
    })

  # make it a mcmc list object
  mh_list <- mcmc.list(out)
  
     #minimum effective sample size from combined traces
          min_ess <- min(effectiveSize(mh_list)[-1])
          
          #autocorrelations between draws in run 1
          autocorr_post <- diag(autocorr(mh_list[[1]])[2, , ])["Posterior"]
          autocorr_age_extant <- diag(autocorr(mh_list[[1]])[2, , ])["age_extant"] 
          autocorr_origin_time <- diag(autocorr(mh_list[[1]])[2, , ])["origin_time"]
          autocorr_diversification <- diag(autocorr(mh_list[[1]])[2, , ])["diversification"]
          autocorr_max <- max(abs(diag(autocorr(mh_list[[1]])[2, , ])[-1]))
          autocorr_bad <- paste0(names(which(diag(autocorr(mh_list[[1]])[2, , ])[-1] > 0.1)),
                                              collapse= ",")
        
          # Gelman- Rubin diagnostic of convergence between runs
          gr <- try(gelman.diag(mh_list,confidence = 0.95, transform=TRUE, 
                          autoburnin=FALSE, multivariate = TRUE), silent = TRUE)
        
          # catch error
          if (!is(gr, "try-error")){
          gr_post <- gr$psrf["Posterior", 1]
          gr_age_extant <- if(age.extant) gr$psrf["age_extant", 1] else NA
          gr_origin_time <- gr$psrf["origin_time",1]
          gr_diversification <- gr$psrf["diversification",1]
          gr_max <- max(gr$psrf[-1,1])
          gr_bad <- paste0(names(which(gr$psrf[-1,1] > 1.05)), collapse = ",")
          gr_mpsrf <- gr$mpsrf
          } else {
          gr_post <- NA
          gr_age_extant <- NA
          gr_origin_time <- NA
          gr_diversification <- NA
          gr_max <- NA
          gr_bad <- NA
          gr_mpsrf <- NA            
          }
        
          # date it was run
          updated <- substr(x = file.info(file.path("output/revbayes", mods[1]))$mtime, 0 , 10)
          
          # put results in a data frame
    out_df <- data.frame(model = x, min_ess, autocorr_post, autocorr_age_extant, autocorr_origin_time, autocorr_diversification, autocorr_max, autocorr_bad, gr_post, gr_age_extant, gr_origin_time, gr_diversification, gr_max, gr_bad, gr_mpsrf, updated)
}

# function for MCMCglmm diagnostics
# plot diagonostic stuff for mcmcglmmm models
# X == mcmc.list
plot_traces <- function(X, clms = 4, cex = 1, cols = adjustcolor(c("yellow","blue"), alpha.f = 0.6)) {

  # reset par when done
  opar <- par()
  on.exit(par(opar))
  
  # trace and autocorrelation  
  if (clms > 1) par(mfrow = c(1, clms))
  for(y in colnames(X[[1]])){

    Z <- lapply(X, function(x) x[, y, drop = FALSE])
    
    traceplot(Z, col = cols, cex.lab=cex, cex.axis=cex, main = "")
    
    title(y, cex.main = cex + 0.5)
    
    if (grepl("age_extant", y))
    autocorr.plot2(Z[[1]], col = "gray", lwd = 4, ask = FALSE, auto.layout = FALSE, cex.axis = cex, cex.lab = cex, cex = cex + 0.5, main = y)
    } 
}

# modified internal from coda
autocorr.plot2 <- function(x, lag.max, auto.layout = TRUE, ask, main, cex = 1, ...) {
  if (missing(ask)) {
    ask <- if (is.R()) {
      dev.interactive()
    }
    else {
      interactive()
    }
  }
  oldpar <- NULL
  on.exit(par(oldpar))
  if (auto.layout) 
    oldpar <- par(mfrow = set.mfrow(Nchains = nchain(x), 
      Nparms = nvar(x)))
  if (!is.mcmc.list(x)) 
    x <- mcmc.list(as.mcmc(x))
  for (i in 1:nchain(x)) {
    xacf <- if (missing(lag.max)) 
      acf(as.ts.mcmc2(x[[i]]), plot = FALSE)
    else acf(as.ts.mcmc2(x[[i]]), lag.max = lag.max, plot = FALSE)
    for (j in 1:nvar(x)) {
      plot(xacf$lag[, j, j], xacf$acf[, j, j], type = "h", 
        ylab = "Autocorrelation", xlab = "Lag", ylim = c(-1, 
          1), ...)
      title(main, cex.main = cex)
      if (i == 1 && j == 1) 
        oldpar <- c(oldpar, par(ask = ask))
    }
  }
  invisible(x)
}

# modified internal from coda
as.ts.mcmc2 <- function (x, ...) 
{
  x <- as.mcmc(x)
  y <- ts(x, start = start(x), end = end(x), deltat = thin(x))
  attr(y, "mcpar") <- NULL
  return(y)
}

## RUN TO UPDATE OUTPUT ###

# get model run log file names
model_runs <- list.files("output/most_recent_revbayes_models", pattern = "run_([[:digit:]]+)\\.log$")

# get base model name (removing run_#) 
base_models <- sapply(strsplit(model_runs, "_run", fixed = TRUE), "[", 1)

# extract diagnostic stats for each model 
# wrapped in try() to catch errors
runs <- pblapply(unique(base_models), cl = parallel::detectCores() - 1, function(x) try(diag_stats(x, thin = if (substr(x, 0, 3) %in% c("CCE", "SUR")) 2 else 1, age.extant = if (grepl("no.fossils", x)) FALSE else TRUE), silent = TRUE))

# remove errors
runs <- runs[sapply(runs, is.data.frame)]

# put in a single data frame
model_diags <- do.call(rbind, runs)

# save
write.csv(model_diags, "output/diagnostics_revbayes_output.csv", row.names = FALSE)

rb.trees <- load.multi("~/Dropbox/Projects/lbh_cultural_evolution/output/most_recent_revbayes_models/", format = "revbayes")

saveRDS(rb.trees, "./output/revbayes_output_in_single_R_object.RDS")

MCMC diagnostics: Gelman-Rubin, ESS, and autocorrelation

model_diags <- read.csv("output/diagnostics_revbayes_output.csv", stringsAsFactors = FALSE)

# get lek name
model_diags$lek <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 1)

# substitution model
model_diags$subs <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 2)

# period
model_diags$period <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 3)

# and which fossils were used
model_diags$fossils <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 4)

model_diags$align <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 5)


# order columns
model_diags <- model_diags[, c(c("model", "lek", "subs", "period", "fossils", "align", "autocorr_bad", "gr_bad")   , names(which(sapply(model_diags, is.numeric))), "updated")]

# round
model_diags[, sapply(model_diags, is.numeric)] <- round(model_diags[, sapply(model_diags, is.numeric)], 4)

#count "bad" branch rates per row
brc <- paste0("branch_rates_count:",(str_count(pattern = "branch_rates", model_diags$autocorr_bad)))

model_diags$autocorr_bad <- gsub("branch_rates\\.([[:digit:]]+)\\.|\\,branch_rates\\.([[:digit:]]+)\\.","", model_diags$autocorr_bad)

model_diags$autocorr_bad <- paste(model_diags$autocorr_bad, brc)

brc2 <- paste0("branch_rates_count:",(str_count(pattern = "branch_rates", model_diags$gr_bad)))

model_diags$gr_bad <- gsub("branch_rates\\.([[:digit:]]+)\\.|\\,branch_rates\\.([[:digit:]]+)\\.","", model_diags$gr_bad)

model_diags$gr_bad <- paste(model_diags$gr_bad, brc)


rb.trees <- readRDS("./output/revbayes_output_in_single_R_object.RDS")


names(rb.trees) <- gsub("\\.trees", "", names(rb.trees))

model_diags$iterations <- sapply(model_diags$model, function(x) max(rb.trees[[x]]$ptable$Iteration))


# Number of models by iterations and lek
kbl <- knitr::kable(as.matrix(table(model_diags$iterations, model_diags$lek)), caption = "Number of models by iterations and lek")

kableExtra::kable_styling(kbl)

Number of models by iterations and lek
	BR1	CCE	HC1	SUR	TR1
3001	18	0	24	6	18
6001	0	24	0	24	0

=======

The table below can be read like this: read.csv("https://raw.githubusercontent.com/maRce10/lbh_cultural_evolution/master/output/diagnostics_revbayes_output.csv")

model_diags <- read.csv("output/diagnostics_revbayes_output.csv", stringsAsFactors = FALSE)

# get lek name
model_diags$lek <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 1)

# substitution model
model_diags$subs <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 2)

# period
model_diags$period <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 3)

# and which fossils were used
model_diags$fossils <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 4)

model_diags$align <- sapply(strsplit(model_diags$model, "_", fixed = TRUE), "[", 5)


# order columns
model_diags <- model_diags[, c(c("model", "lek", "subs", "period", "fossils", "align", "autocorr_bad", "gr_bad")   , names(which(sapply(model_diags, is.numeric))), "updated")]

# round
model_diags[, sapply(model_diags, is.numeric)] <- round(model_diags[, sapply(model_diags, is.numeric)], 4)

#count "bad" branch rates per row
brc <- paste0("branch_rates_count:",(str_count(pattern = "branch_rates", model_diags$autocorr_bad)))

model_diags$autocorr_bad <- gsub("branch_rates\\.([[:digit:]]+)\\.|\\,branch_rates\\.([[:digit:]]+)\\.","", model_diags$autocorr_bad)

model_diags$autocorr_bad <- paste(model_diags$autocorr_bad, brc)

brc2 <- paste0("branch_rates_count:",(str_count(pattern = "branch_rates", model_diags$gr_bad)))

model_diags$gr_bad <- gsub("branch_rates\\.([[:digit:]]+)\\.|\\,branch_rates\\.([[:digit:]]+)\\.","", model_diags$gr_bad)

model_diags$gr_bad <- paste(model_diags$gr_bad, brc)


rb.trees <- readRDS("./output/revbayes_output_in_single_R_object.RDS")


names(rb.trees) <- gsub("\\.trees", "", names(rb.trees))

model_diags$iterations <- sapply(model_diags$model, function(x) max(rb.trees[[x]]$ptable$Iteration))


# Number of models by iterations and lek
kbl <- knitr::kable(as.matrix(table(model_diags$iterations, model_diags$lek)), caption = "Number of models by iterations and lek")

kableExtra::kable_styling(kbl)

Number of models by iterations and lek
	BR1	CCE	HC1	SUR	TR1
3001	18	0	24	6	18
6001	0	24	0	24	0

# model_diags$model[model_diags$lek == "CCE" & model_diags$iterations == 3001]

# add link to plots
plts <- list.files(path = "./output/MCMC_diagnostic_plots", pattern = "\\.jpg$")
  
jpgs <- lapply(model_diags$model, function(x)
grep(paste(gsub("\\_([[:digit:]]+)", "", x), collapse = "|"), plts, value = TRUE))

model_diags$diag_plots <- sapply(model_diags$model, function(x)

paste(paste0('<a href=\"https://raw.githubusercontent.com/maRce10/lbh_cultural_evolution/master/output/MCMC_diagnostic_plots/', grep(paste(paste0(gsub("\\_([[:digit:]]+)", "", x), "_0"), collapse = "|"), plts, value = TRUE),'">', grep(paste(paste0(gsub("\\_([[:digit:]]+)", "", x), "_0"), collapse = "|"), plts, value = TRUE), '</a>'), collapse = " ")
)

# print dynamic table
datatable(model_diags, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

Gelman-Rubin

Record.labs <- c("Recent records only", "Including oldest records")
names(Record.labs) <- c("new", "old")

model_diags$period[model_diags$period == "no.fossils"] <- "new"

# order levels
model_diags$align <- factor(model_diags$align, levels = c("all.equal", "optimal", "prank"))

model_diags$lek <- factor(model_diags$lek, levels = c("BR1", "TR1", "CCE", "HC1", "SUR"))

model_diags$fossils[model_diags$fossils == "full.process"] <- "no.fossils"
  

# parameters to plot
params <- c("gr_post", "gr_age_extant", "gr_origin_time", "gr_diversification", "gr_max", "gr_mpsrf")


# create graphs
ggs <- lapply(params, function(x){

  dat <- model_diags[!is.na(model_diags[,x]), ]
  
  if (x == "gr_age_extant") dat <- dat[grep("no.fossils", dat$model, invert = TRUE), ]
  
  dat$signif <- ifelse(dat[,x] <= 1.1, "good", "bad")

  dat$signif <- factor(dat$signif, levels = c("good", "bad"))


  # dat$alpha <- ifelse(dat[,x] > 1.1, 1, 0.5)

  
  ggplot(data = dat, aes(
    x = fossils,
    y = get(x),
    colour = subs,
    fill = subs,
    shape = signif,
    alpha = signif,
  )) +
  geom_jitter(size = 3,
              # alpha = 0.7,
              width = 0.12) +
  facet_grid(period ~ lek,
             scales = "fixed",
             labeller = labeller(period = Record.labs)) +
  theme_light(base_size = 16) +
  labs(y = x, shape = "GR",  colour = "Alignment", x = "Historical records per song") +
   scale_shape_manual(values = c(20, 21), labels = c("< 1.1", "> 1.1")) + 
    scale_alpha_manual(values = c(0.5, 1), guide = FALSE) +
    scale_colour_manual(
    values = viridis(3),
    labels = c("MAFFT-agnostic", "MAFFT-optimal", "PRANK")
  ) +
    scale_fill_manual(
    values = viridis(3),
    guide = FALSE) +
  scale_x_discrete(labels = c("All", "Earliest", "No fossils")) + 
    geom_hline(yintercept = 1.1, lty = 2, col = "gray42")  + 
    theme(axis.text.x = element_text(angle = 35,  hjust = 1))
})

names(ggs) <- unique(params)

ggs

## $gr_post

## 
## $gr_age_extant

## 
## $gr_origin_time

## 
## $gr_diversification

## 
## $gr_max

## 
## $gr_mpsrf

## RUN TO UPDATE OUTPUT ###

# these plots will be linked to in the table of this Rmarkdown's output

# get model run log file names
model_runs <- list.files("output/most_recent_revbayes_models", pattern = "run_([[:digit:]]+)\\.log$")

# get base model name (removing run_#) 
base_models <- sapply(strsplit(model_runs, "_run", fixed = TRUE), "[", 1)

# loop to make plots
out <- pblapply(base_models, function(x){
  
  # info on base model    
  # print(gsub("_([[:digit:]]+)", "", x))
    
  # extract run names
  mods <- grep(x, model_runs, value = TRUE)
  
  # loop over each pair of runs
  out <- lapply(mods, function(z){
    
    temp <- read.table(file.path("output/most_recent_revbayes_models", z), header = TRUE)
    
    temp <- temp[,c("Posterior", "Likelihood", "Prior", "age_extant", "origin_time", "num_samp_anc", "speciation_rate", "extinction_rate", "diversification", "turnover", "psi", "er.1.", "er.2.", "er.3.", "er.4.", "er.5.", "er.6.", "er.7.", "er.8.", "er.9.", "er.10.", "er.11.","er.12.", "er.13.", "er.14.", "er.15.", "sf.1.", "sf.2.", "sf.3.", "sf.4.", "sf.5.", "sf.6.","alpha_song", "rates_song.1.", "rates_song.2.", "rates_song.3.", "rates_song.4.", grep("branch_rates", colnames(temp), value = TRUE))]
    
    chain <- mcmc(data = temp, start = 1, end = length(temp[,1]), thin = if (substr(x, 0, 3) %in% c("CCE", "SUR")) 2 else 1)

  return(chain)  
    }
  )
  
  # create if it doesn't exist
  if (!file.exists(file.path("./output/MCMC_diagnostic_plots", paste0(gsub("_([[:digit:]]+)", "", x), "_01.jpg")))){
    # plot 
  jpeg(filename = file.path("./output/MCMC_diagnostic_plots", paste0(gsub("_([[:digit:]]+)", "", x), "_%02d.jpg")), width = 1500, height = 1200, pointsize = 6)  
  
  ncol <- ncol(out[[1]])
  
  par(mfrow = c(10, 10))
  
  out2 <- try(plot_traces(X = out, clms = 1, cex = 2.5), silent = TRUE)
  
    # close device
    dev.off()
    }
  }
)

P3 diagnostics

Effect sizes

p3_output_files <- list.files(path = "./output/revbayes/P3_Results", recursive = TRUE, pattern = "effectsizes", full.names = TRUE)


p3_results_l <- lapply(p3_output_files, function(x){
  
  Y <- read.csv(file.path(getwd(), x))
  Y$model <- sapply(strsplit(x, "/", fixed = TRUE), "[", 5)

  return(Y)
})

p3_results <- do.call(rbind, p3_results_l)

p3_results$Lek <- sapply(p3_results$model, function(x)
strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][2], USE.NAMES = FALSE)

p3_results$Lek <- factor(p3_results$Lek, levels = c("BR1", "TR1", "CCE", "HC1", "SUR"))

p3_results$Align <- sapply(p3_results$model, function(x)
strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][3], USE.NAMES = FALSE)

p3_results$Align <- factor(p3_results$Align, levels = c("all.equal", "optimal", "prank"))


p3_results$Fossils <- sapply(p3_results$model, function(x)
strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][5], USE.NAMES = FALSE)

p3_results$Fossils[p3_results$Fossils == "full.process"] <- "no.fossils"

p3_results$Fossils <- factor(p3_results$Fossils, levels = c("all", "early", "no.fossils"))

p3_results$Record <- sapply(p3_results$model, function(x)
strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][4], USE.NAMES = FALSE)

p3_results$Record[p3_results$Record == "no.fossils"] <- "new"

Record.labs <- c("Recent records only", "Including oldest records")
names(Record.labs) <- c("new", "old")

ggs <- lapply(unique(p3_results$Statistic), function(x){
ggplot(data = p3_results[p3_results$Statistic == x, ], aes(
    x = Fossils,
    y = Effect.Size,
    colour = Align,
  )) +
  geom_jitter(size = 3,
              alpha = 0.7,
              width = 0.12) +
  facet_grid(Record ~ Lek,
             scales = "fixed",
             labeller = labeller(Record = Record.labs)) +
  theme_light(base_size = 16) +
  labs(y = "Effect size", colour = "Alignment", x = "Historical records per song") +
  scale_shape_discrete(guide = FALSE) +
  scale_colour_manual(
    values = viridis(4, alpha = 0.6),
    labels = c("MAFFT-agnostic", "MAFFT-optimal", "PRANK")
  ) +
  scale_x_discrete(labels = c("All", "Earliest", "No fossils")) + 
    theme(axis.text.x = element_text(angle = 35,  hjust = 1))
})

names(ggs) <- unique(p3_results$Statistic)

ggs

## $`Number Invariant Sites`

## 
## $`Number Invariant Sites Excluding Ambiguous`

## 
## $`Max Invariant Block Length`

## 
## $`Max Invariant Block Length Excluding Ambiguous`

## 
## $`Max Pairwise Difference`

## 
## $`Max Pairwise Difference Excluding Ambiguous`

## 
## $`Max Variable Block Length`

## 
## $`Max Variable Block Length Excluding Ambiguous`

## 
## $`Min Pairwise Difference`

## 
## $`Min Pairwise Difference Excluding Ambiguous`

## 
## $`Number Invariable Block`

## 
## $`Number Invariable Block Excluding Ambiguous`

## 
## $`Segregating-Sites`

## 
## $Theta

## 
## $`Tajima-D`

## 
## $`Tajima-Pi`

P values

p3_results$signif <- ifelse(p3_results$Two.tailed < 0.05, " < 0.05", " >= 0.05")

p3_results$signif <- factor(p3_results$signif, levels = c(" >= 0.05", " < 0.05"))

p3_results$alpha <- ifelse(p3_results$Two.tailed < 0.05, 1, 0.95)

ggs2 <- lapply(unique(p3_results$Statistic), function(x){
ggplot(data = p3_results[p3_results$Statistic == x, ], aes(
    x = Fossils,
    y = Two.tailed,
    colour = Align,
    fill = Align,
    shape = signif,
    alpha = signif,
  )) +
  geom_jitter(size = 5,
              # alpha = 0.7,
              width = 0.12) +
  facet_grid(Record ~ Lek,
             scales = "fixed",
             labeller = labeller(Record = Record.labs)) +
  theme_light(base_size = 16) +
  labs(y = "pvalue", colour = "Alignment", x = "Historical records per song", shape = "P value") +
  # scale_shape_discrete(guide = FALSE) +
  scale_shape_manual(values = c(20, 21)) + 
    scale_alpha_manual(values = c(0.44, 1), guide = FALSE) +
    scale_colour_manual(
    values = viridis(4),
    labels = c("MAFFT-agnostic", "MAFFT-optimal", "PRANK")
  ) +
    scale_fill_manual(
    values = viridis(4), guide = FALSE,
    labels = c("MAFFT-agnostic", "MAFFT-optimal", "PRANK")
  ) +
  scale_x_discrete(labels = c("All", "Earliest", "No fossils")) + 
    geom_hline(yintercept = 0.05, lty = 2, col = "gray42") + 
    theme(axis.text.x = element_text(angle = 35,  hjust = 1))
})

names(ggs2) <- unique(p3_results$Statistic)

ggs2

## $`Number Invariant Sites`

## 
## $`Number Invariant Sites Excluding Ambiguous`

## 
## $`Max Invariant Block Length`

## 
## $`Max Invariant Block Length Excluding Ambiguous`

## 
## $`Max Pairwise Difference`

## 
## $`Max Pairwise Difference Excluding Ambiguous`

## 
## $`Max Variable Block Length`

## 
## $`Max Variable Block Length Excluding Ambiguous`

## 
## $`Min Pairwise Difference`

## 
## $`Min Pairwise Difference Excluding Ambiguous`

## 
## $`Number Invariable Block`

## 
## $`Number Invariable Block Excluding Ambiguous`

## 
## $`Segregating-Sites`

## 
## $Theta

## 
## $`Tajima-D`

## 
## $`Tajima-Pi`

ML

# Read model names
m <- read.table(file = "./output/revbayes/stdout/dataset.txt", header = F)

# Read two replicates of ML estimates under the stepping stone algorithm
ss.r1 <- read.table(file ="./output/revbayes/stdout/SteppingStone.txt", header = F, sep = ",")
ss.r2 <- read.table(file = "./output/revbayes/stdout/SteppingStoneR2.txt", header = F, sep = ",")
ss.r3 <- read.table(file = "./output/revbayes/stdout/SteppingStoneR3.txt", header = F, sep = ",")

# missing models to NA and failed estimations to "Repeat"
n = 3 # currently two replicates

# make df
m.ml <- m
for (i in 1:n) {
  m.ml <- cbind(m.ml, eval(parse(text = paste0("ss.r", i))))
}
colnames(m.ml) <- c("Model", paste0("ss",c(1:n)))


for(i in 1:n) {
  levels( m.ml[, i + 1]) <- c(levels( m.ml[, i + 1]),"Repeat")
  for (j in 1:nrow(m.ml)) {
        if (grepl(pattern = "This typically refers",
              x =  as.character(m.ml[j, i + 1]),
              fixed =  TRUE)) {
      m.ml[j, i + 1] <- "Repeat"
    }
    if (grepl(pattern = "Problem processing",
              x =  m.ml[j, i + 1],
              fixed =  TRUE)) {
      m.ml[j, i + 1] <- "Repeat"
    }
    if (grepl(pattern = "Could not open",
              x =  m.ml[j, i + 1],
              fixed =  TRUE)) {
      m.ml[j, i + 1] <- NA
    }
  }
  # m.ml[,i+1] <- droplevels(m.ml[,i+1])
}

# remove CCE that I accidentally repeated
#m.ml <- m.ml[-(49:60),]

m.rep<- read.table(file = "./output/revbayes/stdout/datasetRepeats.txt", header = F)

ss.rep <- read.table(file = "./output/revbayes/stdout/SteppingStoneRepeat.txt", header = F, sep = ",")

m.ml.rep <-cbind(m.rep, ss.rep)
colnames(m.ml.rep) <- c("Model", "ss")

# match name format
m.ml.rep$Model <- gsub(pattern = "_", replacement = ".", x = m.ml.rep$Model)
m.ml.rep$Model <- gsub(pattern = ".global", replacement = "._global", x = m.ml.rep$Model)
m.ml.rep$Model <- gsub(pattern = ".Uexp", replacement = "._Uexp", x = m.ml.rep$Model)

#Replace repeat for ML value
m.ml$ss1 <- as.character(m.ml$ss1)
m.ml$ss2 <- as.character(m.ml$ss2)
m.ml$ss3 <- as.character(m.ml$ss3)

for (i in 1:nrow(m.ml)) {
  for (j in 1:nrow(m.ml.rep)) {
    if (m.ml$Model[i] == m.ml.rep$Model[j] & m.ml$ss1[i] == "Repeat") {
      m.ml$ss1[i] <-  m.ml.rep$ss[j]
    }
    if (m.ml$Model[i] == m.ml.rep$Model[j] & m.ml$ss2[i] == "Repeat") {
      m.ml$ss2[i] <-  m.ml.rep$ss[j]
    }
    if (m.ml$Model[i] == m.ml.rep$Model[j] & m.ml$ss3[i] == "Repeat") {
      m.ml$ss3[i] <-  m.ml.rep$ss[j]
    }
  }
}

#m.ml$ss1 <- as.numeric(m.ml$ss1)
#m.ml$ss2 <- as.numeric(m.ml$ss2)

# remove "." from alignment name
m.ml$Model <- gsub(pattern = "all.equal", replacement = "allequal", x = m.ml$Model )

# split name into columns
m.ml <- m.ml %>% separate(Model, c("Lek", "Align", "Record", "Fossils", "Clock"))

# long to wide and rearrange
m.ml <-
  pivot_wider(
    data = m.ml,
    names_from = Clock,
    values_from =  paste0("ss", c(1:n))
  )

m.ml <-
  m.ml[, c(
    "Lek",
    "Align",
    "Record",
    "Fossils",
    paste0("ss", c(1:n), "_global"),
    paste0("ss", c(1:n), "_Uexp")
  )]
# m.ml  %>%
#   kbl() %>%
#   kable_minimal()


# print dynamic table
datatable(m.ml, editable = list(
  target = 'row'
), rownames = FALSE, style = "bootstrap",  filter = 'top', options = list(
  pageLength = 100, autoWidth = TRUE, dom = 'ft'
), autoHideNavigation = TRUE, escape = FALSE)

# while missing data make numeric
m.ml$ss1_diff <- as.numeric(m.ml$ss1_Uexp) - as.numeric(m.ml$ss1_global)
m.ml$ss2_diff <- as.numeric(m.ml$ss2_Uexp) - as.numeric(m.ml$ss2_global)
m.ml$ss3_diff <- as.numeric(m.ml$ss3_Uexp) - as.numeric(m.ml$ss3_global)

# make long data frame
ml.long <- m.ml[,c(1:4,11:13)]
ml.long <-
  pivot_longer(data = ml.long,
               cols = paste0("ss", c(1:n), "_diff"),
               names_to = "run")

ml.long$Lek <-  factor(ml.long$Lek, levels=c('BR1','TR1','CCE','HC1', 'SUR'))

# new facet label names for fossil record
Record.labs <- c("Recent records only", "Including oldest records")
names(Record.labs) <- c("new", "old")

# plot
ML.plot <-
  ggplot(data = ml.long, aes(
    x = Fossils,
    y = value,
    colour = Align,
    shape = Record
  )) +
  geom_jitter(size = 3,
              alpha = 0.7,
              width = 0.2) +
  facet_grid(Record ~ Lek,
             scales = "fixed",
             labeller = labeller(Record = Record.labs)) +
  theme_light(base_size = 24) +
  labs(y = "Differencce in marginal likelihood \nbetween clock models (relaxed - fixed)", colour = "Alignment", x = "Historical records per song") +
  scale_shape_discrete(guide = FALSE) +
  scale_colour_manual(
    values = wes_palette("IsleofDogs1"),
    labels = c("MAFFT-agnostic", "MAFFT-optimal", "PRANK")
  ) +
  scale_x_discrete(labels = c("all", "earliest"))  + 
    theme(axis.text.x = element_text(angle = 35,  hjust = 1))

ML.plot

R session information

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=pt_PT.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_PT.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_PT.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] kableExtra_1.3.1  tidyr_1.1.2       wesanderson_0.3.6 viridis_0.5.1    
##  [5] viridisLite_0.3.0 rwty_1.0.2        ggplot2_3.3.2     ape_5.4-1        
##  [9] stringr_1.4.0     DT_0.16           coda_0.19-4       pbapply_1.4-3    
## 
## loaded via a namespace (and not attached):
##  [1] phangorn_2.5.5     tidyselect_1.1.0   xfun_0.20          purrr_0.3.4       
##  [5] reshape2_1.4.4     lattice_0.20-41    colorspace_1.4-1   vctrs_0.3.6       
##  [9] generics_0.1.0     htmltools_0.5.0    yaml_2.2.1         rlang_0.4.10      
## [13] pillar_1.4.6       glue_1.4.2         withr_2.3.0        RColorBrewer_1.1-2
## [17] lifecycle_0.2.0    plyr_1.8.6         munsell_0.5.0      gtable_0.3.0      
## [21] rvest_0.3.6        htmlwidgets_1.5.2  evaluate_0.14      labeling_0.3      
## [25] knitr_1.30         GGally_2.1.0       crosstalk_1.1.0.1  highr_0.8         
## [29] Rcpp_1.0.5         scales_1.1.1       jsonlite_1.7.1     webshot_0.5.2     
## [33] farver_2.0.3       gridExtra_2.3      fastmatch_1.1-0    digest_0.6.27     
## [37] stringi_1.5.3      dplyr_1.0.2        grid_4.0.3         quadprog_1.5-8    
## [41] tools_4.0.3        magrittr_2.0.1     tibble_3.0.4       ggdendro_0.1.22   
## [45] crayon_1.3.4       pkgconfig_2.0.3    ellipsis_0.3.1     MASS_7.3-53       
## [49] Matrix_1.2-18      xml2_1.3.2         httr_1.4.2         rstudioapi_0.11   
## [53] rmarkdown_2.4      reshape_0.8.8      R6_2.5.0           igraph_1.2.6      
## [57] nlme_3.1-149       compiler_4.0.3

Model diagnostics

Long-billed hermit song cultural evolution

Marcelo Araya-Salas PhD & Beatriz Willink PhD

08-03-2021

MCMC diagnostics: Gelman-Rubin, ESS, and autocorrelation

Gelman-Rubin

P3 diagnostics

Effect sizes

P values

ML