data = read.csv("https://data.broadinstitute.org/Trinity/TMP/bhaas/assembly_metrics.csv", header=T, com="#")
data <- data %>% gather(metric, score, 3:ncol(data))
data <- data %>% filter(! is.na(score))
There are certain metrics where lowest values are best, and so here I rerank them so that smallest values become largest values, and vice-versa.
metric_reverse_rankings = c("Misassemblies", "Mismatches.per.transcript", "Percentage.bases.uncovered", "Number.of.ambiguous.bases", "Missing.BUSCOs")
data <- data %>% mutate(adj_score = ifelse(metric %in% metric_reverse_rankings, -1 * score, score))
Here we rank all programs across each metrics for each data set.
data <- data %>% group_by(Organism, metric) %>%
mutate(ranking = rank(adj_score, na.last = FALSE, ties.method="max"))
Below we analyze the mean rankings for each program across all metrics
mean_rankings_df <- data %>% group_by(Program) %>%
summarize(mean_rank = mean(ranking)) %>% arrange(desc(mean_rank))
mean_rankings_df = mean_rankings_df %>% mutate(Program = factor(Program, Program))
mean_rankings_df %>% ggplot(aes(Program, mean_rank)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + ggtitle("Program Mean Rankings Across All Results")
Below is the distribution of ranking values for each program, ordered according to the mean value as above.
data %>%
mutate(Program = factor(Program, levels(mean_rankings_df$Program))) %>%
group_by(Program) %>%
ggplot(aes(Program, ranking)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggtitle("Distribution of Metric Rankings Across Programs")
## still ordered by mean ranking as above.
For a broad view, the heatmap below shows the metric ranking for each program for each organism data set.
data %>% ggplot(aes(Program, metric)) + geom_tile(aes(fill = ranking)) + facet_wrap(~Organism) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggtitle("Rankings of Metrics per Program on each Organism")
The leaderboard entries for each metric correspond to the MIN_RANK value set below. I explored a MIN_RANK = 10 (meaning the program had to have the top score to be included), MIN_RANK=9 (so top two programs), and MIN_RANK=8 (top three programs, as used in the paper). The cumulative scores are simply summing ‘counts of occurring on metric leaderboards’.
## some utility functions
# MIN_RANK defines the minimum rank for a program
# to show up on the leaderboard for a given metric.
# 10 programs, so max ranking would be 10 (best), ranking from worst to best.
get_top_ranking_progs_by_org <- function(data, MIN_RANK) {
# absolute top ranking:
top_rank_counts_by_organism = data %>% filter(ranking >= MIN_RANK) %>%
group_by(Organism) %>%
count(Program)
cumulative_scores = top_rank_counts_by_organism %>%
group_by(Program) %>%
summarize(sum(n)) %>%
arrange(desc(`sum(n)`)) %>%
mutate(Program=factor(Program, Program))
p <- ggplot(top_rank_counts_by_organism, aes(Program, n)) +
geom_bar(stat='identity',fill="darkseagreen") +
facet_wrap(~Organism) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
ggtitle(sprintf("Sum Count of Metric Leaderboard Occurrence Requiring min_rank=%d", MIN_RANK)) +
geom_text(aes(label=n), size = 2, position = position_stack(vjust = 0.5))
plot(p)
# heatmap showing cumulative rankings
p2 <- top_rank_counts_by_organism %>% ggplot(aes(Organism, Program)) +
geom_tile(aes(fill = n)) +
ggtitle("Cumulative metric leaderboard occurrence for Prog on Org") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_text(aes(label=n), size = 2, color='white')
plot(p2)
p3 <- ggplot(cumulative_scores, aes(Program, `sum(n)`)) +
geom_bar(stat='identity', fill='plum1') +
ggtitle(sprintf("Cumulative Metric Leaderboard Occurrence, requiring min_rank=%d", MIN_RANK)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_text(aes(label=`sum(n)`), size = 3, position = position_stack(vjust = 0.5))
plot(p3)
ret_list = list(top_rank_counts_by_organism=top_rank_counts_by_organism,
cumulative_scores=cumulative_scores)
return(ret_list)
}
ret = get_top_ranking_progs_by_org(data, 10)
ret = get_top_ranking_progs_by_org(data, 9)
ret = get_top_ranking_progs_by_org(data, 8)
ret = get_top_ranking_progs_by_org(data, 7)