data = read.csv("https://data.broadinstitute.org/Trinity/TMP/bhaas/assembly_metrics.csv", header=T, com="#")

Examine rankings by organism

data <- data %>% gather(metric, score, 3:ncol(data))

filter NA values

data <- data %>% filter(! is.na(score))

reverse relative ordering of values for relevant metrics

There are certain metrics where lowest values are best, and so here I rerank them so that smallest values become largest values, and vice-versa.

metric_reverse_rankings = c("Misassemblies", "Mismatches.per.transcript", "Percentage.bases.uncovered", "Number.of.ambiguous.bases", "Missing.BUSCOs")

data <- data %>% mutate(adj_score = ifelse(metric %in% metric_reverse_rankings, -1 * score, score))

set rankings

Here we rank all programs across each metrics for each data set.

data <- data %>% group_by(Organism, metric) %>% 
  mutate(ranking = rank(adj_score, na.last = FALSE, ties.method="max"))

Examine mean rankings

Below we analyze the mean rankings for each program across all metrics

mean_rankings_df <- data %>% group_by(Program) %>% 
  summarize(mean_rank = mean(ranking)) %>% arrange(desc(mean_rank))

mean_rankings_df = mean_rankings_df %>% mutate(Program = factor(Program, Program))

mean_rankings_df %>% ggplot(aes(Program, mean_rank)) + geom_bar(stat='identity') +  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + ggtitle("Program Mean Rankings Across All Results")

Examine distribution of rankings across metrics

Below is the distribution of ranking values for each program, ordered according to the mean value as above.

data %>%
  mutate(Program = factor(Program, levels(mean_rankings_df$Program))) %>% 
  group_by(Program) %>% 
  ggplot(aes(Program, ranking)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("Distribution of Metric Rankings Across Programs")

## still ordered by mean ranking as above.

Individual metric rankings across organisms for each program

For a broad view, the heatmap below shows the metric ranking for each program for each organism data set.

data %>% ggplot(aes(Program, metric)) + geom_tile(aes(fill = ranking)) + facet_wrap(~Organism) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("Rankings of Metrics per Program on each Organism")

Define best by top metric rankings in the leaderboard:

The leaderboard entries for each metric correspond to the MIN_RANK value set below. I explored a MIN_RANK = 10 (meaning the program had to have the top score to be included), MIN_RANK=9 (so top two programs), and MIN_RANK=8 (top three programs, as used in the paper). The cumulative scores are simply summing ‘counts of occurring on metric leaderboards’.

## some utility functions

# MIN_RANK defines the minimum rank for a program 
# to show up on the leaderboard for a given metric.
# 10 programs, so max ranking would be 10 (best), ranking from worst to best.

get_top_ranking_progs_by_org <- function(data, MIN_RANK) {
  # absolute top ranking:
  top_rank_counts_by_organism = data %>% filter(ranking >= MIN_RANK) %>% 
    group_by(Organism) %>% 
    count(Program)
  
  cumulative_scores = top_rank_counts_by_organism  %>% 
    group_by(Program) %>% 
    summarize(sum(n)) %>% 
    arrange(desc(`sum(n)`)) %>%
    mutate(Program=factor(Program, Program))
  
   p <- ggplot(top_rank_counts_by_organism, aes(Program, n)) + 
    geom_bar(stat='identity',fill="darkseagreen") + 
    facet_wrap(~Organism)  + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    ggtitle(sprintf("Sum Count of Metric Leaderboard Occurrence Requiring min_rank=%d", MIN_RANK)) +
    geom_text(aes(label=n), size = 2, position = position_stack(vjust = 0.5))

   plot(p)

   # heatmap showing cumulative rankings
   p2 <- top_rank_counts_by_organism %>% ggplot(aes(Organism, Program)) +
     geom_tile(aes(fill = n)) + 
     ggtitle("Cumulative metric leaderboard occurrence for Prog on Org") + 
     theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
     geom_text(aes(label=n), size = 2, color='white')
   
   plot(p2)
   
  
  p3 <- ggplot(cumulative_scores, aes(Program, `sum(n)`)) +
    geom_bar(stat='identity', fill='plum1') + 
    ggtitle(sprintf("Cumulative Metric Leaderboard Occurrence, requiring min_rank=%d", MIN_RANK)) +
    theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    geom_text(aes(label=`sum(n)`), size = 3, position = position_stack(vjust = 0.5))

  plot(p3)
  
  ret_list = list(top_rank_counts_by_organism=top_rank_counts_by_organism,
                  cumulative_scores=cumulative_scores)
  

  return(ret_list)
}

Scoring considering only single best program per metric

ret = get_top_ranking_progs_by_org(data, 10)

Scoring considering top two best scoring programs per metric

ret = get_top_ranking_progs_by_org(data, 9)

Scoring considering the top 3 programs per metric

ret = get_top_ranking_progs_by_org(data, 8)

Scoring considering the top 4 programs per metric

ret = get_top_ranking_progs_by_org(data, 7)

Filter out certain highly correlated metrics to reduce bias

There are certain metrics that were found highly correlated with other metrics, and potentially contributing bias to the scores for those programs that tend to have high scores for any metric in that correlated group.

Metrics below were removed from the metric pool to reduce this effect.

The mean rankings and distribution of rankings are shown after removing these ‘superfluous metrics’.

filtered_data <- data %>% filter(! metric %in% c("Percentage.good.mappings", 
                                                 "Percentage.bases.uncovered", 
                                                 "Number.of.ambiguous.bases", 
                                                 "Database.coverage", 
                                                 "Complete.single.copy.BUSCOs")) 

# re-rank
filtered_data <- filtered_data %>% group_by(Organism, metric) %>% mutate(ranking = rank(ranking))

filtered_mean_rankings_df <- filtered_data %>% group_by(Program) %>% summarize(mean_rank = mean(ranking)) %>% arrange(desc(mean_rank))

filtered_mean_rankings_df <- filtered_mean_rankings_df %>% 
  mutate(Program = factor(Program, Program)) 

filtered_mean_rankings_df %>% ggplot(aes(Program, mean_rank)) + 
  geom_bar(stat='identity') +  
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("Mean rankings after removing superfluous metrics")

filtered_data %>%
   mutate(Program = factor(Program, levels(filtered_mean_rankings_df$Program))) %>% 
   group_by(Program) %>% 
  ggplot(aes(Program, ranking)) + 
  geom_boxplot() + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("Distribution of rankings after removing superfluous metrics")

Removing the ‘extra’ human entries.

Since I’m not averaging across the different human data sets (as done in the paper), below I just remove the virus-included and simulated data sets to investigate how the scores reflect a single organism per study (including human just once).

Mean rankings and rank distributions provided below, in addition to versions of the plots above rendered with these filtered data.

HumanExtraDataSets = c("HumanEboV3h", "HumanEbo7h", "HumanEbo23h", "HumanSim")

filtered_data <- filtered_data %>% filter(! Organism %in% HumanExtraDataSets) 

# re-rank
filtered_data <- filtered_data %>% group_by(Organism, metric) %>% mutate(ranking = rank(ranking))

filtered_mean_rankings_df <- filtered_data %>% group_by(Program) %>% 
  summarize(mean_rank = mean(ranking)) %>% arrange(desc(mean_rank))

filtered_mean_rankings_df <- filtered_mean_rankings_df %>% 
  mutate(Program = factor(Program, Program))

filtered_mean_rankings_df %>% ggplot(aes(Program, mean_rank)) + geom_bar(stat='identity') +  
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("mean rankings post removal of superfluous metrics and xtra human")

filtered_data %>%
   mutate(Program = factor(Program, levels(filtered_mean_rankings_df$Program))) %>% 
   group_by(Program) %>% 
  ggplot(aes(Program, ranking)) + geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("distribution of rankings post removal of superfluous metrics and xtra human")

filtered_data %>% ggplot(aes(Program, metric)) + geom_tile(aes(fill = ranking)) + 
  facet_wrap(~Organism) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  ggtitle("Metric rankings post removal of superfluous metrics & xtra human")

Rank analysis, using the filtered data

ret = get_top_ranking_progs_by_org(filtered_data, 10)

ret = get_top_ranking_progs_by_org(filtered_data, 9)

ret = get_top_ranking_progs_by_org(filtered_data, 8)

ret = get_top_ranking_progs_by_org(filtered_data, 7)

asm_metric_rankings

9/25/2018