Null threshold for Precision@R and mAP

library(tidyverse)

Set up experiment

m <- 4 # the number of white balls in the urn (replicates)
n <- 32 # the number of black balls in the urn (negative controls)
k <- m # the number of balls drawn from the urn, hence must be in 0,1,…, m+n (top-k)

nn <- 10000 # number of iterations

Create indicator vector (top-k are TRUE, i.e. white)

y <- as.factor(seq(m + n) <= k)
y_rank <- 1 - (seq(m + n) / (m + n))

Generate many samples of precision at k (by random sampling of labels)

retrieval_metrics_empirical <-
  map_df(seq(nn), function(i) {
    x <- as.factor(sample(c(rep(FALSE, n), rep(TRUE, m))))

    pr_at_k_l <-
      yardstick::precision_vec(x, y, event_level = "second")

    average_precision_l <-
      yardstick::average_precision_vec(x, y_rank, event_level = "second")

    tibble(pr_at_k_l, average_precision_l)
  })
retrieval_metrics_empirical %>%
  ggplot(aes(pr_at_k_l, average_precision_l)) +
  geom_point()

Precision@K (under the null hypothesis) follows the hypergeometric distribution

Now sample from the hypergeometric distribution

pr_at_k_l <- rhyper(
  nn = nn,
  m = m,
  n = n,
  k = k
) / k

dist_simulation_pr_at_k <-
  data.frame(pr_at_k_l)

Compute thresholds

thresholds <-
  tribble(
    ~name,
    ~value,
    "m / (m + n)",
    m / (m + n),
    "empirical mean",
    mean(retrieval_metrics_empirical$pr_at_k_l),
    "hyper 95pct",
    qhyper(
      p = 0.95,
      m = m,
      n = n,
      k = k
    ) / k
  )
thresholds

Plot both samples; they should look very similar.

bind_rows(
  retrieval_metrics_empirical %>% select(pr_at_k_l) %>% mutate(type = "empirical"),
  dist_simulation_pr_at_k %>% mutate(type = "simulation")
) %>%
  ggplot(aes(pr_at_k_l)) +
  geom_histogram(binwidth = .01) +
  facet_wrap( ~ type, ncol = 1) +
  geom_vline(
    mapping = aes(xintercept = value,
                  colour = name),
    data = thresholds,
    show.legend = TRUE
  ) +
  theme_bw()

We are using \frac{m}{m+n}, i.e., the mean of the hypergeometric distribution as the null threshold. But we could instead use some higher quantile e.g. the 95th percentile as the null threshold.

This is easy to do for Pr@K but the story is a little more complicated for Average Precision, because there is likely no closed form for the distribution that AP follows.

(Bestgen 2015) provides an iterative process to compute the mean but it’s not clear how to extend this to a given percentile

RandomAPExact <-
  function(m = 0, n = 0) {
    ap <- 0
    for (x in 1:m) {
      for (k in x:(n + x)) {
        ap <- ap + dhyper(x = x, m = m, n = n, k = k) * (x / k) * (x / k)
      }
    }
    ap <- ap / m
    ap
  }

However, we could instead use an empirical estimate for the the 95th percentile:

thresholds_ap_pr_at_k <-
  tribble(
    ~name,
    ~value,
    "pr_at_k_l",
    quantile(retrieval_metrics_empirical$pr_at_k_l, .95, names = FALSE),
    "average_precision_l",
    quantile(retrieval_metrics_empirical$average_precision_l, .95, names = FALSE),
    "pr_at_k_l",
    mean(retrieval_metrics_empirical$pr_at_k_l),
    "average_precision_l",
    mean(retrieval_metrics_empirical$average_precision_l)
  )
thresholds_ap_pr_at_k
retrieval_metrics_empirical %>%
  pivot_longer(everything()) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 50) +
  geom_vline(
    mapping = aes(xintercept = value,
                  colour = name),
    data = thresholds_ap_pr_at_k,
    show.legend = TRUE
  ) +
  facet_wrap( ~ name, ncol = 1) +
  theme_bw() + 
  labs(caption = "Vertical lines indicate mean and 95th percentile")

References

Bestgen, Yves. 2015. “Exact Expected Average Precision of the Random Baseline for System Evaluation.” The Prague Bulletin of Mathematical Linguistics 103 (1): 131–38. https://doi.org/10.1515/pralin-2015-0007.