Unown simulations

February 21, 2017

Run one simulation for catching 26 unowns, assuming each one is equally likely.

simulate_unown <- function() {
  n_unique <- 0
  target <- 26
  encountered <- character(0)
  
  # Take 26 samples on first iteration, 26 - n_unique on next iteration, etc.
  while (target - n_unique > 0) {
    batch <- sample(letters, size = target - n_unique, replace = TRUE)
    encountered <- c(encountered, batch)
    n_unique <- length(unique(encountered))
  }
  
  length(encountered)
}
simulate_unown()
#> [1] 93
simulate_unown()
#> [1] 148
simulate_unown()
#> [1] 70
simulate_unown()
#> [1] 107

Run 10,000 simulations.

simulations <- replicate(10000, simulate_unown())

Expected value:

round(mean(simulations))
#> [1] 100

Good news is that the median is less than the mean.

median(simulations)
#> [1] 95

Confidence intervals:

quantile(simulations, probs = c(.05, .10, .25, .75, .90, .95))
#>  5% 10% 25% 75% 90% 95% 
#>  60  66  78 117 141 158

A long tail for unlucky trainers.

library(ggplot2)
ggplot(data.frame(x = simulations)) + 
  aes(x = x) + 
  geom_histogram(binwidth = 5, color = "white", center = 102.5) + 
  labs(x = "Num. unowns encountered until 26 unique unowns encountered", 
       y = "Num. simulations")

Now assume a 50% catch rate.

simulate_unown_catches <- function(p_catch) {
  n_unique <- 0
  target <- 26
  caught <- character(0)
  encountered <- character(0)
  catch_probs <- c(p_catch, 1 - p_catch)
  
  # Take 26 samples on first iteration, 26 - n_unique on next iteration, etc.
  while (target - n_unique > 0) {
    batch <- sample(letters, size = target - n_unique, replace = TRUE)
    
    catches <- sample(c(TRUE, FALSE), size = target - n_unique, 
                      replace = TRUE, prob = catch_probs)
    
    caught_in_batch <- batch[catches]
    
    encountered <- c(encountered, batch)
    caught <- c(caught, caught_in_batch)
    n_unique <- length(unique(caught))
  }
  
  length(encountered)
}

simulate_unown_catches(.5)
#> [1] 243
simulate_unown_catches(.5)
#> [1] 223
simulate_unown_catches(.5)
#> [1] 221
simulate_unown_catches(.5)
#> [1] 303

Run 10,000 simulations. Not surprisingly, it will take twice as long with a 50% catch rate.

simulations <- replicate(10000, simulate_unown_catches(.5))
round(mean(simulations))
#> [1] 200
median(simulations)
#> [1] 189
quantile(simulations, probs = c(.05, .10, .25, .75, .90, .95))
#>  5% 10% 25% 75% 90% 95% 
#> 118 131 155 233 284 319

ggplot(data.frame(x = simulations)) + 
  aes(x = x) + 
  geom_histogram(binwidth = 5, color = "white", center = 102.5) + 
  labs(x = "Num. unowns encountered until 26 unique unowns caught", 
       y = "Num. simulations", 
       caption = "50% catch rate")