HYPOTHESIS: As variability in word length decreases, complexity bias of the vocabulary decreases necessarily.

These simulations are consistent with that, showing the maximum possibilily correlation from a sample of words as a function of the standard deviation of the length distribution and the complexity distribution. This pattern only emerges if word length is discrete (round word length). Under this condition, only the variability of the length distribution affects the overall correlation.

corrs <- function(comp.sd, length.sd, length.mean, comp.mean) {
  function(k) {
    length <- round(rnorm(1000, mean = length.mean, sd = length.sd))
    complexity <- rnorm(1000, mean = comp.mean, sd = comp.sd)
    cor(sort(complexity), sort(length))
   }
}
get_corr_2 <- function(comp.sd, length.sd, length.mean, comp.mean) {
  sample_values <- 1:100 %>%
    map(corrs(comp.sd, length.sd, length.mean, comp.mean)) %>%
    unlist() 
  mean(sample_values)
}

comp.sd =  seq(.1, 2, by=.5)
length.sd =  seq(.1, 2, by=.1)
length.mean =  1
comp.mean =  1
params <- expand.grid(comp.sd = comp.sd, 
                      length.sd = length.sd,
                      length.mean = length.mean,
                      comp.mean = comp.mean)
sim = params %>%
  group_by(comp.sd, length.sd, length.mean, comp.mean) %>%
  summarise(observed.complexity.bias = get_corr_2(comp.sd, length.sd, length.mean, comp.mean)) %>%
  ungroup() %>%
  mutate(comp.sd = as.factor(comp.sd))

ggplot(sim, aes(y = observed.complexity.bias, x = length.sd,
                color = comp.sd)) +
  facet_grid(. ~ comp.sd) +
  geom_point() +
  geom_line() + 
  theme_bw() +
  ggtitle("Maximum correlation between length and complexity")