Hi stats Twitter, can any kind soul answer this question for me?@michael_nielsen @seanjtaylor @johnmyleswhite pic.twitter.com/zlydULHs03

— Chris Said (@Chris_Said) April 9, 2017
library(tidyverse)
theme_set(theme_minimal())

N <- 100
replications = 500

# hyperparameters
mu0 <- 0
sigma0 <- 1

# N groups with means drawn from Normal()
# and sample sizes that are log normally distributed (plus 1)
set.seed(2017)
sim_data <- crossing(replication = seq_len(replications),
                     group = seq_len(N),
                     theta = c(1, 3, 10, 30, 100, 300)) %>%
  mutate(mu_i = rnorm(n(), mu0, sigma0),
         n_i = 1 + round(rlnorm(n(), 2, 2))) %>%
  unnest(observation = map(n_i, seq_len)) %>%
  mutate(x = rnorm(n(), mu_i, theta))
group_means <- sim_data %>%
  group_by(theta, replication, group, n_i) %>%
  summarize(mu_i_hat = mean(x))

estimates <- group_means %>%
  group_by(theta, replication) %>%
  summarize(Direct = mean(mu_i_hat),
            Weighted = sum(n_i * mu_i_hat) / sum(n_i))
# the true value of mu_0 is 0
estimates %>%
  gather(metric, estimate, -theta, -replication) %>%
  group_by(theta, metric) %>%
  summarize(mse = mean(estimate ^ 2)) %>%
  ggplot(aes(theta, mse, color = metric)) +
  geom_line() +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = expression(paste(theta, " (within-group standard deviation)")),
       y = expression(paste("Mean-squared error of estimate of ", mu[0])),
       title = "Weighting mean by sample size improves MSE for high standard deviation",
       subtitle = "100 groups, log-normal sample sizes, mu_i are normally distributed",
       color = "Method")