Hi stats Twitter, can any kind soul answer this question for me?@michael_nielsen @seanjtaylor @johnmyleswhite pic.twitter.com/zlydULHs03
— Chris Said (@Chris_Said) April 9, 2017
library(tidyverse)
theme_set(theme_minimal())
N <- 100
replications = 500
# hyperparameters
mu0 <- 0
sigma0 <- 1
# N groups with means drawn from Normal()
# and sample sizes that are log normally distributed (plus 1)
set.seed(2017)
sim_data <- crossing(replication = seq_len(replications),
group = seq_len(N),
theta = c(1, 3, 10, 30, 100, 300)) %>%
mutate(mu_i = rnorm(n(), mu0, sigma0),
n_i = 1 + round(rlnorm(n(), 2, 2))) %>%
unnest(observation = map(n_i, seq_len)) %>%
mutate(x = rnorm(n(), mu_i, theta))
group_means <- sim_data %>%
group_by(theta, replication, group, n_i) %>%
summarize(mu_i_hat = mean(x))
estimates <- group_means %>%
group_by(theta, replication) %>%
summarize(Direct = mean(mu_i_hat),
Weighted = sum(n_i * mu_i_hat) / sum(n_i))
# the true value of mu_0 is 0
estimates %>%
gather(metric, estimate, -theta, -replication) %>%
group_by(theta, metric) %>%
summarize(mse = mean(estimate ^ 2)) %>%
ggplot(aes(theta, mse, color = metric)) +
geom_line() +
scale_x_log10() +
scale_y_log10() +
labs(x = expression(paste(theta, " (within-group standard deviation)")),
y = expression(paste("Mean-squared error of estimate of ", mu[0])),
title = "Weighting mean by sample size improves MSE for high standard deviation",
subtitle = "100 groups, log-normal sample sizes, mu_i are normally distributed",
color = "Method")