We simulate random data from two distributions- a normal centered at 0, and a normal centered at 4.
library(tidyverse)
theme_set(theme_bw())
set.seed(2016)
# setup: simulate data
observations <- data_frame(observation = seq_len(1000)) %>%
mutate(oracle = sample(c("A", "B"), n(), TRUE)) %>%
mutate(mu = ifelse(oracle == "A", 0, 4),
x = rnorm(n(), mu, sd = 1))
ggplot(observations, aes(x)) +
geom_histogram() +
geom_vline(xintercept = c(0, 4), color = "red", lty = 2)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How could we estimate assignments for this data, assuming we no longer knew them?
We perform expectation maximization:
set.seed(1337)
# we set up by assigning clusters randomly
maximized <- observations %>%
mutate(assignment = sample(c("A", "B"), n(), TRUE))
for (i in 1:10) {
# expectation: find the mean/sd of current clusters
estimates <- maximized %>%
group_by(assignment) %>%
summarize(mu_hat = mean(x), sigma_hat = sd(x))
# maximization: pick the best for each
last <- maximized
maximized <- observations %>%
crossing(estimates) %>%
mutate(log_likelihood = dnorm(x, mu_hat, sigma_hat, log = TRUE)) %>%
group_by(observation) %>%
top_n(1, log_likelihood) %>%
ungroup()
# have we converged yet?
if (all(maximized$assignment == last$assignment)) {
break
}
}
ggplot(maximized, aes(x, fill = assignment)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

LS0tCnRpdGxlOiAiRml0dGluZyBhIG1peHR1cmUgbW9kZWwgd2l0aCBhIHRpZHkgRU0iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCldlIHNpbXVsYXRlIHJhbmRvbSBkYXRhIGZyb20gdHdvIGRpc3RyaWJ1dGlvbnMtIGEgbm9ybWFsIGNlbnRlcmVkIGF0IDAsIGFuZCBhIG5vcm1hbCBjZW50ZXJlZCBhdCA0LgoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQp0aGVtZV9zZXQodGhlbWVfYncoKSkKCnNldC5zZWVkKDIwMTYpCgojIHNldHVwOiBzaW11bGF0ZSBkYXRhCm9ic2VydmF0aW9ucyA8LSBkYXRhX2ZyYW1lKG9ic2VydmF0aW9uID0gc2VxX2xlbigxMDAwKSkgJT4lCiAgbXV0YXRlKG9yYWNsZSA9IHNhbXBsZShjKCJBIiwgIkIiKSwgbigpLCBUUlVFKSkgJT4lCiAgbXV0YXRlKG11ID0gaWZlbHNlKG9yYWNsZSA9PSAiQSIsIDAsIDQpLAogICAgICAgICB4ID0gcm5vcm0obigpLCBtdSwgc2QgPSAxKSkKCmdncGxvdChvYnNlcnZhdGlvbnMsIGFlcyh4KSkgKwogIGdlb21faGlzdG9ncmFtKCkgKwogIGdlb21fdmxpbmUoeGludGVyY2VwdCA9IGMoMCwgNCksIGNvbG9yID0gInJlZCIsIGx0eSA9IDIpCmBgYAoKSG93IGNvdWxkIHdlIGVzdGltYXRlIGFzc2lnbm1lbnRzIGZvciB0aGlzIGRhdGEsIGFzc3VtaW5nIHdlIG5vIGxvbmdlciBrbmV3IHRoZW0/CgpXZSBwZXJmb3JtIGV4cGVjdGF0aW9uIG1heGltaXphdGlvbjoKCmBgYHtyfQpzZXQuc2VlZCgxMzM3KQoKIyB3ZSBzZXQgdXAgYnkgYXNzaWduaW5nIGNsdXN0ZXJzIHJhbmRvbWx5Cm1heGltaXplZCA8LSBvYnNlcnZhdGlvbnMgJT4lCiAgbXV0YXRlKGFzc2lnbm1lbnQgPSBzYW1wbGUoYygiQSIsICJCIiksIG4oKSwgVFJVRSkpCgpmb3IgKGkgaW4gMToxMCkgewogICMgZXhwZWN0YXRpb246IGZpbmQgdGhlIG1lYW4vc2Qgb2YgY3VycmVudCBjbHVzdGVycwogIGVzdGltYXRlcyA8LSBtYXhpbWl6ZWQgJT4lCiAgICBncm91cF9ieShhc3NpZ25tZW50KSAlPiUKICAgIHN1bW1hcml6ZShtdV9oYXQgPSBtZWFuKHgpLCBzaWdtYV9oYXQgPSBzZCh4KSkKICAKICAjIG1heGltaXphdGlvbjogcGljayB0aGUgYmVzdCBmb3IgZWFjaAogIGxhc3QgPC0gbWF4aW1pemVkCiAgbWF4aW1pemVkIDwtIG9ic2VydmF0aW9ucyAlPiUKICAgIGNyb3NzaW5nKGVzdGltYXRlcykgJT4lCiAgICBtdXRhdGUobG9nX2xpa2VsaWhvb2QgPSBkbm9ybSh4LCBtdV9oYXQsIHNpZ21hX2hhdCwgbG9nID0gVFJVRSkpICU+JQogICAgZ3JvdXBfYnkob2JzZXJ2YXRpb24pICU+JQogICAgdG9wX24oMSwgbG9nX2xpa2VsaWhvb2QpICU+JQogICAgdW5ncm91cCgpCiAgCiAgIyBoYXZlIHdlIGNvbnZlcmdlZCB5ZXQ/CiAgaWYgKGFsbChtYXhpbWl6ZWQkYXNzaWdubWVudCA9PSBsYXN0JGFzc2lnbm1lbnQpKSB7CiAgICBicmVhawogIH0KfQoKZ2dwbG90KG1heGltaXplZCwgYWVzKHgsIGZpbGwgPSBhc3NpZ25tZW50KSkgKwogIGdlb21faGlzdG9ncmFtKCkKYGBgCg==