Bayesian Density Estimation (Finite Mixture Model)

References

Load packages

library(tidyverse)
library(rstan)
## devtools::install_github('jburos/biostan', build_vignettes = TRUE, dependencies = TRUE)
## library(biostan)
set.seed(732565397)

Prepare data

We generate a dataset with clearly separate modes.

data1 <- data_frame(z = 1 + rbinom(n = 82, size = 1, prob = 0.3),
           x1 = rnorm(n = length(z), mean = -10, sd = 1),
           x2 = rnorm(n = length(z), mean = +8, sd = 2),
           x = ifelse(z == 1, x1, x2))
ggplot(data = data1, mapping = aes(x = x)) +
    geom_point(y = 0.5) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

Set range of values to examine estimated density.

grid_max <- 20
grid_min <- -20
grid_length <- 100

Define some helper functions

print_relevant_pars <- function(fit) {
    print(fit, pars = c("mu","sigma_squared","Pi","sigma","lp__"))
}

plot_draws <- function(stan_fit) {
    ## Note direct access to global variables
    draw_data  <- tidybayes::tidy_draws(stan_fit) %>%
        select(.chain, .iteration, .draw, starts_with("log_f")) %>%
        gather(key = key, value = value, starts_with("log_f")) %>%
        mutate(key = gsub("log_f|\\[|\\]", "", key) %>% as.integer(),
           x = factor(key, labels = seq(from = grid_min, to = grid_max, length.out = grid_length)) %>%
               as.character() %>%
               as.numeric(),
           value = exp(value))

    summary_density <- draw_data %>%
        group_by(.chain, x) %>%
        summarize(value = mean(value))

    ggplot(data = summary_density, mapping = aes(x = x, y = value, group = .chain)) +
    geom_line(size = 0.5, color = "gray") +
    geom_point(data = data1, mapping = aes(x = x, group = NULL), y = 0) +
    facet_wrap(~ .chain) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())
}

traceplot_all <- function(fit) {
    print(traceplot(fit, inc_warmup = TRUE, pars = "mu"))
    print(traceplot(fit, inc_warmup = TRUE, pars = "sigma"))
    print(traceplot(fit, inc_warmup = TRUE, pars = "Pi"))
    print(traceplot(fit, inc_warmup = FALSE, pars = "lp__"))
}

pairs_plot_all <- function(fit) {
    pairs(fit, pars = "mu")
    pairs(fit, pars = "sigma_squared")
    pairs(fit, pars = "Pi")
}

Finite Mixture of Normals

Model

Let \(z_{i} \in \left\{ 1, \dots, H \right\}\) be the cluster membership latent variable and \(\mathbf{z}_{i} = (I(z_{i}=1),\dots,I(z_{i}=H))^{T}\) be the vector indicator version. We have the following model.

\[\begin{align*} y_{i} | (z_{i}, \mu_{1},\dots,\mu_{H}, \sigma^{2}_{1},\dots,\sigma^{2}_{H}) &\sim N(\mu_{z_{i}}, \sigma^{2}_{z_{i}})\\ \mathbf{z}_{i} | \boldsymbol{\pi} &\sim Multinomial(1, \boldsymbol{\pi})\\ \\ p(\mu_{h},\sigma^{2}_{h}) &= p(\mu_{h}|\sigma^{2}_{h})p(\sigma^{2}_{h})\\ &= N(\mu_{h} | m, s^{2}) Inverse-Gamma(\sigma^{2}_{h} | \alpha, \beta)\\ &= \left[ \frac{1}{\sqrt{2\pi s^{2}}} \exp \left( - \frac{(\mu_{h} - m)^{2}}{2 \times s^{2}} \right) \right] \left[ \frac{\beta^{\alpha}}{\Gamma(\alpha)} (\sigma^{2}_{h})^{-\alpha - 1} \exp \left( - \frac{\beta}{\sigma^{2}_{h}} \right) \right]\\ &\text{where}\\ m &= 0\\ s^{2} &= 10^{3}\\ \alpha &= 10^{-3}\\ \beta &= 10^{-3}\\ \\ p(\boldsymbol{\pi}) &\sim Dirichlet \left( \frac{\alpha}{H}, \dots, \frac{\alpha}{H} \right)\\ \end{align*} \]

Summing out the latent categorical variable \(z_{i}\) results in the following (conditioning on parameters suppressed for simplicity) marginal density. Note the cluster membership latent variable \(z_i\) is not measured, thus, it is a discrete parameter. Stan’s Hamiltonian Monte Carlo (HMC) cannot deal with discrete parameters, this marginalization step is required fro Stan. JAGS seems to allow a discrete parameter and accepts the original model above.

\[\begin{align*} p(y) &= \sum^{H}_{z=1} p(y|z) p(z)\\ &= \sum^{H}_{z=1} \pi_{z} p(y|z)\\ &= \sum^{H}_{h=1} \pi_h N(y | \mu_{h}, \sigma^{2}_{h})\\ \end{align*} \]

Fitting

2 latent cluster model

normal_fixed_mixture_unordered_stan_code <- biostan::read_stan_file("./bayesianideas_density_normal_fixed_mixture_unordered.stan")

## Warning: replacing previous import 'rstan::loo' by 'rstanarm::loo' when loading 'biostan'

biostan::print_stan_code(normal_fixed_mixture_unordered_stan_code, section = NULL)

## data { 
##     // Hyperparameters 
##     real alpha; 
##     real beta; 
##     real m; 
##     real s_squared; 
##     real<lower=0> dirichlet_alpha; 
##  
##     // Define variables in data 
##     // Number of observations (an integer) 
##     int<lower=0> n; 
##     // Outcome (a real vector of length n) 
##     real y[n]; 
##     // Number of latent clusters 
##     int<lower=1> H; 
##  
##     // Grid evaluation 
##     real grid_max; 
##     real grid_min; 
##     int<lower=1> grid_length; 
## } 
##  
## transformed data { 
##     real s; 
##     real grid_step; 
##  
##     s = sqrt(s_squared); 
##     grid_step = (grid_max - grid_min) / (grid_length - 1); 
## } 
##  
## parameters { 
##     // Define parameters to estimate 
##     // Population mean (a real number) 
##     vector[H] mu; 
##     // Population variance (a positive real number) 
##     real<lower=0> sigma_squared[H]; 
##     // Cluster probability 
##     simplex[H] Pi; 
## } 
##  
## transformed parameters { 
##     // Population standard deviation (a positive real number) 
##     real<lower=0> sigma[H]; 
##     // Standard deviation (derived from variance) 
##     sigma = sqrt(sigma_squared); 
## } 
##  
## model { 
##     // Temporary vector for loop use. Need to come first before priors. 
##     real contributions[H]; 
##  
##     // Prior part of Bayesian inference 
##     // All vectorized 
##     // Mean 
##     mu ~ normal(m, s); 
##     // sigma^2 has inverse gamma (alpha = 1, beta = 1) prior 
##     sigma_squared ~ inv_gamma(alpha, beta); 
##     // cluster probability vector 
##     Pi ~ dirichlet(rep_vector(dirichlet_alpha / H, H)); 
##  
##     // Likelihood part of Bayesian inference 
##     // Outcome model N(mu, sigma^2) (use SD rather than Var) 
##     for (i in 1:n) { 
##         // Loop over individuals 
##  
##           for (h in 1:H) { 
##               // Loop over clusters within each individual 
##               // Log likelihood contributions log(Pi[h] * N(y[i] | mu[h],sigma[h])) 
##               contributions[h] = log(Pi[h]) + normal_lpdf(y[i] | mu[h], sigma[h]); 
##           } 
##  
##           // log(sum(exp(contribution element))) 
##           target += log_sum_exp(contributions); 
##  
##     } 
## } 
##  
## generated quantities { 
##  
##     real log_f[grid_length]; 
##  
##     for (g in 1:grid_length) { 
##         // Definiting here avoids reporting of these intermediates. 
##         real contributions[H]; 
##         real grid_value; 
##  
##         grid_value = grid_min + grid_step * (g - 1); 
##         for (h in 1:H) { 
##             contributions[h] = log(Pi[h]) + normal_lpdf(grid_value | mu[h], sigma[h]); 
##         } 
##  
##         log_f[g] = log_sum_exp(contributions); 
##     } 
##  
## } 
##

normal_fixed_mixture_unordered_stan_fit <-
    rstan::stan(model_code = normal_fixed_mixture_unordered_stan_code,
                data = list(alpha = 10^(-3), beta = 10^(-3),
                            m = 0, s_squared = 10^(3),
                            n = nrow(data1),
                            y = data1$x,
                            dirichlet_alpha = 1,
                            H = 2,
                            grid_max = grid_max, grid_min = grid_min, grid_length = grid_length),
                chains = 12)

## Warning: There were 2303 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

## Warning: Examine the pairs() plot to diagnose sampling problems

print_relevant_pars(normal_fixed_mixture_unordered_stan_fit)

## Inference for Stan model: 68985fd05a725a6fbd3d2ee07905d347.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                            mean se_mean            sd    2.5%     25%     50%     75%          97.5% n_eff  Rhat
## mu[1]             -1.430000e+00    2.60  1.463000e+01  -33.71  -10.16   -1.29    7.73   3.040000e+01    32  1.17
## mu[2]             -2.590000e+00    2.97  1.198000e+01  -16.61  -10.19   -5.48    7.57   1.241000e+01    16  1.33
## sigma_squared[1]  3.298667e+303     NaN           Inf    0.71    1.04    6.57   87.77  3.125404e+248   NaN   NaN
## sigma_squared[2]  6.415190e+304     NaN           Inf    0.66    0.94    5.34   14.27  2.130830e+171   NaN   NaN
## Pi[1]              4.500000e-01    0.12  3.000000e-01    0.00    0.26    0.42    0.71   1.000000e+00     6  7.39
## Pi[2]              5.500000e-01    0.12  3.000000e-01    0.00    0.29    0.58    0.74   1.000000e+00     6  7.39
## sigma[1]          1.017418e+150     NaN 5.742741e+151    0.84    1.02    2.56    9.37  1.767710e+124   NaN  1.00
## sigma[2]          5.698957e+150     NaN 2.532287e+152    0.81    0.97    2.31    3.78   4.565878e+85   NaN  1.00
## lp__              -2.171300e+02   18.73  4.592000e+01 -297.49 -237.03 -188.72 -187.13  -1.858900e+02     6 31.17
## 
## Samples were drawn using NUTS(diag_e) at Sat Dec  1 21:58:36 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

traceplot_all(normal_fixed_mixture_unordered_stan_fit)

pairs_plot_all(normal_fixed_mixture_unordered_stan_fit)

## Warning in plot.window(...): Internal(pretty()): very large range.. corrected

## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current
## (small) bandwidth: consider increasing 'gridsize'

## Warning in (function (x, y = NULL, nbin = 128, bandwidth, colramp = colorRampPalette(c("white", : NAs introduced by
## coercion to integer range

## Warning in (function (x, y = NULL, nbin = 128, bandwidth, colramp = colorRampPalette(c("white", : NAs introduced by
## coercion to integer range

## Warning in plot.window(...): Internal(pretty()): very large range.. corrected

## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current
## (small) bandwidth: consider increasing 'gridsize'

## Warning in plot.window(...): Internal(pretty()): very large range.. corrected

## Warning in plot.window(...): Internal(pretty()): very large range.. corrected

## Warning in par(usr = c(usr[1:2], 0, 1.5)): Internal(pretty()): very large range.. corrected

plot_draws(normal_fixed_mixture_unordered_stan_fit)

shinystan::launch_shinystan(normal_fixed_mixture_unordered_stan_fit)

Many divergent transitions were experienced. Rhat statistics are all large. Note the Rhat for log posterior lp__ is also large. In chains 3, 8, 9, and 11, only one of the two clustered remained, resulting in essentially single normal distribution. Once the cluster probability is zero, the mean parameter mu can take any value and give the same likelihood. This is probably the reason for the wide-spread cross appearance of the pairs plot for the mu parameters.

I realized the prior for the cluster probability vector is Dirichlet(alpha / H). That is, in this case Dirichlet(0.5, 0.5), which gives a lot of mass to regions that collapses either one of the two clusters.

data_frame(x = seq(from = 0, to = 1, by = 0.01),
           y = dbeta(x = x, shape1 = 0.5, shape2 = 0.5)) %>%
    filter(y < Inf) %>%
ggplot(mapping = aes(x = x, y = y)) +
    geom_path() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

2 latent cluster model (Dirichlet mass 3)

normal_fixed_mixture_unordered_stan_fit2 <-
    rstan::stan(model_code = normal_fixed_mixture_unordered_stan_code,
                data = list(alpha = 10^(-3), beta = 10^(-3),
                            m = 0, s_squared = 10^(3),
                            n = nrow(data1),
                            y = data1$x,
                            dirichlet_alpha = 3,
                            H = 2,
                            grid_max = grid_max, grid_min = grid_min, grid_length = grid_length),
                chains = 12)

## Warning: There were 625 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

## Warning: Examine the pairs() plot to diagnose sampling problems

print_relevant_pars(normal_fixed_mixture_unordered_stan_fit2)

## Inference for Stan model: 68985fd05a725a6fbd3d2ee07905d347.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                            mean se_mean            sd    2.5%     25%     50%     75%          97.5% n_eff  Rhat
## mu[1]             -1.760000e+00    3.36  1.236000e+01  -15.21  -10.20   -9.99    7.77   1.695000e+01    14  1.41
## mu[2]             -7.700000e-01    3.53  8.670000e+00  -10.38  -10.15    1.71    7.77   8.620000e+00     6 19.05
## sigma_squared[1]  2.596583e+304     NaN           Inf    0.69    0.92    1.85    7.16  2.680406e+196   NaN   NaN
## sigma_squared[2]   9.880000e+00    7.74  1.940000e+01    0.70    0.98    5.00    7.47   7.751000e+01     6  4.90
## Pi[1]              4.800000e-01    0.10  2.400000e-01    0.01    0.29    0.48    0.70   7.800000e-01     6  5.16
## Pi[2]              5.200000e-01    0.10  2.400000e-01    0.22    0.30    0.52    0.71   9.900000e-01     6  5.16
## sigma[1]          4.765329e+150     NaN 1.610754e+152    0.83    0.96    1.36    2.68   1.619936e+98   NaN  1.00
## sigma[2]           2.410000e+00    0.81  2.020000e+00    0.84    0.99    2.24    2.73   8.800000e+00     6  5.70
## lp__              -1.987000e+02   12.49  3.065000e+01 -300.65 -190.81 -189.32 -188.39  -1.873800e+02     6 19.81
## 
## Samples were drawn using NUTS(diag_e) at Sat Dec  1 21:59:02 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

traceplot_all(normal_fixed_mixture_unordered_stan_fit2)

pairs_plot_all(normal_fixed_mixture_unordered_stan_fit2)

## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current
## (small) bandwidth: consider increasing 'gridsize'

## Warning in (function (x, y = NULL, nbin = 128, bandwidth, colramp = colorRampPalette(c("white", : NAs introduced by
## coercion to integer range

## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current
## (small) bandwidth: consider increasing 'gridsize'

plot_draws(normal_fixed_mixture_unordered_stan_fit2)

shinystan::launch_shinystan(normal_fixed_mixture_unordered_stan_fit2)

I changed the prior to Dirichlet(1.5, 1.5) to encode the prior knowledge that we likely have two clusters. Only chain 12 resulted in one cluster (cluster 2 collapsed). Chain 1 assigned 1 to the cluster with mean 8 and 30%, whereas chain 11(?) assigned 2 to the cluster with mean 8 and 30%. That is, label switching. These two are effectively the same model with label permutation, thus, every chain except chain 12 have converged to the same log posterior density trajectory and the resulting density estimates have the same bimodal shape.

data_frame(x = seq(from = 0, to = 1, by = 0.01),
           y = dbeta(x = x, shape1 = 1.5, shape2 = 1.5)) %>%
    filter(y < Inf) %>%
ggplot(mapping = aes(x = x, y = y)) +
    geom_path() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

2 latent cluster model (Dirichlet mass 10)

normal_fixed_mixture_unordered_stan_fit3 <-
    rstan::stan(model_code = normal_fixed_mixture_unordered_stan_code,
                data = list(alpha = 10^(-3), beta = 10^(-3),
                            m = 0, s_squared = 10^(3),
                            n = nrow(data1),
                            y = data1$x,
                            dirichlet_alpha = 10,
                            H = 2,
                            grid_max = grid_max, grid_min = grid_min, grid_length = grid_length),
                chains = 12)

print_relevant_pars(normal_fixed_mixture_unordered_stan_fit3)

## Inference for Stan model: 68985fd05a725a6fbd3d2ee07905d347.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                     mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff  Rhat
## mu[1]              -5.69    3.18 7.79  -10.42  -10.24  -10.13   -5.76    8.46     6 27.92
## mu[2]               3.28    3.17 7.79  -10.34    1.85    7.55    8.00    8.77     6 16.78
## sigma_squared[1]    2.46    1.07 2.86    0.66    0.85    1.00    2.10    9.93     7  2.57
## sigma_squared[2]    5.46    1.07 3.26    0.73    2.52    5.74    7.46   12.02     9  1.69
## Pi[1]               0.59    0.07 0.17    0.25    0.50    0.66    0.70    0.77     7  3.53
## Pi[2]               0.41    0.07 0.17    0.23    0.30    0.34    0.50    0.75     7  3.53
## sigma[1]            1.38    0.29 0.75    0.81    0.92    1.00    1.45    3.15     7  3.51
## sigma[2]            2.20    0.29 0.79    0.86    1.58    2.40    2.73    3.47     7  2.28
## lp__             -194.91    0.02 1.62 -198.98 -195.73 -194.59 -193.72 -192.76  5362  1.00
## 
## Samples were drawn using NUTS(diag_e) at Sat Dec  1 21:59:20 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

traceplot_all(normal_fixed_mixture_unordered_stan_fit3)

pairs_plot_all(normal_fixed_mixture_unordered_stan_fit3)

plot_draws(normal_fixed_mixture_unordered_stan_fit3)

shinystan::launch_shinystan(normal_fixed_mixture_unordered_stan_fit3)

We can encode a stronger brief about the existence of both clusters with Dirichlet(5,5). No divergent transitions were experienced. Rhat statistics still look large, but the one for the log posterior density is 1.00. For the cluster probability parameter Pi, we clearly see that the only problem left is the label switching. As a result, the resulting density estimates all have the same bimodal shape.

data_frame(x = seq(from = 0, to = 1, by = 0.01),
           y = dbeta(x = x, shape1 = 5, shape2 = 5)) %>%
    filter(y < Inf) %>%
ggplot(mapping = aes(x = x, y = y)) +
    geom_path() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

2 latent cluster model (Dirichlet mass 10, order constraint)

normal_fixed_mixture_stan_code <- biostan::read_stan_file("./bayesianideas_density_normal_fixed_mixture.stan")
biostan::print_stan_code(normal_fixed_mixture_stan_code, section = NULL)

## data { 
##     // Hyperparameters 
##     real alpha; 
##     real beta; 
##     real m; 
##     real s_squared; 
##     real<lower=0> dirichlet_alpha; 
##  
##     // Define variables in data 
##     // Number of observations (an integer) 
##     int<lower=0> n; 
##     // Outcome (a real vector of length n) 
##     real y[n]; 
##     // Number of latent clusters 
##     int<lower=1> H; 
##  
##     // Grid evaluation 
##     real grid_max; 
##     real grid_min; 
##     int<lower=1> grid_length; 
## } 
##  
## transformed data { 
##     real s; 
##     real grid_step; 
##  
##     s = sqrt(s_squared); 
##     grid_step = (grid_max - grid_min) / (grid_length - 1); 
## } 
##  
## parameters { 
##     // Define parameters to estimate 
##     // Population mean (a real number) 
##     ordered[H] mu; 
##     // Population variance (a positive real number) 
##     real<lower=0> sigma_squared[H]; 
##     // Cluster probability 
##     simplex[H] Pi; 
## } 
##  
## transformed parameters { 
##     // Population standard deviation (a positive real number) 
##     real<lower=0> sigma[H]; 
##     // Standard deviation (derived from variance) 
##     sigma = sqrt(sigma_squared); 
## } 
##  
## model { 
##     // Temporary vector for loop use. Need to come first before priors. 
##     real contributions[H]; 
##  
##     // Prior part of Bayesian inference 
##     // All vectorized 
##     // Mean 
##     mu ~ normal(m, s); 
##     // sigma^2 has inverse gamma (alpha = 1, beta = 1) prior 
##     sigma_squared ~ inv_gamma(alpha, beta); 
##     // cluster probability vector 
##     Pi ~ dirichlet(rep_vector(dirichlet_alpha / H, H)); 
##  
##     // Likelihood part of Bayesian inference 
##     // Outcome model N(mu, sigma^2) (use SD rather than Var) 
##     for (i in 1:n) { 
##         // Loop over individuals 
##  
##           for (h in 1:H) { 
##               // Loop over clusters within each individual 
##               // Log likelihood contributions log(Pi[h] * N(y[i] | mu[h],sigma[h])) 
##               contributions[h] = log(Pi[h]) + normal_lpdf(y[i] | mu[h], sigma[h]); 
##           } 
##  
##           // log(sum(exp(contribution element))) 
##           target += log_sum_exp(contributions); 
##  
##     } 
## } 
##  
## generated quantities { 
##  
##     real log_f[grid_length]; 
##  
##     for (g in 1:grid_length) { 
##         // Definiting here avoids reporting of these intermediates. 
##         real contributions[H]; 
##         real grid_value; 
##  
##         grid_value = grid_min + grid_step * (g - 1); 
##         for (h in 1:H) { 
##             contributions[h] = log(Pi[h]) + normal_lpdf(grid_value | mu[h], sigma[h]); 
##         } 
##  
##         log_f[g] = log_sum_exp(contributions); 
##     } 
##  
## } 
##

normal_fixed_mixture_ordered_stan_fit <-
    rstan::stan(model_code = normal_fixed_mixture_stan_code,
                data = list(alpha = 10^(-3), beta = 10^(-3),
                            m = 0, s_squared = 10^(3),
                            n = nrow(data1),
                            y = data1$x,
                            dirichlet_alpha = 10,
                            H = 2,
                            grid_max = grid_max, grid_min = grid_min, grid_length = grid_length),
                chains = 12)

print_relevant_pars(normal_fixed_mixture_ordered_stan_fit)

## Inference for Stan model: 1a37d3991c5cd122dafa78b99a6fc985.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                     mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
## mu[1]             -10.18    0.00 0.13  -10.44  -10.27  -10.19  -10.10   -9.93 13363    1
## mu[2]               7.78    0.00 0.53    6.74    7.43    7.77    8.13    8.85 12966    1
## sigma_squared[1]    0.95    0.00 0.18    0.66    0.82    0.92    1.05    1.37 12262    1
## sigma_squared[2]    6.98    0.02 2.26    3.84    5.42    6.58    8.09   12.56 11352    1
## Pi[1]               0.69    0.00 0.05    0.59    0.65    0.69    0.72    0.78 15323    1
## Pi[2]               0.31    0.00 0.05    0.22    0.28    0.31    0.35    0.41 15323    1
## sigma[1]            0.97    0.00 0.09    0.81    0.90    0.96    1.03    1.17 12726    1
## sigma[2]            2.61    0.00 0.40    1.96    2.33    2.56    2.84    3.54 12483    1
## lp__             -191.98    0.02 1.62 -196.06 -192.81 -191.62 -190.80 -189.88  5965    1
## 
## Samples were drawn using NUTS(diag_e) at Sat Dec  1 22:00:18 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

traceplot_all(normal_fixed_mixture_ordered_stan_fit)

pairs_plot_all(normal_fixed_mixture_ordered_stan_fit)

plot_draws(normal_fixed_mixture_ordered_stan_fit)

shinystan::launch_shinystan(normal_fixed_mixture_unordered_stan_fit3)

If the only problem is label switching, we can solve it by ordering the prior on mu with probability 1. All Rhat statistics are 1 now and pairs plots appear better. The density estimates themselves do not change.

Top Page: http://rpubs.com/kaz_yos/
Github: https://github.com/kaz-yos

print(sessionInfo())

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.1
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2     rstan_2.18.1       StanHeaders_2.18.0 forcats_0.3.0      stringr_1.3.1     
##  [6] dplyr_0.7.7        purrr_0.2.5        readr_1.1.1        tidyr_0.8.2        tibble_1.4.2      
## [11] ggplot2_3.1.0      tidyverse_1.2.1    doRNG_1.7.1        rngtools_1.3.1     pkgmaker_0.27     
## [16] registry_0.5       doParallel_1.0.14  iterators_1.0.10   foreach_1.4.4      knitr_1.20        
## 
## loaded via a namespace (and not attached):
##   [1] minqa_1.2.4               colorspace_1.3-2          ggridges_0.5.1            rsconnect_0.8.8          
##   [5] rprojroot_1.3-2           ggstance_0.3.1            markdown_0.8              base64enc_0.1-3          
##   [9] rstudioapi_0.8            svUnit_0.7-12             DT_0.4                    lubridate_1.7.4          
##  [13] xml2_1.2.0                codetools_0.2-15          splines_3.5.1             shinythemes_1.1.1        
##  [17] bayesplot_1.6.0           jsonlite_1.5              nloptr_1.2.1              broom_0.5.0              
##  [21] shiny_1.1.0               compiler_3.5.1            httr_1.3.1                backports_1.1.2          
##  [25] assertthat_0.2.0          Matrix_1.2-14             lazyeval_0.2.1            cli_1.0.1                
##  [29] later_0.7.5               htmltools_0.3.6           prettyunits_1.0.2         tools_3.5.1              
##  [33] igraph_1.2.2              coda_0.19-2               gtable_0.2.0              glue_1.3.0               
##  [37] reshape2_1.4.3            Rcpp_0.12.19              cellranger_1.1.0          nlme_3.1-137             
##  [41] crosstalk_1.0.0           ps_1.2.0                  lme4_1.1-18-1             rvest_0.3.2              
##  [45] mime_0.6                  miniUI_0.1.1.1            tidybayes_1.0.3           gtools_3.8.1             
##  [49] MASS_7.3-51               zoo_1.8-4                 scales_1.0.0              rstanarm_2.18.1          
##  [53] colourpicker_1.0          hms_0.4.2                 promises_1.0.1            inline_0.3.15            
##  [57] shinystan_2.5.0           yaml_2.2.0                gridExtra_2.3             loo_2.0.0                
##  [61] stringi_1.2.4             dygraphs_1.1.1.6          pkgbuild_1.0.2            bibtex_0.4.2             
##  [65] rlang_0.3.0.1             pkgconfig_2.0.2           matrixStats_0.54.0        evaluate_0.12            
##  [69] lattice_0.20-35           bindr_0.1.1               splines2_0.2.8            rstantools_1.5.1         
##  [73] htmlwidgets_1.3           labeling_0.3              processx_3.2.0            tidyselect_0.2.5         
##  [77] biostan_0.1.0             plyr_1.8.4                magrittr_1.5              R6_2.3.0                 
##  [81] pillar_1.3.0              haven_1.1.2               withr_2.1.2               xts_0.11-1               
##  [85] survival_2.43-1           modelr_0.1.2              crayon_1.3.4              arrayhelpers_1.0-20160527
##  [89] KernSmooth_2.23-15        rmarkdown_1.10            grid_3.5.1                readxl_1.1.0             
##  [93] callr_3.0.0               threejs_0.3.1             digest_0.6.18             xtable_1.8-3             
##  [97] httpuv_1.4.5              stats4_3.5.1              munsell_0.5.0             shinyjs_1.0

## Record execution time and multicore use
end_time <- Sys.time()
diff_time <- difftime(end_time, start_time, units = "auto")
cat("Started  ", as.character(start_time), "\n",
    "Finished ", as.character(end_time), "\n",
    "Time difference of ", diff_time, " ", attr(diff_time, "units"), "\n",
    "Used ", foreach::getDoParWorkers(), " cores\n",
    "Used ", foreach::getDoParName(), " as backend\n",
    sep = "")

## Started  2018-12-01 21:57:52
## Finished 2018-12-01 22:00:32
## Time difference of 2.657673 mins
## Used 12 cores
## Used doParallelMC as backend