Giving a Dirichlet Process Prior to a Random Probability Measure G

```

References

BST 249 Materials

Load packages

set.seed(20161106)
library(tidyverse)
library(MCMCpack)

Aims

Here we aim to define a Dirichlet Process prior for a random probability measure \(G\) by various equivalent methods.

Definition 1: Dirichelet random variable

The very crude partitions are \((-\infty, -3], (-3,-2], (-2,-1], (-1,0], (0,1], (1,2], (2,3],\) and \((3,\infty)\). The initial distribution \(G_0\) is a standard normal.

## Define cutoffs
cutoffs <- c(-Inf,-3,-2,-1,0,+1,+2,+3,Inf)
## Probabilities of abovestated partitions by N(0,1)
G0Ai <- tail(pnorm(cutoffs), -1) - head(pnorm(cutoffs), -1)
G0Ai

## [1] 0.001349898 0.021400234 0.135905122 0.341344746 0.341344746 0.135905122 0.021400234 0.001349898

## Sum to 1
sum(G0Ai)

## [1] 1

## Set M
M <- 1
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1

##              [,1]      [,2]        [,3]      [,4]       [,5]         [,6]          [,7]          [,8]
## [1,] 2.32692e-251 0.2382863 0.002632018 0.6628468 0.09619033 7.983091e-10 0.00004454045 4.720157e-142

sum(dir1)

## [1] 1

## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Repeat with a greater M
M <- 100
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1

##                  [,1]       [,2]    [,3]      [,4]      [,5]      [,6]       [,7]          [,8]
## [1,] 0.00000005005948 0.01482998 0.19269 0.3317011 0.3570582 0.0846562 0.01899657 0.00006798478

sum(dir1)

## [1] 1

## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

With greater \(M\), the random probability measure \(G_{A_i}\) is usually very close to \(G_0(Ai)\), whereas with a small \(M\) there is a lot more random variability.

Definition 2: Normalized Gamma random variables

Here we define

## Define a normalized gamma vector sampler
NormalizedGammaVector <- function(n, M, G0Ai) {
    lapply(seq_len(n), function(i) {
        v <- rgamma(n = length(G0Ai), scale = 1, shape = M * G0Ai)
        v / sum(v)
    }) %>% do.call(rbind, .)
}
## Sample 1 vector
M <- 1
gamma1 <- NormalizedGammaVector(n = 1, M = M, G0Ai = G0Ai)
gamma1

##               [,1]             [,2]      [,3]       [,4]      [,5]         [,6]        [,7]         [,8]
## [1,] 3.657087e-210 0.00000003546997 0.6774649 0.01614417 0.3056665 0.0007244079 8.26635e-25 7.415763e-40

## sum to 1
sum(gamma1)

## [1] 1

## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df1 <- as.data.frame(gamma_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Try with a greater M
M <- 100
## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df1 <- as.data.frame(gamma_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

Top Page: http://rpubs.com/kaz_yos/
github: https://github.com/kaz-yos