```

References

BST 249 Materials

Load packages

set.seed(20161106)
library(tidyverse)
library(MCMCpack)

Aims

Here we aim to define a Dirichlet Process prior for a random probability measure \(G\) by various equivalent methods.

Definition 1: Dirichelet random variable

The very crude partitions are \((-\infty, -3], (-3,-2], (-2,-1], (-1,0], (0,1], (1,2], (2,3],\) and \((3,\infty)\). The initial distribution \(G_0\) is a standard normal.

## Define cutoffs
cutoffs <- c(-Inf,-3,-2,-1,0,+1,+2,+3,Inf)
## Probabilities of abovestated partitions by N(0,1)
G0Ai <- tail(pnorm(cutoffs), -1) - head(pnorm(cutoffs), -1)
G0Ai
## [1] 0.001349898 0.021400234 0.135905122 0.341344746 0.341344746 0.135905122 0.021400234 0.001349898
## Sum to 1
sum(G0Ai)
## [1] 1
## Set M
M <- 1
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1
##              [,1]      [,2]        [,3]      [,4]       [,5]         [,6]          [,7]          [,8]
## [1,] 2.32692e-251 0.2382863 0.002632018 0.6628468 0.09619033 7.983091e-10 0.00004454045 4.720157e-142
sum(dir1)
## [1] 1
## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Repeat with a greater M
M <- 100
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1
##                  [,1]       [,2]    [,3]      [,4]      [,5]      [,6]       [,7]          [,8]
## [1,] 0.00000005005948 0.01482998 0.19269 0.3317011 0.3570582 0.0846562 0.01899657 0.00006798478
sum(dir1)
## [1] 1
## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

With greater \(M\), the random probability measure \(G_{A_i}\) is usually very close to \(G_0(Ai)\), whereas with a small \(M\) there is a lot more random variability.

Definition 2: Normalized Gamma random variables

Here we define

## Define a normalized gamma vector sampler
NormalizedGammaVector <- function(n, M, G0Ai) {
    lapply(seq_len(n), function(i) {
        v <- rgamma(n = length(G0Ai), scale = 1, shape = M * G0Ai)
        v / sum(v)
    }) %>% do.call(rbind, .)
}
## Sample 1 vector
M <- 1
gamma1 <- NormalizedGammaVector(n = 1, M = M, G0Ai = G0Ai)
gamma1
##               [,1]             [,2]      [,3]       [,4]      [,5]         [,6]        [,7]         [,8]
## [1,] 3.657087e-210 0.00000003546997 0.6774649 0.01614417 0.3056665 0.0007244079 8.26635e-25 7.415763e-40
## sum to 1
sum(gamma1)
## [1] 1
## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df2 <- as.data.frame(gamma_n)
colnames(df2) <- paste0("G_A", seq_len(ncol(df2)))
df2$iter <- seq_len(nrow(df2))
df2long <- gather(data = df2, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df2long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df2long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df2,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Try with a greater M
M <- 100
## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df2 <- as.data.frame(gamma_n)
colnames(df2) <- paste0("G_A", seq_len(ncol(df2)))
df2$iter <- seq_len(nrow(df2))
df2long <- gather(data = df2, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df2long, mapping = aes(x = key, y = value)) +
    geom_boxplot() +
    theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df2long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
    geom_line() +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df2,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
    geom_point() +
    theme_bw() + theme(legend.key = element_blank())

Definition 3 (Stick-breaking)

Advantage: we can calculate the realization G(A) for any \(A \in \mathcal{B}\), and it is faster than Definition 5 (Urn scheme).

## Define a one-time stick-breaking generator
StickBreakingGenerator <- function(num, M, G0.generator) {
  theta.vector <- G0.generator(n = num)
  v.vector <- rbeta(num, 1, M)
  w.vector <- rep(NA,num)
  remaining <- 1
  for (i in 1:num){
    w.vector[i] <- remaining * v.vector[i]
    remaining <- remaining-w.vector[i]
  }
  return(list(theta.vector, w.vector))
}

## Define a stick-breaking sampler
StickBreakingSampler <- function(n, M, cutoff, num=1000, G0.generator=rnorm){
  output <- matrix(NA, nrow = n, ncol = length(cutoff) + 1)
  cutoff <- c(-Inf, cutoff, Inf)
  for(i in 1:n){
    SB.sample <- StickBreakingGenerator(num=num, M=M, G0.generator=G0.generator)
    for(j in 1:ncol(output)){
      output[i,j] <- sum(SB.sample[[2]][intersect(which(SB.sample[[1]] > cutoff[j]), which(SB.sample[[1]] <= cutoff[j+1]))])
    }
  }
  return(output)
}

## Sample 1 vector
M <- 1
StickBreaking1 <- StickBreakingSampler(n = 1, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
StickBreaking1
##               [,1]         [,2]      [,3]      [,4]       [,5]         [,6]         [,7]          [,8]
## [1,] 1.037119e-163 1.966526e-12 0.6358957 0.3205887 0.04348805 0.0000275536 2.988335e-18 9.529396e-262
## sum to 1
sum(StickBreaking1)
## [1] 1
## Sample many
StickBreaking_n <- StickBreakingSampler(n = 10^4, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
## Construct df for visualization
df3 <- as.data.frame(StickBreaking_n)
colnames(df3) <- paste0("G_A", seq_len(ncol(df3)))
df3$iter <- seq_len(nrow(df3))
df3long <- gather(data = df3, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df3long, mapping = aes(x = key, y = value)) +
  geom_boxplot() +
  theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df3long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
  geom_line() +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df3,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Try with a greater M
M <- 100
## Sample many
StickBreaking_n <- StickBreakingSampler(n = 10^4, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
## Construct df for visualization
df3 <- as.data.frame(StickBreaking_n)
colnames(df3) <- paste0("G_A", seq_len(ncol(df3)))
df3$iter <- seq_len(nrow(df3))
df3long <- gather(data = df3, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df3long, mapping = aes(x = key, y = value)) +
  geom_boxplot() +
  theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df3long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
  geom_line() +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df3,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

Definition 4 (Non-homogeneous Poisson-Process)

The Non-homogeneous Poisson-Process (NHPP) representation of Dirichlet Process is directly related to the Gamma Process (i.e. Definition 2). Note that it is not easy to generate the interarrival times for this NHPP with intensity \[\lambda(t) = M\exp(-t)/t\]

Definition 5 (Blackwell-MacQueen Urn Scheme)

Here we can also calculate the realization G(A) for any \(A \in \mathcal{B}\), but it takes longer to get the same accuracy as Stick-Breaking approach (Definition 3).

## Define a one-time Polya Urn generator
PolyaUrnGenerator <- function(num, M, G0.generator) {
  X.vector <- G0.generator(n = 1)
  count.vector <- 1
  for (i in 2:num){
    prob <- c(M/(length(count.vector)+M), count.vector/(length(count.vector)+M))
    category <- which(rmultinom(n=1, size=1,prob=prob)==1) - 1
    if (category == 0){
      X.vector <- c(X.vector, G0.generator(n = 1))
      count.vector <- c(count.vector, 1)
    }else{
      count.vector[category] = count.vector[category] + 1
    }
  }
  return(list(X.vector, count.vector))
}

## Define a Polya Urn sampler
PolyaUrnSampler <- function(n, M, cutoff, num = 10000, G0.generator=rnorm){
  output <- matrix(NA, nrow = n, ncol = length(cutoff) + 1)
  cutoff <- c(-Inf, cutoff, Inf)
  for(i in 1:n){
    PU.sample <- PolyaUrnGenerator(num=num, M=M, G0.generator=G0.generator)
    for(j in 1:ncol(output)){
      output[i,j] <- sum(PU.sample[[2]][intersect(which(PU.sample[[1]] > cutoff[j]), which(PU.sample[[1]] <= cutoff[j+1]))])/num
    }
  }
  return(output)
}

## Sample 1 vector
M <- 1
PolyaUrn1 <- PolyaUrnSampler(n = 1, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
PolyaUrn1
##      [,1] [,2]   [,3]   [,4]  [,5]   [,6] [,7] [,8]
## [1,]    0    0 0.0031 0.7714 0.168 0.0575    0    0
## sum to 1
sum(PolyaUrn1)
## [1] 1
## Sample many
PolyaUrn_n <- PolyaUrnSampler(n = 10^4, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
## Construct df for visualization
df5 <- as.data.frame(PolyaUrn_n)
colnames(df5) <- paste0("G_A", seq_len(ncol(df5)))
df5$iter <- seq_len(nrow(df5))
df5long <- gather(data = df5, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df5long, mapping = aes(x = key, y = value)) +
  geom_boxplot() +
  theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df5long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
  geom_line() +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df5,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Try with a greater M
M <- 100
## Sample many
PolyaUrn_n <- PolyaUrnSampler(n = 10^4, M = M, cutoff = c(-3,-2,-1,0,1,2,3))
## Construct df for visualization
df5 <- as.data.frame(PolyaUrn_n)
colnames(df5) <- paste0("G_A", seq_len(ncol(df5)))
df5$iter <- seq_len(nrow(df5))
df5long <- gather(data = df5, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df5long, mapping = aes(x = key, y = value)) +
  geom_boxplot() +
  theme_bw() + theme(legend.key = element_blank())

## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df5long, iter <= 5),
       mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
  geom_line() +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df5,
       mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
  geom_point() +
  theme_bw() + theme(legend.key = element_blank())

Conclusion

From the results above, we can see that all the definitions turn out to have the same pattern of realizations of \(DP(M=1, G_0 = N(0,1))\). Thus, these are equivalent definitions of Dirichlet Process.