```
BST 249 Materials
set.seed(20161106)
library(tidyverse)
library(MCMCpack)
Here we aim to define a Dirichlet Process prior for a random probability measure \(G\) by various equivalent methods.
The very crude partitions are \((-\infty, -3], (-3,-2], (-2,-1], (-1,0], (0,1], (1,2], (2,3],\) and \((3,\infty)\). The initial distribution \(G_0\) is a standard normal.
## Define cutoffs
cutoffs <- c(-Inf,-3,-2,-1,0,+1,+2,+3,Inf)
## Probabilities of abovestated partitions by N(0,1)
G0Ai <- tail(pnorm(cutoffs), -1) - head(pnorm(cutoffs), -1)
G0Ai
## [1] 0.001349898 0.021400234 0.135905122 0.341344746 0.341344746 0.135905122 0.021400234 0.001349898
## Sum to 1
sum(G0Ai)
## [1] 1
## Set M
M <- 1
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 2.32692e-251 0.2382863 0.002632018 0.6628468 0.09619033 7.983091e-10 0.00004454045 4.720157e-142
sum(dir1)
## [1] 1
## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
geom_boxplot() +
theme_bw() + theme(legend.key = element_blank())
## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
geom_line() +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Repeat with a greater M
M <- 100
## Sample one Dirichlet vector (notice it sum to 1)
dir1 <- rdirichlet(n = 1, alpha = M * G0Ai)
dir1
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0.00000005005948 0.01482998 0.19269 0.3317011 0.3570582 0.0846562 0.01899657 0.00006798478
sum(dir1)
## [1] 1
## Sample many
dir_n <- rdirichlet(n = 10^4, alpha = M * G0Ai)
## Construct df for visualization
df1 <- as.data.frame(dir_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
geom_boxplot() +
theme_bw() + theme(legend.key = element_blank())
## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
geom_line() +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
With greater \(M\), the random probability measure \(G_{A_i}\) is usually very close to \(G_0(Ai)\), whereas with a small \(M\) there is a lot more random variability.
Here we define
## Define a normalized gamma vector sampler
NormalizedGammaVector <- function(n, M, G0Ai) {
lapply(seq_len(n), function(i) {
v <- rgamma(n = length(G0Ai), scale = 1, shape = M * G0Ai)
v / sum(v)
}) %>% do.call(rbind, .)
}
## Sample 1 vector
M <- 1
gamma1 <- NormalizedGammaVector(n = 1, M = M, G0Ai = G0Ai)
gamma1
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 3.657087e-210 0.00000003546997 0.6774649 0.01614417 0.3056665 0.0007244079 8.26635e-25 7.415763e-40
## sum to 1
sum(gamma1)
## [1] 1
## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df1 <- as.data.frame(gamma_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
geom_boxplot() +
theme_bw() + theme(legend.key = element_blank())
## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
geom_line() +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Try with a greater M
M <- 100
## Sample many
gamma_n <- NormalizedGammaVector(n = 10^4, M = M, G0Ai = G0Ai)
## Construct df for visualization
df1 <- as.data.frame(gamma_n)
colnames(df1) <- paste0("G_A", seq_len(ncol(df1)))
df1$iter <- seq_len(nrow(df1))
df1long <- gather(data = df1, key = key, value = value, -iter)
## Marginal distribution of G_Ai
ggplot(data = df1long, mapping = aes(x = key, y = value)) +
geom_boxplot() +
theme_bw() + theme(legend.key = element_blank())
## Observe several realized vectors (line connects values from a single vector)
ggplot(data = subset(df1long, iter <= 5),
mapping = aes(x = key, y = value, group = iter, color = factor(iter))) +
geom_line() +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
## Any two elements are inversely associated by the sum-is-one constraint
ggplot(data = df1,
mapping = aes(x = G_A2, y = G_A4, color = G_A3)) +
geom_point() +
theme_bw() + theme(legend.key = element_blank())