require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
require(DPpackage)

1 Chinese restaurant process

Chinese restaurant process (CRP) / Hoppe’s urn modeと呼ばれる過程は次の様な物である。
1. 壺の中に黒玉が一つ入っている。
2. 黒玉の重みを\(\alpha\), 色玉の重みを\(1\)とし、重みに比例した確率で取り出す。
3. 黒玉が出たら新しい色の玉を壺の中に入れる。色玉が出たら同じ色の色玉と共に壺の中に戻す。

CRP <- function(alpha, n){
  total_weight <- alpha
  weight_c <- c(alpha)
  n_colour <- 0
  ball_c <- c()
  n_colour_c <- c(0)
  for (i in 1:n ){
    ball <- sample(1:(n_colour + 1), size=1, prob = weight_c / total_weight)
    if (ball == 1) {
      weight_c <- c(weight_c, 1)
      total_weight <- total_weight + 1
      n_colour <- n_colour + 1
      ball_c <- c(ball_c, 1)
    } else{
      weight_c[ball] <- weight_c[ball] + 1
      total_weight <- total_weight + 1
      ball_c <- c(ball_c, ball)
    }
    n_colour_c <- c(n_colour_c, n_colour)
  }
  return(list(balltaken=ball_c, colour=n_colour_c))
} 

b10 <- CRP(alpha = 10,1000)
b2 <- CRP(alpha = 2,1000)
df <- data.frame(num_colour=c(b10$colour,b2$colour), step=rep(0:1000,2), alpha=c(rep(10,1001), rep(2,1001)))
df$alpha <- factor(df$alpha)
g <- ggplot(df, aes(x=step,y=num_colour, group=alpha, colour=alpha)) + geom_line()
ggplotly(g)
b10_2 <- CRP(alpha = 10, n = 100000)
a <- table(b10_2$balltaken)[-1]
plot(1:length(a),sort(a, decreasing = T), log="y", xlab = "ball colour", ylab = "number of balls")

個数分布は指数分布になる。一方Pitman-Yor processではべき分布になることが知られている。

2 Dirichlet process

\[P(\theta^n | \theta^1, \cdots, \theta^{n-1}) = \frac{\alpha}{\alpha + n -1}G_0 (\theta^n) + \frac{n-1}{\alpha + n -1}\left(\frac{1}{n-1}\sum_{i=1}^c n_i\delta_{\theta_i} \right)\] ここで\(n_i\)\(\theta^k~~(k=1,\cdots, n-1)\)のうち\(\theta_k\)と等しい\(\theta^k\)の個数である。この式を用いて\(\theta^1,\cdots, \theta^{1000}\)を発生させた。基底分布\(G_0(\theta)\)はガウシアンに選んだ。

ballcolour <- function(balltaken){
  n_colour <- 1
  colour <- c()
  for (i in 1:length(balltaken)){
    if (balltaken[i] == 1){
      colour[i] <- n_colour + 1
      n_colour <- n_colour + 1
    } else{
      colour[i] <- balltaken[i]
    }
  }
  return(colour - 1)
}

plot_dp <- function(alpha){
  crp <- CRP(alpha = alpha,n = 1000)
  theta_l <- c()
  for (i in 1:crp$colour[length(crp$colour)]){
    theta <- rnorm(1)
    theta_l <- c(theta_l, theta)
  }
  
  n <- as.vector( table(ballcolour(crp$balltaken)) )
  df <- data.frame(x= theta_l, count= n)
  ggplot(df) + geom_bar(aes(x=x, y=count), stat="identity")
}
plot_dp(2)

plot_dp(alpha = 10)

plot_dp(1)

3 Gibbs sampling

\[ P(x_1 | x_2) = \frac{1}{\sqrt{2}}\exp\left[ -\frac{1}{2} (x_1 - a x_2)^2 \right], \\ P(x_2 | x_1) = \frac{1}{\sqrt{2}}\exp\left[ -\frac{1}{2} (x_2 - a x_1)^2 \right] \]

N <- 300
a <- 0.6
x1 <- 5
x2 <- -5
x1_l <- c(x1)
x2_l <- c(x2)
for (i in 1:N){
  x1 <- rnorm(1,  mean=a * x2)
  x1_l <- c(x1_l, x1)
  x2_l <- c(x2_l, x2)
  x2 <- rnorm(1, mean=a * x1)
  x1_l <- c(x1_l, x1)
  x2_l <- c(x2_l, x2)
}
df <- data.frame(n =1:length(x1_l),x= x1_l, y=x2_l)
g <- ggplot(df, aes(x=x, y=y)) + geom_path() + geom_point() 
v <- eigen(matrix(c(1,-a,-a,1), 2,2))
g <- g + geom_path(data = df, aes(x=x, y=y))
ggplotly(g)

4 DPによるクラスタリング

set.seed(5)
S <- list()
mu <- list()
for (i in 1:5) {
  a <- matrix( rnorm(4,mean=0,sd=1), 2,2) 
  a <- a + t(a)
  b <- eigen(a)
  a <- t( b$vectors ) %*% diag(abs(b$values)) %*% b$vectors
  S[[i]] <- a
  a <- matrix( rnorm(2,mean=0,sd=10), 1,2) 
  mu[[i]] <- a
}
N <- 100
data <- c()
for (i in 1:5) {
  a <- mvrnorm(n = N, Sigma = S[[i]], mu = mu[[i]])
  data <- rbind(data, a)
}
plot(data)

s2 <- matrix(c(10000,0,0,1),ncol=2)
m2 <- c(180,3)
psiinv2 <- solve(matrix(c(10000,0,0,1),ncol=2))

prior <- list(a0=1,b0=1/10,nu1=4,nu2=4,s2=s2,
              m2=m2,psiinv2=psiinv2,tau1=0.01,tau2=0.01)

# Initial state
state <- NULL

# MCMC parameters

nburn <- 5000
nsave <- 10000
nskip <- 10
ndisplay <- 1000
mcmc <- list(nburn=nburn,nsave=nsave,nskip=nskip,ndisplay=ndisplay)

# Fit the model
fit1 <- DPdensity(y=data,prior=prior,mcmc=mcmc,
                  state=state,status=TRUE,na.action=na.omit)
## 
## MCMC scan 1000 of 10000 (CPU time: 26.534 s)
## MCMC scan 2000 of 10000 (CPU time: 49.163 s)
## MCMC scan 3000 of 10000 (CPU time: 74.276 s)
## MCMC scan 4000 of 10000 (CPU time: 95.578 s)
## MCMC scan 5000 of 10000 (CPU time: 116.116 s)
## MCMC scan 6000 of 10000 (CPU time: 135.841 s)
## MCMC scan 7000 of 10000 (CPU time: 162.035 s)
## MCMC scan 8000 of 10000 (CPU time: 186.418 s)
## MCMC scan 9000 of 10000 (CPU time: 206.993 s)
## MCMC scan 10000 of 10000 (CPU time: 230.047 s)
summary(fit1)
## 
## DPM of normals model for density estimation
## 
## Call:
## DPdensity.default(y = data, prior = prior, mcmc = mcmc, state = state, 
##     status = TRUE, na.action = na.omit)
## 
## Posterior Predictive Distributions (log):
##    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.  
## -10.030   -4.909   -4.234   -4.422   -3.667   -2.862  
## 
## Baseline distribution:
##                  Mean        Median      Std. Dev.   Naive Std.Error
## m1-data           1.579e+01   1.506e+01   1.250e+01   1.250e-01     
## m1-prior          2.773e+00   2.771e+00   1.014e+00   1.014e-02     
## k0                2.064e-03   1.707e-03   1.497e-03   1.497e-05     
## psi1-data         3.353e-01   3.146e-01   1.125e-01   1.125e-03     
## psi1-data-prior   3.487e-02   3.071e-02   1.010e-01   1.010e-03     
## psi1-prior        5.807e-01   5.435e-01   1.957e-01   1.957e-03     
##                  95%HPD-Low  95%HPD-Upp
## m1-data          -7.182e+00   3.877e+01
## m1-prior          7.751e-01   4.738e+00
## k0                2.384e-05   4.940e-03
## psi1-data         1.625e-01   5.593e-01
## psi1-data-prior  -1.627e-01   2.391e-01
## psi1-prior        2.753e-01   9.671e-01
## 
## Precision parameter:
##           Mean      Median    Std. Dev.  Naive Std.Error  95%HPD-Low
## ncluster  5.087200  5.000000  0.298337   0.002983         5.000000  
## alpha     0.860629  0.796109  0.407122   0.004071         0.191977  
##           95%HPD-Upp
## ncluster  6.000000  
## alpha     1.670826  
## 
## Number of Observations: 500
## Number of Variables: 2
df <- data.frame(x=data[,1], y=data[,2], pred = fit1$state$ss)
df$pred <- factor(df$pred)
ggplot(df, aes(x=x,y=y,group=pred, colour=pred)) + geom_point()

正しくクラスタリング出来ていることが分かる。