Conjugate Distributions 2. Categolical and Dirichlet

Categolical distribution

The categorical distribution is a discrete distribution that determines the probability of observing one of K possible outcomes. If K = 2, it may equal to Berunoulli distribution.

We will concern about the world having four possible parameters (K = 4), where 0 =< λ k <= 1 ,sum(λ k) = 1 (k = 1, 2, …, K).

set.seed(126)
n<-1000
lambda<-c(0.1,0.2,0.4,0.3)
k<-length(lambda)
catk<-rmultinom(n,1 ,prob=lambda)
row.names(catk)<-as.character(1:k)
barplot(apply(catk,1,sum)/n,col=3,main="Categolical distribution n=1000")

#
cat30<-rmultinom(30,1,prob=lambda)
row.names(cat30)<-as.character(1:k)
barplot(apply(cat30,1,sum)/30,col=4,main="Categolical distribution n=30")

Its probability for data xki = 1 in whole x is λ k.
We can make function named as PrCat(x, lambda), where x is K sets of vectors x = (x1,x2,…,xK) having series of 1 or 0. xk has mk numbers of 1 (xk = c(1,0,1,0,…,0),sum(xki) = mk).
This function will back the value of λ k^sum(x[k,])/nK (k = 1 to K, nK = sum(x[1:K,])).
PrCat has not constant value;
gamma(n)/(gamma(n1)…gamma(nk))*Π λ k^nK.

PrCat<-function(x,lambda){
  k<-nrow(x)
  n<-ncol(x)
  p<-1
    for(ki in 1:k){ p<- p*(lambda[ki])^(sum(x[ki,])/n)}
  p}
#
PrCat(x=matrix(c(1,0,0,0,1,0,0,0,1),nrow=3),lambda=c(.333,.333,.333))

## [1] 0.333

PrCat(x=catk,lambda=lambda)

## [1] 0.278675

PrCat(x=cat30,lambda=lambda)

## [1] 0.2665648

Likelifood distribution & Maximum likelifood (ML)

Distribution of λin the world may be estimated and visualized as a graph from the data x. Maximum likelihood of world λ may be gained also form this distribution.
Four categolical parameters as the axies of the world are generated by the function seek4lambda.
Where 0 =< λ < =1 ,sum(λ )=1.

seek4lambda<-function(bigin=c(0,0,0,0),end=c(1,1,1,1),step=c(0.1,0.1,0.1,0.1)){
  la<-NULL
  for(l4 in seq(bigin[4],end[4], step[4])){
    for(l3 in seq(bigin[3] ,min(1-l4,end[3]),step[3])){
      for(l2 in seq(bigin[2], min(1-l4-l3,end[2]),step[2])){
        l1=1-l4-l3-l2
        if (l1>=bigin[1] && l1<=end[1]) la<-rbind(la,c(l1,l2,l3,l4))
  }}}
  la}
#####
la<-seek4lambda(step=c(0.025,0.025,0.025,0.025))
head(la)

##       [,1]  [,2] [,3] [,4]
## [1,] 1.000 0.000    0    0
## [2,] 0.975 0.025    0    0
## [3,] 0.950 0.050    0    0
## [4,] 0.925 0.075    0    0
## [5,] 0.900 0.100    0    0
## [6,] 0.875 0.125    0    0

Along this axies probabilities of the likelifood will be estimated.

Likelifood.Cat<-function(x,lam){ 
  Lk<-NULL
  for (i in 1:nrow(lam)){
    p<-PrCat(x=x,lambda=lam[i,])
    if( p==Inf) p<-0  # 
    Lk<-rbind(Lk,c(lam[i,],p))
  }
  Lk<-data.frame(Lk)
  sp<-sum(Lk[,5]) # 1/sp = CONSTANT
  Lk[,5]<-Lk[,5]/sp
  names(Lk)<-c("L1","L2","L3","L4","Likelihood")
  Lk
}
#####
library(scatterplot3d)
col.n<-100
ch.col<-topo.colors(col.n+1,alpha=0.4)
#####
#n=1000
LkCat<-Likelifood.Cat(x=catk,lam=la)
ch.max<-max(LkCat$Likelihood)
col<-ch.col[(1-LkCat$Likelihood/ch.max)*col.n+1]
scatterplot3d(LkCat[,1:3],color=col,angle=140, pch = 16,main="Likelihood distribution of &lambda;  n=1000")

#ML
LkCat[LkCat$Likelihood==max(LkCat$Likelihood),]

##       L1  L2  L3  L4   Likelihood
## 8199 0.1 0.2 0.4 0.3 0.0001633475

#n=30
LkCat<-Likelifood.Cat(x=cat30,lam=la)
ch.max<-max(LkCat$Likelihood)
col<-ch.col[(1-LkCat$Likelihood/ch.max)*col.n+1]
scatterplot3d(LkCat[,1:3],color=col,angle=140,pch=16,main="Likelihood distribution of &lambda;  n=30")

#ML
LkCat[LkCat$Likelihood==max(LkCat$Likelihood),]

##       L1    L2  L3    L4   Likelihood
## 9321 0.1 0.225 0.3 0.375 0.0001608031

Dirichlet Distribution

Dirichlet function could represent distribution of the value lambda.

Drch<-function(a,lambda){
  PrD<-NULL
  for(n in 1:nrow(lambda)){
    lam<-lambda[n,]
    p<-1
    for(ki in 1:length(a)){
        p<-p*lam[ki]^(a[ki]-1)}
    PrD<-rbind(PrD,c(lam,p))}
  sp<-sum(PrD[,5])
  PrD[,5]<-PrD[,5]/sp
  PrD<-data.frame(PrD)
  names(PrD)<-c("L1","L2","L3","L4","prior")
  PrD
}   
#
a<-c(1,2,4,3)*2
prior<-Drch(a=a,lambda=la)
ch.max<-max(prior$prior)
col<-ch.col[(1-prior$prior/ch.max)*col.n+1]
scatterplot3d(prior[,1:3],color=col,angle=140,pch=16,main="Dirichlet distribution of &lambda; ")

Conjugate Distributions

As you see Categolical is conjugate to Dirichlet.
It means both have the same form
λ i^xi
So
PrCat(x,lambda)*Drch(a,lambda)=k(x,a)Drch((x/n+a),lambda).
where k(x,a) is constant.
Previously we called PrCat(x,λ) as likelihood. This time we may call Drch(x,a,lambda) as prior distribution of λ.

Bayes’ rule

According to Bayes’ rule
Posterior = Liklihood * Prior / Evidence.
In upper case
Liklihood = PrCat(x,λ); Pr(x|λ)
Prior = Drch(a,λ); Pr(λ)
Evidence = ∫ PrCat(x,λ)dλ

Posterior distribution & Maximum a posteriori estimation (MAP)

Posterior distribution of λin the world may be estimated and visualized as a graph from the data x.

Posterior.Cat<-function(x,a,lam){
  k<-length(a)
  n<-ncol(x)
  for(ki in 1:k){ a[ki]<- a[ki]+sum(x[ki,])/n-1}
  p<-Drch(a=a,lambda=lam)
  names(p)[5]<- "posterior"
  p}
#n=1000
Post<-Posterior.Cat(x=catk,a=a,lam=la)
ch.max<-max(Post$posterior)
col<-ch.col[(1-Post$posterior/ch.max)*col.n+1]
scatterplot3d(Post[,1:3],color=col,angle=140,pch=16,main="Posterior distribution of &lambda;  n=1000")

#ML
Post[Post$posterior==max(Post$posterior),]

##         L1    L2    L3    L4   posterior
## 8650 0.025 0.175 0.475 0.325 0.002874661

#
Post<-Posterior.Cat(x=cat30,a=a,lam=la)
ch.max<-max(Post$posterior)
col<-ch.col[(1-Post$posterior/ch.max)*col.n+1]
scatterplot3d(Post[,1:3],color=col,angle=140,pch=16,main="Posterior distribution of &lambda;  n=30")

#ML
Post[Post$posterior==max(Post$posterior),]

##         L1    L2    L3    L4   posterior
## 8650 0.025 0.175 0.475 0.325 0.002867657

If your data is not large, then the shape of the distribution will be dull also in the case of posterior estimate as in the case of the likelifood.

Monte Carlo Simulation

You may use MCmultinomdirichlet {MCMCpack}.

library(MCMCpack)

## Loading required package: coda

## Loading required package: MASS

## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)

## ## Copyright (C) 2003-2016 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park

## ##
## ## Support provided by the U.S. National Science Foundation

## ## (Grants SES-0350646 and SES-0350613)
## ##

posterior <- MCmultinomdirichlet(apply(catk,1,sum), a, mc=10000)
summary(posterior)

## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##        Mean       SD  Naive SE Time-series SE
## pi.1 0.1030 0.009591 9.591e-05      9.591e-05
## pi.2 0.1885 0.012310 1.231e-04      1.210e-04
## pi.3 0.4016 0.015352 1.535e-04      1.574e-04
## pi.4 0.3068 0.014458 1.446e-04      1.414e-04
## 
## 2. Quantiles for each variable:
## 
##         2.5%     25%    50%    75%  97.5%
## pi.1 0.08487 0.09649 0.1027 0.1092 0.1230
## pi.2 0.16477 0.18006 0.1883 0.1967 0.2134
## pi.3 0.37192 0.39135 0.4014 0.4120 0.4317
## pi.4 0.27901 0.29704 0.3067 0.3165 0.3357

plot(posterior)

Summary of functions

#Estimate categolical probability at the given lambda
PrCat<-function(x,lambda){
  k<-nrow(x)
  n<-ncol(x)
  p<-1
    for(ki in 1:k){ p<- p*(lambda[ki])^(sum(x[ki,])/n)}
  p
}
#Categolical Likelifood along lambda sequence
Likelifood.Cat<-function(x,lam){ 
  Lk<-NULL
  for (i in 1:nrow(lam)){
    p<-PrCat(x=x,lambda=lam[i,])
    if( p==Inf) p<-0  # 
    Lk<-rbind(Lk,c(lam[i,],p))
  }
  Lk<-data.frame(Lk)
  sp<-sum(Lk[,5]) # 1/sp = CONSTANT
  Lk[,5]<-Lk[,5]/sp
  names(Lk)<-c("L1","L2","L3","L4","Likelihood")
  Lk
}   
#Dirichlet distribution as prior
Drch<-function(a,lambda){
  PrD<-NULL
  for(n in 1:nrow(lambda)){
    lam<-lambda[n,]
    p<-1
    for(ki in 1:length(a)){
        p<-p*lam[ki]^(a[ki]-1)}
    PrD<-rbind(PrD,c(lam,p))}
  sp<-sum(PrD[,5])
  PrD[,5]<-PrD[,5]/sp
  PrD<-data.frame(PrD)
  names(PrD)<-c("L1","L2","L3","L4","prior")
  PrD
}

#Distribution of the posterior probability along lambda   
Posterior.Cat<-function(x,a,lam){
  k<-length(a)
  n<-ncol(x)
  for(ki in 1:k){ a[ki]<- a[ki]+sum(x[ki,])/n-1}
  p<-Drch(a=a,lambda=lam)
  names(p)[5]<- "posterior"
  p
}