Natural (non-) informative priors for the skew-normal distribution

The skew-normal distribution probability density function is given by [1]:

\[s(x \,\vert \, \mu,\sigma,\lambda) = \dfrac{2}{\sigma}\phi\left(\dfrac{x-\mu}{\sigma}\right)\Phi\left(\lambda\dfrac{x-\mu}{\sigma}\right),\] where \(\phi\) and \(\Phi\) are the standard normal density and distribution functions, respectively, \(\lambda \in {\mathbb R}\) is a shape parameter than induces skewness, \(\mu\in {\mathbb R}\) is a location parameter, and \(\sigma>0\) is a scale parameter. [1,2] have studied priors for the shape (skewness) parameter \(\lambda\) in the family of skew-symmetric models [3], which contains the skew normal distribution. In particular, [2,4] studied the Jeffreys (reference) prior of \(\lambda\), and found that it can be approximated with a Student-t distribution with \(1/2\) degrees of freedom. [1] proposed a novel approach to construct informative and non-informative priors for \(\lambda\) based on the Total Variation distance. Such strategy leads to closed-form, interpretable priors, with similar tails to that of the Jeffreys prior. As a joint prior for the parameters \((\mu,\sigma,\lambda)\) [1,2] consider the prior structure: \[\pi(\mu,\sigma,\lambda) \propto \dfrac{1}{\sigma}\pi(\lambda),\] for different choices of \(\pi(\lambda)\).

The following R code presents an example with real data that illustrate the use of the Jeffreys [2] and Total Variation priors [1]. It contains the implementation of the posterior distribution using (i) the BTV(1,1) prior, (ii) the BTV(0.5,0.5) prior, (iii) the BTV(3,0.5) prior, and (4) the Jeffreys prior. The fit is illustrated by comparing the histogram with the posterior predictive density function associated to each prior.

  1. Natural (non-) informative priors for skew-symmetric distributions

  2. On the independence Jeffreys prior for skew-symmetric models

  3. A class of distributions which includes the normal ones

  4. A note on reference priors for the scalar skew-normal distribution

rm(list=ls())

# Required packages
library(sn)
## Loading required package: stats4
## 
## Attaching package: 'sn'
## The following object is masked from 'package:stats':
## 
##     sd
library(Rtwalk)
## Package: Rtwalk, Version: 1.8.0
## The R Implementation of 't-walk' MCMC Sampler
## http://www.cimat.mx/~jac/twalk/
## For citations, please use:
## Christen JA and Fox C (2010). A general purpose sampling algorithm for
## continuous distributions (the t-walk). Bayesian Analysis, 5(2),
## pp. 263-282. <URL:
## http://ba.stat.cmu.edu/journal/2010/vol05/issue02/christen.pdf>.
library(TeachingDemos)

# Data
data(ais)

bmi = ais$BMI[1:100]

hist(bmi)

# Log-likelihood and MLE (infinite)
ll <- function(par){
if(par[2]>0) return(-sum(dsn(bmi,par[1],par[2],par[3],log=T)))
else return(Inf)
}

optim(c(20,1,2),ll)
## $par
## [1] 19.229308  3.810285  2.233281
## 
## $value
## [1] 235.504
## 
## $counts
## function gradient 
##       76       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
##########################################################################################################################
# Log-posterior with BTV(1,1) prior
##########################################################################################################################
# -Log posterior
lptv <- function(par){
return(-sum(dsn(bmi,par[1],par[2],par[3],log=T)) + log(par[2]) - dt(par[3],df=1,log=T) )
}

# Number of iterations 
NMH = 110000

# Support function 
Support <- function(x) {((0 < x[2]))}

# Function to generate the initial points in the sampler
X0 <- function(x) { c( runif(1, min=-0.1, max=0.1), 1+runif(1,min=-0.25,max=0.25),7+runif(1, min=-0.5, max=0.5)) }

# Posterior samples
set.seed(1234)
outtv <- Runtwalk( dim=3,  Tr=NMH,  Obj=lptv, Supp=Support, x0=X0(), xp0=X0(),PlotLogPost = FALSE) 
## This is the twalk for R.
## Evaluating the objective density at initial values.
## Opening 8 X 110001 matrix to save output.  Sampling (no graphics mode).
# thin-in and burn-in
burn = 10000
thin = 100
ind = seq(burn,NMH,thin)

muptv = outtv$output[ , 1][ind]
sigmaptv = outtv$output[ , 2][ind]
lambdaptv = outtv$output[ , 3][ind]

# Some histograms and summaries
hist(muptv)

hist(sigmaptv)

hist(lambdaptv)

median(muptv)
## [1] 19.56176
median(sigmaptv)
## [1] 3.604874
median(lambdaptv)
## [1] 1.728608
emp.hpd(muptv)
## [1] 18.17581 23.07018
emp.hpd(sigmaptv)
## [1] 2.421726 4.515195
emp.hpd(lambdaptv)
## [1] -0.7979746  3.8141542
##########################################################################################################################
# Log-posterior with BTV(0.5,0.5) prior
##########################################################################################################################

# Log prior
lpriorbtv = Vectorize(function(lambda){
val = dbeta(atan(lambda)/pi + 0.5,1/2,1/2,log=T) + dt(lambda,df=1,log=T)
return(val)
})

# - Log posterior
lpbtv <- function(par){
if(par[2]>0) return( log(par[2]) - sum(log(dsn(bmi,par[1],par[2],par[3]))) -  lpriorbtv(par[3])    )
}

# Posterior samples
set.seed(1234)
outbtv <- Runtwalk( dim=3,  Tr=NMH,  Obj=lpbtv, Supp=Support, x0=X0(), xp0=X0(),PlotLogPost = FALSE) 
## This is the twalk for R.
## Evaluating the objective density at initial values.
## Opening 8 X 110001 matrix to save output.  Sampling (no graphics mode).
ind = seq(burn,NMH,thin)

mupbtv = outbtv$output[ , 1][ind]
sigmapbtv = outbtv$output[ , 2][ind]
lambdapbtv = outbtv$output[ , 3][ind]

# Some histograms and summaries
hist(mupbtv)

hist(sigmapbtv)

hist(lambdapbtv)

median(mupbtv)
## [1] 19.51081
median(sigmapbtv)
## [1] 3.629451
median(lambdapbtv)
## [1] 1.777081
emp.hpd(mupbtv)
## [1] 18.20072 23.12962
emp.hpd(sigmapbtv)
## [1] 2.646438 4.603110
emp.hpd(lambdapbtv)
## [1] -0.652033  4.321629
##########################################################################################################################
# Log-posterior with BTV informative prior
##########################################################################################################################

# Log prior
lpriorbtvi = Vectorize(function(lambda){
val = dbeta(atan(lambda)/pi + 0.5,3,1/2,log=T) + dt(lambda,df=1,log=T)
return(val)
})

# - Log posterior
lpbtvi <- function(par){
if(par[2]>0) return( log(par[2]) - sum(log(dsn(bmi,par[1],par[2],par[3]))) -  lpriorbtvi(par[3])    )
}

# Posterior samples
set.seed(1234)
outbtvi <- Runtwalk( dim=3,  Tr=NMH,  Obj=lpbtvi, Supp=Support, x0=X0(), xp0=X0(),PlotLogPost = FALSE) 
## This is the twalk for R.
## Evaluating the objective density at initial values.
## Opening 8 X 110001 matrix to save output.  Sampling (no graphics mode).
ind = seq(burn,NMH,thin)

mupbtvi = outbtvi$output[ , 1][ind]
sigmapbtvi = outbtvi$output[ , 2][ind]
lambdapbtvi = outbtvi$output[ , 3][ind]

# Some histograms and summaries
hist(mupbtvi)

hist(sigmapbtvi)

hist(lambdapbtvi)

median(mupbtvi)
## [1] 19.26803
median(sigmapbtvi)
## [1] 3.796684
median(lambdapbtvi)
## [1] 2.143555
emp.hpd(mupbtvi)
## [1] 18.26624 20.91671
emp.hpd(sigmapbtvi)
## [1] 2.669896 4.676249
emp.hpd(lambdapbtvi)
## [1] 0.3194122 4.0209780
##########################################################################################################################
# Log-posterior with Reference prior
##########################################################################################################################

# - Log posterior
c=pi/2
lpr <- function(par){
  return(-sum(dsn(bmi,par[1],par[2],par[3],log=T)) + log(par[2]) - dt(par[3]/c,df=1/2,log=T) )
}

# Posterior samples
set.seed(1234)
outr <- Runtwalk( dim=3,  Tr=NMH,  Obj=lpr, Supp=Support, x0=X0(), xp0=X0(),PlotLogPost = FALSE) 
## This is the twalk for R.
## Evaluating the objective density at initial values.
## Opening 8 X 110001 matrix to save output.  Sampling (no graphics mode).
mupr = outr$output[ , 1][ind]
sigmapr = outr$output[ , 2][ind]
lambdapr = outr$output[ , 3][ind]

# Some histograms and summaries
hist(mupr)

hist(sigmapr)

hist(lambdapr)

median(mupr)
## [1] 19.41992
median(sigmapr)
## [1] 3.731819
median(lambdapr)
## [1] 1.912515
emp.hpd(mupr)
## [1] 18.24755 23.63246
emp.hpd(sigmapr)
## [1] 2.735461 4.767317
emp.hpd(lambdapr)
## [1] -1.223716  4.161240
##########################################################################################################################
# Predictive densities
##########################################################################################################################

# Predictive densities for the different priors
predtv<- Vectorize(function(x) {
  tempf <- function(par) dsn(x,par[1],par[2],par[3])
  var <- mean(apply(cbind(muptv,sigmaptv,lambdaptv),1,tempf))
  return(var)
})

predbtv<- Vectorize(function(x) {
  tempf <- function(par) dsn(x,par[1],par[2],par[3])
  var <- mean(apply(cbind(mupbtv,sigmapbtv,lambdapbtv),1,tempf))
  return(var)
})

predbtvi<- Vectorize(function(x) {
  tempf <- function(par) dsn(x,par[1],par[2],par[3])
  var <- mean(apply(cbind(mupbtvi,sigmapbtvi,lambdapbtvi),1,tempf))
  return(var)
})

predr<- Vectorize(function(x) {
tempf <- function(par) dsn(x,par[1],par[2],par[3])
var <- mean(apply(cbind(mupr,sigmapr,lambdapr),1,tempf))
  return(var)
})

hist(bmi,breaks=20,probability = TRUE, xlim=c(15,35))
curve(predtv,15,35,add=T,lwd=2,lty=1)
curve(predbtv,15,35,add=T,lwd=2,lty=2)
curve(predbtvi,15,35,add=T,lwd=2,lty=3)
curve(predr,15,35,add=T,lwd=2,lty=4)
box()
legend(30, 0.15, c("BTV(1,1)","BTV(0.5,0.5)","BTV(3,0.5)","Jeffreys"),
       text.col = "black", lty = c(1, 2, 3, 4),
       merge = TRUE, bg = "gray90")