Example

In this post, we present four methods for fitting the two-piece normal (see The family of two-piece distributions and Bayesian inference on the two-piece normal distribution using rstan) to a popular data set containing the heights of 219 volcanoes:

– Method I: using the command metrop() from the R package mcmc

– Method II: using the command adaptMetropGibbs() from the R package spBayes

– Method III: using the command Runtwalk() from the R package Rtwalk

– Method IV: Maximum Likelihood Estimation

The log posterior and log likelihood functions are implemented using the R package twopiece. We use a flat prior on the skewness parameter \(\gamma \in (0,1)\), which is based on a weakly informative prior proposed in Inference in Two-Piece Location-Scale Models with Jeffreys Priors.

R code

Data preparation

## --------------------------------------------------------------------------------------------------------------------------------------
rm(list=ls())

# Required packages
library(mcmc)
library(normalp)
#library(devtools)
#install_github("FJRubio67/twopiece")
library(twopiece)
library(spBayes)
library(Rtwalk)

######################################################################################################
#  Example with real data from Mudholkar and Hutson (2000). AG ~ Uniform(-1,1)
######################################################################################################

dat = 10*c(0.9,0.9,0.7,0.6,0.6,0.6,0.5,0.2,
           1.9,1.9,1.7,1.6,1.6,1.3,1.1,1.0,1.0,1.0,
           2.9,2.9,2.9,2.8,2.8,2.7,2.7,2.6,2.6,2.6,2.5,2.5,2.4,2.4,2.4,2.2,2.2,2.2,2.1,2.1,2.0,2.0,
           3.9,3.9,3.8,3.7,3.6,3.6,3.6,3.5,3.5,3.5,3.5,3.4,3.4,3.2,3.2,3.1,3.1,3.0,
           4.9,4.9,4.9,4.9,4.9,4.8,4.8,4.7,4.6,4.4,4.4,4.4,4.3,4.3,4.3,4.3,4.3,4.2,4.1,4.1,4.1,4.0,
           5.9,5.9,5.7,5.7,5.7,5.6,5.6,5.6,5.6,5.6,5.6,5.5,5.5,5.4,5.4,5.3,5.2,5.2,5.2,5.1,5.1,5.0,5.0,
           6.9,6.8,6.8,6.7,6.7,6.7,6.6,6.6,6.6,6.6,6.5,6.5,6.4,6.4,6.1,6.1,6.0,6.0,
           7.9,7.8,7.8,7.8,7.7,7.6,7.5,7.5,7.5,7.4,7.3,7.3,7.2,7.1,7.1,7.1,7.0,7.0,7.0,7.0,
           8.9,8.7,8.6,8.5,8.3,8.3,8.3,8.2,8.2,8.2,8.2,8.1,
           9.9,9.7,9.7,9.6,9.5,9.5,9.4,9.4,9.3,9.3,9.2,9.1,9.0,9.0,9.0,
           10.9,10.8,10.6,10.5,10.4,10.4,10.3,10.3,10.2,10.2,10.1,10.1,10.0,
           11.9,11.6,11.6,11.4,11.3,11.3,11.2,11.1,11.1,11.0,
           12.6,12.5,12.4,12.4,12.4,12.2,12.1,12.1,
           13.8,13.7,13.4,13.3,13.0,
           14.0,14.0,
           15.7,15.6,15.6,
           16.5,16.2,
           17.9,17.2,
           18.5,
           19.9,19.7,19.3,19.3,19.0)

hist(dat,probability=TRUE,xlab="Height", main="Volcanoes' data")

Method I

#---------------------------------------------------------------------------------------------------------
# Method I: Using the command metrop() from the mcmc package
#---------------------------------------------------------------------------------------------------------


# alpha0 :   Shape parameter in the proposal prior AG ~ Beta(alpha0,beta0).
# beta0  :   Shape parameter in the proposal prior AG ~ Beta(alpha0,beta0).
alpha0 <- 1
beta0 <- 1

# log-posterior par=(mu,sigma,gamma)
lp <- function(par){
  if(par[2]>0 & par[3]>-1 & par[3]<1){
    log.lik <- sum(dtp3(dat,par[1],par[2],par[3],FUN = dnorm,param="eps",log=T))
    log.prior <- -log(par[2]) + (beta0-1)*log(1-par[3]) + (alpha0-1)*log(1+par[3])
    return( log.lik + log.prior )
  }
  else return(-Inf)
}

# sc     :   Controls the proposal step size in the Metropolis sampling algorithm.
sc <- c(0.5,0.5,0.1)

# burnin :   Burn-in period.
burnin = 50000

# thin   :   Thinning period.
thin = 100

# NMH    :   Length of the Markov Chain.
NMH = 550000

# ini    :   Starting point for the MCMC sampler.
ini = c(30,40,-0.65)

# Metropolis sampler
set.seed(1234)
out1 <- metrop(lp, scale = sc, initial = ini, nbatch = NMH)

# Posterior sample after burning and thinning
ind = seq(burnin,NMH,thin)
sim1 = t(rbind( out1$batch[ , 1][ind], out1$batch[ , 2][ind],out1$batch[ , 3][ind]))
colnames(sim1) <- c("mu","sigma","gamma")
acc = out1$accept
paste('Acceptance rate = ', acc)

## [1] "Acceptance rate =  0.324330909090909"

# Summaries of the posterior sample
apply(sim1,2,summary)

##                mu    sigma      gamma
## Min.     5.403094 33.69770 -0.9596201
## 1st Qu. 24.068989 38.41945 -0.7198117
## Median  28.169203 39.65490 -0.6608443
## Mean    28.197307 39.73274 -0.6581991
## 3rd Qu. 32.328596 40.96602 -0.5992905
## Max.    50.388510 48.06078 -0.2945577

hist(sim1[,1],probability=TRUE,xlab=~mu,main=~mu)

hist(sim1[,2],probability=TRUE,xlab=~sigma,main=~sigma)

hist(sim1[,3],probability=TRUE,xlab=~gamma,main=~gamma)

bayes.est1 <- apply(sim1,2,mean) # Posterior mean

# Fitted density using the posterior mean estimate
fit.den1 <- Vectorize(function(x) dtp3(x,bayes.est1[1],bayes.est1[2],bayes.est1[3],param = "eps", FUN = dnorm))

Method II

#---------------------------------------------------------------------------------------------------------
# Method II: Using the command adaptMetropGibbs from spBayes
#---------------------------------------------------------------------------------------------------------

# Number of batches and batch length
n.batch <- 5500
batch.length <- 10

# Running an adaptive Metropolis within Gibbs sampler
set.seed(1234)
chain <- adaptMetropGibbs(ltd=lp, starting=ini, accept.rate=0.44, batch=n.batch, 
                          batch.length=batch.length, report=100, verbose=FALSE)$p.theta.samples

# # Burning and thinning the chain 
burn <- 5000
thin <- 50
NS <- n.batch*batch.length
index <- seq(burn,NS,thin)

# Chain after burn in and thinning
sim2 <- chain[index,]

# Posterior samples (after thinning and burn-in period)
apply(sim2,2,summary)

##              [,1]     [,2]       [,3]
## Min.     5.098531 34.27265 -0.9623174
## 1st Qu. 23.748454 38.44647 -0.7223949
## Median  28.200724 39.72434 -0.6613747
## Mean    27.850681 39.79547 -0.6626145
## 3rd Qu. 32.843916 41.03242 -0.5977527
## Max.    49.339685 48.70396 -0.3727193

hist(sim2[,1],probability=TRUE,xlab=~mu,main=~mu)

hist(sim2[,2],probability=TRUE,xlab=~sigma,main=~sigma)

hist(sim2[,3],probability=TRUE,xlab=~gamma,main=~gamma)

bayes.est2 <- apply(sim2,2,mean) # posterior mean
# Fitted density using the posterior mean estimate
fit.den2 <- Vectorize(function(x) dtp3(x,bayes.est2[1],bayes.est2[2],bayes.est2[3],param = "eps", FUN = dnorm))

Method III

#---------------------------------------------------------------------------------------------------------
# Method III: The twalk
#---------------------------------------------------------------------------------------------------------

# minus log-posterior par=(mu,sigma,gamma)
mlp <- function(par){
    log.lik <- sum(dtp3(dat,par[1],par[2],par[3],FUN = dnorm,param="eps",log=T))
    log.prior <- -log(par[2]) + (beta0-1)*log(1-par[3]) + (alpha0-1)*log(1+par[3])
    return( -log.lik - log.prior )
}

# Number of iterations 
NMH = 55000

# Support function 
Support <- function(x) {((0 < x[2]) & (abs(x[3]) < 1))}

# Function to generate the initial points in the sampler
X0 <- function(x) { ini + runif(3,-0.05,0.05) }

# Posterior samples
set.seed(1234)
out <- Runtwalk( dim=3,  Tr=NMH,  Obj=mlp, Supp=Support, x0=X0(), xp0=X0(),PlotLogPost = FALSE)

## This is the twalk for R.
## Evaluating the objective density at initial values.
## Opening 8 X 55001 matrix to save output.  Sampling (no graphics mode).

# Chain after burn in and thinning
sim3 <- out$output[index,]

# Posterior samples (after thinning and burn-in period)
apply(sim3,2,summary)

##              [,1]     [,2]       [,3]
## Min.     4.277433 33.91098 -0.9605187
## 1st Qu. 23.793774 38.32488 -0.7233402
## Median  27.913715 39.68946 -0.6694697
## Mean    27.626037 39.71578 -0.6677498
## 3rd Qu. 31.947417 40.80058 -0.6070168
## Max.    50.434340 47.53877 -0.3740797

hist(sim3[,1],probability=TRUE,xlab=~mu,main=~mu)

hist(sim3[,2],probability=TRUE,xlab=~sigma,main=~sigma)

hist(sim3[,3],probability=TRUE,xlab=~gamma,main=~gamma)

bayes.est3 <- apply(sim3,2,mean) # posterior mean
# Fitted density using the posterior mean estimate
fit.den3 <- Vectorize(function(x) dtp3(x,bayes.est3[1],bayes.est3[2],bayes.est3[3],param = "eps", FUN = dnorm))

Method IV

#---------------------------------------------------------------------------------------------------------
# Method IV: Maximum Likelihood Estimator
#---------------------------------------------------------------------------------------------------------

# minus log-likelihood par=(mu,sigma,gamma,delta)
ll <- function(par){
  if(par[2]>0 & par[3]>-1 & par[3]<1){
    tempval <- dtp3(dat,par[1],par[2],par[3],FUN = dnorm,param="eps",log=T)
    return( -sum(tempval) )
  }
  else return(Inf)
}

# Optimization step
OPT <- optim(c(30,40,-0.65),ll)
MLE <- OPT$par # Maximum Likelihood Estimator
# Fitted density using the MLE
fit.den4 <- Vectorize(function(x) dtp3(x,MLE[1],MLE[2],MLE[3],param = "eps", FUN = dnorm))

Comparison

#**************************************************************************************************
# Comparison of the fitted densities
#**************************************************************************************************

hist(dat,probability=TRUE,xlab="Height", main="Volcanoes data")
curve(fit.den1,0,200, add=T, lwd = 2)
curve(fit.den2,0,200, add=T, lwd = 2, lty = 2)
curve(fit.den3,0,200, add=T, lwd = 2, lty = 3)
curve(fit.den4,0,200, add=T, lwd = 2, lty = 4)
legend("topright",  legend = c("Method I", "Method II", "Method III", "Method IV"), 
       col = c("black","black","black","black"), lwd = c(2,2,2,2), lty = c(1,2,3,4) )
box()

Fitting the two-piece normal distribution to the Volcanoes dataset

F. Javier Rubio and Mark F.J. Steel

August 25, 2021