Example 2.5

Here is an R code that can be used to generate Poisson random variables for large values of lambda. The sequence t contains the integer values in the range around the mean.

The last line of the program checks to see what interval the uniform random variable fell in and assigns the correct Poisson value to X. See Exercise 2.14 for other distributions.

Nsim=10^4
lambda=100
spread=3*sqrt(lambda)
t=round(seq(max(0,lambda-spread),lambda+spread,1))
prob=ppois(t, lambda)
X=rep(0,Nsim)
for (i in 1:Nsim){
u=runif(1)
X[i]=t[1]+sum(prob<u) }

Accept Reject algorithm

These so-called Accept–Reject methods only require us to know the functional form of the density f of interest (called the target density) up to a multiplicative constant. We use a simpler (to simulate) density g, called the instrumental or candidate density, to generate the random variable for which the simulation is actually done. The only constraints we impose on this candidate density g are that (i). f and g have compatible supports (i.e., g(x) > 0 when f(x) > 0). (ii). There is a constant M with f(x)/g(x) ≤ M for all x.

M = 1
u=runif(1)*M
y=randg(1)
while (u>f(y)/g(y)){
 u=runif(1)*M
 y=randg(1)}

In the implementation of the Accept–Reject algorithm above, the total number of attempts Nsim is fixed, which means that the number of accepted values is a binomial random variable with probability 1/M. Instead, in most cases, the number of accepted values is fixed, but this implementation can nonetheless be exploited as in

x=NULL
while (length(x)<Nsim){
  y=runif(Nsim*M)
   x=c(x,y[runif(Nsim*M)*M<dbeta(y,a,b)])}
x=x[1:Nsim]

(Note that using y=u=runif(NsimM) in the program would produce a bias, as y and u would then take the same values.) Simulating NsimM proposals from the start reduces the number of calls to while since this is the expected number of proposals (Exercise 2.5).

Exercise 2.19

In an Accept–Reject algorithm that generates a N (0, 1) random variable from a double-exponential distribution with density g(x|α) = (α/2) exp(−α|x|), compute the upper bound M over f /g and show that the choice α = 1 optimizes the corresponding acceptance rate.

ex 2.19

Exercise 2.20

In each of the following cases, construct an Accept–Reject algorithm, generate a sample of the corresponding random variables, and draw the density function on top of the histogram. a. Generate normal random variables using a Cauchy candidate in Accept–Reject. b. Generate gamma G(4.3, 6.2) random variables using a gamma G(4, 7) candidate.

a

f <- function(x,m,v) dnorm(x,m,sqrt(v))

g <- function(x,x0,lambda) dcauchy(x,x0,lambda)

genSamp <- function(n,m,v) {

  stProbe <- rep(0,n)
  interval <- seq(m - 10 * sqrt(v), m + 10 * sqrt(v), by = 0.01)
  M = max(f(interval,m,v)/g(interval,m,v))

  for (i in 1:n) {
    repeat{
      x <- rcauchy(1,m,v)
      u <- runif(1,0,1)
      if(u < (f(x,m,v)/(M*g(x,m,v)))) break
      }
    stProbe[i] <- x
    }

  return(stProbe)
  }

set.seed(0)
test <- genSamp(1000, 2, 0.5)
shapiro.test(test)$p.value
## [1] 0.1563038
#[1] 0.1563038

ks.test(test, rnorm(1000, 2, sqrt(0.5)))$p.value
## [1] 0.7590978
#[1] 0.7590978

b

n=1000
alpha = 4.3 #shape
beta = 6.2 #rate
k = 2.8 #constant
x <- seq(0,10,length.out=n)
f <- dgamma(x,shape=alpha, rate = beta)
plot(x,k*dgamma(x,shape=4, rate=7),type="l",ylim=c(0,5),xlab="x",ylab="Density") # Plots the gamma(4,7)
lines(x,f,lty=4) # Plots the target function
xc <- rep(0,n)
yc <- rep(0,n)
for(i in 1:n){
  reject <- TRUE
  while(reject){
    p <- runif(1)
    if (p < 0.5){
      xc[i]<- qgamma(p,shape = 4, rate=7)
      } 
    else{
      xc[i] <- qgamma(p,shape = 4,rate = 7)
    }
    p <- runif(1)
    yc[i] <- p*k*dgamma(xc[i], shape= 4,rate = 7)
    if (yc[i] < dgamma(xc[i],shape=alpha,rate=beta)){ # Then the point under the gamma(alpha,beta)
      reject <- FALSE
    }
  }
}
points(xc,yc,type='p') # The not rejected points

hist(xc,probability=T,xlim=c(0,10),ylim=c(0,2),xlab="x",col=16, breaks = (n/16)) # Plots the histogram
lines(x,dgamma(x,shape=alpha,rate=beta)) #Plot the target function, and see they look like each other.

3.1 Example

ex 2.19

ch=function(la){
  integrate(function(x){x^(la-1)*exp(-x)},0,Inf)$val}
plot(lgamma(seq(.01,10,le=100)),log(apply(as.matrix(
 seq(.01,10,le=100)),1,ch)),xlab="log(integrate(f))",
 ylab=expression(log(Gamma(lambda))),pch=19,cex=.6)

we obtain the sequence represented in Figure 3.1, which does not show any discrepancy even for very small values of λ.

Exercise 3.1

ex 2.19

a

The plot of the integrands follows from a simpleRprogram:

Niter=10^4
co=rcauchy(Niter)

I=mean(co*dnorm(co,mean=x))/mean(dnorm(co,mean=x))

x=0
mean(co*dnorm(co,mean=x))/mean(dnorm(co,mean=x))
## [1] 0.0007173104
x=2
mean(co*dnorm(co,mean=x))/mean(dnorm(co,mean=x))
## [1] 1.268048
x=4
mean(co*dnorm(co,mean=x))/mean(dnorm(co,mean=x))
## [1] 3.462791

b

x1=dnorm(co,mean=x)
estint2=cumsum(x1)/(1:Niter)
esterr2=sqrt(cumsum((x1-estint2)^2))/(1:Niter)

x1=co*x1
estint1=cumsum(x1)/(1:Niter)
esterr2=sqrt(cumsum((x1-estint1)^2))/(1:Niter)
par(mfrow=c(1,2))
plot(estint1,type="l",xlab="iteration",ylab="",col="gold")

plot(estint2,type="l",xlab="iteration",ylab="",col="gold")
lines(estint2-2*esterr2,lty=2,lwd=2)
lines(estint2+2*esterr2,lty=2,lwd=2)

ex 2.19

x=0
max(4*var(dnorm(co,m=x))*10^6,+ 4*var(co*dnorm(co,m=x))*10^6)
## [1] 98034.7
x=2
4*10^6*max(var(dnorm(co,m=x)),var(co*dnorm(co,m=x)))
## [1] 210945.7
x=4
10^6*4*max(var(dnorm(co,m=x)),var(co*dnorm(co,m=x)))
## [1] 313441.2

C

A similar implementation applies for the normal simulation, replacing dnorm with dcauchy in the above. The comparison is clear in that the required number of normal simulations when x= 4 is 1398.22, to compare with the above 306878.

3.3

ex 2.19 ex 2.19

h=function(x){ 1/(x^2*sqrt(2*pi)*exp(1/(2*x^2)))}

par(mfrow=c(2,1))
curve(h,from=0,to=1/20,xlab="x",ylab="h(x)",lwd="2")

I=1/20*h(runif(10^4)/20)

estint=cumsum(I)/(1:10^4)

esterr=sqrt(cumsum((I-estint)^2))/(1:10^4)

plot(estint,xlab="Iterations",ty="l",lwd=2,ylim=mean(I)+20*c(-esterr[10^4],esterr[10^4]),ylab="")
lines(estint+2*esterr,col="gold",lwd=2)
lines(estint-2*esterr,col="gold",lwd=2)

The estimated probability is 2.505e−89 with an error of±3.61e−90, compared with

integrate(h,0,1/20)
## 2.759158e-89 with absolute error < 5.4e-89
pnorm(-20)
## [1] 2.753624e-89

3.13

ex 2.19 For the three choices, the importance weights are easily computed:

x1=sample(c(-1,1),10^4,rep=T)*rexp(10^4)
w1=exp(-sqrt(abs(x1)))*sin(x1)^2*(x1>0)/.5*dexp(x1)
x2=rcauchy(10^4)*2
w2=exp(-sqrt(abs(x2)))*sin(x2)^2*(x2>0)/dcauchy(x2/2)
x3=rnorm(10^4)
w3=exp(-sqrt(abs(x3)))*sin(x3)^2*(x3>0)/dnorm(x3)

boxplot(as.data.frame(cbind(w1,w2,w3)))

Computing the effective sample size1/sum((w1/sum(w1))^2) introduced in Example 3.10. The preferable choice is then g1. The estimated sizes are given by

4*10^6*var(x1*w1/sum(w1))/mean(x1*w1/sum(w1))^2
## [1] 10273593
4*10^6*var(x2*w2/sum(w2))/mean(x2*w2/sum(w2))^2
## [1] 44129525
4*10^6*var(x3*w3/sum(w3))/mean(x3*w3/sum(w3))^2
## [1] 77744811

Again showing the appeal of using the double exponential proposal. (Note that efficiency could be doubled by considering the absolute values of the simulations.)

6

The working principle of Markov chain Monte Carlo methods is quite straightforward to describe. Given a target density f, we build a Markov kernel K with stationary distribution f and then generate a Markov chain (X(t) ) using this kernel so that the limiting distribution of (X(t) ) is f and integrals can be approximated according to the Ergodic Theorem (6.2). The difficulty should thus be in constructing a kernel K that is associated with an arbitrary density f. But, quite miraculously, there exist methods for deriving such kernels that are universal in that they are theoretically valid for any density f! The Metropolis–Hastings algorithm is an example of those methods. (Gibbs sampling, described in Chapter 7, is another example with equally universal potential.) Given the target density f, it is associated with a working conditional density q(y|x) that, in practice, is easy to simulate. In addition, q can be almost arbitrary in that the only theoretical requirements are that the ratio f(y)/q(y|x) is known up to a constant independent of x and that q(·|x) has enough dispersion to lead to an exploration of the entire support of f. Once again, we stress the incredible feature of the Metropolis–Hastings algorithm that, for every given q, we can then construct a Metropolis–Hastings kernel such that f is its stationary distribution.

ex 2.19 ## Example 6.2 ex 2.19

Nsim=10^4
X=c(rt(1,1)) # initialize the chain from the stationary
for (t in 2:Nsim){
 Y=rnorm(1) # candidate normal
 rho=dt(Y,1)*dnorm(X[t-1])/(dt(X[t-1],1)*dnorm(Y))
 X[t]=X[t-1] + (Y-X[t-1])*(runif(1)<rho)
 }

When executing this code, you may sometimes start with a large value for X(0), 12.788 say. In this case, dnorm(X[t-1]) is equal to 0 because, while 12.788 can formally be a realization from a normal N (0, 1), it induces computational underflow problems

pnorm(12.78,log=T,low=F)/log(10)
## [1] -36.97455
plot(cumsum(X<3)/(1:Nsim),lwd=2,ty="l",ylim=c(.85,1))

Example 6.4

ex 2.19 ex 2.19 ## Exercise 6.7

ex 2.19

a

We generate an Metropolis-Hastings sample from the Be(2.7,6.3) density using uniform simulations.

Nsim=10^4
a=2.7;
b=6.3
X=runif(Nsim)
last=X[1]
for (i in 1:Nsim) {
  cand=rbeta(1,1,1)
  alpha=(dbeta(cand,a,b)/dbeta(last,a,b))/(dbeta(cand,1,1)/dbeta(last,1,1))
  if (runif(1)<alpha)
    last=cand
  X[i]=last
  }
hist(X,pro=TRUE,col="wheat2",xlab="",ylab="",main="Beta(2.7,3) simulation")
curve(dbeta(x,a,b),add=T,lwd=2,col="sienna2")

The acceptance rate is estimated by

length(unique(X))/5000
## [1] 0.9002

If instead we use a Be(20,60) proposal, the modified lines in the Rprogramare

cand=rbeta(20,60,1)
alpha=(dbeta(cand,a,b)/dbeta(last,a,b))/(dbeta(cand,20,60)/dbeta(last,20,60))

b

In the case of a truncated beta, the following Rprogram

Nsim=5000
a=2.7;
b=6.3;
c=0.25;
d=0.75
X=rep(runif(1),Nsim)
test2=function(){
  last=X[1]
  for (i in 1:Nsim){
    cand=rbeta(1,2,6)
    alpha=(dbeta(cand,a,b)/dbeta(last,a,b))/(dbeta(cand,2,6)/dbeta(last,2,6))
    if ((runif(1)<alpha)&&(cand<d)&&(c<cand))
      last=cand
    X[i]=last
    }
  }
test1=function(){
  last=X[1]
  for (i in 1:Nsim){
    cand=runif(1,c,d)
    alpha=(dbeta(cand,a,b)/dbeta(last,a,b))
    if ((runif(1)<alpha)&&(cand<d)&&(c<cand))
      last=cand
    X[i]=last
    }
  }
system.time(test1());system.time(test2())
##    user  system elapsed 
##    0.06    0.00    0.07
##    user  system elapsed 
##    0.07    0.00    0.08

shows very similar running times but more efficiency for the beta proposal,since the acceptance rates are approximated by 0.51 and 0.72 for test1 and test2, respectively. When changing toc= 0.25,d= 0.75,test1 is more efficient than test2, with acceptances rates of approximately 0.58 and 0.41, respectively.

Example 6.5

ex 2.19

scale=1
the=matrix(runif(2,-2,5),ncol=2)
curlike=hval=like(x)
Niter=10^4
for (iter in (1:Niter)){
 prop=the[iter,]+rnorm(2)*scale
 if ((max(-prop)>2)||(max(prop)>5)||
 (log(runif(1))>like(prop)-curlike)) prop=the[iter,]
 curlike=like(prop)
 hval=c(hval,curlike)
 the=rbind(the,prop)
 }

Since the main problem of this target is the existence of two modes, one of which is smaller than the other, we can compare the impact of different choices of scale on the behavior of the chain in terms of exploration of both modes and the attraction therein. When the scale is 1, the modes are highly attractive and, out of 10^4 iterations, it is not uncommon to explore only one mode neighborhood, as shown in Figure 6.8 (left and center) for both modes. If the scale increases to 2, the proposal is diverse enough to reach both modes but at a cost. Out of 104 iterations, the chain only changes values 23 times! For the smaller scale 1, the number of changes is closer to 100, still a very low acceptance rate. An issue that often arises when using random walks on constrained domains is whether or not the random walk should be constrained as well. The answer to this question is no in that using constraints in the proposal modifies the function g and thus jeopardizes the validity of the ratio of the targets found in Algorithm 6. When values yt outside the range of f are proposed (that is, when f(y_t) = 0), the proposed value is rejected and the current value X(t) is duplicated. Obviously, picking a random walk density that often ends up outside the domain of f is a poor idea in that the chain will be stuck most of the time! But it is formally correct.

6.9

ex 2.19 ### a

g47=rgamma(5000,4,7)
u=runif(5000,max=dgamma(g47,4,7))
x=g47[u<dgamma(g47,4.3,6.2)]
par(mfrow=c(1,3),mar=c(4,4,1,1))
hist(x,freq=FALSE,xlab="",ylab="",col="wheat2",main="Accept-Reject with Ga(4.7) proposal")
curve(dgamma(x,4.3,6.2),lwd=2,col="sienna",add=T)

length(x)/5000
## [1] 0.842

Efficiency:

length(unique(X))/5000
## [1] 2e-04

b

X=rep(0,5000)
X[1]=rgamma(1,4.3,6.2)
for (t in 2:5000){
  rho=(dgamma(X[t-1],4,7)*dgamma(g47[t],4.3,6.2))/(dgamma(g47[t],4,7)*dgamma(X[t-1],4.3,6.2))
  X[t]=X[t-1]+(g47[t]-X[t-1])*(runif(1)<rho)
  }
hist(X,freq=FALSE,xlab="",ylab="",col="wheat2",main="Metropolis-Hastings with Ga(4,7) proposal")
curve(dgamma(x,4.3,6.2),lwd=2,col="sienna",add=T)

length(unique(X))/5000
## [1] 0.783

Efficiency:

length(unique(X))/5000
## [1] 0.783

c

g56=rgamma(5000,5,6)
X[1]=rgamma(1,4.3,6.2)
for (t in 2:5000){
  rho=(dgamma(X[t-1],5,6)*dgamma(g56[t],4.3,6.2))/(dgamma(g56[t],5,6)*dgamma(X[t-1],4.3,6.2))
  X[t]=X[t-1]+(g56[t]-X[t-1])*(runif(1)<rho)
  }
hist(X,freq=FALSE,xlab="",ylab="",col="wheat2",main="Metropolis-Hastings with Ga(5,6) proposal")
curve(dgamma(x,4.3,6.2),lwd=2,col="sienna",add=T)

Efficiency:

length(unique(X))/5000
## [1] 0.7722

Example 6.9

ex 2.19 ## Example 6.10

ex 2.19

6.15

ex 2.19

a

Nsim=5000
A=B=runif(Nsim)
alpha=1;
alpha2=3
last=A[1]
a=0;
b=1
cand=ifelse(runif(Nsim)>0.5,1,-1) * rexp(Nsim)/alpha
for (i in 1:Nsim){
  rate=(dnorm(cand[i],a,b^2)/dnorm(last,a,b^2))/(exp(-alpha*abs(cand[i]))/exp(-alpha*abs(last)))
  if (runif(1)<rate) 
    last=cand[i]
  A[i]=last
  }
cand=ifelse(runif(Nsim)>0.5,1,-1) * rexp(Nsim)/alpha2
for (i in 1:Nsim) {
  rate=(dnorm(cand[i],a,b^2)/dnorm(last,a,b^2))/(exp(-alpha2*abs(cand[i]))/exp(-alpha2*abs(last)))
  if (runif(1)<rate) 
    last=cand[i]
  B[i]=last
  }
par (mfrow=c(1,3),mar=c(4,4,2,1))
est1=cumsum(A)/(1:Nsim)
est2=cumsum(B)/(1:Nsim)
plot(est1,type="l",xlab="iterations",ylab="",lwd=2)
lines(est2,lwd="2",col="gold2")
acf(A)
acf(B)

b

alf=seq(1,10,le=50)
cand0=ifelse(runif(Nsim)>0.5,1,-1) * rexp(Nsim)
acce=rep(0,50)
for (j in 1:50){
  cand=cand0/alf[j]
  last=A[1]
  for (i in 2:Nsim){
    rate=(dnorm(cand[i],a,b^2)/dnorm(last,a,b^2))/(exp(-alf[j]*abs(cand[i]))/exp(-alf[j]*abs(last)))
    if (runif(1)<rate) 
      last=cand[i]
    A[i]=last}
  acce[j]=length(unique(A))/Nsim}
par(mfrow=c(1,3),mar=c(4,4,2,1))
plot(alf,acce,xlab="",ylab="",type="l",main="Laplace iid")

c

ome=sqrt(seq(.01,10,le=50))
cand0=rnorm(Nsim)
acce=rep(0,50)
for (j in 1:50){
  cand=cand0*ome[j]
  last=A[1]
  for (i in 2:Nsim){
    rate=(dnorm(cand[i],a,b^2)/dnorm(last,a,b^2))/(dnorm(cand[i],sd=ome[j])/dnorm(last,sd=ome[j]))
    if (runif(1)<rate) 
      last=cand[i]
    A[i]=last
    }
  acce[j]=length(unique(A))/Nsim
  }
plot(ome^2,acce,xlab="",ylab="",type="l",main="Normal iid")

d

alf=seq(.1,10,le=50)
cand0=ifelse(runif(Nsim)>0.5,1,-1) * rexp(Nsim)
acce=rep(0,50)
for (j in 1:50){
  eps=cand0/alf[j]
  last=A[1]
  for (i in 2:Nsim){
    cand[i]=last+eps[i]
    rate=dnorm(cand[i],a,b^2)/dnorm(last,a,b^2)
    if (runif(1)<rate) 
      last=cand[i]
    A[i]=last
    }
  acce[j]=length(unique(A))/Nsim
  }
plot(alf,acce,xlab="",ylab="",type="l",main="Laplace random walk")

#Test 2019

1

a

ex 2.19

teta=integrate(function(x){ifelse(x>0 & x<2, x^5*(1/4), 0)},0,2)
teta
## 2.666667 with absolute error < 3e-14
h=function(x){ifelse(x>0 & x<2, x^5*(1/4), 0)}
x=runif(10^4,0,2)
teta_chapeu=2*mean(h(x))
teta_chapeu
## [1] 2.673592
y=2*h(x)
estint=cumsum(y)/(1:10^4)
esterr=sqrt(cumsum((y-estint)^2))/(1:10^4)
plot(estint, xlab="Mean and error range",main='Monitorização da convergência',type="l",lwd=2,ylim=mean(y)+40*c(-esterr[10^4],esterr[10^4]),ylab="")
lines(estint+2*esterr,col="gold",lwd=2)
lines(estint-2*esterr,col="gold",lwd=2)
abline(h=teta$value,col="red")

Erro:

teta_chapeu-teta$value
## [1] 0.006925579

b

ex 2.19

hf=function(x){ifelse(x>0 & x<1,4*(x^5)/(2-2*x),0)}
x2=rbeta(10^5,1,2)
teta_est=(1/16)*mean(hf(x2))
teta_est
## [1] 0.0414873
var1=sd(hf(x2))^2/10^5; var1
## [1] 0.0002916141
hf2=function(x){ifelse(x>0 & x<1,4*(x^5)/(6*(x^5)),0)}
x3=rbeta(10^5,6,1)
teta_est1=(1/16)*mean(hf2(x3))
teta_est1
## [1] 0.04166667
var2=sd(hf2(x3))^2/10^5; var2
## [1] 4.108281e-38

##2

ex 2.19

Primeiro Candidato- N(0,1)

f=function(x){dcauchy(x)}
q=function(x){dnorm(x,0,1)}

n=10^4
X=runif(n,0,1)
x=X[1]
for (i in 1:n){
  y=rnorm(1,0,1)
  prob=(f(y)*q(x))/(f(x)*q(y))
  if (runif(1)<prob) {x=y}
  X[i]=x
}

hist(X,pro=TRUE)
curve(dcauchy(x),add=T,col='red')

ar1=length(unique(X))/length(X)
ar1
## [1] 0.8378
plot(X,type="l",xlab="Iteracoes",ylab="Valores de X")

plot(cumsum(X)/length(X),type="l",xlab="Iteracoes",ylab="media")
abline(h=0,col="red")

acf(X)

y.cauchy <- qcauchy(ppoints(length(X)))
qqplot(X
       , y.cauchy)

Segundo Candidato- t(10)

f=function(x){dcauchy(x)}
q=function(x){dt(x,10)}

n=10^4
X=runif(n,0,1)
x=X[1]
for (i in 1:n){
  y=rt(1,10)
  prob=(f(y)*q(x))/(f(x)*q(y))
  if (runif(1)<prob) {x=y}
  X[i]=x
}

hist(X,pro=TRUE)
curve(dcauchy(x),add=T,col='red')

ar2=length(unique(X))/length(X)
ar2
## [1] 0.7571
plot(X,type="l",xlab="Iteracoes",ylab="Valores de X")

plot(cumsum(X)/length(X),type="l",xlab="Iteracoes",ylab="media")
abline(h=0,col="red")

acf(X)

y.cauchy <- qcauchy(ppoints(length(X)))
qqplot(X
       , y.cauchy)