MCMC Assignment

Thomas — Feb 25, 2013, 5:50 AM

# Bayesian statistics, MCMC-assignments, Chapter 6
# Note that I used the lambda.r-library for functional programming 
# (and sacrificed performance for practice, clarity and safety)
suppressPackageStartupMessages({
  require("lambda.r")
  require("geoR")
})

set.seed(100000)
setwd("F:/SkyDrive/Statistics/Bayesian")

data <- read.table("osteoporosis study.txt", head=TRUE)
data_centered <- data
data_centered$BMI = data_centered$BMI - mean(data_centered$BMI)
n <- length(data$AGE)

# Run a tail-recursive function n times. Bug/limitation in lambda.r in next line:
# iterate(func, init, n, retain_as_df=TRUE) %::% Function, t, numeric, bool, t 
# retain=TRUE -> here we retain all the states in the chain
iterate(func, init, n, TRUE, burn_in, ...) %as% {
  l <- data.frame(init)
  x <- init
  for(i in 1:n) {
    x <- func(x, ...)
    l <- rbind(l, x)
  }
  l[(burn_in+1):(n+1),]
}
# retain=FALSE -> only return the final state in the chain (not used in final solution)
iterate(func, init, n, FALSE, ...) %as% {
  x <- init
  for(i in 1:n) 
    x <- func(x, ...)
  x
}
seal(iterate)


# Define a state in our MCMC-chain (optional log-likelihood)
State(b0=0.1, b1=0.1, s2=0.1, ll=-Inf) %when% { 
  s2 >= 0 
} %as% {
  state <- list(b0=b0, b1=b1, s2=s2, ll=ll)
}
print.State <- function(s) {
  if(s$ll==-Inf)
    cat(sprintf("(b0=%f, b1=%f, s2=%f)\n", s$b0, s$b1, s$s2))
  else
    cat(sprintf("(b0=%f, b1=%f, s2=%f), ll: %f\n", s$b0, s$b1, s$s2, s$ll))
}
seal(State)


# Define a likelihood-function on states:
# note: write partial function support for lambda.r
loglikelihood(x, data) %::% State : data.frame : numeric
loglikelihood(x, data) %as% {
  sum(log(
    dnorm((data$TBBMC - x$b0 - x$b1*data$BMI)^2, 0, x$s2)
  ))
}
seal(loglikelihood)


# Now define our gibbs step function: (gibbs(State_i) -> State_i+1)
gibbs(x, data) %::% State : data.frame : State
gibbs(x, data) %as% {
  s_s2 <- mean((data$TBBMC - x$b0 - x$b1*data$BMI)^2)
  r_b1 <- mean( data$TBBMC - x$b1*data$BMI)
  r_b0 <- sum ((data$TBBMC - x$b0)*data$BMI/sum(data$BMI^2))

  new_state <- State(
    b0 = rnorm(1, r_b1, sqrt(x$s2/n)),
    b1 = rnorm(1, r_b0, sqrt(x$s2/sum(data$BMI^2))),
    s2 = rinvchisq(1, n, s_s2)
  )
  new_state$ll <- loglikelihood(new_state, data)
  new_state
}
seal(gibbs)


# Define our Random-walk metropolis step function: (metropolis(State_i) -> State_i+1)
metropolis(current, data, c) %::% State : data.frame : numeric : State
metropolis(current, data, c=0.033) %when% {
  c >= 0
} %as% {
  # Propose a new state, calculate log likelihood
  proposed <- State(
    s2 = exp(rnorm(1, log(current$s2), c^2)),
    b0 = rnorm(1, current$b0, c^2),
    b1 = rnorm(1, current$b1, c^2)
  )
  proposed$ll <- loglikelihood(proposed, data)

  # Calculate move probability, and use it
  ratio <- min(exp(proposed$ll-current$ll), 1, na.rm=TRUE)
  if(runif(1)<ratio)
    return(proposed)
  return(current)
}
seal(metropolis)


# ===============
# Exercise 1 & 2:
# ===============

# Run the code
g <- iterate(gibbs, State(), 10000, TRUE, burn_in=3000, data=data)
m <- iterate(metropolis, State(), 10000, TRUE, burn_in=3000, data=data, c=0.033)
g_centered <- iterate(gibbs, State(b0=1.8, b1=0.04, s2=0.15), 10000, TRUE, burn_in=3000, data=data_centered)
m_centered <- iterate(metropolis, State(b0=1.8, b1=0.04, s2=0.15), 10000, TRUE, burn_in=3000, data=data_centered, c=0.1)


# "Acceptance rate for metropolis algorithm:"
mean(m$ll[1:7000]!=m$ll[2:7001])
[1] 0.4339
# "Acceptance rate for centered metropolis algorithm:"
mean(m_centered$ll[1:7000]!=m_centered$ll[2:7001])
[1] 0.3676
# "Gibbs correlation between b0 and b1:"
cor(g$b0, g$b1)
[1] -0.09743
# "Metropolis correlation between b0 and b1:"
cor(m$b0, m$b1)
[1] -0.8189
# "Centered Gibbs correlation between b0 and b1:"
cor(m_centered$b0, m_centered$b1)
[1] -0.3124
# "Centered Metropolis correlation between b0 and b1:"
cor(m_centered$b0, m_centered$b1)
[1] -0.3124
# The |correlation| goes down from ~0.8 to ~0.3, which is an improvement: we can set our
#      c value much higher for the same ~40% acceptance rate, making our chain much faster.
# We could expect this, since the correlation between BMI and TBBMC is very high, as 
#      shown by the hint-code supplied in the exercise. Centering around x=0 is a way to
#      reduce this, since it delinearizes the relation between the y and x.

par(mfrow=c(4,3))
hist(g$b0)
hist(g$b1)
hist(g$s2)

hist(m$b0)
hist(m$b1)
hist(m$s2)

hist(g_centered$b0)
hist(g_centered$b1)
hist(g_centered$s2)

hist(m_centered$b0)
hist(m_centered$b1)
hist(m_centered$s2)

plot of chunk unnamed-chunk-1


par(mfrow=c(2,2))
plot(g$b0, g$b1, type="l")
plot(m$b0, m$b1, type="l")
plot(g_centered$b0, g_centered$b1, type="l")
plot(m_centered$b0, m_centered$b1, type="l")

plot of chunk unnamed-chunk-1



# ===========
# Exercise 3:
# ===========
lr_data <- read.table("linear_reg_without_intercept.csv", head=TRUE, sep=",")
#lr_data$x1 <- lr_data$x1 - mean(lr_data$x1)
#lr_data$x2 <- lr_data$x2 - mean(lr_data$x2)
lr_n <- length(lr_data$y)

# First we define the likelihood:
loglikelihood(x, data) %::% State : data.frame : numeric
loglikelihood(x, data) %as% {
  sum(log(
    dnorm((data$y - x$b0*data$x1 - x$b1*data$x2)^2, 0, x$s2)
  ))
}
seal(loglikelihood)

# We can deduce the marginals from the likelihood. I have a more extensive version on paper, but
# here's the summary:
# We start with the likelihood being the product of normal distributions, with as
# exponential power:
#    -(b0*x1 + b1*x2 - y)^2
#    -----------------------
#             2s^2
# To find the conditional of b0, we divide top and bottom by x1^2, giving:
#    -(b0 - (y - b1*x2)/x1)^2
#    -----------------------
#             2s^2/x1^2
# Remember that this is a long product, so some more manipulation gives:
# For b0, we use a normal distribution with mean (y - b1*x2)*x1/sum(x1^2) and variance s^2/x1^2
# We do the same for b1.
#
# For s^2, with knowledge of the relevant distribution, we quickly see that it's
# a scaled normal inverse chi-square function, with n df and the scale as filled in below.

gibbs2(current, data) %::% State : data.frame : State
gibbs2(current, data) %as% {        
  b0_mean <- sum( (data$y-current$b1*data$x2)*data$x1 ) / sum(data$x1^2)
  b0_sd <- sqrt(current$s2/sum(data$x1^2))

  b1_mean <- sum((data$y-current$b0*data$x1)*(data$x2)) / sum(data$x2^2)
  b1_sd <- sqrt(current$s2/sum(data$x2^2))

  new_state <- State(
    b0 = rnorm(1, b0_mean, b0_sd), 
    b1 = rnorm(1, b1_mean, b1_sd), 
    s2 = rinvchisq(1, lr_n, mean((data$y-current$b0*data$x1-current$b1*data$x2)^2) )
  )
  new_state$ll <- loglikelihood(new_state, data)
  new_state
}
seal(gibbs2)

# Run the samplers
g2 <- iterate(gibbs2, State(b0=0.9, b1=1.9, s2=0.1), 10000, TRUE, burn_in=3000, data=lr_data)
m2 <- iterate(metropolis, State(b0=0.9, b1=1.9, s2=0.1), 10000, TRUE, burn_in=3000, c=0.35, data=lr_data)
par(mfrow=c(2,1))
plot(g2$b0, g2$b1, type="l")
plot(m2$b0, m2$b1, type="l")

plot of chunk unnamed-chunk-1

# "Acceptance rate for metropolis algorithm:"
mean(m2$ll[1:7000]!=m2$ll[2:7001])
[1] 0.3193
# Correlations:
cor(g2$b0, g2$b1)
[1] -0.00284
cor(m2$b0, m2$b1)
[1] 0.6707
# Which is better? Let's look at the qq-plots:
qqnorm(m2$b0)
qqnorm(g2$b0)

plot of chunk unnamed-chunk-1

# It is clear that the gibbs sampler is much better.

# =============
# Exercise 4.1:
# =============
# Question 4 is formulated in such a way that I'm not sure what it is exactly asking. My attempt at
# answering it anyway:
#
# Given:
#   x,y,z~p( .. )
# Draw x' by Gibbs sampling
#  - We had  x, y, z ~ p(x,y,z)
#  - Now we have  x', x, y, z ~ p(x' | x, y, z) * p(x, y, z)
#  - x' does not depend on x, so  x', y, z ~ p(x' | y, z) * p(y, z)
#     + Note that we have defined p(x' | y, z) as the conditional p(x | y, z)
#  - So, x', y, z ~ p(x | y, z) * p(y, z) ~ p(x, y, z)

# =============
# Exercise 4.2:
# =============
# Unless I misunderstood 4.1, this follows directly from 4.1. Just repeat the 4.1 three times,
# once for every variable.