Thomas — Feb 25, 2013, 5:50 AM
# Bayesian statistics, MCMC-assignments, Chapter 6
# Note that I used the lambda.r-library for functional programming
# (and sacrificed performance for practice, clarity and safety)
suppressPackageStartupMessages({
require("lambda.r")
require("geoR")
})
set.seed(100000)
setwd("F:/SkyDrive/Statistics/Bayesian")
data <- read.table("osteoporosis study.txt", head=TRUE)
data_centered <- data
data_centered$BMI = data_centered$BMI - mean(data_centered$BMI)
n <- length(data$AGE)
# Run a tail-recursive function n times. Bug/limitation in lambda.r in next line:
# iterate(func, init, n, retain_as_df=TRUE) %::% Function, t, numeric, bool, t
# retain=TRUE -> here we retain all the states in the chain
iterate(func, init, n, TRUE, burn_in, ...) %as% {
l <- data.frame(init)
x <- init
for(i in 1:n) {
x <- func(x, ...)
l <- rbind(l, x)
}
l[(burn_in+1):(n+1),]
}
# retain=FALSE -> only return the final state in the chain (not used in final solution)
iterate(func, init, n, FALSE, ...) %as% {
x <- init
for(i in 1:n)
x <- func(x, ...)
x
}
seal(iterate)
# Define a state in our MCMC-chain (optional log-likelihood)
State(b0=0.1, b1=0.1, s2=0.1, ll=-Inf) %when% {
s2 >= 0
} %as% {
state <- list(b0=b0, b1=b1, s2=s2, ll=ll)
}
print.State <- function(s) {
if(s$ll==-Inf)
cat(sprintf("(b0=%f, b1=%f, s2=%f)\n", s$b0, s$b1, s$s2))
else
cat(sprintf("(b0=%f, b1=%f, s2=%f), ll: %f\n", s$b0, s$b1, s$s2, s$ll))
}
seal(State)
# Define a likelihood-function on states:
# note: write partial function support for lambda.r
loglikelihood(x, data) %::% State : data.frame : numeric
loglikelihood(x, data) %as% {
sum(log(
dnorm((data$TBBMC - x$b0 - x$b1*data$BMI)^2, 0, x$s2)
))
}
seal(loglikelihood)
# Now define our gibbs step function: (gibbs(State_i) -> State_i+1)
gibbs(x, data) %::% State : data.frame : State
gibbs(x, data) %as% {
s_s2 <- mean((data$TBBMC - x$b0 - x$b1*data$BMI)^2)
r_b1 <- mean( data$TBBMC - x$b1*data$BMI)
r_b0 <- sum ((data$TBBMC - x$b0)*data$BMI/sum(data$BMI^2))
new_state <- State(
b0 = rnorm(1, r_b1, sqrt(x$s2/n)),
b1 = rnorm(1, r_b0, sqrt(x$s2/sum(data$BMI^2))),
s2 = rinvchisq(1, n, s_s2)
)
new_state$ll <- loglikelihood(new_state, data)
new_state
}
seal(gibbs)
# Define our Random-walk metropolis step function: (metropolis(State_i) -> State_i+1)
metropolis(current, data, c) %::% State : data.frame : numeric : State
metropolis(current, data, c=0.033) %when% {
c >= 0
} %as% {
# Propose a new state, calculate log likelihood
proposed <- State(
s2 = exp(rnorm(1, log(current$s2), c^2)),
b0 = rnorm(1, current$b0, c^2),
b1 = rnorm(1, current$b1, c^2)
)
proposed$ll <- loglikelihood(proposed, data)
# Calculate move probability, and use it
ratio <- min(exp(proposed$ll-current$ll), 1, na.rm=TRUE)
if(runif(1)<ratio)
return(proposed)
return(current)
}
seal(metropolis)
# ===============
# Exercise 1 & 2:
# ===============
# Run the code
g <- iterate(gibbs, State(), 10000, TRUE, burn_in=3000, data=data)
m <- iterate(metropolis, State(), 10000, TRUE, burn_in=3000, data=data, c=0.033)
g_centered <- iterate(gibbs, State(b0=1.8, b1=0.04, s2=0.15), 10000, TRUE, burn_in=3000, data=data_centered)
m_centered <- iterate(metropolis, State(b0=1.8, b1=0.04, s2=0.15), 10000, TRUE, burn_in=3000, data=data_centered, c=0.1)
# "Acceptance rate for metropolis algorithm:"
mean(m$ll[1:7000]!=m$ll[2:7001])
[1] 0.4339
# "Acceptance rate for centered metropolis algorithm:"
mean(m_centered$ll[1:7000]!=m_centered$ll[2:7001])
[1] 0.3676
# "Gibbs correlation between b0 and b1:"
cor(g$b0, g$b1)
[1] -0.09743
# "Metropolis correlation between b0 and b1:"
cor(m$b0, m$b1)
[1] -0.8189
# "Centered Gibbs correlation between b0 and b1:"
cor(m_centered$b0, m_centered$b1)
[1] -0.3124
# "Centered Metropolis correlation between b0 and b1:"
cor(m_centered$b0, m_centered$b1)
[1] -0.3124
# The |correlation| goes down from ~0.8 to ~0.3, which is an improvement: we can set our
# c value much higher for the same ~40% acceptance rate, making our chain much faster.
# We could expect this, since the correlation between BMI and TBBMC is very high, as
# shown by the hint-code supplied in the exercise. Centering around x=0 is a way to
# reduce this, since it delinearizes the relation between the y and x.
par(mfrow=c(4,3))
hist(g$b0)
hist(g$b1)
hist(g$s2)
hist(m$b0)
hist(m$b1)
hist(m$s2)
hist(g_centered$b0)
hist(g_centered$b1)
hist(g_centered$s2)
hist(m_centered$b0)
hist(m_centered$b1)
hist(m_centered$s2)
par(mfrow=c(2,2))
plot(g$b0, g$b1, type="l")
plot(m$b0, m$b1, type="l")
plot(g_centered$b0, g_centered$b1, type="l")
plot(m_centered$b0, m_centered$b1, type="l")
# ===========
# Exercise 3:
# ===========
lr_data <- read.table("linear_reg_without_intercept.csv", head=TRUE, sep=",")
#lr_data$x1 <- lr_data$x1 - mean(lr_data$x1)
#lr_data$x2 <- lr_data$x2 - mean(lr_data$x2)
lr_n <- length(lr_data$y)
# First we define the likelihood:
loglikelihood(x, data) %::% State : data.frame : numeric
loglikelihood(x, data) %as% {
sum(log(
dnorm((data$y - x$b0*data$x1 - x$b1*data$x2)^2, 0, x$s2)
))
}
seal(loglikelihood)
# We can deduce the marginals from the likelihood. I have a more extensive version on paper, but
# here's the summary:
# We start with the likelihood being the product of normal distributions, with as
# exponential power:
# -(b0*x1 + b1*x2 - y)^2
# -----------------------
# 2s^2
# To find the conditional of b0, we divide top and bottom by x1^2, giving:
# -(b0 - (y - b1*x2)/x1)^2
# -----------------------
# 2s^2/x1^2
# Remember that this is a long product, so some more manipulation gives:
# For b0, we use a normal distribution with mean (y - b1*x2)*x1/sum(x1^2) and variance s^2/x1^2
# We do the same for b1.
#
# For s^2, with knowledge of the relevant distribution, we quickly see that it's
# a scaled normal inverse chi-square function, with n df and the scale as filled in below.
gibbs2(current, data) %::% State : data.frame : State
gibbs2(current, data) %as% {
b0_mean <- sum( (data$y-current$b1*data$x2)*data$x1 ) / sum(data$x1^2)
b0_sd <- sqrt(current$s2/sum(data$x1^2))
b1_mean <- sum((data$y-current$b0*data$x1)*(data$x2)) / sum(data$x2^2)
b1_sd <- sqrt(current$s2/sum(data$x2^2))
new_state <- State(
b0 = rnorm(1, b0_mean, b0_sd),
b1 = rnorm(1, b1_mean, b1_sd),
s2 = rinvchisq(1, lr_n, mean((data$y-current$b0*data$x1-current$b1*data$x2)^2) )
)
new_state$ll <- loglikelihood(new_state, data)
new_state
}
seal(gibbs2)
# Run the samplers
g2 <- iterate(gibbs2, State(b0=0.9, b1=1.9, s2=0.1), 10000, TRUE, burn_in=3000, data=lr_data)
m2 <- iterate(metropolis, State(b0=0.9, b1=1.9, s2=0.1), 10000, TRUE, burn_in=3000, c=0.35, data=lr_data)
par(mfrow=c(2,1))
plot(g2$b0, g2$b1, type="l")
plot(m2$b0, m2$b1, type="l")
# "Acceptance rate for metropolis algorithm:"
mean(m2$ll[1:7000]!=m2$ll[2:7001])
[1] 0.3193
# Correlations:
cor(g2$b0, g2$b1)
[1] -0.00284
cor(m2$b0, m2$b1)
[1] 0.6707
# Which is better? Let's look at the qq-plots:
qqnorm(m2$b0)
qqnorm(g2$b0)
# It is clear that the gibbs sampler is much better.
# =============
# Exercise 4.1:
# =============
# Question 4 is formulated in such a way that I'm not sure what it is exactly asking. My attempt at
# answering it anyway:
#
# Given:
# x,y,z~p( .. )
# Draw x' by Gibbs sampling
# - We had x, y, z ~ p(x,y,z)
# - Now we have x', x, y, z ~ p(x' | x, y, z) * p(x, y, z)
# - x' does not depend on x, so x', y, z ~ p(x' | y, z) * p(y, z)
# + Note that we have defined p(x' | y, z) as the conditional p(x | y, z)
# - So, x', y, z ~ p(x | y, z) * p(y, z) ~ p(x, y, z)
# =============
# Exercise 4.2:
# =============
# Unless I misunderstood 4.1, this follows directly from 4.1. Just repeat the 4.1 three times,
# once for every variable.