Coding Progress With Simple Gaussian Process

Here is a summary of the code I've written so far.

MLE GP Regression for a simple function

Following the procedure of Jones, Schonlau, and Welch, I first fit a Gaussian process model to a simple, deterministic function with 1 input \( x \), meant to act as a simple “simulator”. For ease I evenly spaces the points in \( x \) for my function evaluations

# Generate some data from a function
xs <- seq(-2, 3, length = 10)
fs <- dgamma(xs + 2.2, shape = 1.4, scale = 3) + 0.1 * dgamma(xs + 2.2, shape = 1.4, 
    scale = 3) * sin(2 * pi * xs/1.5)

I next define the covariance function to be the simple non-isotropic Gaussian, or squared exponential covariance. Strictly it is trivally isotropic for this one parameter case, but defined to allow for a separate scale parameter for cases with higher dimensional inputs. Conditional on the GP parameters, the likelihood function is multivariate normal.


########################
# Correlation Function for GP Model
#
# Input:
#   x1,x2: two px1 vectors defined on the input space
#   scale: px1 vector dictating the correlation length in each input dimension
#   rough: scalar. Often will be set to fixed value of 2 to make the 
#           GP response infinitely differentiable
#
# Output:
#   scalar, correlation between the two parameter values
#
corrF <- function(x1,x2,scale,rough){
  # Non-isotropic, squared exponential correlation function
  exp(-sum(scale*abs(x1-x2)^rough))
}

########################
# Build Correlation Matrix for GP model
#
# Input:
#   Xmat: nxp matrix of input values
#   scale: px1 vector dictating the correlation length in each input dimension
#   rough: scalar. Often will be set to fixed value of 2 to make the 
#           GP response infinitely differentiable
#
# Output:
#   chain: samples from the Markov chain generated by the MH algorithm
#
# Requires:
# - corrF()
#   - Correlation Function - corrF()
#       - See above
#
getCorrMat <- function(Xmat,scale,rough=2){
  Xmat <- as.matrix(Xmat)
  n <- dim(Xmat)[1]
  corrMat <- matrix(0,nrow=n,ncol=n)
  for (i in 1:n){
    for (j in 1:i){
      corrMat[i,j] <- corrF(Xmat[i,],Xmat[j,],scale,rough)
      corrMat[j,i] <- corrMat[i,j]
    }
  }
  # Adjustment for numerical stability
  corrMat <- (corrMat+t(corrMat))/2
  return(corrMat)
}

########################
# Get vector of correlations between new inputs and the current design points
#
# Input:
#   predXvec: n*xp matrix of inputs to predict at
#   Xmat: nxp matrix of input values
#   scale: px1 vector dictating the correlation length in each input dimension
#   rough: scalar. Often will be set to fixed value of 2 to make the 
#           GP response infinitely differentiable
#
# Output:
#   chain: samples from the Markov chain generated by the MH algorithm
#
# Requires:
# - corrF()
#   - Correlation Function - corrF()
#       - See above
#
# 
getCorrVec <- function(predXvec,Xmat,scale,rough=2){
  Xmat <- as.matrix(Xmat); predXvec <- as.matrix(predXvec)
  n <- dim(Xmat)[1]
  corrMat <- matrix(0,nrow=n,ncol=1)
  for (j in 1:n){
    corrMat[j,1] <- corrF(predXvec,Xmat[j,],scale,rough)
  }
  # Returns as a matrix for ease of later calculations
  return(corrMat)
}

########################
# Calculate the likelihood function
#
# Input:
#   scale: px1 vector of the scale parameters for the GP
#   x: nxp matrix of input values
#   y: nx1 vector of observed response values
#
# Output:
#   Scalar value of the likelihood function
#
# Requires:
# - getCorrMat()
#   - See above
# - mvtnorm R package
# 
likelihoodFunction <- function(scale,x,y){
  y = as.matrix(y)
  corrMat <- getCorrMat(x,scale)
  invcMat <- solve(corrMat)
  sigsq <- as.numeric(t(y)%*%invcMat%*%y/length(y))
  y = c(y)
  # Use dmvnorm presuming it is optimized well
  return(dmvnorm(y,mean=rep(0,length(y)),sigma=sigsq*corrMat))
}

With the functions defined above, we can find the MLE for the scale parameter of the GP model for the one input and use that value to obtain predicted values for the “simulator” response at locations in the input space for which we have no evaluated the function.

# Obtain the MLE for the GP scale parameter Using R optimize function for
# now as it's quick
scale = optimize(f = likelihoodFunction, interval = c(1e-05, 10), maximum = T, 
    x = xs, y = fs)$maximum

# Obtain correlation matrix for the GP using the MLE for the scale
corrMat <- getCorrMat(xs, scale)
invCorrMat <- solve(corrMat)

# Obtaining the expected value of the predicted response across the input
# space
predX <- seq(-2, 4, length = 100)
predCorr <- sapply(predX, getCorrVec, Xmat = xs, scale = scale)
predf <- t(predCorr) %*% (invCorrMat %*% fs)

# Estimating the variance as in Jones, Schonlau, and Welch
estVar <- t(fs) %*% invCorrMat %*% fs/length(xs)
predVar <- estVar[1] * (1 - as.matrix(diag(t(predCorr) %*% invCorrMat %*% predCorr)) + 
    (1 - t(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% predCorr))^2/as.numeric(matrix(1, 
        nrow = 1, ncol = length(xs)) %*% invCorrMat %*% matrix(1, ncol = 1, 
        nrow = length(xs))))
predSE <- sqrt(apply(cbind(predVar, rep(0, 100)), 1, max))

We can plot the resulting comparison between the true value for the “simulator” and the predictions based on the expectation of the GP model, including a 95\% CI at each point.

plot(xs, fs, xlim = c(-2, 4), ylim = c(0, 0.2))
points(predX, dgamma(predX + 2.2, shape = 1.4, scale = 3) + 0.1 * dgamma(predX + 
    2.2, shape = 1.4, scale = 3) * sin(2 * pi * predX/1.5), lwd = 2, type = "l")
points(predX, predf, type = "l", col = rgb(1, 0, 0, 1))
points(predX, predf - 1.96 * predSE, type = "l", col = rgb(1, 0, 0, 1), lty = 2)
points(predX, predf + 1.96 * predSE, type = "l", col = rgb(1, 0, 0, 1), lty = 2)

plot of chunk unnamed-chunk-5

Extension to Bayesian framework

As discussed by Jones, Schonlau, and Welch, their approach does not take into account uncertainty in the GP model parameters. As discussed by Higdon as well as Kennedy and O'Hagan, the above approach can be extended into the Bayesian framework.

To do this we will sample from the posterior using the Metropolis-Hastings MCMC algorithm. Defined below are the prior for the scale parameter, the transition probability for the MH algorithm, and the MCMC function itself.

########################
# Obtain the value of the prior for a given parameter value
#
# Input:
#   param: given value for the GP scale parameter
#
# Output:
#   Scalar value of the prior at this point
#
# Requires:
# 
priorFunction <- function(param){
  # Prior from Higdon 2004
  return(log(dlnorm(param[1],sdlog=2)))
}

########################
# Obtain new proposal values for Metropolis-Hastings algorithm
#
# Input:
#   location: current value for the GP scale parameter
#
# Output:
#   list with the new proposed value for the GP scale parameter
#     and the transition probability in each direction
#
# Requires:
# 
proposalFunction <- function(location){
  scale = 0.35
  prop <- rlnorm(length(location),mean = log(location), sd=scale)
  pps <- c(dlnorm(location,log(prop),scale),dlnorm(prop,log(location),scale))
  return(list(proposal=prop,propProbs=pps))
}

########################
# Simple Metropolis Hastings Algorithm
#
# Input:
#   startvalue: parameter vector to initialize the Markov Chain
#   iterations: number of total iterations to run for the chain
#   inputData: inputs or predictors for the probablisitic model
#   outputData: outputs or responses for the probabilitic model
#   diagnostics: Logical dictating if diagnostic output should be displayed
#
# Output:
#   chain: samples from the Markov chain generated by the MH algorithm
#
# Requires:
# - proposalFunction(),priorFunction(),likelihoodFunction()
#   - Proposal function - proposalFunction()
#       - Inputs: current chain parameter vector
#       - Outputs: List with proposed parameter vector ("proposal") 
#             and the two transition probabilities ("propProbs")
#
#   - Prior function - priorFunction()
#       - Inputs: proposed parameter vector
#       - Outputs: log of the prior probability of the proposed parameter vector
#
#   - Likelihood function - likelihoodFunction()
#       - Inputs: Proposed parameter vector, model input (predictor) data, model output (response) data
#       - Output: value of the likelihood function for the observed data at the proposed parameter values
#
metropolisMCMC <- function(startvalue, iterations, inputData,outputData,diagnostics=F){
  # initialize array to hold chain values
  chain = array(dim = c(iterations+1,length(startvalue)))
    oldprob = array(dim = c(iterations+1))

    chain[1,] = startvalue
    oldprob[1] = -Inf
    i=1
    its=1
  # initialize onscreen progress bar
  tb <- txtProgressBar(min=1,max=iterations,initial=1)
    while (i <= iterations){
#   Obtain proposed parameter values
    temp = proposalFunction(chain[i,])
    proposal = temp$proposal
        proposalProbs = temp$propProbs

#   Obtain the prior probabilty of proposed parameter
    priors = priorFunction(proposal)
#   Combine prior and likelihood function to obtain unnormalized posterior
    probab = priors+log(likelihoodFunction(proposal,inputData,outputData))
#   Accept proposed parameter value according to M-H algorithm
    if (probab - oldprob[i] + log(proposalProbs[1]) - log(proposalProbs[2])> log(runif(1))){
            chain[i+1,] = proposal
            oldprob[i+1] = probab
            i=i+1
            setTxtProgressBar(tb, i)
        } else{
          chain[i+1,] = chain[i,]
          oldprob[i+1] = oldprob[i]
          i=i+1
          setTxtProgressBar(tb, i)
        }
        its = its +1
    }
    close(tb)
    print(length(unique(as.matrix(chain[,1])))/length(as.matrix(chain[,1])))
    if(diagnostics){plot(oldprob,main="Unnormalized Posterior Probability at Each Step",type="l",lwd=2)}
    return(chain)
}

Before doing carrying out the inference to obtain samples from the posterior distribution, we can first look at a sample of GP paths from the prior.

### Plot 4 sample paths from the Prior
samp4 <- rlnorm(4, sdlog = 2)
plot(xs, fs, xlim = c(-2, 4), ylim = c(0, 0.2))
points(predX, dgamma(predX + 2.2, shape = 1.4, scale = 3) + 0.1 * dgamma(predX + 
    2.2, shape = 1.4, scale = 3) * sin(2 * pi * predX/1.5), lwd = 2, type = "l")
for (i in 1:4) {
    corrMat <- getCorrMat(xs, samp4[i])
    invCorrMat <- solve(corrMat)
    predCorr <- sapply(predX, getCorrVec, Xmat = xs, scale = samp4[i])
    predf <- t(predCorr) %*% (invCorrMat %*% fs)
    estVar <- t(fs) %*% invCorrMat %*% fs/length(xs)
    predVar <- estVar[1] * (1 - as.matrix(diag(t(predCorr) %*% invCorrMat %*% 
        predCorr)) + (1 - t(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% 
        predCorr))^2/as.numeric(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% 
        matrix(1, ncol = 1, nrow = length(xs))))
    predSE <- sqrt(apply(cbind(predVar, rep(0, 100)), 1, max))
    points(predX, predf, type = "l", col = rgb((i - 1)/3, 0, 1 - (i - 1)/3, 
        0.75))
    points(predX, predf - 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 
        1 - (i - 1)/3, 0.75), lty = 2)
    points(predX, predf + 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 
        1 - (i - 1)/3, 0.75), lty = 2)
}

plot of chunk unnamed-chunk-7

We can see a great deal of variability in these potential fits. We can condition on the observed data to obtain posterior samples using the MH algorithm above.

# Obtain samples simple Bayesian 1D GP model:
mcmcSamples <- metropolisMCMC(startvalue = c(scale), iterations = 5000, inputData = xs, 
    outputData = fs)

## ===========================================================================
## [1] 0.5203

Below we plot 4 sample paths using the posterior distribution of the GP scale parameter, as well as the model fit at the median of the GP parameter.

# Plot 4 sample paths from posterior draws
samp4 <- sample(mcmcSamples, size = 4)
plot(xs, fs, xlim = c(-2, 4), ylim = c(0, 0.2))
points(predX, dgamma(predX + 2.2, shape = 1.4, scale = 3) + 0.1 * dgamma(predX + 
    2.2, shape = 1.4, scale = 3) * sin(2 * pi * predX/1.5), lwd = 2, type = "l")
for (i in 1:4) {
    corrMat <- getCorrMat(xs, samp4[i])
    invCorrMat <- solve(corrMat)
    predCorr <- sapply(predX, getCorrVec, Xmat = xs, scale = samp4[i])
    predf <- t(predCorr) %*% (invCorrMat %*% fs)
    estVar <- t(fs) %*% invCorrMat %*% fs/length(xs)
    predVar <- estVar[1] * (1 - as.matrix(diag(t(predCorr) %*% invCorrMat %*% 
        predCorr)) + (1 - t(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% 
        predCorr))^2/as.numeric(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% 
        matrix(1, ncol = 1, nrow = length(xs))))
    predSE <- sqrt(apply(cbind(predVar, rep(0, 100)), 1, max))
    points(predX, predf, type = "l", col = rgb((i - 1)/3, 0, 1 - (i - 1)/3, 
        0.75))
    points(predX, predf - 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 
        1 - (i - 1)/3, 0.75), lty = 2)
    points(predX, predf + 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 
        1 - (i - 1)/3, 0.75), lty = 2)
}

plot of chunk unnamed-chunk-9

# Plot sample path from posterior median:
corrMat <- getCorrMat(xs, median(mcmcSamples))
invCorrMat <- solve(corrMat)
predCorr <- sapply(predX, getCorrVec, Xmat = xs, scale = median(mcmcSamples))
predf <- t(predCorr) %*% (invCorrMat %*% fs)
estVar <- t(fs) %*% invCorrMat %*% fs/length(xs)
predVar <- estVar[1] * (1 - as.matrix(diag(t(predCorr) %*% invCorrMat %*% predCorr)) + 
    (1 - t(matrix(1, nrow = 1, ncol = length(xs)) %*% invCorrMat %*% predCorr))^2/as.numeric(matrix(1, 
        nrow = 1, ncol = length(xs)) %*% invCorrMat %*% matrix(1, ncol = 1, 
        nrow = length(xs))))
predSE <- sqrt(apply(cbind(predVar, rep(0, 100)), 1, max))
plot(xs, fs, xlim = c(-2, 4), ylim = c(0, 0.2))
points(predX, dgamma(predX + 2.2, shape = 1.4, scale = 3) + 0.1 * dgamma(predX + 
    2.2, shape = 1.4, scale = 3) * sin(2 * pi * predX/1.5), lwd = 2, type = "l")
points(predX, predf, type = "l", col = rgb((i - 1)/3, 0, 1 - (i - 1)/3, 0.75))
points(predX, predf - 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 1 - 
    (i - 1)/3, 0.75), lty = 2)
points(predX, predf + 1.96 * predSE, type = "l", col = rgb((i - 1)/3, 0, 1 - 
    (i - 1)/3, 0.75), lty = 2)

plot of chunk unnamed-chunk-10

Lastly a couple plots to show the distribution of draws from the posterior and the MCMC traceplots.

par(mfrow = c(1, 2))
hist(mcmcSamples, breaks = 20)
plot(mcmcSamples, type = "l")

plot of chunk unnamed-chunk-11