Hidden Markov Models

Many movement models proposed in recent years belong to the class of hidden Markov models (HMMs). An HMM is a time series model that comprises two components, an observable series and an underlying, non-observable state sequence (just as we saw with the mixture model). The observed data are taken to be conditionally independent given the states and are generated by so-called state-dependent distributions.

The state sequence is modeled as a Markov process usually assumed to be of first order, which means that the probability of state occurrences at time \(t+1\) depends only on which state the chain is in at time \(t\). That is, for an HMM with \(K\) states there is a \(K\) by \(K\) transition probability matrix \(\Gamma\) with \(\gamma_{i,j} = p(z_t=j | z_t=i)\). For the matrix to properly define a probability distribution, each row must be a simplex (i.e. its components must add up to \(1\)).

Using our latent indicator variable approach for the case of two states, this can be written as:

\[ \begin{equation} z_t \sim \begin{cases} \text{Bern}(p_1) &\mbox{if } z_{t-1} = 1 \\ \text{Bern}(p_2) &\mbox{if } z_{t-1} = 0 \end{cases}\; . \end{equation} \]

A consequence of this formulation is that the amount of time \(D_n\) spent in a given state \(n\) (before switching to an other state) is a random variable that follows a geometric distribution with parameter \(1 − \gamma_{n,n}\).

We also have to define the initial state distribution \(\boldsymbol\delta_n = p(z_1 = n)\).

Let’s simulate a trajectory

library(CircStats)

pal <- c("#f03b20","#2ca25f")

set.seed(123)
T <- 300
a <- c(1, 5)   # scale for Weibull
b <- c(1, 2)   # shape for Weibull 
m <- c(0, 0)   # circular mean for turns
rho <- c(0.1, 0.8)
p <- c(0.08, 0.86) # prob of z_t = 2 given z_t-1 = 1 and 2

MU <- matrix(0, T, 2)
z <- numeric(T)

phi <- runif(1, 0, 2 * pi) # initial movement direction

z[1] <- rbinom(1, size = 1, prob = p) + 1

for(t in 2:T){
  z[t] <- rbinom(1, size = 1, prob = p[z[t-1]]) + 1
  tmp <- rwrpcauchy(1, location = m[z[t]], rho = rho[z[t]])
  if (tmp > pi) 
    tmp <- tmp - 2 * pi
  if (tmp < -pi) 
    tmp <- tmp + 2 * pi
  phi <- phi + tmp
  s <- rweibull(1, shape = b[z[t]], scale = a[z[t]]) 
  move <- s * c(Re(exp((0+1i) * phi)), Im(exp((0+1i) * phi)))
  MU[t, ] <- MU[t-1, ] + move
}

plot(MU, type = "l", asp = 1, xlab = "", ylab = "", lwd = 1,
     cex.lab = 1.2, bty = "n", las = 1) 
for(i in 2:T){
  lines(MU[(i-1):i,], col = pal[z[i]], lwd = 2)
}

Let’s look at the time series of states. We can see that there is some “structure” and that switches between states are not too frequent.

plot(z-1, type = "s", xlab = "time", ylab = "z", yaxt="n",
     cex.lab = 1.2, bty = "n", las = 1, mar=c(5,4,8,1)
     )
axis(side=2, at=c(0,1))

Now we fit an HMM (see hmm.stan). For more on HMMs in Stan see Leos-Barajas and Michelot (2018).

library(momentuHMM)
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

datos <- momentuHMM::prepData(data.frame(MU),type="UTM",coordNames=c("X1","X2"))

# remove NAs
datos$step[is.na(datos$step)] <- -100
datos$angle[is.na(datos$angle)] <- -100
datos$ID <- as.numeric(datos$ID)

stan.data <- list(T = nrow(datos),
                  N = 2,
                  ID = datos$ID,
                  steps = datos$step,
                  turns = datos$angle,
                  lb = 0.85)

fit <- stan(file = "Stan_code/hmm.stan", 
            data = stan.data,
            iter = 1000,
            thin = 1,
            chains = 3
            )

As usual, we need to check that the chains have converged (Rhat \(< 1.1\)), and that we have a decent sample size of the posteriors (n_eff). Then we can look at the posteriors for the parameters governing steps and turns

print(fit, pars = c("mu", "rho", "shape", "scale", "g"))

## Inference for Stan model: hmm.
## 3 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=1500.
## 
##          mean se_mean   sd  2.5%   25%  50%  75%  98% n_eff Rhat
## mu[1]    0.41    0.02 0.61 -0.80  0.03 0.39 0.76 1.79  1062    1
## mu[2]    0.01    0.00 0.04 -0.07 -0.01 0.02 0.04 0.10  1724    1
## rho[1]   0.11    0.00 0.04  0.03  0.08 0.11 0.14 0.19   897    1
## rho[2]   0.76    0.00 0.03  0.70  0.75 0.77 0.79 0.82  1407    1
## shape[1] 0.95    0.00 0.05  0.86  0.92 0.95 0.99 1.06   852    1
## shape[2] 1.89    0.01 0.18  1.56  1.77 1.89 2.01 2.26  1183    1
## scale[1] 0.87    0.00 0.07  0.73  0.82 0.86 0.91 1.01  1214    1
## scale[2] 5.76    0.01 0.37  5.05  5.52 5.76 6.01 6.47  1360    1
## g[1,1]   0.93    0.00 0.02  0.89  0.92 0.93 0.95 0.96  1865    1
## g[1,2]   0.07    0.00 0.02  0.04  0.05 0.07 0.08 0.11  1865    1
## g[2,1]   0.14    0.00 0.04  0.07  0.11 0.14 0.17 0.22  1542    1
## g[2,2]   0.86    0.00 0.04  0.78  0.83 0.86 0.89 0.93  1542    1
## 
## Samples were drawn using NUTS(diag_e) at Thu Oct  8 08:10:03 2020.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Let’s plot the posteriors

plot(fit, pars = list("mu", "rho", "shape", "scale", "g"), 
     show_density = TRUE, ci_level = 0.5, fill_color = "gray")

We can compare posteriors to true parameter values

m.pars <- rstan::extract(fit, pars = c("mu", "rho", "scale", "shape", "g"))

op <- par(mar = c(5, 4, 1, 2) + 0.1, cex.lab = 1.2, bty = "n", las = 1)

layout(matrix(1:10,2,5, byrow = TRUE))
plot(density(m.pars$mu[,1]), main="", xlab=expression(mu[1]))
abline(v=m[1], lwd=2)
plot(density(m.pars$mu[,2]), main="", xlab=expression(mu[2]))
abline(v=m[2], lwd=2)
plot(density(m.pars$rho[,1]), main="", xlab=expression(rho[1]))
abline(v=rho[1], lwd=2)
plot(density(m.pars$rho[,2]), main="", xlab=expression(rho[2]))
abline(v=rho[2], lwd=2)
plot(density(m.pars$scale[,1]), main="", xlab="scale")
abline(v=a[1], lwd=2)
plot(density(m.pars$scale[,2]), main="", xlab="scale")
abline(v=a[2], lwd=2)
plot(density(m.pars$shape[,1]), main="", xlab="shape")
abline(v=b[1], lwd=2)
plot(density(m.pars$shape[,2]), main="", xlab="shape")
abline(v=b[2], lwd=2)
plot(density(m.pars$g[,1,2]), main="", xlab="g 1 to 2")
abline(v=p[1], lwd=2)
plot(density(m.pars$g[,2,2]), main="", xlab="g 2 to 2")
abline(v=p[2], lwd=2)

The classified trajectory

psam <- rstan::extract(fit, pars = c("mu", "rho", "scale", "shape", "viterbi"))
states <- colMeans(psam$viterbi)
ggplot(datos, aes(x,y,group=ID,col=states)) + geom_point(size=0.5) +
  geom_path(size=0.5) + coord_equal()

Compare true states to estimated ones

plot(jitter(z[-1]), states[-200], 
     col = gray(0.5,0.5), pch=16, 
     xlab = "jittered z", ylab = "mean posterior state",
     cex.lab = 1.2, bty = "n", las = 1)

Diagnostics

Let’s start by looking at the density of step and turn distributions. As we know the true parameters for these, we can add them to the plots.

# restore NAs
datos$step[datos$step < 0] <- NA 
datos$angle[datos$angle < (-pi)] <- NA

# unpack posterior draws
shape.pos <- rstan::extract(fit, pars = "shape")$shape 
scale.pos <- rstan::extract(fit, pars = "scale")$scale
mu.pos <- rstan::extract(fit, pars = "mu")$mu
rho.pos <- rstan::extract(fit, pars = "rho")$rho


# indices of posterior draws to plot (thinned for visualisation purposes)
ind <- seq(1, nrow(shape.pos), by = 5)
# plot step length densities
stepgrid <- seq(min(datos$step, na.rm = TRUE),
                max(datos$step, na.rm = TRUE), length = 100)

plot(NA, xlim = c(0, 10), ylim = c(0, 1.1), 
     xlab = "step length", ylab = "density") 

for (i in ind) {
  lines(stepgrid, dweibull(stepgrid, shape = shape.pos[i, 1], scale = scale.pos[i, 1]),
        lwd = 0.2, col = adjustcolor(pal[1], alpha.f = 0.1))
  lines(stepgrid, dweibull(stepgrid, shape = shape.pos[i, 2], scale = scale.pos[i, 2]),
        lwd = 0.2, col = adjustcolor(pal[2], alpha.f = 0.1))
}

lines(stepgrid, dweibull(stepgrid, shape = b[1], scale = a[1]), lwd = 2)
lines(stepgrid, dweibull(stepgrid, shape = b[2], scale = a[2]), lwd = 2)

# plot turning angle densities
anglegrid <- seq(-pi, pi, length = 100)
plot(NA, xlim = c(-pi, pi), ylim = c(0, 1.6), 
     xlab = "turning angle", ylab = "density") 

for (i in ind[-1]) {
  lines(anglegrid, dwrpcauchy(anglegrid, mu = mu.pos[i, 1], rho = rho.pos[i, 1]), 
         lwd = 0.2, col = adjustcolor(pal[1], alpha.f = 0.1))
  lines(anglegrid, dwrpcauchy(anglegrid, mu = mu.pos[i, 2], rho = rho.pos[i, 2]), 
         lwd = 0.2, col = adjustcolor(pal[2], alpha.f = 0.1))
}

lines(anglegrid, dwrpcauchy(anglegrid, mu=m[1], rho=rho[1]), lwd = 2)
lines(anglegrid, dwrpcauchy(anglegrid, mu=m[2], rho=rho[2]), lwd = 2)

Now a posterior predictive check on the autocorrelation in step lengths

psam <- rstan::extract(fit, pars = c("mu", "rho", "scale", "shape", "viterbi"))

steps <- datos$step

n.sims <- nrow(psam$mu) 

ppsteps <- matrix(NA,n.sims,T)
#ppturns <- matrix(NA,n.sims,T)

p2 <- psam$viterbi
nobs <- ncol(p2)
scale <- psam$scale
shape <- psam$shape
mu <- psam$mu
rho <- psam$rho

for(i in 1:n.sims){
  z <- p2[i,]
  ppsteps[i,] = rweibull(T,shape=shape[i,z],scale=scale[i,z])  
#  ppturns[i,] = rwrpcauchy(T, location  = (mu[i,z]), rho = rho[i,z])
}

#Autocorrelation
nlags <- 61 
oac = acf(steps[2:(T-1)],lag.max=(nlags-1),plot=FALSE)  #  ACF observada

ppac = matrix(NA,n.sims,nlags)
for(i in 1:n.sims){
ppac[i,] = acf(ppsteps[i,],lag.max=(nlags-1),plot=FALSE)$acf
}

library(coda)
hpd <- HPDinterval(as.mcmc(ppac), prob = 0.9)

dat <- data.frame(y = 1:61, acf = as.numeric(oac$acf), 
                  lb = hpd[, 1], ub = hpd[, 2])
ggplot(dat, aes(y, acf)) + 
geom_ribbon(aes(x = y, ymin = lb, ymax = ub), fill = "grey70", alpha = 0.5) +
geom_point(col = "black", size = 1) + 
  geom_line() + 
coord_cartesian(xlim = c(2, 60), ylim = c(-0.2, 0.7)) + 
xlab("Lag") + ylab("ACF") + 
ggtitle("Observed Autocorrelation
          with 90% CI for ACF of Predicted Quantities")

HMM with covariates

library (raster)
set.seed(12)
side <- 300
E <- raster(nrows=side, ncols=side, xmn=0, xmx=side, ymn=0,ymx=side)
E[] <- runif(side*side, -8, 16)
E <- focal(E, w=matrix(1, 7, 7), mean)
E <- focal(E, w=matrix(1, 7, 7), mean)
E.m = cellStats(E, "mean")
E.sd = cellStats(E, "sd")
E.scale <- (E - E.m) / E.sd
plot(E.scale)

T <- 200
a <- c(1, 5)
b <- c(1, 2)
m <- c(pi, 0)
rho <- c(0.5, 0.85)
p <- 0.2
b0 <- c(-2, -0.5)
b1 <- c(0, 2)

MU <- matrix(0, T, 2)
z <- numeric(T)
env <- numeric(T)

phi <- runif(1, 0, 2 * pi)
z[1] <- rbinom(1, size = 1, prob = p) + 1
MU[1, ] <- side/2

set.seed(12)
for(t in 2:T){
  env[t-1] <- raster::extract(E.scale, matrix(MU[t-1,],1,2))
  if(!is.na(env[t-1])){
    p <- plogis(b0[z[t-1]] + b1[z[t-1]] * env[t-1])
  } 
  z[t] <- rbinom(1, size = 1, prob = p) + 1
  tmp <- rwrpcauchy(1, location = m[z[t]], rho = rho[z[t]])
  if (tmp > pi) 
    tmp <- tmp - 2 * pi
  if (tmp < -pi) 
    tmp <- tmp + 2 * pi
  phi <- phi + tmp
  s <- rweibull(1, shape = b[z[t]], scale = a[z[t]]) 
  move <- s * c(Re(exp((0+1i) * phi)), Im(exp((0+1i) * phi)))
  MU[t, ] <- MU[t-1, ] + move
}
op <- par(cex.lab = 1.2 , font.lab = 1, cex.axis = 1, bty = "n", las = 1)
plot(E.scale)
#lines(MU, type = "l", asp = 1, xlab="", ylab="", lwd=1) #, xaxt='n', yaxt='n')
for(i in 2:T){
  lines(MU[(i-1):i,], col=pal[z[i]], lwd=2)
}

par(op)

plot time series for states

op <- par(cex.lab = 1.2 , font.lab = 2, cex.axis = 1, bty = "n", las = 1, mar=c(5,4,8,1))
plot(z-1, type = "s", xlab = "time", ylab = "z", yaxt="n")
axis(side=2, at=c(0,1))

par(op)

Fit the model

datos <- momentuHMM::prepData(data.frame(MU),type="UTM",coordNames=c("X1","X2"))

# remove NAs
datos$step[is.na(datos$step)] <- -100
datos$angle[is.na(datos$angle)] <- -100
datos$ID <- as.numeric(datos$ID)

stan.data <- list(T=nrow(datos),
                  N=2,
                  ID=datos$ID,
                  steps=datos$step,
                  turns=datos$angle,
                  nCovs=1, 
                  covs=cbind(matrix(1,nrow=nrow(datos)),env),
                  lb = 0.8)
library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

fit <- stan(file="Stan_code/sw.stan", 
            data=stan.data,
            iter=1000, 
            chains=3, 
            seed = 1234,
            control=list(adapt_delta=0.95)
            )

Before anything, we check that the chains have converged (r-hat < 1.1) and look at the posteriors for the steps and turns

plot(fit, plotfun = "rhat")

plot(fit, pars = list("mu", "rho", "shape", "scale", "beta"), 
     show_density = TRUE, ci_level = 0.5, fill_color = "gray")

The classsified trajectory

psam <- rstan::extract(fit, pars = c("mu", "rho", "scale", "shape", "viterbi"))
states <- colMeans(psam$viterbi)
ggplot(datos, aes(x,y,group=ID,col=states)) + 
  geom_point(size=0.5) +
  geom_path(size=0.5) + 
  coord_equal()

Compare true states to estimated ones

op <- par(cex.lab = 1.2 , font.lab = 1, cex.axis = 1, bty = "n", las = 1)
plot(jitter(z[-1]), states[-200], col = gray(0.5,0.5), pch=16, xlab = "jittered z", ylab = "mean posterior state" )

par(op)

Exercise

Fit a 3-state HMM to the simulated 2-state CRW.
Plot the distributions fitted to the different states
Do a posterior predictive check on the step length autocorrelation

References

Leos-Barajas, V., and T. Michelot. 2018. An introduction to animal movement modeling with hidden markov models using stan for bayesian inference.

HMMs

jmm

10/27/2020

Hidden Markov Models

Diagnostics

HMM with covariates

Exercise

References