Sum to zero constraints in Stan

Introduction

MFIT has the goal of identifying a treatment regimen that best reduces fatigue in people with kidney failure. There are 4 treatment arms, each being a different exercise regime. Participants are allocated a treatment arm then followed up over 9 months. The outcome is a FACIT-F score (0-52), which we treat as continuous.

Data simulation

We will use the following libraries and settings.

library(data.table)
library(ggplot2)
library(rstan)
rstan::rstan_options(auto_write = TRUE)
Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
options(mc.cores = 1)

The data model includes the grand mean at baseline, a time effect and a treatment by time interaction for each arm. When analysing the data, we will apply a sum-to-zero constraint on the treatment by time interaction at each timepoint. It isn’t necessary to enforce the sum-to-zero constraint in the data generation process. The following simulates data arising from the trial.

#' Simulate data from MFIT trial
#'
#' @description 
#'  We have baseline (0), 1, 2, 3 and 9 months followup.
#'  For primary we look at 3 months fu
#'  There are potentially time and treatment effects.
#'  Naturally, we would assume that without intervention, over time, things get worse.
#'  However, being involved in a trial will likely improve things purely as 
#'  a results of being in the trial.
#'  Missingness likely to be material.
#'
#' @param N number of participants
#' @param alpha baseline fatigue score
#' @param locbj time effects (relative to time zero) default NULL implies no change
#' @param locbjk time x trt interactions (rel to soc at baseline) default NULL implies no effects
#' @param sdu participant variance component (as sd)
#' @param sde residual variance component (as sd)
#' @param pmiss probability of missingness
#'
#' @return data.table
#' @export
#'
#' @examples
getdata <- function(N = 400,
                    alpha = 35,
                    locbj = NULL, 
                    locbjk = NULL, 
                    sdu = 2, 
                    sde = 5,
                    pmiss = 0.0){
  
  # relative to baseline - time effect at points 1, 2, 3, and 9
  if(is.null(locbj)) locbj <- c(0, 0, 0, 0)

  # **NOTE** that higher fatigue scores assumed to imply less fatigue!!
  
  # Cols correspond to the trt effects relative to SOC at each timepoint (1, 2, 3, 9)
  if(is.null(locbjk)) {
    locbjk <- rbind(c(0, 0, 0, 0),
                    c(0, 0, 0, 0),
                    c(0, 0, 0, 0))  
  }
  
  # times, trts
  J <- length(locbj) + 1
  K <- nrow(locbjk) + 1
  
  # subject and residual variability
  # equal variance on all arms - probably unrealistic??
  if(is.null(sdu)) sdu <- 2
  if(is.null(sde)) sde <- 5
  
  # Time period index. 
  # Zero is the baseline measure, then:
  # Index 1 corresponds to month 1
  # Index 2 corresponds to month 2
  # Index 3 corresponds to month 3
  # Index 4 corresponds to month 9
  loctidx <- c(0, 1, 2, 3, 4) # index
  loctt <- c(0, 1, 2, 3, 9)   # actual time point (in months)
  
  # allocation randomisation probs
  w <- rep(1/K, K)
  cw <- cumsum(w)
  loctrt <- findInterval(runif(N), cw) + 1
  
  # missing indicator
  locmiss <- rbinom(N*length(loctidx), 1, pmiss)
  
  # data. each participant has 5 observations (baseline, months 1, 2, 3, 9) 
  d <- data.table(id = rep(1:N, each = length(loctidx)))
  
  # Extends y = b0 + b1 time + b2 trt x time
  d[, trt := rep(loctrt, each = length(loctidx))]
  # time index
  d[, tidx := rep(loctidx, length = N * length(loctidx))]
  # month fu
  d[, tt := loctt[tidx + 1]]
  
  # variance components
  d[, e := rnorm(N*length(loctidx), 0, sde)]
  d[, u := rep(rnorm(N, 0, sdu), each = length(loctidx))]
  
  d[, miss := locmiss]
  
  d[, y := alpha + u + e ] 
  # time effects on all  arms relative to soc at zero
  d[tidx != 0, y := y + locbj[tidx]]
  # time x trt effects on all arms relative to soc at zero
  for(j in 1:(J-1)){
    d[trt != 1 & tidx == j, y := y + locbjk[trt - 1, j]]  
  }

  return(d)
}

Simulate a null scenario (no time nor treatment effects) and summarise the data by showing means and variation for trt x time:

N = 400
alpha = 35
# Null implies no time or treatment effects
locbj = NULL
locbjk = NULL
sdu = 2
sde = 5
# No missingness
pmiss = 0.0
d <- getdata()

# No change over time
dcast(d[, .(trt, tt, y = y - e - u)], trt ~ tt, fun = mean, value.var = "y")

##    trt  0  1  2  3  9
## 1:   1 35 35 35 35 35
## 2:   2 35 35 35 35 35
## 3:   3 35 35 35 35 35
## 4:   4 35 35 35 35 35

# Residual
dcast(d[, .(trt, tt, y)], trt ~ tt, fun = sd, value.var = "y")

##    trt        0        1        2        3        9
## 1:   1 5.500985 5.021543 5.380365 5.980276 5.193275
## 2:   2 4.816226 5.064966 5.231430 5.143437 5.546034
## 3:   3 5.967663 5.073005 5.173301 5.618985 5.874484
## 4:   4 5.507672 5.299795 5.009616 5.519428 5.326000

And now simulate a temporal change across all arms with no treatment effects. The data now shows that as time goes on, fatigue gets worse.

# Scenario - time trend in all arms 1, 2, 3, 9
locbj <- c(-1, -2, -3, -4)

d <- getdata(locbj = locbj)
dcast(d[, .(trt, tt, y = y - u - e)], trt ~ tt, fun = mean, value.var = "y")

##    trt  0  1  2  3  9
## 1:   1 35 34 33 32 31
## 2:   2 35 34 33 32 31
## 3:   3 35 34 33 32 31
## 4:   4 35 34 33 32 31

Assume a common declining temporal trend, shared across all arms and a treatment effect in one arm. For the treatment x time effects matrix, each column gives the treatment effects at that timepoint relative to soc at baseline.

# Time trend
locbj <- c(-1, -2, -3, -4)
# Scenario - constant treatment effect in one arm
# Cols correspond to the trts x4 
locbjk <- rbind(c( 7,  8,  9,  8),  
                c(0,0,0,0), 
                c(0,0,0,0)) 

d <- getdata(locbj = locbj, locbjk = locbjk)
# Removes the noise
dcast(d[, .(trt, tt, y = y - u - e)], trt ~ tt, fun = mean, value.var = "y")

##    trt  0  1  2  3  9
## 1:   1 35 34 33 32 31
## 2:   2 35 41 41 41 39
## 3:   3 35 34 33 32 31
## 4:   4 35 34 33 32 31

And visualise.

dtmp <- d[, .(mu = mean(y)), by = .(trt, tt)]
ggplot(d, aes(x = tt, y = y)) +
  geom_line(aes(group = id, col = paste0(trt)), alpha = 0.15) + 
  geom_line(data = dtmp, aes(x = tt, y = mu, group = trt), 
             inherit.aes = FALSE) +
  geom_point(data = dtmp, aes(x = tt, y = mu), 
             inherit.aes = FALSE, size = 2) +
  scale_color_discrete("Treatment") +
  scale_x_continuous("Months", breaks = 1:9) +
  # scale_y_continuous("FACIT-F", limits = c(10, 60)) +
  geom_point(data = dtmp, aes(x = tt, y = mu, col = paste0(trt)), 
             inherit.aes = FALSE, size = 1) +
  theme_bw()

Modelling

The following stan model is used to analyse the data. It is best to standardise the response prior to use. Note that generally, we will not be able to recover the data means exactly when using a variance components model, but it should be pretty close in this situation.

functions{
  // see https://discourse.mc-stan.org/t/test-soft-vs-hard-sum-to-zero-constrain-choosing-the-right-prior-for-soft-constrain/3884/21
  vector Q_sum_to_zero_QR(int N) {
    vector [2*N] Q_r;
    for(i in 1:N) {
      Q_r[i] = -sqrt((N-i)/(N-i+1.0));
      Q_r[i+N] = inv_sqrt((N-i) * (N-i+1));
    }
    return Q_r;
  }
  vector sum_to_zero_QR(vector x_raw, vector Q_r) {
    int N = num_elements(x_raw) + 1;
    vector [N] x;
    real x_aux = 0;
    for(i in 1:N-1){
      x[i] = x_aux + x_raw[i] * Q_r[i];
      x_aux = x_aux + x_raw[i] * Q_r[i+N];
    }
    x[N] = x_aux;
    return x;
  }
  real to_real(int x) { return x; }
}
data { 
  int N; // observations
  int J; // number of fu times excl baseline
  int K; // number of treatments incl referent
  int U; // number of distinct individuals
  int id[N]; // participant id
  int time[N]; // time id
  int trt[N];  // treatment id
  real y[N];   // response
}        
transformed data {
  vector[2*K] Q_r[J] = rep_array(Q_sum_to_zero_QR(K), J);
  real x_raw_sigma[J] = rep_array(inv_sqrt(1 - inv(K)), J);
}
parameters {        
  // common baseline fatigue
  real alpha;
  // time effects (excl baseline)
  real bj[J];
  // treatment x time interactions constrained to sum
  // to zero at each time point
  // array of size J containing vectors with K-1 elements
  vector[K-1] bjk_raw[J];
  // variance components
  real<lower=0> ee;
  real<lower=0> uu;
  // subject level offset
  real u[U];
}        
transformed parameters {        
  vector[K] bjk[J];
  for(i in 1:J){
    bjk[i] = sum_to_zero_QR(bjk_raw[i], Q_r[i]);
  }
}        
model {        
  real mu[N];
  target += normal_lpdf(alpha | 0, 10);    
  for(j in 1:J){
    // this implies that bjk_raw[j] ~ N(0, 1/(sqrt(1 - (1/K)) 
    target += normal_lpdf(bjk_raw[j] | 0, x_raw_sigma[j]); 
  }
  target += normal_lpdf(bj | 0, 1);
  target += inv_gamma_lpdf(ee | 0.01, 0.01);
  target += inv_gamma_lpdf(uu | 0.01, 0.01);
  target += normal_lpdf(u | 0, sqrt(uu));
  for(i in 1:N){
    // for baseline all we have is the grand mean and rand intercept
    mu[i] = alpha + u[id[i]];
    // for all other timepoints.
    if(time[i] != 0) mu[i] = mu[i] + bj[time[i]] + bjk[time[i], trt[i]]; 
  }
  target += normal_lpdf(y | mu, sqrt(ee));
}        
generated quantities{    
}

Compile and fit the data generated previously.

mod1 <- suppressWarnings(rstan::stan_model("mod1.stan" ,verbose = F))

ld <- list(N = nrow(d), # observations
           J = length(unique(d$tt))-1, # number of fu times excl baseline
           K = length(unique(d$trt)),  # number of treatments incl referent
           U = length(unique(d$id)),   # number of distinct individuals
           id = d$id,     # participant ids
           time = d$tidx, # time ids
           trt = d$trt,   # treatment ids
           y = (d$y - mean(d$y))/sd(d$y)   # response (centered and scaled)
           )

f1 <- rstan::sampling(object  = mod1,
                        data    = ld,
                        chains  = 1,
                        thin    = 3,
                        iter    = 5000,
                        warmup  = 1000,
                        refresh = 1000)

## 
## SAMPLING FOR MODEL 'mod1' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000146 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.46 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1001 / 5000 [ 20%]  (Sampling)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Sampling)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Sampling)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 3.17749 seconds (Warm-up)
## Chain 1:                9.40694 seconds (Sampling)
## Chain 1:                12.5844 seconds (Total)
## Chain 1:

Summarise parameter estimates. Note that ee and uu are variance (not sd) estimates. If you look at the estimates named bjk that within each timepoint bjk[j,] the estimates sum to zero as desired. Clearly, the estimates do not align with the simulation parameters, but then we would not expect them to because the response was standardised before fitting the model.

print(f1, pars = c("alpha", "bj", "bjk", "ee", "uu"), digits = 3, probs = c(0.025, 0.5, 0.975))

## Inference for Stan model: mod1.
## 1 chains, each with iter=5000; warmup=1000; thin=3; 
## post-warmup draws per chain=1334, total post-warmup draws=1334.
## 
##            mean se_mean    sd   2.5%    50%  97.5% n_eff  Rhat
## alpha     0.047   0.001 0.043 -0.039  0.049  0.129  1146 1.000
## bj[1]     0.125   0.002 0.055  0.018  0.124  0.237  1016 1.000
## bj[2]     0.064   0.002 0.054 -0.047  0.063  0.168  1273 1.000
## bj[3]    -0.149   0.002 0.056 -0.261 -0.149 -0.039  1272 0.999
## bj[4]    -0.277   0.002 0.055 -0.384 -0.279 -0.167  1313 0.999
## bjk[1,1] -0.278   0.002 0.073 -0.421 -0.278 -0.132  1265 0.999
## bjk[1,2]  0.817   0.002 0.072  0.677  0.817  0.959  1296 1.000
## bjk[1,3] -0.234   0.002 0.075 -0.381 -0.235 -0.089  1267 0.999
## bjk[1,4] -0.306   0.002 0.072 -0.451 -0.307 -0.158  1465 1.000
## bjk[2,1] -0.378   0.002 0.076 -0.532 -0.379 -0.228  1322 0.999
## bjk[2,2]  0.883   0.002 0.075  0.741  0.883  1.023  1252 0.999
## bjk[2,3] -0.326   0.002 0.074 -0.468 -0.325 -0.175  1221 0.999
## bjk[2,4] -0.179   0.002 0.074 -0.324 -0.179 -0.038  1474 0.999
## bjk[3,1] -0.430   0.002 0.074 -0.573 -0.429 -0.283  1270 1.006
## bjk[3,2]  1.098   0.002 0.076  0.946  1.098  1.246  1439 0.999
## bjk[3,3] -0.246   0.002 0.076 -0.396 -0.245 -0.103  1272 1.001
## bjk[3,4] -0.422   0.002 0.074 -0.559 -0.423 -0.278  1200 1.000
## bjk[4,1] -0.253   0.002 0.075 -0.396 -0.253 -0.103  1305 1.000
## bjk[4,2]  0.859   0.002 0.075  0.710  0.862  1.003  1303 0.999
## bjk[4,3] -0.309   0.002 0.073 -0.451 -0.309 -0.168  1504 0.999
## bjk[4,4] -0.297   0.002 0.073 -0.435 -0.299 -0.151  1469 1.000
## ee        0.641   0.001 0.023  0.598  0.639  0.687  1010 0.999
## uu        0.111   0.001 0.018  0.078  0.110  0.150   735 0.999
## 
## Samples were drawn using NUTS(diag_e) at Mon Feb 22 12:49:21 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Extract the posterior samples and collect into a data.table.

post <- rstan::extract(f1, pars = c("alpha", "bj", "bjk"))

ttidx <- 0:4
trtidx <- 1:4
times <- c(0, 1, 2, 3, 9)

dpost <- do.call(rbind, lapply(trtidx, function(i){
  yy <- sapply(ttidx, function(j){
    if(j == 0) post$alpha
    else post$alpha + post$bj[, j] + post$bjk[, j, i]
  }) 
  dp <- data.table(cbind(trt = i, yy))
  setnames(dp, paste0("V",2:6), paste0("ttidx",1:5))
  dp
}))

dpost <- melt(dpost, 
            id.vars = c("trt"),
            measure.vars = paste0("ttidx",1:5),
            value.name = "y")

dpost$ttidx <- as.numeric(substr(dpost$variable, 6, 6))
dpost[, variable := NULL]
dpost[, tt := times[ttidx]]

Visualise the posterior mean by treatment and time. The dashed lines show means from the simulated data, solid lines are posterior means; both are standardised here.

dtmp1 <- dpost[, .(mu = mean(y), 
                   lb = quantile(y, probs = 0.05),
                   ub = quantile(y, probs = 0.95)), by = .(trt, tt)][order(trt, tt)]
d[, ybar := (y - mean(y))/sd(y)]
dtmp2 <- d[, .(mu = mean(ybar)), by = .(trt, tt)][order(trt, tt)]

ggplot(dtmp1, aes(x = tt, y = mu)) +
  geom_ribbon(aes(x = tt, ymin = lb, ymax = ub, 
                  group = trt, fill = paste0(trt)), 
              alpha = 0.1, show.legend = FALSE) +
  geom_line(aes(col = paste0(trt))) + 
  geom_line(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE, lty = 2) + 
  geom_point(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE) + 
  scale_color_discrete("Treatment") +
  scale_x_continuous("Months", breaks = 1:9) +
  scale_y_continuous("FACIT-F (standardised)") +
  theme_bw()

Now transform back to the original scale and see if we approximately recover the data means. It seems like we have a reasonable approximation.

dpost[, yy := (sd(d$y)*(y))+mean(d$y)]
dtmp1 <- dpost[, .(mu = mean(yy),
                   lb = quantile(yy, probs = 0.05),
                   ub = quantile(yy, probs = 0.95)
                   ), by = .(trt, tt)][order(trt, tt)]
dtmp2 <- d[, .(mu = mean(y)), by = .(trt, tt)][order(trt, tt)]

ggplot(dtmp1, aes(x = tt, y = mu)) +
  geom_ribbon(aes(x = tt, ymin = lb, ymax = ub, 
                  group = trt, fill = paste0(trt)), 
              alpha = 0.1, show.legend = FALSE) +
  geom_line(aes(col = paste0(trt))) + 
  geom_line(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE, lty = 2) + 
  geom_point(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE) + 
  scale_color_discrete("Treatment") +
  scale_x_continuous("Months", breaks = 1:9) +
  scale_y_continuous("FACIT-F") +
  theme_bw()

Remove the shared time effect and just look at the trt x time effect in a single arm.

# Time trend
locbj <- rep(0, 4)
# Scenario - constant treatment effect in one arm
# Cols correspond to the trts x4 
locbjk <- rbind(c( 7,  8,  9,  8),  
                c(0,0,0,0), 
                c(0,0,0,0)) 

d <- getdata(locbj = locbj, locbjk = locbjk)
# Removes the noise
dcast(d[, .(trt, tt, y = y - u - e)], trt ~ tt, fun = mean, value.var = "y")

##    trt  0  1  2  3  9
## 1:   1 35 35 35 35 35
## 2:   2 35 42 43 44 43
## 3:   3 35 35 35 35 35
## 4:   4 35 35 35 35 35

Fit

ld <- list(N = nrow(d), # observations
           J = length(unique(d$tt))-1, # number of fu times excl baseline
           K = length(unique(d$trt)),  # number of treatments incl referent
           U = length(unique(d$id)),   # number of distinct individuals
           id = d$id,     # participant ids
           time = d$tidx, # time ids
           trt = d$trt,   # treatment ids
           y = (d$y - mean(d$y))/sd(d$y)   # response (centered and scaled)
           )

f2 <- rstan::sampling(object  = mod1,
                        data    = ld,
                        chains  = 1,
                        thin    = 3,
                        iter    = 5000,
                        warmup  = 1000,
                        refresh = 1000)

## 
## SAMPLING FOR MODEL 'mod1' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000147 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.47 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1001 / 5000 [ 20%]  (Sampling)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Sampling)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Sampling)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 3.24748 seconds (Warm-up)
## Chain 1:                9.16407 seconds (Sampling)
## Chain 1:                12.4115 seconds (Total)
## Chain 1:

post <- rstan::extract(f2, pars = c("alpha", "bj", "bjk"))

ttidx <- 0:4
trtidx <- 1:4
times <- c(0, 1, 2, 3, 9)

dpost <- do.call(rbind, lapply(trtidx, function(i){
  yy <- sapply(ttidx, function(j){
    if(j == 0) post$alpha
    else post$alpha + post$bj[, j] + post$bjk[, j, i]
  }) 
  dp <- data.table(cbind(trt = i, yy))
  setnames(dp, paste0("V",2:6), paste0("ttidx",1:5))
  dp
}))

dpost <- melt(dpost, 
            id.vars = c("trt"),
            measure.vars = paste0("ttidx",1:5),
            value.name = "y")

dpost$ttidx <- as.numeric(substr(dpost$variable, 6, 6))
dpost[, variable := NULL]
dpost[, tt := times[ttidx]]

dpost[, yy := (sd(d$y)*(y))+mean(d$y)]
dtmp1 <- dpost[, .(mu = mean(yy),
                   lb = quantile(yy, probs = 0.05),
                   ub = quantile(yy, probs = 0.95)
                   ), by = .(trt, tt)][order(trt, tt)]
dtmp2 <- d[, .(mu = mean(y)), by = .(trt, tt)][order(trt, tt)]

ggplot(dtmp1, aes(x = tt, y = mu)) +
  geom_ribbon(aes(x = tt, ymin = lb, ymax = ub, 
                  group = trt, fill = paste0(trt)), 
              alpha = 0.1, show.legend = FALSE) +
  geom_line(aes(col = paste0(trt))) + 
  geom_line(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE, lty = 2) + 
  geom_point(data = dtmp2, aes(x = tt, y = mu, col = paste0(trt)),
           inherit.aes = FALSE) + 
  scale_color_discrete("Treatment") +
  scale_x_continuous("Months", breaks = 1:9) +
  scale_y_continuous("FACIT-F") +
  theme_bw()

Sum to zero constraints in Stan

Data simulation and modelling for MFIT

Introduction

Data simulation

Modelling