Ordinal regression with stan

knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(ggplot2)
library(mvtnorm)
library(cmdstanr)

## This is cmdstanr version 0.5.3

## - CmdStanR documentation and vignettes: mc-stan.org/cmdstanr

## - CmdStan path: /Users/mark/.cmdstan/cmdstan-2.33.1

## - CmdStan version: 2.33.1

library(kableExtra)
library(matrixcalc)

tok <- unlist(tstrsplit(getwd(), "/"))
if(tok[length(tok)] == 'rpubs'){path_pref = "."} else{path_pref = ".."}

The previous ordinals suffice for learning but are slow to run.

Once again, consider a hypothetical setting. This time we have an ordinal outcome with three levels. We observe the outcome under experimental with three treatment arms in four strata.

Start with simulating the data; first set up the intercepts:

# Number of categories
K <- 3
# Set up the intercepts:
# Define probability of being in each category
p0 <- c(0.20, 0.50, 0.3)
stopifnot(length(p0) == K)
# covers off on rounding errors that you will sometimes see in R
p0[K] = 1 - sum(p0[1:(K-1)])
# cumulative probabilities
pc0 <- cumsum(p0)
# cumulative logits - these are the cutpoints for the control group
# note that the lower and upper are not required
l0 <- qlogis(pc0[-K])
# shoud be the same 
knitr::kable(
  rbind(
    p0_orig = p0, 
    p0_constructed = c(1, plogis(-l0[-K])) - c(plogis(-l0[-K]), 0)),
  format = "simple"
)

p0_orig	0.2	0.5	0.3
p0_constructed	0.2	0.5	0.3

Now set up the design matrix based on the three treatment arms and four strata.

# Set up the linear predictors (the first row is the reference)
dX <- CJ(trt = factor(1:3), strata = factor(1:4))
XX <- model.matrix(~ trt * strata, data = dX)[, -1]

Note the use of the model.matrix to create a design matrix based on two factor variables and their interaction. In the ordinal model likelihood, the intercepts are handled separately and are therefore removed from the design matrix. That is the reason for the [, -1].

Next, I define some arbitrary parameters and compute the value of the linear predictor for each unique covariate combination.

# Simulation parameters for each column in the design matrix
b <- c(-1.5, 0.5, 
       0.8, 0.4, 0.2, 
       0.05, 0.1, 
       -0.3, -0.5, 
       -0.3, -0.8)

names(b) <- colnames(XX)

# include the linear predictor
eta <- as.numeric(XX %*% b)

knitr::kable(
  cbind(data.table(eta = eta), XX),
  format = "simple"
)

eta	trt2	trt3	strata2	strata3	strata4	trt2:strata2	trt3:strata2	trt2:strata3	trt3:strata3	trt2:strata4	trt3:strata4
0.00	0	0	0	0	0	0	0	0	0	0	0
0.80	0	0	1	0	0	0	0	0	0	0	0
0.40	0	0	0	1	0	0	0	0	0	0	0
0.20	0	0	0	0	1	0	0	0	0	0	0
-1.50	1	0	0	0	0	0	0	0	0	0	0
-0.65	1	0	1	0	0	1	0	0	0	0	0
-1.40	1	0	0	1	0	0	0	1	0	0	0
-1.60	1	0	0	0	1	0	0	0	0	1	0
0.50	0	1	0	0	0	0	0	0	0	0	0
1.40	0	1	1	0	0	0	1	0	0	0	0
0.40	0	1	0	1	0	0	0	0	1	0	0
-0.10	0	1	0	0	1	0	0	0	0	0	1

The linear predictors are used to construct the probability of each level occurring under each of the covariate groups.

# Get probabilities for each of the covariate combinations 
lp <- list()
for(i in 1:nrow(XX)){
  # cumulative logits for treatment group
  l1 <- eta[i] - l0
  # cumulative probs
  pc1 <- plogis(l1)
  lp[[i+1]] <- c(1, pc1) - c(pc1, 0)
}
# first row should be l0
p_tru <- do.call(rbind, lp)
colnames(p_tru) <- paste0("Cat", 1:ncol(p_tru))
rownames(p_tru) <- paste0("Cov", 1:nrow(p_tru))

knitr::kable(
  cbind(round(p_tru, 2), XX), 
  format = "simple"
)

	Cat1	Cat2	Cat3	trt2	trt3	strata2	strata3	strata4	trt2:strata2	trt3:strata2	trt2:strata3	trt3:strata3	trt2:strata4	trt3:strata4
Cov1	0.20	0.50	0.30	0	0	0	0	0	0	0	0	0	0	0
Cov2	0.10	0.41	0.49	0	0	1	0	0	0	0	0	0	0	0
Cov3	0.14	0.47	0.39	0	0	0	1	0	0	0	0	0	0	0
Cov4	0.17	0.49	0.34	0	0	0	0	1	0	0	0	0	0	0
Cov5	0.53	0.38	0.09	1	0	0	0	0	0	0	0	0	0	0
Cov6	0.32	0.49	0.18	1	0	1	0	0	1	0	0	0	0	0
Cov7	0.50	0.40	0.10	1	0	0	1	0	0	0	1	0	0	0
Cov8	0.55	0.37	0.08	1	0	0	0	1	0	0	0	0	1	0
Cov9	0.13	0.45	0.41	0	1	0	0	0	0	0	0	0	0	0
Cov10	0.06	0.31	0.63	0	1	1	0	0	0	1	0	0	0	0
Cov11	0.14	0.47	0.39	0	1	0	1	0	0	0	0	1	0	0
Cov12	0.22	0.50	0.28	0	1	0	0	1	0	0	0	0	0	1

And now based on these probabilities, simulate a large dataset. The observed proportions can be seen to be approximately equal to the simulation probabilities.

# Covariate combinations
N <- 5000
d_obs <- data.table(id = 1:N)
d_obs[, id_cov := sample(1:nrow(XX), size = .N, replace = T)]
d_obs[, y := unlist(lapply(1:N, function(i) {
  sample(1:K, size = 1, replace = F, prob = p_tru[d_obs$id_cov[i], ])
}))]
# Should resemble each other
p_obs <- prop.table(table(d_obs$id_cov, d_obs$y), 1)
colnames(p_obs) <- paste0("Cat", 1:K, " (obs)")
knitr::kable(
  round(cbind(p_tru, p_obs), 2), 
  format = "simple"
)

	Cat1	Cat2	Cat3	Cat1 (obs)	Cat2 (obs)	Cat3 (obs)
Cov1	0.20	0.50	0.30	0.16	0.50	0.34
Cov2	0.10	0.41	0.49	0.11	0.40	0.49
Cov3	0.14	0.47	0.39	0.12	0.48	0.39
Cov4	0.17	0.49	0.34	0.17	0.50	0.33
Cov5	0.53	0.38	0.09	0.53	0.38	0.09
Cov6	0.32	0.49	0.18	0.31	0.51	0.18
Cov7	0.50	0.40	0.10	0.52	0.39	0.10
Cov8	0.55	0.37	0.08	0.58	0.35	0.07
Cov9	0.13	0.45	0.41	0.15	0.45	0.40
Cov10	0.06	0.31	0.63	0.06	0.28	0.66
Cov11	0.14	0.47	0.39	0.17	0.44	0.39
Cov12	0.22	0.50	0.28	0.22	0.52	0.26

The first stan implementation leaves the parameters on their original scale.

m1 <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/cumulative_logit6.stan"))
m1

## data{
##   int N; // num observations
##   int K; // num categories for y
##   int P; // num covariates
##   int Q; // num covariate patterns
##  
##   array[N] int y; 
##   // model matrix created with intercept 
##   // but then drop the intercept so that
##   // the first row is zeros, assuming 
##   // dummy coding. 
##   matrix[N,P] X; 
##  
##   // for prediction at each of the unique 
##   // covariate combinations 
##   matrix[Q, P] XX;
## 
##   int prior_only;
##   real pri_b_s;
##   real pri_c_s;
## }
## parameters {
##   ordered[K - 1] c;
##   vector[P] b;
## }
## transformed parameters{
## }
## model {
##   target += normal_lpdf(c | 0, pri_c_s);
##   target += normal_lpdf(b | 0, 1);
##   if(!prior_only){
##     // the effect of each state can change at each time point
##     target += ordered_logistic_lpmf(y | X * b,  c);
##   }
## }
## generated quantities{
##   matrix[Q, K] p;
## 
##   // for each covariate pattern there is a probability
##   // of occupying each category of y
##   for(j in 1:Q){
##     for(k in 1:K){
##         if(k == 1){
##           p[j,k] = 1 - inv_logit(XX[j,] * b - c[1]);
##         } else if (k == K) {
##           p[j,k] = inv_logit(XX[j,] * b - c[K-1]);
##         } else {
##           p[j,k] = inv_logit(XX[j,] * b - c[k-1]) -
##             inv_logit(XX[j,] * b - c[k]);
##         }
##     }
##   }
## }

The posterior means appear to approximate the parameters used to simulate the data. Based on an earlier test (not shown), in contrast to a model constructed with an indexing approach, there is about a 10-fold decrease in the duration to fit the data.

lsd <- list(
  N = N,
  K = K,
  P = ncol(XX),
  y = d_obs$y,
  X = XX[d_obs$id_cov,],
  XX = XX,
  Q = nrow(XX),
  prior_only = 0, pri_b_s = 1, pri_c_s = 1.8
)

f1 <- m1$sample(lsd,
    iter_warmup = 1000,
    iter_sampling = 2000,
    parallel_chains = 2,
    chains = 2,
    refresh = 2000, show_exceptions = F)

## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 3000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 3000 [  0%]  (Warmup) 
## Chain 1 Iteration: 1001 / 3000 [ 33%]  (Sampling) 
## Chain 2 Iteration: 1001 / 3000 [ 33%]  (Sampling) 
## Chain 1 Iteration: 3000 / 3000 [100%]  (Sampling) 
## Chain 1 finished in 19.9 seconds.
## Chain 2 Iteration: 3000 / 3000 [100%]  (Sampling) 
## Chain 2 finished in 20.0 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 19.9 seconds.
## Total execution time: 20.1 seconds.

smry <- f1$summary(variables = c("b"))

knitr::kable(
  cbind(b, posterior_means = smry$mean, rhat = smry$rhat), 
  format = "simple", digits = 3
)

	b	posterior_means	rhat
trt2	-1.50	-1.716	1.000
trt3	0.50	0.209	1.000
strata2	0.80	0.572	1.000
strata3	0.40	0.231	1.001
strata4	0.20	-0.062	1.000
trt2:strata2	0.05	0.326	1.000
trt3:strata2	0.10	0.477	1.000
trt2:strata3	-0.30	-0.182	1.000
trt3:strata3	-0.50	-0.320	1.000
trt2:strata4	-0.30	-0.155	1.000
trt3:strata4	-0.80	-0.519	1.000

And the probabilities of observing each outcome in each covariate pattern are also reasonable.

d_p <- data.table(f1$draws(variables = c("p"), format = "matrix"))

d_tbl <- melt(d_p, measure.vars = names(d_p)) 
d_tbl[, id_cov := gsub("p[", "", variable, fixed = T)]
d_tbl[, id_cov := as.integer(gsub(",.]", "", id_cov))]
d_tbl[, k := gsub("^p\\[.*,", "", variable)]
d_tbl[, k := as.integer(gsub("]", "", fixed = T, k))]
d_tbl <- d_tbl[, .(p_mu = mean(value)), keyby = .(id_cov, k)]
d_tbl[, p_tru := as.numeric(t(p_tru))]

d_res <- melt(d_tbl, id.vars = c("id_cov", "k"))
knitr::kable(
  dcast(d_res, id_cov + variable ~ k, value.var = "value"),
  digits = 3, format = "simple"
)

id_cov	variable	1	2	3
1	p_mu	0.167	0.484	0.349
1	p_tru	0.200	0.500	0.300
2	p_mu	0.102	0.412	0.487
2	p_tru	0.101	0.411	0.488
3	p_mu	0.137	0.460	0.402
3	p_tru	0.144	0.466	0.390
4	p_mu	0.176	0.490	0.335
4	p_tru	0.170	0.486	0.344
5	p_mu	0.527	0.385	0.088
5	p_tru	0.528	0.384	0.087
6	p_mu	0.312	0.497	0.191
6	p_tru	0.324	0.493	0.183
7	p_mu	0.515	0.394	0.092
7	p_tru	0.503	0.401	0.096
8	p_mu	0.580	0.348	0.072
8	p_tru	0.553	0.367	0.080
9	p_mu	0.140	0.463	0.397
9	p_tru	0.132	0.454	0.414
10	p_mu	0.054	0.293	0.653
10	p_tru	0.058	0.307	0.635
11	p_mu	0.151	0.473	0.376
11	p_tru	0.144	0.466	0.390
12	p_mu	0.225	0.505	0.270
12	p_tru	0.216	0.504	0.279

A slight modification to the model is to decompose the design matrix using QR decomposition:

\[ \begin{aligned} X = Q R \end{aligned} \]

where \(Q\) is an \(N \times M\) orthogonal matrix with \(M\) non-zero rows and \(R\) is a \(M \times M\) upper-triangle matrix. The linear predictor is then restructured to

\[ \begin{aligned} \eta &= X^\top \beta \\ &= Q R \beta \\ &= Q \beta^\prime \end{aligned} \]

and because \(Q\) is orthogonal, the columns are independent and the posterior over the \(\beta^\prime = R \beta\) should be less correlated. For computational efficiency, \(Q\) and \(R\) are converted to the same scale via

\[ \begin{aligned} Q^\prime &= Q \times \sqrt{N - 1} \\ R^\prime &= \frac{1}{\sqrt{N - 1}} R \end{aligned} \]

so that

\[ \begin{aligned} \eta &= Q R \beta = Q^\prime R^\prime \beta = Q^\prime \gamma \end{aligned} \]

where \(\gamma = R^\prime \beta\) and thus \(\beta = (R^\prime)^{-1} \gamma\) (product of the inverse of R and gamma). The revised implementation is shown below.

m2 <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/cumulative_logit7.stan"))
m2

## data{
##   int N; // num observations
##   int K; // num categories for y
##   int P; // num covariates
##   int Q; // num covariate patterns
##  
##   array[N] int y; 
##   // model matrix created with intercept 
##   // but then drop the intercept so that
##   // the first row is zeros, assuming 
##   // dummy coding. 
##   matrix[N,P] X; 
##   
##   // unique entries in design matrix 
##   matrix[Q, P] XX;
## 
##   int prior_only;
##   real pri_b_s;
##   real pri_c_s;
## }
## transformed data{
##   matrix[N, P] Q_ast;
##   matrix[P, P] R_ast;
##   matrix[P, P] R_ast_inverse;
##   // thin and scale the QR decomposition
##   Q_ast = qr_thin_Q(X) * sqrt(N - 1);
##   R_ast = qr_thin_R(X) / sqrt(N - 1);
##   R_ast_inverse = inverse(R_ast);
## }
## parameters {
##   ordered[K - 1] c;
##   vector[P] g;
## }
## transformed parameters{
## }
## model {
##   target += normal_lpdf(c | 0, pri_c_s);
##   target += normal_lpdf(g | 0, 1);
##   if(!prior_only){
##     // the effect of each state can change at each time point
##     target += ordered_logistic_lpmf(y | Q_ast * g,  c);
##   }
## }
## generated quantities{
##   matrix[Q, K] p;
##   vector[P] b;
## 
##   b = R_ast_inverse * g;
## 
##   // for each covariate pattern there is a probability
##   // of occupying each category of y
##   for(j in 1:Q){
##     for(k in 1:K){
##         if(k == 1){
##           p[j,k] = 1 - inv_logit(XX[j,] * b - c[1]);
##         } else if (k == K) {
##           p[j,k] = inv_logit(XX[j,] * b - c[K-1]);
##         } else {
##           p[j,k] = inv_logit(XX[j,] * b - c[k-1]) -
##             inv_logit(XX[j,] * b - c[k]);
##         }
##     }
##   }
## }

There is a slight improvement in the duration taken to sampling, which will vary dependent on the data. Again, the parameters appear to have been estimated reasonably well.

f2 <- m2$sample(lsd,
    iter_warmup = 1000,
    iter_sampling = 2000,
    parallel_chains = 2,
    chains = 2,
    refresh = 2000, show_exceptions = F)

## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 3000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 3000 [  0%]  (Warmup) 
## Chain 1 Iteration: 1001 / 3000 [ 33%]  (Sampling) 
## Chain 2 Iteration: 1001 / 3000 [ 33%]  (Sampling) 
## Chain 2 Iteration: 3000 / 3000 [100%]  (Sampling) 
## Chain 2 finished in 13.5 seconds.
## Chain 1 Iteration: 3000 / 3000 [100%]  (Sampling) 
## Chain 1 finished in 14.3 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 13.9 seconds.
## Total execution time: 14.4 seconds.

smry <- f2$summary(variables = c("b"))

knitr::kable(
  cbind(b, posterior_means = smry$mean, rhat = smry$rhat), 
  format = "simple", digits = 3
)

	b	posterior_means	rhat
trt2	-1.50	-1.734	1.000
trt3	0.50	0.203	1.001
strata2	0.80	0.562	1.001
strata3	0.40	0.228	1.000
strata4	0.20	-0.062	1.001
trt2:strata2	0.05	0.352	1.000
trt3:strata2	0.10	0.494	1.000
trt2:strata3	-0.30	-0.169	1.000
trt3:strata3	-0.50	-0.319	1.000
trt2:strata4	-0.30	-0.144	1.001
trt3:strata4	-0.80	-0.525	1.001

The probabilities of observing each outcome in each covariate pattern are also reasonable.

d_p <- data.table(f2$draws(variables = c("p"), format = "matrix"))

d_tbl <- melt(d_p, measure.vars = names(d_p)) 
d_tbl[, id_cov := gsub("p[", "", variable, fixed = T)]
d_tbl[, id_cov := as.integer(gsub(",.]", "", id_cov))]
d_tbl[, k := gsub("^p\\[.*,", "", variable)]
d_tbl[, k := as.integer(gsub("]", "", fixed = T, k))]
d_tbl <- d_tbl[, .(p_mu = mean(value)), keyby = .(id_cov, k)]
d_tbl[, p_tru := as.numeric(t(p_tru))]

d_res <- melt(d_tbl, id.vars = c("id_cov", "k"))
knitr::kable(
  dcast(d_res, id_cov + variable ~ k, value.var = "value"),
  digits = 2, format = "simple"
)

id_cov	variable	1	2	3
1	p_mu	0.17	0.48	0.35
1	p_tru	0.20	0.50	0.30
2	p_mu	0.10	0.41	0.49
2	p_tru	0.10	0.41	0.49
3	p_mu	0.14	0.46	0.40
3	p_tru	0.14	0.47	0.39
4	p_mu	0.17	0.49	0.34
4	p_tru	0.17	0.49	0.34
5	p_mu	0.53	0.38	0.09
5	p_tru	0.53	0.38	0.09
6	p_mu	0.31	0.50	0.19
6	p_tru	0.32	0.49	0.18
7	p_mu	0.51	0.39	0.09
7	p_tru	0.50	0.40	0.10
8	p_mu	0.58	0.35	0.07
8	p_tru	0.55	0.37	0.08
9	p_mu	0.14	0.46	0.40
9	p_tru	0.13	0.45	0.41
10	p_mu	0.05	0.29	0.65
10	p_tru	0.06	0.31	0.63
11	p_mu	0.15	0.47	0.38
11	p_tru	0.14	0.47	0.39
12	p_mu	0.23	0.51	0.27
12	p_tru	0.22	0.50	0.28

This model will be reconfigured to support an ordinal Markov model in a subsequent post.

Ordinal regression with stan

Using design matrix and QR decomposition

2023-09-22