knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(ggplot2)
library(mvtnorm)
library(cmdstanr)
## This is cmdstanr version 0.5.3
## - CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
## - CmdStan path: /Users/mark/.cmdstan/cmdstan-2.33.1
## - CmdStan version: 2.33.1
library(kableExtra)
library(matrixcalc)
tok <- unlist(tstrsplit(getwd(), "/"))
if(tok[length(tok)] == 'rpubs'){path_pref = "."} else{path_pref = ".."}
The previous ordinals suffice for learning but are slow to run.
Once again, consider a hypothetical setting. This time we have an ordinal outcome with three levels. We observe the outcome under experimental with three treatment arms in four strata.
Start with simulating the data; first set up the intercepts:
# Number of categories
K <- 3
# Set up the intercepts:
# Define probability of being in each category
p0 <- c(0.20, 0.50, 0.3)
stopifnot(length(p0) == K)
# covers off on rounding errors that you will sometimes see in R
p0[K] = 1 - sum(p0[1:(K-1)])
# cumulative probabilities
pc0 <- cumsum(p0)
# cumulative logits - these are the cutpoints for the control group
# note that the lower and upper are not required
l0 <- qlogis(pc0[-K])
# shoud be the same
knitr::kable(
rbind(
p0_orig = p0,
p0_constructed = c(1, plogis(-l0[-K])) - c(plogis(-l0[-K]), 0)),
format = "simple"
)
p0_orig | 0.2 | 0.5 | 0.3 |
p0_constructed | 0.2 | 0.5 | 0.3 |
Now set up the design matrix based on the three treatment arms and four strata.
# Set up the linear predictors (the first row is the reference)
dX <- CJ(trt = factor(1:3), strata = factor(1:4))
XX <- model.matrix(~ trt * strata, data = dX)[, -1]
Note the use of the model.matrix
to create a design
matrix based on two factor variables and their interaction. In the
ordinal model likelihood, the intercepts are handled separately and are
therefore removed from the design matrix. That is the reason for the
[, -1]
.
Next, I define some arbitrary parameters and compute the value of the linear predictor for each unique covariate combination.
# Simulation parameters for each column in the design matrix
b <- c(-1.5, 0.5,
0.8, 0.4, 0.2,
0.05, 0.1,
-0.3, -0.5,
-0.3, -0.8)
names(b) <- colnames(XX)
# include the linear predictor
eta <- as.numeric(XX %*% b)
knitr::kable(
cbind(data.table(eta = eta), XX),
format = "simple"
)
eta | trt2 | trt3 | strata2 | strata3 | strata4 | trt2:strata2 | trt3:strata2 | trt2:strata3 | trt3:strata3 | trt2:strata4 | trt3:strata4 |
---|---|---|---|---|---|---|---|---|---|---|---|
0.00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0.80 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0.40 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0.20 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
-1.50 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-0.65 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
-1.40 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
-1.60 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
0.50 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1.40 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0.40 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
-0.10 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
The linear predictors are used to construct the probability of each level occurring under each of the covariate groups.
# Get probabilities for each of the covariate combinations
lp <- list()
for(i in 1:nrow(XX)){
# cumulative logits for treatment group
l1 <- eta[i] - l0
# cumulative probs
pc1 <- plogis(l1)
lp[[i+1]] <- c(1, pc1) - c(pc1, 0)
}
# first row should be l0
p_tru <- do.call(rbind, lp)
colnames(p_tru) <- paste0("Cat", 1:ncol(p_tru))
rownames(p_tru) <- paste0("Cov", 1:nrow(p_tru))
knitr::kable(
cbind(round(p_tru, 2), XX),
format = "simple"
)
Cat1 | Cat2 | Cat3 | trt2 | trt3 | strata2 | strata3 | strata4 | trt2:strata2 | trt3:strata2 | trt2:strata3 | trt3:strata3 | trt2:strata4 | trt3:strata4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Cov1 | 0.20 | 0.50 | 0.30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov2 | 0.10 | 0.41 | 0.49 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov3 | 0.14 | 0.47 | 0.39 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov4 | 0.17 | 0.49 | 0.34 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov5 | 0.53 | 0.38 | 0.09 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov6 | 0.32 | 0.49 | 0.18 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Cov7 | 0.50 | 0.40 | 0.10 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Cov8 | 0.55 | 0.37 | 0.08 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
Cov9 | 0.13 | 0.45 | 0.41 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cov10 | 0.06 | 0.31 | 0.63 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
Cov11 | 0.14 | 0.47 | 0.39 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Cov12 | 0.22 | 0.50 | 0.28 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
And now based on these probabilities, simulate a large dataset. The observed proportions can be seen to be approximately equal to the simulation probabilities.
# Covariate combinations
N <- 5000
d_obs <- data.table(id = 1:N)
d_obs[, id_cov := sample(1:nrow(XX), size = .N, replace = T)]
d_obs[, y := unlist(lapply(1:N, function(i) {
sample(1:K, size = 1, replace = F, prob = p_tru[d_obs$id_cov[i], ])
}))]
# Should resemble each other
p_obs <- prop.table(table(d_obs$id_cov, d_obs$y), 1)
colnames(p_obs) <- paste0("Cat", 1:K, " (obs)")
knitr::kable(
round(cbind(p_tru, p_obs), 2),
format = "simple"
)
Cat1 | Cat2 | Cat3 | Cat1 (obs) | Cat2 (obs) | Cat3 (obs) | |
---|---|---|---|---|---|---|
Cov1 | 0.20 | 0.50 | 0.30 | 0.16 | 0.50 | 0.34 |
Cov2 | 0.10 | 0.41 | 0.49 | 0.11 | 0.40 | 0.49 |
Cov3 | 0.14 | 0.47 | 0.39 | 0.12 | 0.48 | 0.39 |
Cov4 | 0.17 | 0.49 | 0.34 | 0.17 | 0.50 | 0.33 |
Cov5 | 0.53 | 0.38 | 0.09 | 0.53 | 0.38 | 0.09 |
Cov6 | 0.32 | 0.49 | 0.18 | 0.31 | 0.51 | 0.18 |
Cov7 | 0.50 | 0.40 | 0.10 | 0.52 | 0.39 | 0.10 |
Cov8 | 0.55 | 0.37 | 0.08 | 0.58 | 0.35 | 0.07 |
Cov9 | 0.13 | 0.45 | 0.41 | 0.15 | 0.45 | 0.40 |
Cov10 | 0.06 | 0.31 | 0.63 | 0.06 | 0.28 | 0.66 |
Cov11 | 0.14 | 0.47 | 0.39 | 0.17 | 0.44 | 0.39 |
Cov12 | 0.22 | 0.50 | 0.28 | 0.22 | 0.52 | 0.26 |
The first stan implementation leaves the parameters on their original scale.
m1 <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/cumulative_logit6.stan"))
m1
## data{
## int N; // num observations
## int K; // num categories for y
## int P; // num covariates
## int Q; // num covariate patterns
##
## array[N] int y;
## // model matrix created with intercept
## // but then drop the intercept so that
## // the first row is zeros, assuming
## // dummy coding.
## matrix[N,P] X;
##
## // for prediction at each of the unique
## // covariate combinations
## matrix[Q, P] XX;
##
## int prior_only;
## real pri_b_s;
## real pri_c_s;
## }
## parameters {
## ordered[K - 1] c;
## vector[P] b;
## }
## transformed parameters{
## }
## model {
## target += normal_lpdf(c | 0, pri_c_s);
## target += normal_lpdf(b | 0, 1);
## if(!prior_only){
## // the effect of each state can change at each time point
## target += ordered_logistic_lpmf(y | X * b, c);
## }
## }
## generated quantities{
## matrix[Q, K] p;
##
## // for each covariate pattern there is a probability
## // of occupying each category of y
## for(j in 1:Q){
## for(k in 1:K){
## if(k == 1){
## p[j,k] = 1 - inv_logit(XX[j,] * b - c[1]);
## } else if (k == K) {
## p[j,k] = inv_logit(XX[j,] * b - c[K-1]);
## } else {
## p[j,k] = inv_logit(XX[j,] * b - c[k-1]) -
## inv_logit(XX[j,] * b - c[k]);
## }
## }
## }
## }
The posterior means appear to approximate the parameters used to simulate the data. Based on an earlier test (not shown), in contrast to a model constructed with an indexing approach, there is about a 10-fold decrease in the duration to fit the data.
lsd <- list(
N = N,
K = K,
P = ncol(XX),
y = d_obs$y,
X = XX[d_obs$id_cov,],
XX = XX,
Q = nrow(XX),
prior_only = 0, pri_b_s = 1, pri_c_s = 1.8
)
f1 <- m1$sample(lsd,
iter_warmup = 1000,
iter_sampling = 2000,
parallel_chains = 2,
chains = 2,
refresh = 2000, show_exceptions = F)
## Running MCMC with 2 parallel chains...
##
## Chain 1 Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 1 Iteration: 1001 / 3000 [ 33%] (Sampling)
## Chain 2 Iteration: 1001 / 3000 [ 33%] (Sampling)
## Chain 1 Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 1 finished in 19.9 seconds.
## Chain 2 Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 2 finished in 20.0 seconds.
##
## Both chains finished successfully.
## Mean chain execution time: 19.9 seconds.
## Total execution time: 20.1 seconds.
smry <- f1$summary(variables = c("b"))
knitr::kable(
cbind(b, posterior_means = smry$mean, rhat = smry$rhat),
format = "simple", digits = 3
)
b | posterior_means | rhat | |
---|---|---|---|
trt2 | -1.50 | -1.716 | 1.000 |
trt3 | 0.50 | 0.209 | 1.000 |
strata2 | 0.80 | 0.572 | 1.000 |
strata3 | 0.40 | 0.231 | 1.001 |
strata4 | 0.20 | -0.062 | 1.000 |
trt2:strata2 | 0.05 | 0.326 | 1.000 |
trt3:strata2 | 0.10 | 0.477 | 1.000 |
trt2:strata3 | -0.30 | -0.182 | 1.000 |
trt3:strata3 | -0.50 | -0.320 | 1.000 |
trt2:strata4 | -0.30 | -0.155 | 1.000 |
trt3:strata4 | -0.80 | -0.519 | 1.000 |
And the probabilities of observing each outcome in each covariate pattern are also reasonable.
d_p <- data.table(f1$draws(variables = c("p"), format = "matrix"))
d_tbl <- melt(d_p, measure.vars = names(d_p))
d_tbl[, id_cov := gsub("p[", "", variable, fixed = T)]
d_tbl[, id_cov := as.integer(gsub(",.]", "", id_cov))]
d_tbl[, k := gsub("^p\\[.*,", "", variable)]
d_tbl[, k := as.integer(gsub("]", "", fixed = T, k))]
d_tbl <- d_tbl[, .(p_mu = mean(value)), keyby = .(id_cov, k)]
d_tbl[, p_tru := as.numeric(t(p_tru))]
d_res <- melt(d_tbl, id.vars = c("id_cov", "k"))
knitr::kable(
dcast(d_res, id_cov + variable ~ k, value.var = "value"),
digits = 3, format = "simple"
)
id_cov | variable | 1 | 2 | 3 |
---|---|---|---|---|
1 | p_mu | 0.167 | 0.484 | 0.349 |
1 | p_tru | 0.200 | 0.500 | 0.300 |
2 | p_mu | 0.102 | 0.412 | 0.487 |
2 | p_tru | 0.101 | 0.411 | 0.488 |
3 | p_mu | 0.137 | 0.460 | 0.402 |
3 | p_tru | 0.144 | 0.466 | 0.390 |
4 | p_mu | 0.176 | 0.490 | 0.335 |
4 | p_tru | 0.170 | 0.486 | 0.344 |
5 | p_mu | 0.527 | 0.385 | 0.088 |
5 | p_tru | 0.528 | 0.384 | 0.087 |
6 | p_mu | 0.312 | 0.497 | 0.191 |
6 | p_tru | 0.324 | 0.493 | 0.183 |
7 | p_mu | 0.515 | 0.394 | 0.092 |
7 | p_tru | 0.503 | 0.401 | 0.096 |
8 | p_mu | 0.580 | 0.348 | 0.072 |
8 | p_tru | 0.553 | 0.367 | 0.080 |
9 | p_mu | 0.140 | 0.463 | 0.397 |
9 | p_tru | 0.132 | 0.454 | 0.414 |
10 | p_mu | 0.054 | 0.293 | 0.653 |
10 | p_tru | 0.058 | 0.307 | 0.635 |
11 | p_mu | 0.151 | 0.473 | 0.376 |
11 | p_tru | 0.144 | 0.466 | 0.390 |
12 | p_mu | 0.225 | 0.505 | 0.270 |
12 | p_tru | 0.216 | 0.504 | 0.279 |
A slight modification to the model is to decompose the design matrix using QR decomposition:
\[ \begin{aligned} X = Q R \end{aligned} \]
where \(Q\) is an \(N \times M\) orthogonal matrix with \(M\) non-zero rows and \(R\) is a \(M \times M\) upper-triangle matrix. The linear predictor is then restructured to
\[ \begin{aligned} \eta &= X^\top \beta \\ &= Q R \beta \\ &= Q \beta^\prime \end{aligned} \]
and because \(Q\) is orthogonal, the columns are independent and the posterior over the \(\beta^\prime = R \beta\) should be less correlated. For computational efficiency, \(Q\) and \(R\) are converted to the same scale via
\[ \begin{aligned} Q^\prime &= Q \times \sqrt{N - 1} \\ R^\prime &= \frac{1}{\sqrt{N - 1}} R \end{aligned} \]
so that
\[ \begin{aligned} \eta &= Q R \beta = Q^\prime R^\prime \beta = Q^\prime \gamma \end{aligned} \]
where \(\gamma = R^\prime \beta\) and thus \(\beta = (R^\prime)^{-1} \gamma\) (product of the inverse of R and gamma). The revised implementation is shown below.
m2 <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/cumulative_logit7.stan"))
m2
## data{
## int N; // num observations
## int K; // num categories for y
## int P; // num covariates
## int Q; // num covariate patterns
##
## array[N] int y;
## // model matrix created with intercept
## // but then drop the intercept so that
## // the first row is zeros, assuming
## // dummy coding.
## matrix[N,P] X;
##
## // unique entries in design matrix
## matrix[Q, P] XX;
##
## int prior_only;
## real pri_b_s;
## real pri_c_s;
## }
## transformed data{
## matrix[N, P] Q_ast;
## matrix[P, P] R_ast;
## matrix[P, P] R_ast_inverse;
## // thin and scale the QR decomposition
## Q_ast = qr_thin_Q(X) * sqrt(N - 1);
## R_ast = qr_thin_R(X) / sqrt(N - 1);
## R_ast_inverse = inverse(R_ast);
## }
## parameters {
## ordered[K - 1] c;
## vector[P] g;
## }
## transformed parameters{
## }
## model {
## target += normal_lpdf(c | 0, pri_c_s);
## target += normal_lpdf(g | 0, 1);
## if(!prior_only){
## // the effect of each state can change at each time point
## target += ordered_logistic_lpmf(y | Q_ast * g, c);
## }
## }
## generated quantities{
## matrix[Q, K] p;
## vector[P] b;
##
## b = R_ast_inverse * g;
##
## // for each covariate pattern there is a probability
## // of occupying each category of y
## for(j in 1:Q){
## for(k in 1:K){
## if(k == 1){
## p[j,k] = 1 - inv_logit(XX[j,] * b - c[1]);
## } else if (k == K) {
## p[j,k] = inv_logit(XX[j,] * b - c[K-1]);
## } else {
## p[j,k] = inv_logit(XX[j,] * b - c[k-1]) -
## inv_logit(XX[j,] * b - c[k]);
## }
## }
## }
## }
There is a slight improvement in the duration taken to sampling, which will vary dependent on the data. Again, the parameters appear to have been estimated reasonably well.
f2 <- m2$sample(lsd,
iter_warmup = 1000,
iter_sampling = 2000,
parallel_chains = 2,
chains = 2,
refresh = 2000, show_exceptions = F)
## Running MCMC with 2 parallel chains...
##
## Chain 1 Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 1 Iteration: 1001 / 3000 [ 33%] (Sampling)
## Chain 2 Iteration: 1001 / 3000 [ 33%] (Sampling)
## Chain 2 Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 2 finished in 13.5 seconds.
## Chain 1 Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 1 finished in 14.3 seconds.
##
## Both chains finished successfully.
## Mean chain execution time: 13.9 seconds.
## Total execution time: 14.4 seconds.
smry <- f2$summary(variables = c("b"))
knitr::kable(
cbind(b, posterior_means = smry$mean, rhat = smry$rhat),
format = "simple", digits = 3
)
b | posterior_means | rhat | |
---|---|---|---|
trt2 | -1.50 | -1.734 | 1.000 |
trt3 | 0.50 | 0.203 | 1.001 |
strata2 | 0.80 | 0.562 | 1.001 |
strata3 | 0.40 | 0.228 | 1.000 |
strata4 | 0.20 | -0.062 | 1.001 |
trt2:strata2 | 0.05 | 0.352 | 1.000 |
trt3:strata2 | 0.10 | 0.494 | 1.000 |
trt2:strata3 | -0.30 | -0.169 | 1.000 |
trt3:strata3 | -0.50 | -0.319 | 1.000 |
trt2:strata4 | -0.30 | -0.144 | 1.001 |
trt3:strata4 | -0.80 | -0.525 | 1.001 |
The probabilities of observing each outcome in each covariate pattern are also reasonable.
d_p <- data.table(f2$draws(variables = c("p"), format = "matrix"))
d_tbl <- melt(d_p, measure.vars = names(d_p))
d_tbl[, id_cov := gsub("p[", "", variable, fixed = T)]
d_tbl[, id_cov := as.integer(gsub(",.]", "", id_cov))]
d_tbl[, k := gsub("^p\\[.*,", "", variable)]
d_tbl[, k := as.integer(gsub("]", "", fixed = T, k))]
d_tbl <- d_tbl[, .(p_mu = mean(value)), keyby = .(id_cov, k)]
d_tbl[, p_tru := as.numeric(t(p_tru))]
d_res <- melt(d_tbl, id.vars = c("id_cov", "k"))
knitr::kable(
dcast(d_res, id_cov + variable ~ k, value.var = "value"),
digits = 2, format = "simple"
)
id_cov | variable | 1 | 2 | 3 |
---|---|---|---|---|
1 | p_mu | 0.17 | 0.48 | 0.35 |
1 | p_tru | 0.20 | 0.50 | 0.30 |
2 | p_mu | 0.10 | 0.41 | 0.49 |
2 | p_tru | 0.10 | 0.41 | 0.49 |
3 | p_mu | 0.14 | 0.46 | 0.40 |
3 | p_tru | 0.14 | 0.47 | 0.39 |
4 | p_mu | 0.17 | 0.49 | 0.34 |
4 | p_tru | 0.17 | 0.49 | 0.34 |
5 | p_mu | 0.53 | 0.38 | 0.09 |
5 | p_tru | 0.53 | 0.38 | 0.09 |
6 | p_mu | 0.31 | 0.50 | 0.19 |
6 | p_tru | 0.32 | 0.49 | 0.18 |
7 | p_mu | 0.51 | 0.39 | 0.09 |
7 | p_tru | 0.50 | 0.40 | 0.10 |
8 | p_mu | 0.58 | 0.35 | 0.07 |
8 | p_tru | 0.55 | 0.37 | 0.08 |
9 | p_mu | 0.14 | 0.46 | 0.40 |
9 | p_tru | 0.13 | 0.45 | 0.41 |
10 | p_mu | 0.05 | 0.29 | 0.65 |
10 | p_tru | 0.06 | 0.31 | 0.63 |
11 | p_mu | 0.15 | 0.47 | 0.38 |
11 | p_tru | 0.14 | 0.47 | 0.39 |
12 | p_mu | 0.23 | 0.51 | 0.27 |
12 | p_tru | 0.22 | 0.50 | 0.28 |
This model will be reconfigured to support an ordinal Markov model in a subsequent post.