Strange marginal variance covariance matrix Cov(Yi) in nlme

Prepare data

## Load dataset from Applied Longitudinal Analysis
dental <- read.table("http://www.hsph.harvard.edu/fitzmaur/ala2e/dental.dat",
                     col.names = c("id","gender","age8","age10","age12","age14"))
dental

   id gender age8 age10 age12 age14
1   1      F 21.0  20.0  21.5  23.0
2   2      F 21.0  21.5  24.0  25.5
3   3      F 20.5  24.0  24.5  26.0
4   4      F 23.5  24.5  25.0  26.5
5   5      F 21.5  23.0  22.5  23.5
6   6      F 20.0  21.0  21.0  22.5
7   7      F 21.5  22.5  23.0  25.0
8   8      F 23.0  23.0  23.5  24.0
9   9      F 20.0  21.0  22.0  21.5
10 10      F 16.5  19.0  19.0  19.5
11 11      F 24.5  25.0  28.0  28.0
12 12      M 26.0  25.0  29.0  31.0
13 13      M 21.5  22.5  23.0  26.5
14 14      M 23.0  22.5  24.0  27.5
15 15      M 25.5  27.5  26.5  27.0
16 16      M 20.0  23.5  22.5  26.0
17 17      M 24.5  25.5  27.0  28.5
18 18      M 22.0  22.0  24.5  26.5
19 19      M 24.0  21.5  24.5  25.5
20 20      M 23.0  20.5  31.0  26.0
21 21      M 27.5  28.0  31.0  31.5
22 22      M 23.0  23.0  23.5  25.0
23 23      M 21.5  23.5  24.0  28.0
24 24      M 17.0  24.5  26.0  29.5
25 25      M 22.5  25.5  25.5  26.0
26 26      M 23.0  24.5  26.0  30.0
27 27      M 22.0  21.5  23.5  25.0


## Subset to females only
dental <- subset(dental, gender == "F")

## Load reshape2
library(reshape2)

## Change to long format
dental.long <- melt(dental, id.vars = c("id","gender"),
                    value.name = "distance", variable.name = "age")

## Delete age from age variable values
dental.long$age <- as.numeric(gsub("age", "", dental.long$age))

## Make ID to factor
dental.long$id <- factor(dental.long$id)

## Create categorical age
dental.long$age.cat <- factor(dental.long$age)

## Dataset ordered by age (within-cluster ordering), then id (cluster id)
head(dental.long, 10)

   id gender age distance age.cat
1   1      F   8     21.0       8
2   2      F   8     21.0       8
3   3      F   8     20.5       8
4   4      F   8     23.5       8
5   5      F   8     21.5       8
6   6      F   8     20.0       8
7   7      F   8     21.5       8
8   8      F   8     23.0       8
9   9      F   8     20.0       8
10 10      F   8     16.5       8

gls() on data ordered in different ways

The result becomes disorganized when the dataset is ordered by the within-cluster ordering, and then the cluster ID.

Cluster ID variable: id
Within-cluster occasion variable: age.cat (age as categorical)
The saturated model explains four mean distances by four different ages.
The variance-covariance matrix is dictated by data.
Different variances are allowed at different age.cat values

## Load nlme
library(nlme)

## Cluster ID, then within-cluster ordering
dental.long.by.id.age <- dental.long[with(dental.long, order(id, age)),]
head(dental.long.by.id.age)

   id gender age distance age.cat
1   1      F   8     21.0       8
12  1      F  10     20.0      10
23  1      F  12     21.5      12
34  1      F  14     23.0      14
2   2      F   8     21.0       8
13  2      F  10     21.5      10


gls.by.id.age <- gls(model       = distance ~ -1 + age.cat,
                     data        = dental.long.by.id.age,
                     correlation = corSymm(form = ~ as.numeric(age.cat) | id),
                     weights     = varIdent(form = ~ 1 | as.numeric(age.cat)),
                     method      = "REML"
                     )


## Cluster ID, then reversed within-cluster ordering
dental.long.by.id.rev_age <- dental.long[with(dental.long, order(id,rev(age))),]
head(dental.long.by.id.rev_age)

   id gender age distance age.cat
34  1      F  14     23.0      14
23  1      F  12     21.5      12
12  1      F  10     20.0      10
1   1      F   8     21.0       8
35  2      F  14     25.5      14
24  2      F  12     24.0      12


gls.by.id.rev_age <- gls(model       = distance ~ -1 + age.cat,
                         data        = dental.long.by.id.rev_age,
                         correlation = corSymm(form = ~ as.numeric(age.cat) | id),
                         weights     = varIdent(form = ~ 1 | as.numeric(age.cat)),
                         method      = "REML"
                         )


## within-cluster ordering, then cluster id
dental.long.by.age.id <- dental.long[with(dental.long, order(age, id)),]
head(dental.long.by.age.id)

  id gender age distance age.cat
1  1      F   8     21.0       8
2  2      F   8     21.0       8
3  3      F   8     20.5       8
4  4      F   8     23.5       8
5  5      F   8     21.5       8
6  6      F   8     20.0       8


gls.by.age.id <- gls(model       = distance ~ -1 + age.cat,
                     data        = dental.long.by.age.id,
                     correlation = corSymm(form = ~ as.numeric(age.cat) | id),
                     weights     = varIdent(form = ~ 1 | as.numeric(age.cat)),
                     method      = "REML"
                     )

Variance-covariance matrices (See the different variance-covariance matrices depending data configuration)

Demonstration of effects of data configuration on the variance-covariance matrix returned by getVarCov()

ID: Cluster ID
age: Within-cluster ordering

Variance-covariance matrix from raw data

cov(dental[,c("age8","age10","age12","age14")])

          age8    age10    age12    age14
age8  4.513636 3.354545 4.331818 4.356818
age10 3.354545 3.618182 4.027273 4.077273
age12 4.331818 4.027273 5.590909 5.465909
age14 4.356818 4.077273 5.465909 5.940909

Variance-covariance matrix from data sorted by ID, then age (CORRECT RESULT)

getVarCov(gls.by.id.age)

Marginal variance covariance matrix
       [,1]   [,2]   [,3]   [,4]
[1,] 4.5136 3.3545 4.3318 4.3568
[2,] 3.3545 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
  Standard Deviations: 2.1245 1.9022 2.3645 2.4374

When the data are ordered by the patient id (cluster ID) and then age (within-cluster ordering), the matrix is correct, and is the same as the raw data matrix. This is an expected result as the data is balanced and complete.

Variance-covariance matrix from data sorted by ID, then reversed age (EXPECTED REVERSE RESULT)

getVarCov(gls.by.id.rev_age)

Marginal variance covariance matrix
       [,1]   [,2]   [,3]   [,4]
[1,] 5.9409 5.4659 4.0773 4.3568
[2,] 5.4659 5.5909 4.0273 4.3318
[3,] 4.0773 4.0273 3.6182 3.3545
[4,] 4.3568 4.3318 3.3545 4.5136
  Standard Deviations: 2.4374 2.3645 1.9022 2.1245

When the data are ordered by the patient id (cluster ID) and then reversed age (within-cluster ordering), the matrix has reversed rows and columns. This is an expected behavior, I think.

Variance-covariance matrix from data sorted by age, then ID (UNEXPECTED RESULT)

getVarCov(gls.by.age.id)

Marginal variance covariance matrix
       [,1]   [,2]   [,3]   [,4]
[1,] 4.5136 4.2985 4.3318 3.4001
[2,] 4.2985 5.9409 5.1605 4.0773
[3,] 4.3318 5.1605 5.5909 4.2656
[4,] 3.4001 4.0773 4.2656 3.6182
  Standard Deviations: 2.1245 2.4374 2.3645 1.9022

However, when the data are ordered by the age (within-cluster ordering) and then the patient id (cluster ID), the matrix has an unexpected configuration (the diagonal elements are ordered age8, age14, age12, age10). This appears to be due to problem with the nlme:::getVarCov.gls function.

The problem is shown below by step-by-step execution of nlme:::getVarCov.gls()

## Show the code
nlme:::getVarCov.gls

function (obj, individual = 1, ...) 
{
    S <- corMatrix(obj$modelStruct$corStruct)[[individual]]
    if (!is.null(obj$modelStruct$varStruct)) {
        ind <- obj$groups == individual
        vw <- 1/varWeights(obj$modelStruct$varStruct)[ind]
    }
    else vw <- rep(1, nrow(S))
    vars <- (obj$sigma * vw)^2
    result <- t(S * sqrt(vars)) * sqrt(vars)
    class(result) <- c("marginal", "VarCov")
    attr(result, "group.levels") <- names(obj$groups)
    result
}
<bytecode: 0x104df0600>
<environment: namespace:nlme>


## Show data that causes problem (individual 1 at rows 1, 12, 23, and 34)
dental.long.by.age.id[dental.long.by.age.id$id == 1,]

   id gender age distance age.cat
1   1      F   8     21.0       8
12  1      F  10     20.0      10
23  1      F  12     21.5      12
34  1      F  14     23.0      14



## Put gls object from the dataset that causes the unexpected behavior to obj
obj <- gls.by.age.id

## Default 1 for individual
individual <- 1

## Extract Correlation Matrix
S <- corMatrix(obj$modelStruct$corStruct)[[individual]]
S

          [,1]      [,2]      [,3]      [,4]
[1,] 1.0000000 0.8300900 0.8623146 0.8413558
[2,] 0.8300900 1.0000000 0.8954155 0.8794235
[3,] 0.8623146 0.8954155 1.0000000 0.9484069
[4,] 0.8413558 0.8794235 0.9484069 1.0000000


## The above matrix is the same as the correlation matrix from the raw data, thus correct.
cor(dental[,c("age8","age10","age12","age14")])

           age8     age10     age12     age14
age8  1.0000000 0.8300900 0.8623146 0.8413558
age10 0.8300900 1.0000000 0.8954156 0.8794236
age12 0.8623146 0.8954156 1.0000000 0.9484070
age14 0.8413558 0.8794236 0.9484070 1.0000000



## Extract indexes for individual 1 from obj$groups, which corresponds to data (1, 12, 23, and 34)
ind <- obj$groups == individual
which(ind)

[1]  1 12 23 34

Problem exist in mismatch between varWeights (positions 1, 2, 3, and 4) and indexes (positions 1, 12, 23, and 34).

Extract Variance Function Weights. THIS APPEARS TO BE ORDERED BY PATIENT ID(CLUSTER ID), THEN AGE (WITHIN-CLUSTER ORDERING), REGARDLESS OF DATA ORDERING. Individual 1 is at positions 1, 2, 3, and 4. (NOT 1, 12, 23, and 34, as in index)

## PROBLEM
##
## Extract Variance Function Weights.
## THIS APPEARS TO BE  ORDERED BY CLUSTER ID, THEN WITHIN-CLUSTER ORDERING, REGARDLESS OF DATA ORDERING.
## Individual 1 is at positions 1, 2, 3, and 4. (NOT 1, 12, 23, and 34)
varWeights(obj$modelStruct$varStruct)

        1         2         3         4         1         2         3         4         1         2         3 
1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 
        4         1         2         3         4         1         2         3         4         1         2 
0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 
        3         4         1         2         3         4         1         2         3         4         1 
0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 
        2         3         4         1         2         3         4         1         2         3         4 
1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393 1.0000000 1.1169098 0.8985081 0.8716393

This part extracts positions 1, 12, 23, and 34 (ind), although what we want is at positions 1, 2, 3, and 4. Thus, the element obtained are wrong (named 1, 4, 3, and 2, coming from positions 1, 12, 23, and 34). These are occasion 1 from subject 1, occasion 4 from subject 3, occasion 3 from subject 6, occasion 2 from subject 9.

## PROBLEM
##
## This part extracts positions 1, 12, 23, and 34 (ind), although what we want is at positions 1, 2, 3, and 4.
## Thus, the element obtained are wrong (named 1, 4, 3, and 2, coming from positions 1, 12, 23, and 34).
## These are occasion 1 from subject 1, occasion 4 from subject 3, occasion 3 from subject 6,
## occasion 2 from subject 9.
vw <- 1/varWeights(obj$modelStruct$varStruct)[ind]
vw

        1         4         3         2 
1.0000000 1.1472635 1.1129560 0.8953274

## The rest has no problem.
vars <- (obj$sigma * vw)^2
result <- t(S * sqrt(vars)) * sqrt(vars)

## Show result
result

         [,1]     [,2]     [,3]     [,4]
[1,] 4.513634 4.298478 4.331817 3.400071
[2,] 4.298478 5.940907 5.160505 4.077271
[3,] 4.331817 5.160505 5.590908 4.265608
[4,] 3.400071 4.077271 4.265608 3.618180


## The result matrix is different from the matrix from the raw data.
cov(dental[,c("age8","age10","age12","age14")])

          age8    age10    age12    age14
age8  4.513636 3.354545 4.331818 4.356818
age10 3.354545 3.618182 4.027273 4.077273
age12 4.331818 4.027273 5.590909 5.465909
age14 4.356818 4.077273 5.465909 5.940909