Prepare data
## Load dataset from Applied Longitudinal Analysis
dental <- read.table("http://www.hsph.harvard.edu/fitzmaur/ala2e/dental.dat",
col.names = c("id","gender","age8","age10","age12","age14"))
dental
id gender age8 age10 age12 age14
1 1 F 21.0 20.0 21.5 23.0
2 2 F 21.0 21.5 24.0 25.5
3 3 F 20.5 24.0 24.5 26.0
4 4 F 23.5 24.5 25.0 26.5
5 5 F 21.5 23.0 22.5 23.5
6 6 F 20.0 21.0 21.0 22.5
7 7 F 21.5 22.5 23.0 25.0
8 8 F 23.0 23.0 23.5 24.0
9 9 F 20.0 21.0 22.0 21.5
10 10 F 16.5 19.0 19.0 19.5
11 11 F 24.5 25.0 28.0 28.0
12 12 M 26.0 25.0 29.0 31.0
13 13 M 21.5 22.5 23.0 26.5
14 14 M 23.0 22.5 24.0 27.5
15 15 M 25.5 27.5 26.5 27.0
16 16 M 20.0 23.5 22.5 26.0
17 17 M 24.5 25.5 27.0 28.5
18 18 M 22.0 22.0 24.5 26.5
19 19 M 24.0 21.5 24.5 25.5
20 20 M 23.0 20.5 31.0 26.0
21 21 M 27.5 28.0 31.0 31.5
22 22 M 23.0 23.0 23.5 25.0
23 23 M 21.5 23.5 24.0 28.0
24 24 M 17.0 24.5 26.0 29.5
25 25 M 22.5 25.5 25.5 26.0
26 26 M 23.0 24.5 26.0 30.0
27 27 M 22.0 21.5 23.5 25.0
## Subset to females only
dental <- subset(dental, gender == "F")
## Load reshape2
library(reshape2)
## Change to long format
dental.long <- melt(dental, id.vars = c("id","gender"),
value.name = "distance", variable.name = "age")
## Delete age from age variable values
dental.long$age <- as.numeric(gsub("age", "", dental.long$age))
## Make ID to factor
dental.long$id <- factor(dental.long$id)
## Create categorical age
dental.long$age.cat <- factor(dental.long$age)
## Dataset ordered by age (within-cluster ordering), then id (cluster id)
dental.long
id gender age distance age.cat
1 1 F 8 21.0 8
2 2 F 8 21.0 8
3 3 F 8 20.5 8
4 4 F 8 23.5 8
5 5 F 8 21.5 8
6 6 F 8 20.0 8
7 7 F 8 21.5 8
8 8 F 8 23.0 8
9 9 F 8 20.0 8
10 10 F 8 16.5 8
11 11 F 8 24.5 8
12 1 F 10 20.0 10
13 2 F 10 21.5 10
14 3 F 10 24.0 10
15 4 F 10 24.5 10
16 5 F 10 23.0 10
17 6 F 10 21.0 10
18 7 F 10 22.5 10
19 8 F 10 23.0 10
20 9 F 10 21.0 10
21 10 F 10 19.0 10
22 11 F 10 25.0 10
23 1 F 12 21.5 12
24 2 F 12 24.0 12
25 3 F 12 24.5 12
26 4 F 12 25.0 12
27 5 F 12 22.5 12
28 6 F 12 21.0 12
29 7 F 12 23.0 12
30 8 F 12 23.5 12
31 9 F 12 22.0 12
32 10 F 12 19.0 12
33 11 F 12 28.0 12
34 1 F 14 23.0 14
35 2 F 14 25.5 14
36 3 F 14 26.0 14
37 4 F 14 26.5 14
38 5 F 14 23.5 14
39 6 F 14 22.5 14
40 7 F 14 25.0 14
41 8 F 14 24.0 14
42 9 F 14 21.5 14
43 10 F 14 19.5 14
44 11 F 14 28.0 14
cov(dental[,3:6])
age8 age10 age12 age14
age8 4.513636 3.354545 4.331818 4.356818
age10 3.354545 3.618182 4.027273 4.077273
age12 4.331818 4.027273 5.590909 5.465909
age14 4.356818 4.077273 5.465909 5.940909
This is a completely balanced dataset without missing values, thus, the variance-covariance matrix should match that from the dataset shown above (to my understanding). However, each individual’s variance-covariance matrix is different from one another.
library(nlme)
## Within-cluster ordering first, then cluster ID second
dental.long.by.age.id <- dental.long
gls.by.age.id <- gls(model = distance ~ -1 + age.cat,
data = dental.long.by.age.id,
correlation = corSymm(form = ~ as.numeric(age.cat) | id),
weights = varIdent(form = ~ 1 | as.numeric(age.cat)),
method = "REML"
)
getVarCov(gls.by.age.id, individual = 1)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 4.2985 4.3318 3.4001
[2,] 4.2985 5.9409 5.1605 4.0773
[3,] 4.3318 5.1605 5.5909 4.2656
[4,] 3.4001 4.0773 4.2656 3.6182
Standard Deviations: 2.1245 2.4374 2.3645 1.9022
getVarCov(gls.by.age.id, individual = 2)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 3.6182 3.3546 3.9980 3.7842
[2,] 3.3546 4.5136 4.6368 4.4178
[3,] 3.9980 4.6368 5.9409 5.4659
[4,] 3.7842 4.4178 5.4659 5.5909
Standard Deviations: 1.9022 2.1245 2.4374 2.3645
getVarCov(gls.by.age.id, individual = 3)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 5.5909 3.7335 4.3318 4.8490
[2,] 3.7335 3.6182 3.6186 4.0773
[3,] 4.3318 3.6186 4.5136 4.9112
[4,] 4.8490 4.0773 4.9112 5.9409
Standard Deviations: 2.3645 1.9022 2.1245 2.4374
getVarCov(gls.by.age.id, individual = 4)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 5.9409 4.7840 3.9980 4.3568
[2,] 4.7840 5.5909 4.0273 4.4178
[3,] 3.9980 4.0273 3.6182 3.8327
[4,] 4.3568 4.4178 3.8327 4.5136
Standard Deviations: 2.4374 2.3645 1.9022 2.1245
getVarCov(gls.by.age.id, individual = 5) # This is the same as individual 1
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 4.2985 4.3318 3.4001
[2,] 4.2985 5.9409 5.1605 4.0773
[3,] 4.3318 5.1605 5.5909 4.2656
[4,] 3.4001 4.0773 4.2656 3.6182
Standard Deviations: 2.1245 2.4374 2.3645 1.9022
This problem occurs due to mishandling of data by getVarCov.gls().
## Cluster ID first, then within-cluster ordering second
dental.long.by.id.age <- dental.long[with(dental.long, order(id, age)),]
head(dental.long.by.id.age)
id gender age distance age.cat
1 1 F 8 21.0 8
12 1 F 10 20.0 10
23 1 F 12 21.5 12
34 1 F 14 23.0 14
2 2 F 8 21.0 8
13 2 F 10 21.5 10
gls.by.id.age <- gls(model = distance ~ -1 + age.cat,
data = dental.long.by.id.age,
correlation = corSymm(form = ~ as.numeric(age.cat) | id),
weights = varIdent(form = ~ 1 | as.numeric(age.cat)),
method = "REML"
)
getVarCov(gls.by.id.age, individual = 1) # Correct
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov(gls.by.id.age, individual = 2) # Correct
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov(gls.by.id.age, individual = 3) # Correct
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov(gls.by.id.age, individual = 4) # Correct
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
This can be fixed by creating an index from correlation matrices (corMatrix()), rather than from the dataset.
getVarCov.fix <- function (obj, individual = 1, ...) {
S <- corMatrix(obj$modelStruct$corStruct)[[individual]]
if (!is.null(obj$modelStruct$varStruct)) {
## ind <- obj$groups == individual # index from dataset
dimensions.Ri <- sapply(corMatrix(obj$modelStruct$corStruct), nrow) # dimentions from corMatrix()
ind <- rep(seq_along(dimensions.Ri), dimensions.Ri) == individual # index from dimentions
vw <- 1/varWeights(obj$modelStruct$varStruct)[ind]
}
else vw <- rep(1, nrow(S))
vars <- (obj$sigma * vw)^2
result <- t(S * sqrt(vars)) * sqrt(vars)
class(result) <- c("marginal", "VarCov")
attr(result, "group.levels") <- names(obj$groups)
result
}
## Now the result is not dependent on the dataset structure.
getVarCov.fix(gls.by.age.id, 1)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.id.age, 1)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.age.id, 2)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.id.age, 2)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.age.id, 3)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.id.age, 3)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.age.id, 4)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
getVarCov.fix(gls.by.id.age, 4)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 4.5136 3.3546 4.3318 4.3568
[2,] 3.3546 3.6182 4.0273 4.0773
[3,] 4.3318 4.0273 5.5909 5.4659
[4,] 4.3568 4.0773 5.4659 5.9409
Standard Deviations: 2.1245 1.9022 2.3645 2.4374
## This can handle unbalanced data.
dental.long.imbalance <- dental.long[c(-1,-2,-13),] # 3 observations for #1, 2 observations for #2
gls.by.id.age.imbalance <- gls(model = distance ~ -1 + age.cat,
data = dental.long.imbalance,
correlation = corSymm(form = ~ as.numeric(age.cat) | id),
weights = varIdent(form = ~ 1 | as.numeric(age.cat)),
method = "REML"
)
getVarCov.fix(gls.by.id.age.imbalance, individual = 1)
Marginal variance covariance matrix
[,1] [,2] [,3]
[1,] 3.6923 4.1883 4.3269
[2,] 4.1883 5.5909 5.4659
[3,] 4.3269 5.4659 5.9409
Standard Deviations: 1.9215 2.3645 2.4374
getVarCov.fix(gls.by.id.age.imbalance, individual = 2)
Marginal variance covariance matrix
[,1] [,2]
[1,] 5.5909 5.4659
[2,] 5.4659 5.9409
Standard Deviations: 2.3645 2.4374
getVarCov.fix(gls.by.id.age.imbalance, individual = 3)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 5.5202 4.0564 4.8195 4.7579
[2,] 4.0564 3.6923 4.1883 4.3269
[3,] 4.8195 4.1883 5.5909 5.4659
[4,] 4.7579 4.3269 5.4659 5.9409
Standard Deviations: 2.3495 1.9215 2.3645 2.4374
getVarCov.fix(gls.by.id.age.imbalance, individual = 4)
Marginal variance covariance matrix
[,1] [,2] [,3] [,4]
[1,] 5.5202 4.0564 4.8195 4.7579
[2,] 4.0564 3.6923 4.1883 4.3269
[3,] 4.8195 4.1883 5.5909 5.4659
[4,] 4.7579 4.3269 5.4659 5.9409
Standard Deviations: 2.3495 1.9215 2.3645 2.4374
## Show the code
nlme:::getVarCov.gls
function (obj, individual = 1, ...)
{
S <- corMatrix(obj$modelStruct$corStruct)[[individual]]
if (!is.null(obj$modelStruct$varStruct)) {
ind <- obj$groups == individual
vw <- 1/varWeights(obj$modelStruct$varStruct)[ind]
}
else vw <- rep(1, nrow(S))
vars <- (obj$sigma * vw)^2
result <- t(S * sqrt(vars)) * sqrt(vars)
class(result) <- c("marginal", "VarCov")
attr(result, "group.levels") <- names(obj$groups)
result
}
<bytecode: 0x7fbe0d912bf8>
<environment: namespace:nlme>
## Show data that causes problem (individual 1 at rows 1, 12, 23, and 34)
dental.long.by.age.id[dental.long.by.age.id$id == 1,]
id gender age distance age.cat
1 1 F 8 21.0 8
12 1 F 10 20.0 10
23 1 F 12 21.5 12
34 1 F 14 23.0 14
## Put gls object from the dataset that causes the unexpected behavior to obj
obj <- gls.by.age.id
## Default 1 for individual
individual <- 1
## Extract Correlation Matrix
S <- corMatrix(obj$modelStruct$corStruct)[[individual]]
S
[,1] [,2] [,3] [,4]
[1,] 1.0000000 0.8300904 0.8623144 0.8413560
[2,] 0.8300904 1.0000000 0.8954160 0.8794243
[3,] 0.8623144 0.8954160 1.0000000 0.9484076
[4,] 0.8413560 0.8794243 0.9484076 1.0000000
## Extract indexes for individual 1 from obj$groups, which corresponds to data (1, 12, 23, and 34)
ind <- obj$groups == individual
which(ind)
[1] 1 12 23 34
## Fix by the alternative indexing method
dimensions.Ri <- sapply(corMatrix(obj$modelStruct$corStruct), nrow)
ind2 <- rep(seq_along(dimensions.Ri), dimensions.Ri) == individual
which(ind2)
[1] 1 2 3 4
Extract Variance Function Weights. THIS IS ORDERED BY PATIENT ID(CLUSTER ID), THEN AGE (WITHIN-CLUSTER ORDERING), REGARDLESS OF DATA ORDERING. Individual 1 is at positions 1, 2, 3, and 4. (NOT 1, 12, 23, and 34, as in index)
## PROBLEM
##
## Extract Variance Function Weights.
## THIS APPEARS TO BE ORDERED BY CLUSTER ID, THEN WITHIN-CLUSTER ORDERING, REGARDLESS OF DATA ORDERING.
## Individual 1 is at positions 1, 2, 3, and 4. (NOT 1, 12, 23, and 34)
varWeights(obj$modelStruct$varStruct)
1 2 3 4 1 2 3 4 1 2 3
1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075
4 1 2 3 4 1 2 3 4 1 2
0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068
3 4 1 2 3 4 1 2 3 4 1
0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000
2 3 4 1 2 3 4 1 2 3 4
1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394 1.0000000 1.1169068 0.8985075 0.8716394
## PROBLEM
##
## This part extracts positions 1, 12, 23, and 34 (ind), although what we want is at positions 1, 2, 3, and 4.
## Thus, the element obtained are wrong (named 1, 4, 3, and 2, coming from positions 1, 12, 23, and 34).
## These are occasion 1 from subject 1, occasion 4 from subject 3, occasion 3 from subject 6,
## occasion 2 from subject 9.
vw <- 1/varWeights(obj$modelStruct$varStruct)[ind]
vw
1 4 3 2
1.0000000 1.1472634 1.1129568 0.8953299
## The rest has no problem.
vars <- (obj$sigma * vw)^2
result <- t(S * sqrt(vars)) * sqrt(vars)
## Show result
result
[,1] [,2] [,3] [,4]
[1,] 4.513645 4.298490 4.331830 3.400089
[2,] 4.298490 5.940920 5.160523 4.077295
[3,] 4.331830 5.160523 5.590930 4.265636
[4,] 3.400089 4.077295 4.265636 3.618208
## The result matrix is different from the matrix from the raw data.
cov(dental[,c("age8","age10","age12","age14")])
age8 age10 age12 age14
age8 4.513636 3.354545 4.331818 4.356818
age10 3.354545 3.618182 4.027273 4.077273
age12 4.331818 4.027273 5.590909 5.465909
age14 4.356818 4.077273 5.465909 5.940909