stests is an useful package in which are implemented several statistical tests for multivariate analysis. The current version of the package is hosted in github and any user could download the package using the next code.
if (!require('devtools')) install.packages('devtools')
devtools::install_github('fhernanb/stests', force=TRUE)To use the package we can use the usual way.
For more information about stests package please consult the webpage.
This test was proposed by Gamage, Mathew, and Weerahandi (2004). The statistical test to perform is
\[T^2 = \left( \overline{\mathbf{X}}_1 - \overline{\mathbf{X}}_2 \right) ^{\top} \widetilde{\mathbf{S}} ^{-1} \left( \overline{\mathbf{X}}_1 - \overline{\mathbf{X}}_2 \right).\]
where variance-covariance matrix \(\widetilde{\mathbf{S}}\) is given by:
\[\widetilde{\mathbf{S}} = \widetilde{\mathbf{S}}_1 + \widetilde{\mathbf{S}}_2 = \dfrac{\mathbf{S}_1}{n_1} + \dfrac{\mathbf{S}_2}{n_2} .\]
If \(t^2\) represents the observed value of \(T^2\), the generalized \(p-\)value for the test is \(P(T_1 \geq t^2 | H_0)\) where the \(T_1\) is a random variable given by
\[\begin{equation} \label{T1} T_1 = \frac{1}{Q_1} \sum_{i=1}^{p} d_i Z_{0i}^2 + \frac{n_2-1}{Q_2} \sum_{i=1}^{p} \left( 1 - \frac{d_i}{n_1-1} \right) d_i Z_{0i}^2. \end{equation}\]
To obtain the generalized \(p-\)value \(P(T_1 \geq t^2 | H_0)\) we should simulate at least 1000 values from each distribution \(Z_{0i}^2 \sim \chi^2_1\), \(Q_1 \sim \chi^2_{n_1-p}\) and \(Q_2 \sim \chi^2_{n_2-p}\) independently, then those values are replaced in expression to obtain several \(t_1\), finally, the generalized \(p-\)value is obtained as the percentage rate \(t_1 \geq t^2\).
In expression the values denoted by \(d_i\) correspond to the eigenvalues of \(\boldsymbol{V}_1\) defined as
\[ \boldsymbol{V}_1 = \frac{n_1-1}{n_1} \left( \frac{\boldsymbol{S}_1}{n_1} + \frac{\boldsymbol{S}_2}{n_2} \right)^{-\frac{1}{2}} \boldsymbol{S}_1 \left( \frac{\boldsymbol{S}_1}{n_1} + \frac{\boldsymbol{S}_2}{n_2} \right)^{-\frac{1}{2}}. \]
For more details about this to test please consult Gamage, Mathew, and Weerahandi (2004).
Next function can be used to generate random samples from multivariate normal populations.
gen_data <- function(mu1, sigma1, n1, mu2, sigma2, n2) {
dt1 <- MASS::mvrnorm(n1, mu=mu1, Sigma=sigma1)
dt1 <- cbind(dt1, 1)
dt2 <- MASS::mvrnorm(n2, mu=mu2, Sigma=sigma2)
dt2 <- cbind(dt2, 2)
datos <- as.data.frame(rbind(dt1, dt2))
colnames(datos) <- c('x', 'y', 'pop')
datos$pop <- as.factor(datos$pop)
return(datos)
}The gen_data function can be used as follows.
n1 <- 45
mu1 <- c(200, 500)
sigma1 <- matrix(c(13825.3, 23823.4,
23823.4, 73107.4), ncol=2)
n2 <- 55
mu2 <- c(300, 700)
sigma2 <- matrix(c(8632.0, 19616.7,
19616.7, 55964.5), ncol=2)
datos <- gen_data(mu1, sigma1, n1, mu2, sigma2, n2)
library(ggplot2)
ggplot(data=datos, aes(x=x, y=y, color=pop)) +
geom_point() +
annotate("point", x=mu1[1], y=mu1[2], shape=8, size=5) +
annotate("point", x=mu2[1], y=mu2[2], shape=8, size=5)The \(\delta\) parameter corresponds to the distance between centroids.
Next function is an auxiliar function to generate data using an specific delta and n.
auxiliar <- function(delta, n) {
mu1 <- c(200, 500)
sigma1 <- matrix(c(13825.3, 23823.4,
23823.4, 73107.4), ncol=2)
mu2 <- mu1 + delta
sigma2 <- matrix(c(8632.0, 19616.7,
19616.7, 55964.5), ncol=2)
dat <- gen_data(mu1, sigma1, n1=n, mu2, sigma2, n2=n)
return(dat)
}Next function is used to …
one_simul <- function(delta, n) {
datos <- auxiliar(delta, n)
medias <- lapply(split(x=datos[, 1:2], f=datos$pop, drop=TRUE), colMeans)
varia <- lapply(split(x=datos[, 1:2], f=datos$pop, drop=TRUE), var)
N <- lapply(split(x=datos[, 1:2], f=datos$pop, drop=TRUE), nrow)
res <- two_mean_vector_test(xbar1=medias$`1`, s1=varia$`1`, n1=N$`1`,
xbar2=medias$`2`, s2=varia$`2`, n2=N$`2`,
method = 'gamage')
res$p.value
}Next function is used to …
mult_simul <- function(x) {
delta <- x[1]
n <- x[2]
nrep <- x[3]
res <- replicate(n=nrep, expr=one_simul(delta=delta, n=n))
mean(res < 0.05)
}The parameters for the simulation are shown below.
delta <- seq(from=0, to=150, by=10)
n <- seq(from=10, to=150, by=10)
nrep <- 3
casos <- expand.grid(delta=delta, n=n, nrep=nrep)To obtain the rejection rate.
tasa <- apply(X=casos, MARGIN=1, FUN=mult_simul)
res <- cbind(casos, tasa)
res$size <- as.factor(res$n)To plot the results.
ggplot(data=res, aes(x=delta, y=tasa, group=size)) +
geom_line(aes(color=size)) +
labs(x=expression(delta), y="Tasa de rechazo H0")