Abstract
This is the final project for the STAT4443 course, focused on developing an R package for performing Multivariate Analysis of Variance (MANOVA). The package aims to simplify complex statistical modeling using matrix operations and advanced hypothesis testing.\[\begin{align*} \textsf{\textcopyright} \end{align*}\] ***
This project involves designing and implementing an R package prototype to perform Multivariate Analysis of Variance (MANOVA). The package aims to provide easy-to-use tools for hypothesis testing with multiple dependent variables while leveraging complex matrix operations and statistical tests. Reference materials include lecture notes [@wesonga2024apr] and resources such as [@roff2002comparing; @as2018multivariate; @friedrich2018analysis, @chatgpt].
The one-way MANOVA examines the effect of a single categorical independent variable with g groups on p dependent variables.
\[B = \sum_i n_i * (\bar{x}_i - \bar{x}) (\bar{x}_i - \bar{x})^{'}\] \[W =\sum_i\sum_j(x_ij - \bar{x_i}) (x_ij - \bar{x_i})^{'}\]
\[T=B+W\] \[Lambda = |W|/ |W + B|\] \[V = trace((B (B + W)^{-1})\] \[HL = trace(W^{-1} B)\]
The two-way MANOVA examines the effects of two independent variables (A and B) and their interaction (𝐴×𝐵)on p dependent variables. The total variance is partitioned as:
\[T = W + BA + BB + BAB\] \[β^{ˆ}= (X^{T}X)^{-1} X^{T} y\]
In a One-Way MANOVA, we examine if there are significant differences in multiple dependent variables based on a single categorical independent variable (group labels).
Data matrix X, group labels Y, and the statistical test method.
Within-group matrix 𝑊: Measures within-group variation. Between-group matrix 𝐵: Measures between-group variation.
Compute 𝑇=𝑊+𝐵 Calculate test statistics such as Wilks’ Lambda, Pillai’s Trace, Hotelling-Lawley Trace, and Roy’s Largest Root.
Compute approximate F-statistic and p-value. Reject the null hypothesis (H0) if p-value < α.
Test statistic, p-value, and decision.
In a Two-Way MANOVA, we examine the influence of two independent variables (factors) and their interaction on multiple dependent variables.
Data matrix X, factors YA and YB, and the statistical test method.
-Within-group variation 𝑊 -Between-group variation for each factor A & B -Interaction effect matrix 𝐴×𝐵
Calculate test statistics for each factor using methods like Wilks’ Lambda.
Compute F-statistic and p-value for each factor. Reject the null hypothesis for factors A or B if p-value<α.
Test statistic, p-values, and decision for each factor.
compute_W_B <- function(data, groups) {
# Within-group variation (W)
W <- matrix(0, ncol = ncol(data), nrow = ncol(data))
group_means <- aggregate(data, by = list(groups), FUN = mean)
group_means <- group_means[, -1] # Remove grouping column
for (group in unique(groups)) {
group_data <- data[groups == group, ]
deviations <- sweep(group_data, 2, as.numeric(group_means[group, ]))
W <- W + t(deviations) %*% deviations
}
# Between-group variation (B)
overall_mean <- colMeans(data)
B <- matrix(0, ncol = ncol(data), nrow = ncol(data))
for (group in unique(groups)) {
group_mean <- as.numeric(group_means[group, ])
n_i <- sum(groups == group)
diff <- group_mean - overall_mean
B <- B + n_i * (diff %*% t(diff))
}
return(list(W = W, B = B))
}
compute_W_B_TwoWay <- function(data, factor_A, factor_B) {
# Compute overall mean
overall_mean <- colMeans(data)
# Initialize matrices
W <- matrix(0, ncol = ncol(data), nrow = ncol(data))
B_A <- matrix(0, ncol = ncol(data), nrow = ncol(data))
B_B <- matrix(0, ncol = ncol(data), nrow = ncol(data))
# Compute W (Within-group variation)
for (level_A in unique(factor_A)) {
for (level_B in unique(factor_B)) {
group_data <- data[factor_A == level_A & factor_B == level_B, ]
group_mean <- colMeans(group_data)
deviations <- sweep(group_data, 2, group_mean)
W <- W + t(deviations) %*% deviations
}
}
# Compute B_A and B_B (Between-group variations)
for (level_A in unique(factor_A)) {
group_data <- data[factor_A == level_A, ]
group_mean <- colMeans(group_data)
n_i <- nrow(group_data)
diff <- group_mean - overall_mean
B_A <- B_A + n_i * (diff %*% t(diff))
}
for (level_B in unique(factor_B)) {
group_data <- data[factor_B == level_B, ]
group_mean <- colMeans(group_data)
n_i <- nrow(group_data)
diff <- group_mean - overall_mean
B_B <- B_B + n_i * (diff %*% t(diff))
}
return(list(W = W, B_A = B_A, B_B = B_B))
}
# One-Way MANOVA Function
one_way_manova <- function(dependent_vars, factor_A) {
matrices <- compute_W_B(dependent_vars, factor_A)
W <- matrices$W
B <- matrices$B
T <- W + B
Lambda <- det(W) / det(T)
p <- ncol(dependent_vars)
g <- length(unique(factor_A))
n <- nrow(dependent_vars)
df1 <- p * (g - 1)
df2 <- n - g - p + 1
approx_F <- ((1 - Lambda) / Lambda) * (df2 / df1)
p_value <- pf(approx_F, df1, df2, lower.tail = FALSE)
manova_table <- data.frame(
Statistic = c("Wilks' Lambda", "F-Statistic", "p-value"),
Value = c(Lambda, approx_F, p_value)
)
return(list(
type = "One-Way MANOVA",
table = manova_table,
decision = ifelse(p_value < 0.05, "Reject H0", "Fail to Reject H0")
))
}
# Two-Way MANOVA Function
two_way_manova <- function(dependent_vars, factor_A, factor_B) {
matrices <- compute_W_B_TwoWay(dependent_vars, factor_A, factor_B)
W <- matrices$W
B_A <- matrices$B_A
B_B <- matrices$B_B
Lambda_A <- det(W) / det(W + B_A)
Lambda_B <- det(W) / det(W + B_B)
p <- ncol(dependent_vars)
n <- nrow(dependent_vars)
df1_A <- p * (length(unique(factor_A)) - 1)
df2_A <- n - length(unique(factor_A)) - p + 1
approx_F_A <- ((1 - Lambda_A) / Lambda_A) * (df2_A / df1_A)
p_value_A <- pf(approx_F_A, df1_A, df2_A, lower.tail = FALSE)
df1_B <- p * (length(unique(factor_B)) - 1)
df2_B <- n - length(unique(factor_B)) - p + 1
approx_F_B <- ((1 - Lambda_B) / Lambda_B) * (df2_B / df1_B)
p_value_B <- pf(approx_F_B, df1_B, df2_B, lower.tail = FALSE)
manova_table <- data.frame(
Factor = c("Factor A", "Factor B"),
Wilks_Lambda = c(Lambda_A, Lambda_B),
F_Statistic = c(approx_F_A, approx_F_B),
p_value = c(p_value_A, p_value_B)
)
return(list(
type = "Two-Way MANOVA",
table = manova_table,
decisions = c(
ifelse(p_value_A < 0.05, "Reject H0 for Factor A", "Fail to Reject H0 for Factor A"),
ifelse(p_value_B < 0.05, "Reject H0 for Factor B", "Fail to Reject H0 for Factor B")
)
))
}
# Simulated Data Validation for One-Way MANOVA
set.seed(123)
dependent_vars <- cbind(rnorm(60, mean = 3), rnorm(60, mean = 2))
factor_A <- rep(1:3, each = 20)
library(ReemManovaPackage)
##
## Attaching package: 'ReemManovaPackage'
## The following objects are masked _by_ '.GlobalEnv':
##
## compute_W_B, compute_W_B_TwoWay, one_way_manova, two_way_manova
result_one_way <- one_way_manova(dependent_vars, factor_A)
print(result_one_way$table)
## Statistic Value
## 1 Wilks' Lambda 0.86147392
## 2 F-Statistic 2.25121744
## 3 p-value 0.07500437
cat("Decision:", result_one_way$decision, "\n")
## Decision: Fail to Reject H0
# Simulated Data Validation for Two-Way MANOVA
factor_B <- rep(rep(1:2, each = 10), 3)
library(ReemManovaPackage)
result_two_way <- two_way_manova(dependent_vars, factor_A, factor_B)
print(result_two_way$table)
## Factor Wilks_Lambda F_Statistic p_value
## 1 Factor A 0.8569842 2.336357 0.06646528
## 2 Factor B 0.9486777 1.541813 0.22278367
cat("Decisions:", paste(result_two_way$decisions, collapse = "; "), "\n")
## Decisions: Fail to Reject H0 for Factor A; Fail to Reject H0 for Factor B
Based on the validation results of the one-way and two-way MANOVA functions, the following key findings were observed:
Interpretation: - The p-value of 0.0750044 indicates that we fail to reject the null hypothesis. This suggests that the group means differ significantly across the dependent variables.
Interpretation: - For Factor A, the p-value of 0.0664653 suggests that we fail to reject the null hypothesis, indicating significant differences between levels of Factor A. - For Factor B, the p-value of 0.2227837 suggests that we fail to reject the null hypothesis, indicating significant differences between levels of Factor B.
set.seed(123)
dependent_vars <- cbind(rnorm(60, mean = 3), rnorm(60, mean = 2))
factor_A <- rep(1:3, each = 20)
factor_B <- rep(rep(1:2, each = 10), 3)
kableExtra::kable(result_one_way$table)
Statistic | Value |
---|---|
Wilks’ Lambda | 0.8614739 |
F-Statistic | 2.2512174 |
p-value | 0.0750044 |
kableExtra::kable(result_two_way$table)
Factor | Wilks_Lambda | F_Statistic | p_value |
---|---|---|---|
Factor A | 0.8569842 | 2.336357 | 0.0664653 |
Factor B | 0.9486777 | 1.541813 | 0.2227837 |