data(iris)
setosa <- subset(iris, Species == "setosa")[,1:4]
versicolor <- subset(iris, Species == "versicolor")[,1:4]
analyze_species <- function(data, name){
pca <- prcomp(data, scale.=TRUE)
scores <- pca$x[,1:2]
mu <- colMeans(scores)
S <- cov(scores)
d2 <- mahalanobis(scores, mu, S)
d2_sorted <- sort(d2)
n <- length(d2_sorted)
chi_quant <- qchisq((1:n - 0.5)/n, df=2)
plot(chi_quant, d2_sorted,
main=paste("Chi-square Q-Q Plot:", name),
xlab="Chi-square Quantiles (df=2)",
ylab="Ordered Mahalanobis Distances")
abline(0,1,col="red")
}
analyze_species(setosa, "Setosa")
analyze_species(versicolor, "Versicolor")
Looking at the Setosa Q-Q plot, we see that the points are more dense and follow the line on the left. This is the majority of the data points. However, it deviates a bit. Yes, there is really only one large outlier. I would say that this indicates the data is approximately multivariate normal.
For the Versicolor, most of the points stick to the plotted line, and there are really only two points that deviate far. I would, once again, say that this is still approximately multivariate normal.
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.5.3
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
data_subset <- subset(iris, Species %in% c("setosa", "versicolor"))
data_subset$Species <- factor(data_subset$Species) # Drop unused levels
res <- box_m(data_subset[, 1:4], data_subset$Species)
res
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 66.8 1.82e-10 10 Box's M-test for Homogeneity of Covariance Matri…
sub_iris <- subset(iris, Species %in% c("setosa", "versicolor"))
sub_iris$Species <- factor(sub_iris$Species)
S1 <- cov(subset(sub_iris, Species == "setosa")[, 1:4])
S2 <- cov(subset(sub_iris, Species == "versicolor")[, 1:4])
n1 <- 50; n2 <- 50; N <- 100; k <- 2
Sp <- ((n1-1)*S1 + (n2-1)*S2) / (N-k)
M <- (N-k)*log(det(Sp)) - ((n1-1)*log(det(S1)) + (n2-1)*log(det(S2)))
cat("M Statistic:", M, "\n")
## M Statistic: 69.87649
Therefore, M = 69.87649, the \(\chi^2\) = 66.81048, and the p-value = \(1.823 \times 10^{-3}\). Therefore, we reject the null hypothesis that the covariance matrices are equal. We should not use Hotelling’s \(T^2\) or LDA, as it would be unreliable.
calc_mahalanobis <- function(data) {
X <- as.matrix(data[, 1:4])
mahalanobis(X, colMeans(X), cov(X))
}
d2_setosa <- calc_mahalanobis(subset(sub_iris, Species == "setosa"))
d2_versicolor <- calc_mahalanobis(subset(sub_iris, Species == "versicolor"))
threshold <- qchisq(0.975, df = 4)
indices_s <- which(iris$Species == "setosa")
indices_v <- which(iris$Species == "versicolor")
plot_data <- data.frame(
Index = c(indices_s, indices_v),
Distance = c(d2_setosa, d2_versicolor),
Species = factor(rep(c("Setosa", "Versicolor"), each = 50))
)
library(ggplot2)
ggplot(plot_data, aes(x = Index, y = Distance, color = Species)) +
geom_point(size = 2) +
geom_hline(yintercept = threshold, linetype = "dashed", color = "red", size = 1) +
geom_text(aes(label = ifelse(Distance > threshold, as.character(Index), "")),
vjust = -1, hjust = 1, show.legend = FALSE) +
labs(title = "Outlier Detection: Squared Mahalanobis Distance",
subtitle = paste("Threshold (Chi-sq, df=4, 0.975) =", round(threshold, 3)),
y = "Squared Mahalanobis Distance (d2)",
x = "Observation Index") +
theme_minimal() +
scale_color_manual(values = c("Setosa" = "#00AFBB", "Versicolor" = "#E7B800"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
flagged_setosa <- indices_s[d2_setosa > threshold]
flagged_versicolor <- indices_v[d2_versicolor > threshold]
cat("Flagged Setosa indices:", flagged_setosa, "\n")
## Flagged Setosa indices: 42 44
cat("Flagged Versicolor indices:", flagged_versicolor, "\n")
## Flagged Versicolor indices: 69
x1 <- as.matrix(subset(sub_iris, Species == "setosa")[, 1:4])
x2 <- as.matrix(subset(sub_iris, Species == "versicolor")[, 1:4])
n1 <- nrow(x1)
n2 <- nrow(x2)
p <- ncol(x1)
mean1 <- colMeans(x1)
mean2 <- colMeans(x2)
S1 <- cov(x1)
S2 <- cov(x2)
Sp <- ((n1 - 1) * S1 + (n2 - 1) * S2) / (n1 + n2 - 2)
mean_diff <- mean1 - mean2
T2 <- (n1 * n2 / (n1 + n2)) * t(mean_diff) %*% solve(Sp) %*% mean_diff
F_stat <- ((n1 + n2 - p - 1) / (p * (n1 + n2 - 2))) * T2
df1 <- p
df2 <- n1 + n2 - p - 1
p_val <- pf(F_stat, df1, df2, lower.tail = FALSE)
cat("Hotelling T2:", T2, "\n")
## Hotelling T2: 2580.839
cat("F-statistic:", F_stat, "\n")
## F-statistic: 625.4583
cat("Degrees of Freedom:", df1, ",", df2, "\n")
## Degrees of Freedom: 4 , 95
cat("P-value:", p_val, "\n")
## P-value: 2.664857e-67
Due to the p-value being less than 0.05, we would reject the null hypothesis. This is to say that the population mean vectors for Setosa and Versicolor are significantly different.
n1 <- 50
n2 <- 50
p <- 4
alpha <- 0.05
df_pool <- n1 + n2 - 2
m <- 4
se_pool <- sqrt((1/n1 + 1/n2) * diag(Sp))
mean_diffs <- mean1 - mean2
crit_simul <- sqrt(((df_pool * p) / (n1 + n2 - p - 1)) * qf(1 - alpha, p, n1 + n2 - p - 1))
crit_bonf <- qt(1 - (alpha / (2 * m)), df_pool)
simul_lower <- mean_diffs - crit_simul * se_pool
simul_upper <- mean_diffs + crit_simul * se_pool
bonf_lower <- mean_diffs - crit_bonf * se_pool
bonf_upper <- mean_diffs + crit_bonf * se_pool
results <- data.frame(
Variable = colnames(x1),
Diff = mean_diffs,
Simul_Lower = simul_lower, Simul_Upper = simul_upper,
Bonf_Lower = bonf_lower, Bonf_Upper = bonf_upper
)
print(results)
## Variable Diff Simul_Lower Simul_Upper Bonf_Lower Bonf_Upper
## Sepal.Length Sepal.Length -0.930 -1.2120563 -0.6479437 -1.1549332 -0.7050668
## Sepal.Width Sepal.Width 0.658 0.4359378 0.8800622 0.4809106 0.8350894
## Petal.Length Petal.Length -2.798 -3.0240684 -2.5719316 -2.9782842 -2.6177158
## Petal.Width Petal.Width -1.080 -1.1811181 -0.9788819 -1.1606393 -0.9993607
The Simultaneous confidence intervals are more conservative as they are wider than the Bonferroni ones. The only exception to this is the confidence intervals for Sepal Width.
data(iris)
dependent_vars <- cbind(iris$Sepal.Length, iris$Sepal.Width,
iris$Petal.Length, iris$Petal.Width)
manova_model <- manova(dependent_vars ~ Species, data = iris)
sscp_matrices <- summary(manova_model)$SS
B <- sscp_matrices$Species
W <- sscp_matrices$Residuals
summary(manova_model, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## Species 2 0.023439 199.15 8 288 < 2.2e-16 ***
## Residuals 147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit <- manova(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data = iris)
summary.aov(fit)
## Response Sepal.Length :
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 63.212 31.606 119.26 < 2.2e-16 ***
## Residuals 147 38.956 0.265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Sepal.Width :
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 11.345 5.6725 49.16 < 2.2e-16 ***
## Residuals 147 16.962 0.1154
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Petal.Length :
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 437.10 218.551 1180.2 < 2.2e-16 ***
## Residuals 147 27.22 0.185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Petal.Width :
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 80.413 40.207 960.01 < 2.2e-16 ***
## Residuals 147 6.157 0.042
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We find that F=199.15, p-value > 0.0001, and df = (8, 288). Therefore, we reject the null hypothesis. Further, we see that Petal Length and Petal Width have the largest F values, meaning that they contribute most to the separation among the three groups.
iris_data <- iris[, 1:4]
pca_out <- prcomp(iris_data, scale. = TRUE)
eigenvalues <- (pca_out$sdev)^2
prop_variance <- eigenvalues / sum(eigenvalues)
cum_variance <- cumsum(prop_variance)
pca_summary <- data.frame(
PC = 1:4,
Eigenvalue = eigenvalues,
Proportion = prop_variance,
Cumulative = cum_variance
)
print(pca_summary)
## PC Eigenvalue Proportion Cumulative
## 1 1 2.91849782 0.729624454 0.7296245
## 2 2 0.91403047 0.228507618 0.9581321
## 3 3 0.14675688 0.036689219 0.9948213
## 4 4 0.02071484 0.005178709 1.0000000
min_pcs <- which(cum_variance >= 0.95)[1]
cat("Minimum PCs required for 95% variation:", min_pcs, "\n")
## Minimum PCs required for 95% variation: 2
plot(eigenvalues, type = "b", pch = 19, col = "blue",
xlab = "Principal Component",
ylab = "Eigenvalue (Variance explained)",
main = "Scree Plot of Iris PCA")
abline(h = 1, col = "red", lty = 2)
pc_scores <- as.data.frame(pca_out$x)
pc_scores$Species <- iris$Species
plot(pc_scores$PC1, pc_scores$PC2,
col = c("#FB6150", "#0093B0", "#6DBF68")[as.numeric(pc_scores$Species)],
pch = c(16, 17, 18)[as.numeric(pc_scores$Species)],
xlab = "Principal Component 1",
ylab = "Principal Component 2",
main = "Iris PCA: PC1 vs PC2 Projection")
legend("top", legend = levels(iris$Species),
col = c("#FB6150", "#0093B0", "#6DBF68"),
pch = c(16, 17, 18), horiz = TRUE, bty = "n")