Homework 3

(a)

data(iris)
setosa <- subset(iris, Species == "setosa")[,1:4]
versicolor <- subset(iris, Species == "versicolor")[,1:4]

analyze_species <- function(data, name){
  
  pca <- prcomp(data, scale.=TRUE)
  
  scores <- pca$x[,1:2]
  
  mu <- colMeans(scores)
  S <- cov(scores)
  
  d2 <- mahalanobis(scores, mu, S)
  
  d2_sorted <- sort(d2)
  
  n <- length(d2_sorted)
  
  chi_quant <- qchisq((1:n - 0.5)/n, df=2)
  
  plot(chi_quant, d2_sorted,
       main=paste("Chi-square Q-Q Plot:", name),
       xlab="Chi-square Quantiles (df=2)",
       ylab="Ordered Mahalanobis Distances")
  
  abline(0,1,col="red")
}

analyze_species(setosa, "Setosa")

analyze_species(versicolor, "Versicolor")

Looking at the Setosa Q-Q plot, we see that the points are more dense and follow the line on the left. This is the majority of the data points. However, it deviates a bit. Yes, there is really only one large outlier. I would say that this indicates the data is approximately multivariate normal.

For the Versicolor, most of the points stick to the plotted line, and there are really only two points that deviate far. I would, once again, say that this is still approximately multivariate normal.

(b)

library(rstatix)

## Warning: package 'rstatix' was built under R version 4.5.3

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

data_subset <- subset(iris, Species %in% c("setosa", "versicolor"))
data_subset$Species <- factor(data_subset$Species) # Drop unused levels
res <- box_m(data_subset[, 1:4], data_subset$Species)
res

## # A tibble: 1 × 4
##   statistic  p.value parameter method                                           
##       <dbl>    <dbl>     <dbl> <chr>                                            
## 1      66.8 1.82e-10        10 Box's M-test for Homogeneity of Covariance Matri…

sub_iris <- subset(iris, Species %in% c("setosa", "versicolor"))
sub_iris$Species <- factor(sub_iris$Species)


S1 <- cov(subset(sub_iris, Species == "setosa")[, 1:4])
S2 <- cov(subset(sub_iris, Species == "versicolor")[, 1:4])


n1 <- 50; n2 <- 50; N <- 100; k <- 2
Sp <- ((n1-1)*S1 + (n2-1)*S2) / (N-k)


M <- (N-k)*log(det(Sp)) - ((n1-1)*log(det(S1)) + (n2-1)*log(det(S2)))


cat("M Statistic:", M, "\n")

## M Statistic: 69.87649

Therefore, M = 69.87649, the \(\chi^2\) = 66.81048, and the p-value = \(1.823 \times 10^{-3}\). Therefore, we reject the null hypothesis that the covariance matrices are equal. We should not use Hotelling’s \(T^2\) or LDA, as it would be unreliable.

(c)

calc_mahalanobis <- function(data) {
  X <- as.matrix(data[, 1:4])
  
  mahalanobis(X, colMeans(X), cov(X))
}


d2_setosa <- calc_mahalanobis(subset(sub_iris, Species == "setosa"))
d2_versicolor <- calc_mahalanobis(subset(sub_iris, Species == "versicolor"))


threshold <- qchisq(0.975, df = 4)

indices_s <- which(iris$Species == "setosa")
indices_v <- which(iris$Species == "versicolor")

plot_data <- data.frame(
  Index = c(indices_s, indices_v),
  Distance = c(d2_setosa, d2_versicolor),
  Species = factor(rep(c("Setosa", "Versicolor"), each = 50))
)


library(ggplot2)
ggplot(plot_data, aes(x = Index, y = Distance, color = Species)) +
  geom_point(size = 2) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "red", size = 1) +
  geom_text(aes(label = ifelse(Distance > threshold, as.character(Index), "")), 
            vjust = -1, hjust = 1, show.legend = FALSE) +
  labs(title = "Outlier Detection: Squared Mahalanobis Distance",
       subtitle = paste("Threshold (Chi-sq, df=4, 0.975) =", round(threshold, 3)),
       y = "Squared Mahalanobis Distance (d2)",
       x = "Observation Index") +
  theme_minimal() +
  scale_color_manual(values = c("Setosa" = "#00AFBB", "Versicolor" = "#E7B800"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

flagged_setosa <- indices_s[d2_setosa > threshold]
flagged_versicolor <- indices_v[d2_versicolor > threshold]

cat("Flagged Setosa indices:", flagged_setosa, "\n")

## Flagged Setosa indices: 42 44

cat("Flagged Versicolor indices:", flagged_versicolor, "\n")

## Flagged Versicolor indices: 69

(d)

x1 <- as.matrix(subset(sub_iris, Species == "setosa")[, 1:4])
x2 <- as.matrix(subset(sub_iris, Species == "versicolor")[, 1:4])

n1 <- nrow(x1)
n2 <- nrow(x2)
p <- ncol(x1)

mean1 <- colMeans(x1)
mean2 <- colMeans(x2)
S1 <- cov(x1)
S2 <- cov(x2)

Sp <- ((n1 - 1) * S1 + (n2 - 1) * S2) / (n1 + n2 - 2)

mean_diff <- mean1 - mean2
T2 <- (n1 * n2 / (n1 + n2)) * t(mean_diff) %*% solve(Sp) %*% mean_diff

F_stat <- ((n1 + n2 - p - 1) / (p * (n1 + n2 - 2))) * T2
df1 <- p
df2 <- n1 + n2 - p - 1
p_val <- pf(F_stat, df1, df2, lower.tail = FALSE)

cat("Hotelling T2:", T2, "\n")

## Hotelling T2: 2580.839

cat("F-statistic:", F_stat, "\n")

## F-statistic: 625.4583

cat("Degrees of Freedom:", df1, ",", df2, "\n")

## Degrees of Freedom: 4 , 95

cat("P-value:", p_val, "\n")

## P-value: 2.664857e-67

Due to the p-value being less than 0.05, we would reject the null hypothesis. This is to say that the population mean vectors for Setosa and Versicolor are significantly different.

(e)

n1 <- 50
n2 <- 50
p <- 4
alpha <- 0.05
df_pool <- n1 + n2 - 2
m <- 4

se_pool <- sqrt((1/n1 + 1/n2) * diag(Sp))
mean_diffs <- mean1 - mean2

crit_simul <- sqrt(((df_pool * p) / (n1 + n2 - p - 1)) * qf(1 - alpha, p, n1 + n2 - p - 1))
crit_bonf  <- qt(1 - (alpha / (2 * m)), df_pool)

simul_lower <- mean_diffs - crit_simul * se_pool
simul_upper <- mean_diffs + crit_simul * se_pool

bonf_lower <- mean_diffs - crit_bonf * se_pool
bonf_upper <- mean_diffs + crit_bonf * se_pool

results <- data.frame(
  Variable = colnames(x1),
  Diff = mean_diffs,
  Simul_Lower = simul_lower, Simul_Upper = simul_upper,
  Bonf_Lower = bonf_lower, Bonf_Upper = bonf_upper
)
print(results)

##                  Variable   Diff Simul_Lower Simul_Upper Bonf_Lower Bonf_Upper
## Sepal.Length Sepal.Length -0.930  -1.2120563  -0.6479437 -1.1549332 -0.7050668
## Sepal.Width   Sepal.Width  0.658   0.4359378   0.8800622  0.4809106  0.8350894
## Petal.Length Petal.Length -2.798  -3.0240684  -2.5719316 -2.9782842 -2.6177158
## Petal.Width   Petal.Width -1.080  -1.1811181  -0.9788819 -1.1606393 -0.9993607

The Simultaneous confidence intervals are more conservative as they are wider than the Bonferroni ones. The only exception to this is the confidence intervals for Sepal Width.

(f)

data(iris)
dependent_vars <- cbind(iris$Sepal.Length, iris$Sepal.Width, 
                        iris$Petal.Length, iris$Petal.Width)
manova_model <- manova(dependent_vars ~ Species, data = iris)

sscp_matrices <- summary(manova_model)$SS
B <- sscp_matrices$Species
W <- sscp_matrices$Residuals

summary(manova_model, test = "Wilks")

##            Df    Wilks approx F num Df den Df    Pr(>F)    
## Species     2 0.023439   199.15      8    288 < 2.2e-16 ***
## Residuals 147                                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fit <- manova(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data = iris)

summary.aov(fit)

##  Response Sepal.Length :
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## Species       2 63.212  31.606  119.26 < 2.2e-16 ***
## Residuals   147 38.956   0.265                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Sepal.Width :
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## Species       2 11.345  5.6725   49.16 < 2.2e-16 ***
## Residuals   147 16.962  0.1154                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Petal.Length :
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## Species       2 437.10 218.551  1180.2 < 2.2e-16 ***
## Residuals   147  27.22   0.185                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Petal.Width :
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## Species       2 80.413  40.207  960.01 < 2.2e-16 ***
## Residuals   147  6.157   0.042                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We find that F=199.15, p-value > 0.0001, and df = (8, 288). Therefore, we reject the null hypothesis. Further, we see that Petal Length and Petal Width have the largest F values, meaning that they contribute most to the separation among the three groups.

(g)

iris_data <- iris[, 1:4]

pca_out <- prcomp(iris_data, scale. = TRUE)


eigenvalues <- (pca_out$sdev)^2
prop_variance <- eigenvalues / sum(eigenvalues)
cum_variance <- cumsum(prop_variance)


pca_summary <- data.frame(
  PC = 1:4,
  Eigenvalue = eigenvalues,
  Proportion = prop_variance,
  Cumulative = cum_variance
)
print(pca_summary)

##   PC Eigenvalue  Proportion Cumulative
## 1  1 2.91849782 0.729624454  0.7296245
## 2  2 0.91403047 0.228507618  0.9581321
## 3  3 0.14675688 0.036689219  0.9948213
## 4  4 0.02071484 0.005178709  1.0000000

min_pcs <- which(cum_variance >= 0.95)[1]
cat("Minimum PCs required for 95% variation:", min_pcs, "\n")

## Minimum PCs required for 95% variation: 2

plot(eigenvalues, type = "b", pch = 19, col = "blue",
     xlab = "Principal Component", 
     ylab = "Eigenvalue (Variance explained)",
     main = "Scree Plot of Iris PCA")
abline(h = 1, col = "red", lty = 2)

(h)

pc_scores <- as.data.frame(pca_out$x)
pc_scores$Species <- iris$Species


plot(pc_scores$PC1, pc_scores$PC2, 
     col = c("#FB6150", "#0093B0", "#6DBF68")[as.numeric(pc_scores$Species)],
     pch = c(16, 17, 18)[as.numeric(pc_scores$Species)],
     xlab = "Principal Component 1", 
     ylab = "Principal Component 2",
     main = "Iris PCA: PC1 vs PC2 Projection")

legend("top", legend = levels(iris$Species), 
       col = c("#FB6150", "#0093B0", "#6DBF68"), 
       pch = c(16, 17, 18), horiz = TRUE, bty = "n")

Homework 3

Evan Hardinger

2026-04-08

(a)

(b)

(c)

(d)

(e)

(f)

(g)

(h)