Exercises from Units 1-8

Example 1-5: Women’s Health Survey (Descriptive Statistics)

library(readr)

Warning: package 'readr' was built under R version 4.3.3

nutrient<-read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/nutrient.csv")

library(dplyr)
N <-count(nutrient)
mean_cal <- mean(nutrient$calcium)
std_cal <- sd(nutrient$calcium)
min_cal <- min(nutrient$calcium)
max_cal <- max(nutrient$calcium)
summary_df <- data.frame(Variable = "calcium",
                         N = N,
                         Mean = mean_cal,
                         StdDev = std_cal,
                         Minimum = min_cal,
                         Maximum = max_cal)
mean_ir <- mean(nutrient$iron)
std_ir <- sd(nutrient$iron)
min_ir <- min(nutrient$iron)
max_ir <- max(nutrient$iron)
summary_df2<- data.frame(Variable = "iron",
                         N = N,
                         Mean = mean_ir,
                         StdDev = std_ir,
                         Minimum = min_ir,
                         Maximum = max_ir)
mean_pr <- mean(nutrient$protein)
std_pr <- sd(nutrient$protein)
min_pr <- min(nutrient$protein)
max_pr <- max(nutrient$protein)
summary_df3<- data.frame(Variable = "protein",
                         N = N,
                         Mean = mean_pr,
                         StdDev = std_pr,
                         Minimum = min_pr,
                         Maximum = max_pr)
mean_a <- mean(nutrient$vitamin.A)
std_a <- sd(nutrient$vitamin.A)
min_a <- min(nutrient$vitamin.A)
max_a <- max(nutrient$vitamin.A)
summary_df4<- data.frame(Variable = "a",
                         N = N,
                         Mean = mean_a,
                         StdDev = std_a,
                         Minimum = min_a,
                         Maximum = max_a)
mean_c <- mean(nutrient$vitamin.C)
std_c <- sd(nutrient$vitamin.C)
min_c <- min(nutrient$vitamin.C)
max_c <- max(nutrient$vitamin.C)
summary_df5<- data.frame(Variable = "c",
                         N = N,
                         Mean = mean_c,
                         StdDev = std_c,
                         Minimum = min_c,
                         Maximum = max_c)

rbind(summary_df,summary_df2,summary_df3,summary_df4,summary_df5)

  Variable   n      Mean     StdDev Minimum   Maximum
1  calcium 737 624.04925  397.27754    7.44  2866.440
2     iron 737  11.12990    5.98419    0.00    58.668
3  protein 737  65.80344   30.57576    0.00   251.012
4        a 737 839.63535 1633.53983    0.00 34434.270
5        c 737  78.92845   73.59527    0.00   433.339

nut <- select(nutrient, calcium, iron, protein, vitamin.A, vitamin.C)
round(cov(nut),1)

           calcium   iron protein vitamin.A vitamin.C
calcium   157829.4  940.1  6075.8  102411.1    6701.6
iron         940.1   35.8   114.1    2383.2     137.7
protein     6075.8  114.1   934.9    7330.1     477.2
vitamin.A 102411.1 2383.2  7330.1 2668452.4   22063.2
vitamin.C   6701.6  137.7   477.2   22063.2    5416.3

round(cor(nut),3)

          calcium  iron protein vitamin.A vitamin.C
calcium     1.000 0.395   0.500     0.158     0.229
iron        0.395 1.000   0.623     0.244     0.313
protein     0.500 0.623   1.000     0.147     0.212
vitamin.A   0.158 0.244   0.147     1.000     0.184
vitamin.C   0.229 0.313   0.212     0.184     1.000

Example 1-6: Woman’s Health Survey (Variance)

df1<-data.frame(Variable = "calcium",
                Variance = var(nutrient$calcium))
df2<-data.frame(Variable = "iron",
                Variance = var(nutrient$iron))
df3<-data.frame(Variable = "protein",
                Variance = var(nutrient$protein))
df4<-data.frame(Variable = "Vitamin A",
                Variance = var(nutrient$vitamin.A))
df5<-data.frame(Variable = "Vitamin C",
                Variance = var(nutrient$vitamin.C))
df6<-data.frame(Variable = "Total",
                Variance = (var(nutrient$calcium)+var(nutrient$iron)+var(nutrient$protein)+var(nutrient$vitamin.A)+var(nutrient$vitamin.C)))
rbind(df1,df2,df3,df4,df5,df6)

   Variable     Variance
1   calcium 1.578294e+05
2      iron 3.581054e+01
3   protein 9.348769e+02
4 Vitamin A 2.668452e+06
5 Vitamin C 5.416264e+03
6     Total 2.832669e+06

Example 1-7: Woman’s Health Survey (Generalized Variance)

gen_variance <- det(cov(nut))
gen_variance

[1] 2.831042e+19

Example 2-3: Women’s Health Survey (Population Mean)

df1<-data.frame(Variable = "calcium",
                Mean = mean(nutrient$calcium))
df2<-data.frame(Variable = "iron",
                Mean = mean(nutrient$iron))
df3<-data.frame(Variable = "protein",
                Mean = mean(nutrient$protein))
df4<-data.frame(Variable = "Vitamin A",
                Mean = mean(nutrient$vitamin.A))
df5<-data.frame(Variable = "Vitamin C",
                Mean = mean(nutrient$vitamin.C))
rbind(df1,df2,df3,df4,df5)

   Variable      Mean
1   calcium 624.04925
2      iron  11.12990
3   protein  65.80344
4 Vitamin A 839.63535
5 Vitamin C  78.92845

Example 2-4: Women’s Health Survey (Population Variance)

round(cov(nut),1)

           calcium   iron protein vitamin.A vitamin.C
calcium   157829.4  940.1  6075.8  102411.1    6701.6
iron         940.1   35.8   114.1    2383.2     137.7
protein     6075.8  114.1   934.9    7330.1     477.2
vitamin.A 102411.1 2383.2  7330.1 2668452.4   22063.2
vitamin.C   6701.6  137.7   477.2   22063.2    5416.3

\(s_y^2=\)

round(((0.001)^2)*var(nutrient$vitamin.A) + var(nutrient$vitamin.C)+ 2*0.001*cov(nutrient$vitamin.A,nutrient$vitamin.C),1)

[1] 5463.1

Example 3-1: Women’s Health Survey (Graphing)

L_calciu <-log(nutrient$calcium)
S_calciu <- nutrient$calcium**.5
S_S_calc <- nutrient$calcium**.25
L_iron <- log(nutrient$iron)
S_S_iron <- nutrient$iron**.25
L_prot <- log(nutrient$protein);
S_S_prot <- nutrient$protein**.25;
S_S_a = nutrient$vitamin.A**.25;
S_S_c = nutrient$vitamin.C**.25;

Univariate Cases

hist(nutrient$calcium, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of Calcium")

hist(L_calciu, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of L_calciu")

hist(S_calciu, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of S_calciu")

hist(S_S_calc, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of S_S_calc")

Bivariate Cases

plot(nutrient$iron, nutrient$calcium, main = "Scatter Plot of calcium vs iron", xlab = "iron", ylab = "calcium", col = "blue", pch = 16, xlim = c(0, 60), ylim = c(0, 3000))

plot(S_S_iron,S_S_calc, main = "Scatter Plot of S_S_calc vs S_S_iron", xlab = "S_S_iron", ylab = "S_S_calc", col = "blue", pch = 16, xlim = c(0, 3), ylim = c(1, 8))

Trivariate Cases

library(scatterplot3d)
scatterplot3d(nutrient$iron,nutrient$protein,nutrient$calcium,
              main = "3D Scatter Plot of calcium by iron and protein",
              xlab = "iron",
              ylab = "protein",
              zlab = "calcium",
              color = "blue",
              type = "h",
              pch = 16,
              xlim = c(0, 60), ylim = c(0, 250), zlim=c(7, 2900)
              )

Multivariate Cases

pairs(~S_S_calc + S_S_iron + S_S_prot + S_S_a + S_S_c, data = nutrient,
      main = "Scatterplot Matrix",
       pch = 19, 
      col = "blue",
      cex = 0.1)

Example 4-2: Q-Q Plot for Board Stiffness Data

boards <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/boardstiffness.csv", skip = 1, header = FALSE)
names(boards) <- c("x1", "x2", "x3", "x4")

pc_result <- prcomp(boards, scale. = TRUE)
scores <- pc_result$x  

mahal_distances <- mahalanobis(scores, colMeans(scores), cov(scores))

n <- nrow(boards) 
plot_data <- data.frame(
  dist2 = mahal_distances,
  prb = (seq_len(n) - 0.5) / n
)
plot_data$chiquant <- qchisq(plot_data$prb, df = 4)

# Using base R
plot(plot_data$chiquant, sort(plot_data$dist2), xlab = "chiquant", ylab = "dist2", main = "SAS Plot of the Mahbalanobis Distance", pch = 3)

## Example 4-4: Wechsler Adult Intelligence Scale

wechsler <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/wechsler.csv")

df1<-data.frame(Variable = "Information",
                Mean = mean(wechsler$info))
df2<-data.frame(Variable = "Similarities",
                Mean = mean(wechsler$sim))
df3<-data.frame(Variable = "Arithmetic",
                Mean = mean(wechsler$arith))
df4<-data.frame(Variable = "Picture Completion",
                Mean = mean(wechsler$pict))
rbind(df1,df2,df3,df4)

            Variable      Mean
1        Information 12.567568
2       Similarities  9.567568
3         Arithmetic 11.486486
4 Picture Completion  7.972973

library(dplyr)
wech <- select(wechsler, "info", "sim", "arith", "pict")
covariance_matrix <- round(cov(wech), 3)
covariance_matrix

        info    sim  arith  pict
info  11.474  9.086  6.383 2.071
sim    9.086 12.086  5.938 0.544
arith  6.383  5.938 11.090 1.791
pict   2.071  0.544  1.791 3.694

pca_result <- prcomp(~ info + sim + arith + pict, data = wechsler)

eigenvalues <- pca_result$sdev^2

round(eigenvalues,3)

[1] 26.245  6.255  3.932  1.912

pca_result <- prcomp(~ info + sim + arith + pict, data = wechsler)

eigenvectors <- pca_result$rotation

round(eigenvectors,3)

        PC1    PC2    PC3    PC4
info  0.606 -0.218 -0.461  0.611
sim   0.605 -0.496  0.320 -0.535
arith 0.505  0.795  0.335  0.035
pict  0.110  0.274 -0.757 -0.582

conf_level <- 0.95
alpha <- qchisq(conf_level, df = 4)
ellipse_lengths <- sqrt(alpha * eigenvalues)
ellipse_lengths

[1] 15.779990  7.703845  6.107496  4.258778

n <- length(eigenvalues)
volume_hyperellipse <- ((2*pi^2) /(4*gamma(2))) *(9.49^2)* sqrt(26.245*6.255*3.932*1.912)
print(volume_hyperellipse)

[1] 15613.12

Example 5-3: Simultaneous Confidence Region Multiplier

alpha <- 0.05
p <- 3
n <- 25

f_quantile <- qf(1 - alpha, df1 = p, df2 = n - p)
print(f_quantile)

[1] 3.049125

multiplier<-sqrt((p*(n-1))/(n-p) * f_quantile)
multiplier

[1] 3.158948

Example 5-4

mineral <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/mineral.csv")
df1<-data.frame(Variable = "domradius",
                Mean = mean(mineral$domradius),
                StdDev = sd(mineral$domradius))
df2<-data.frame(Variable = "domhumerus",
                Mean = mean(mineral$domhumerus),
                StdDev = sd(mineral$domhumerus))
df3<-data.frame(Variable = "domulna",
                Mean = mean(mineral$domulna),
                StdDev = sd(mineral$domulna))
rbind(df1,df2,df3)

    Variable    Mean    StdDev
1  domradius 0.84380 0.1140245
2 domhumerus 1.79268 0.2834735
3    domulna 0.70440 0.1075566

One at a time intervals

Variable<-noquote(c('domradius','domhumerus','domulna'))
means <- c(mean(mineral$domradius), mean(mineral$domhumerus),mean(mineral$domulna))
ses <- c(sd(mineral$domradius), sd(mineral$domhumerus), sd(mineral$domulna)) 
confidence_level <- 0.95
df <- n-1
t_crit <- qt((1 - confidence_level) / 2 + confidence_level, df)
ci_widths <- t_crit * (ses / sqrt(n))
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis

    Variable lower_bounds upper_bounds
1  domradius    0.7967330    0.8908670
2 domhumerus    1.6756679    1.9096921
3    domulna    0.6600028    0.7487972

Bonferroni Method

m <- length(means)
confidence_level <- 0.95
alpha_adj <- (1 - confidence_level)/m
df <- df
t_crit <- qt(1-alpha_adj/2, df)
ci_widths <- t_crit * (ses / sqrt(n))
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis

    Variable lower_bounds upper_bounds
1  domradius    0.7851084    0.9024916
2 domhumerus    1.6467682    1.9385918
3    domulna    0.6490376    0.7597624

Simultaneous Confidence Region Method

ci_widths <- multiplier * ses/sqrt(n)
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis

    Variable lower_bounds upper_bounds
1  domradius    0.7717605    0.9158395
2 domhumerus    1.6135844    1.9717756
3    domulna    0.6364469    0.7723531

Variable<-noquote(c('domhumerus', 'domradius','domulna'))
var<- c( var(mineral$domhumerus),var(mineral$domradius), var(mineral$domulna))
means <- c( mean(mineral$domhumerus),mean(mineral$domradius),mean(mineral$domulna))
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab

    Variable  n   means        var       t1       tb        f     loone
1 domhumerus 25 1.79268 0.08035723 2.063899 2.573641 3.049125 1.6756679
2  domradius 25 0.84380 0.01300158 2.063899 2.573641 3.049125 0.7967330
3    domulna 25 0.70440 0.01156842 2.063899 2.573641 3.049125 0.6600028
      upone     lobon     upbon     losim     upsim
1 1.9096921 1.6467682 1.9385918 1.6135844 1.9717756
2 0.8908670 0.7851084 0.9024916 0.7717605 0.9158395
3 0.7487972 0.6490376 0.7597624 0.6364469 0.7723531

Example 5-5: Wechsler Adult Intelligence Scale

cor_matrix <- cor(wechsler[, c("info", "sim", "arith", "pict")])
cor_matrix

           info        sim     arith       pict
info  1.0000000 0.77152974 0.5658260 0.31816370
sim   0.7715297 1.00000000 0.5129451 0.08135234
arith 0.5658260 0.51294508 1.0000000 0.27987766
pict  0.3181637 0.08135234 0.2798777 1.00000000

n<-nrow(wechsler)
test_statistic <- cor(wechsler$info,wechsler$sim) * sqrt(n - 2) / sqrt(1 - cor(wechsler$info,wechsler$sim)^2)
test_statistic

[1] 7.174599

Example 5-6: Wechsler Adult Intelligence Data

fisher_transformation <- 0.5 * log((1 + cor(wechsler$info,wechsler$sim)) / (1 - cor(wechsler$info,wechsler$sim)))
fisher_transformation

[1] 1.024096

standard_error <- 1 / sqrt(n - 3)
df <- n - 3
t_crit <- qnorm(1-0.025)
margin_of_error <- t_crit * standard_error
lower_bound <- fisher_transformation - margin_of_error
upper_bound <- fisher_transformation + margin_of_error
ci_lower <- (exp(2 * lower_bound) - 1) / (exp(2 * lower_bound) + 1)
ci_upper <- (exp(2 * upper_bound) - 1) / (exp(2 * upper_bound) + 1)
print(paste("95% Confidence Interval:", ci_lower, "to", ci_upper))

[1] "95% Confidence Interval: 0.596673274426047 to 0.876445796058012"

Example 6-2: Wechsler Adult Intelligence Scale

library(ppcor)

Warning: package 'ppcor' was built under R version 4.3.3

fit_lm <- lm(cbind(info, sim) ~ arith + pict, data = wechsler)
residuals <- residuals(fit_lm)
partial_cor <- pcor(residuals)
r<-partial_cor$estimate[1,2]
r

[1] 0.7118787

Example 6-3: Wechsler Adult Intelligence Data

n<-nrow(wechsler)
test_statistic <- r * sqrt(n - 2-2) / sqrt(1 - r^2)
test_statistic

[1] 5.822892

fisher_transformation <- 0.5 * log((1 + r) / (1 - r))
fisher_transformation

[1] 0.8909825

standard_error <- 1 / sqrt(n - 3 -2)
df <- n - 3-2
t_crit <- qnorm(1-0.025)
margin_of_error <- t_crit * standard_error
lower_bound <- fisher_transformation - margin_of_error
upper_bound <- fisher_transformation + margin_of_error
ci_lower <- (exp(2 * lower_bound) - 1) / (exp(2 * lower_bound) + 1)
ci_upper <- (exp(2 * upper_bound) - 1) / (exp(2 * upper_bound) + 1)
print(paste("95% Confidence Interval:", ci_lower, "to", ci_upper))

[1] "95% Confidence Interval: 0.496391667333479 to 0.844729175475622"

Example 7-1: Woman’s Survey Data (Hotelling’s \(T^2\) Test)

library(MASS)


hotel <- function(x, mu0) {
  one <- matrix(1, nrow = nrow(x), ncol = 1)
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% (ident - one %*% t(one) / nrow(x)) %*% x / (nrow(x) - 1)
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  result <- list(t2 = t2, f = f, df1 = df1, df2 = df2, p = p)
  return(result)
}
mu0 <- c(1000, 15, 60, 800, 75)

x <- as.matrix(nutrient[, c("calcium", "iron", "protein", "vitamin.A", "vitamin.C")])
result <- hotel(x, mu0)

rbind(result$t2, result$f, result$df1, result$df2, result$p)

          [,1]
[1,] 1758.5413
[2,]  349.7968
[3,]    5.0000
[4,]  732.0000
[5,]    0.0000

Example 7-2: Women’s Health Survey:

calcium <- nutrient$calcium
conf_interval_cal <- t.test(calcium)$conf.int
conf_interval_cal

[1] 595.3201 652.7784
attr(,"conf.level")
[1] 0.95

variable<-noquote(c("Calcium","Iron","Protein","Vitamin A","Vitamin C"))
calcium <- nutrient$calcium
iron<- nutrient$iron
protein<-nutrient$protein
vitamin.A<-nutrient$vitamin.A
vitamin.C<-nutrient$vitamin.C
conf_interval_cal <- t.test(calcium)$conf.int
conf_interval_ir<-t.test(iron)$conf.int
conf_interval_pr<-t.test(protein)$conf.int
conf_interval_a<-t.test(vitamin.A)$conf.int
conf_interval_c<-t.test(vitamin.C)$conf.int
tab<-data.frame(conf_interval_cal,conf_interval_ir,conf_interval_pr,conf_interval_a,conf_interval_c)
tab

  conf_interval_cal conf_interval_ir conf_interval_pr conf_interval_a
1          595.3201         10.69715         63.59235        721.5057
2          652.7784         11.56265         68.01453        957.7650
  conf_interval_c
1         73.6064
2         84.2505

Example 7-3 and 7-4: Women’s Health Survey

Simultaneous confidence intervals are given by the columns for “losim” and “upsim” while for Bonferroni intervals are given by “lobon” and “upbon”

Variable<-noquote(c("Calcium","Iron","Protein","Vitamin A","Vitamin C"))
var<-c(var(calcium),var(iron),var(protein),var(vitamin.A),var(vitamin.C))
means <-c(mean(calcium),mean(iron),mean(protein),mean(vitamin.A),mean(vitamin.C))
p<-5
n<-nrow(nutrient)
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab

   Variable   n     means          var       t1       tb       f     loone
1   Calcium 737 624.04925 1.578294e+05 1.963192 2.582526 2.22634 595.32008
2      Iron 737  11.12990 3.581054e+01 1.963192 2.582526 2.22634  10.69715
3   Protein 737  65.80344 9.348769e+02 1.963192 2.582526 2.22634  63.59235
4 Vitamin A 737 839.63535 2.668452e+06 1.963192 2.582526 2.22634 721.50572
5 Vitamin C 737  78.92845 5.416264e+03 1.963192 2.582526 2.22634  73.60640
      upone     lobon     upbon     losim      upsim
1 652.77843 586.25681 661.84169 575.09118  673.00733
2  11.56265  10.56063  11.69917  10.39244   11.86735
3  68.01453  62.89481  68.71207  62.03547   69.57141
4 957.76498 684.23906 995.03163 638.32780 1040.94289
5  84.25050  71.92743  85.92946  69.85901   87.99788

Example 7-9: Spouse Data (Paired Hotelling’s)

library(MASS)
hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
  
  print(mu0)
  print(ybar)
  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  
  print(t2)
  print(f)
  print(df1)
  print(df2)
  print(p)
}
spouse <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/spouse.csv")
d1<- spouse$h1 - spouse$w1
d2<- spouse$h2 - spouse$w2
d3<- spouse$h3 - spouse$w3
d4<- spouse$h4 - spouse$w4
diffs <- data.frame(d1, d2, d3, d4)
x <- as.matrix(diffs)
hotel(x)

[1] 0 0 0 0
          [,1]
d1  0.06666667
d2 -0.13333333
d3 -0.30000000
d4 -0.13333333
            d1          d2         d3          d4
d1  0.82298851  0.07816092 -0.0137931 -0.05977011
d2  0.07816092  0.80919540 -0.2137931 -0.15632184
d3 -0.01379310 -0.21379310  0.5620690  0.51034483
d4 -0.05977011 -0.15632184  0.5103448  0.60229885
         [,1]
[1,] 13.12784
         [,1]
[1,] 2.942447
[1] 4
[1] 26
           [,1]
[1,] 0.03936914

Example 7-10: Spouse Data (Bonferroni CI)

In this output losim and upsim give the lower and upper bounds for the simultaneous intervals, and lobon and upbon give the lower and upper bounds for the Bonferroni interval

Variable<-noquote(c("d1","d2","d3","d4"))
var<-c(var(d1),var(d2),var(d3),var(d4))
means <-c(mean(d1),mean(d2),mean(d3),mean(d4))
p<-4
n<-nrow(diffs)
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab

  Variable  n       means       var      t1       tb        f      loone
1       d1 30  0.06666667 0.8229885 2.04523 2.663196 2.742594 -0.2720826
2       d2 30 -0.13333333 0.8091954 2.04523 2.663196 2.742594 -0.4692319
3       d3 30 -0.30000000 0.5620690 2.04523 2.663196 2.742594 -0.5799473
4       d4 30 -0.13333333 0.6022989 2.04523 2.663196 2.742594 -0.4231261
        upone      lobon      upbon      losim     upsim
1  0.40541591 -0.3744357 0.50776899 -0.5127078 0.6460411
2  0.20256524 -0.5707236 0.30405698 -0.7078322 0.4411655
3 -0.02005272 -0.6645333 0.06453334 -0.7788034 0.1788034
4  0.15645938 -0.5106869 0.24402025 -0.6289758 0.3623091

Example 7-11: Spouse Data (Question 2)

library(MASS)
hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
  
  print(mu0)
  print(ybar)

  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)

  print(t2)
  print(f)
  print(df1)
  print(df2)
  print(p)
}
spouse <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/spouse.csv")
d1<- spouse$h1 - spouse$w2
d2<- spouse$h2 - spouse$w1
d3<- spouse$h3 - spouse$w4
d4<- spouse$h4 - spouse$w3
diffs <- data.frame(d1, d2, d3, d4)
x <- as.matrix(diffs)
hotel(x)

[1] 0 0 0 0
         [,1]
d1 -0.2000000
d2  0.1333333
d3 -0.2000000
d4 -0.2333333
           d1          d2          d3          d4
d1  0.7862069  0.13103448 -0.14482759 -0.18620690
d2  0.1310345  0.74022989 -0.04137931 -0.07126437
d3 -0.1448276 -0.04137931  0.51034483  0.50344828
d4 -0.1862069 -0.07126437  0.50344828  0.66781609
         [,1]
[1,] 6.426621
        [,1]
[1,] 1.44045
[1] 4
[1] 26
          [,1]
[1,] 0.2488886

Example 7-13: Swiss Banknotes (Two-Sample Hotelling’s)

swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
swissgenuine<-swiss%>% 
  filter(type =="real")
swisscounterfeit<-swiss%>% 
  filter(type =="fake")

df1<-data.frame(Variable = "Length",
                Genuine = mean(swissgenuine$length),
                Counterfeit=mean(swisscounterfeit$length))
df2<-data.frame(Variable = "Left Width",
                Genuine = mean(swissgenuine$left),
                Counterfeit=mean(swisscounterfeit$left)) 
df3<-data.frame(Variable = "Right Width",
                Genuine = mean(swissgenuine$right),
                Counterfeit=mean(swisscounterfeit$right)) 
df4<-data.frame(Variable = "Bottom Margin",
                Genuine = mean(swissgenuine$bottom),
                Counterfeit=mean(swisscounterfeit$bottom)) 
df5<-data.frame(Variable = "Top Margin",
                Genuine = mean(swissgenuine$top),
                Counterfeit=mean(swisscounterfeit$top)) 
df6<-data.frame(Variable = "Diagonal",
                Genuine = mean(swissgenuine$diag),
                Counterfeit= mean(swisscounterfeit$diag)) 

rbind(df1,df2,df3,df4,df5,df6)

       Variable Genuine Counterfeit
1        Length 214.969     214.823
2    Left Width 129.943     130.300
3   Right Width 129.720     130.193
4 Bottom Margin   8.305      10.530
5    Top Margin  10.168      11.133
6      Diagonal 141.517     139.450

library(MASS)

hotel2 <- function(x1, x2) {
  n1 <- nrow(x1)
  n2 <- nrow(x2)
  k <- ncol(x1)
  one1 <- matrix(1, nrow = n1, ncol = 1)
  one2 <- matrix(1, nrow = n2, ncol = 1)
  ident1 <- diag(n1)
  ident2 <- diag(n2)
  ybar1 <- t(x1) %*% one1 / n1
  s1 <- t(x1) %*% ((ident1 - one1 %*% t(one1) / n1) %*% x1) / (n1 - 1.0)
  print(round(s1,4)) ##sample variance-covariance matrix for the real or genuine notes
  
  ybar2 <- t(x2) %*% one2 / n2
  s2 <- t(x2) %*% ((ident2 - one2 %*% t(one2) / n2) %*% x2) / (n2 - 1.0)
  print(round(s2,4)) ##sample variance-covariance for the second sample of notes
  
  spool <- ((n1 - 1.0) * s1 + (n2 - 1.0) * s2) / (n1 + n2 - 2.0)
  print(round(spool,4)) ##pooled variance-covariance matrix for the two samples.
  
  t2 <- t(ybar1 - ybar2) %*% solve(spool * (1 / n1 + 1 / n2)) %*% (ybar1 - ybar2)
  f <- (n1 + n2 - k - 1) * t2 / k / (n1 + n2 - 2)
  df1 <- k
  df2 <- n1 + n2 - k - 1
  p <- 1 - pf(f, df1, df2)
  print(c("t2" = t2, "f" = f, "df1" = df1, "df2" = df2, "p" = p))
}

x1 <- as.matrix(swiss[swiss$type == "real", c("length", "left", "right", "bottom", "top", "diag")])
x2 <- as.matrix(swiss[swiss$type == "fake", c("length", "left", "right", "bottom", "top", "diag")])

hotel2(x1, x2)

       length    left   right  bottom     top    diag
length 0.1502  0.0580  0.0573  0.0571  0.0145  0.0055
left   0.0580  0.1326  0.0859  0.0567  0.0491 -0.0431
right  0.0573  0.0859  0.1263  0.0582  0.0306 -0.0238
bottom 0.0571  0.0567  0.0582  0.4132 -0.2635 -0.0002
top    0.0145  0.0491  0.0306 -0.2635  0.4212 -0.0753
diag   0.0055 -0.0431 -0.0238 -0.0002 -0.0753  0.1998
        length    left   right  bottom     top    diag
length  0.1240  0.0315  0.0240 -0.1006  0.0194  0.0116
left    0.0315  0.0651  0.0468 -0.0240 -0.0119 -0.0051
right   0.0240  0.0468  0.0889 -0.0186  0.0001  0.0342
bottom -0.1006 -0.0240 -0.0186  1.2813 -0.4902  0.2385
top     0.0194 -0.0119  0.0001 -0.4902  0.4045 -0.0221
diag    0.0116 -0.0051  0.0342  0.2385 -0.0221  0.3112
        length    left  right  bottom     top    diag
length  0.1371  0.0448 0.0406 -0.0217  0.0169  0.0085
left    0.0448  0.0988 0.0663  0.0163  0.0186 -0.0241
right   0.0406  0.0663 0.1076  0.0198  0.0154  0.0052
bottom -0.0217  0.0163 0.0198  0.8473 -0.3768  0.1191
top     0.0169  0.0186 0.0154 -0.3768  0.4128 -0.0487
diag    0.0085 -0.0241 0.0052  0.1191 -0.0487  0.2555
       t2         f       df1       df2         p 
2412.4507  391.9217    6.0000  193.0000    0.0000

7-14 7.2.3

Example 7-15: Swiss Banknotes

The bounds of the simultaneous confidence intervals are given in columns losim and upsim. The Bonferroni Corrected (1 - α) x 100% Confidence Intervals are given in columns lobon and upbon

library(dplyr)
library(tidyverse)
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
pop1 <- swiss %>%
  filter(type == "real") %>%
  pivot_longer(cols = c(length, left, right, bottom, top, diag),
               names_to = "variable",
               values_to = "x")%>%
  group_by(variable) %>%
  summarize(n1 = n(),
            xbar1 = mean(x),
            s21 = var(x))
fake <- swiss %>%
  filter(type == "fake") %>%
  pivot_longer(cols = c(length, left, right, bottom, top, diag),
               names_to = "variable",
               values_to = "x")
pop2 <- fake %>%
  group_by(variable) %>%
  summarize(n2 = n(),
            xbar2 = mean(x),
            s22 = var(x))
combine <- merge(pop1, pop2, by = "variable")
p <- 6
f <- qf(0.95, p, combine$n1 + combine$n2 - p - 1)
t <- qt(1 - 0.025 / p, combine$n1 + combine$n2 - 2)
sp <- ((combine$n1 - 1) * combine$s21 + (combine$n2 - 1) * combine$s22) / (combine$n1 + combine$n2 - 2)
combine$losim <- combine$xbar1 - combine$xbar2 - sqrt(p * (combine$n1 + combine$n2 - 2) * f * (1/combine$n1 + 1/combine$n2) * sp / (combine$n1 + combine$n2 - p - 1))
combine$upsim <- combine$xbar1 - combine$xbar2 + sqrt(p * (combine$n1 + combine$n2 - 2) * f * (1/combine$n1 + 1/combine$n2) * sp / (combine$n1 + combine$n2 - p - 1))
combine$lobon <- combine$xbar1 - combine$xbar2 - t * sqrt((1/combine$n1 + 1/combine$n2) * sp)
combine$upbon <- combine$xbar1 - combine$xbar2 + t * sqrt((1/combine$n1 + 1/combine$n2) * sp)
cbind(combine, f,t,sp)

  variable  n1   xbar1       s21  n2   xbar2        s22       losim      upsim
1   bottom 100   8.305 0.4132071 100  10.530 1.28131313 -2.69809429 -1.7519057
2     diag 100 141.517 0.1998091 100 139.450 0.31121212  1.80719722  2.3268028
3     left 100 129.943 0.1325768 100 130.300 0.06505051 -0.51856518 -0.1954348
4   length 100 214.969 0.1502414 100 214.823 0.12401111 -0.04432667  0.3363267
5    right 100 129.720 0.1262626 100 130.193 0.08894040 -0.64159649 -0.3044035
6      top 100  10.168 0.4211879 100  11.133 0.40445556 -1.29523310 -0.6347669
         lobon      upbon        f        t         sp
1 -2.571916424 -1.8780836 2.145801 2.665026 0.84726010
2  1.876488608  2.2575114 2.145801 2.665026 0.25551061
3 -0.475474511 -0.2385255 2.145801 2.665026 0.09881364
4  0.006434909  0.2855651 2.145801 2.665026 0.13712626
5 -0.596630515 -0.3493695 2.145801 2.665026 0.10760152
6 -1.207157404 -0.7228426 2.145801 2.665026 0.41282172

7.2.5 - Profile Plots

library(dplyr)

swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")  # Skip the first row

swiss <- swiss %>%
  mutate(
    length = length - 215.0,
    left = left - 130.0,
    right = right - 130.0,
    bottom = bottom - 9.0,
    top = top - 10.0,
    diag = diag - 141.354
  ) %>%
  pivot_longer(
    cols = c(length, left, right, bottom, top, diag),
    names_to = "variable",
    values_to = "x"
  )

swiss <- swiss %>%
  arrange(type, variable)
a <- swiss %>%
  group_by(type, variable) %>%
  summarize(xbar = mean(x))

library(ggplot2)
# Create the line plot
ggplot(a, aes(x = variable, y = xbar, group = type, color = type)) +
  geom_line() +
  labs(title = "Profile Plots", x = "Variable", y = "Mean") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

7.2.6 - Model Assumptions and Diagnostics Assumptions

  library(biotools)

Warning: package 'biotools' was built under R version 4.3.3

  swiss <- read_csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
  result <- boxM(data = swiss[, -1], grouping = swiss$type)
  result


    Box's M-test for Homogeneity of Covariance Matrices

data:  swiss[, -1]
Chi-Sq (approx.) = 121.9, df = 21, p-value = 3.198e-16

7.2.7 - Testing for Equality of Mean Vectors when \(\Sigma_1 \neq \Sigma_2\)

library(Hotelling)
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")

x1 <- subset(swiss, type == "real", select = c(length, left, right, bottom, top, diag))
x2 <- subset(swiss, type == "fake", select = c(length, left, right, bottom, top, diag))

n1 <- nrow(x1)
n2 <- nrow(x2)
k <- ncol(x1)
ybar1 <- colMeans(x1)
ybar2 <- colMeans(x2)
s1 <- cov(x1)
s2 <- cov(x2)
st <- s1 / n1 + s2 / n2
t2 <- t(ybar1 - ybar2) %*% solve(st) %*% (ybar1 - ybar2)
df1 <- k
df2 <- n1 + n2 - k - 1

p_value <- 1 - pf(((n1 + n2 - k - 1) * t2) / (k * (n1 + n2 - 2)), df1, df2)

print(paste("Test statistic:", t2))

[1] "Test statistic: 2412.45068552327"

print(paste("Degrees of freedom:", df1, df2))

[1] "Degrees of freedom: 6 193"

print(paste("p-value:", p_value))

[1] "p-value: 0"

Example 8-1 Pottery Data (MANOVA)

pottery<- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv")
models <- list()
resids <- data.frame(site = pottery$site)

for(response in c("al", "fe", "mg", "ca", "na")) {

  formula <- as.formula(paste(response, "~ site"))
  
  # Fit the model
  model <- lm(formula, data = pottery)
  
  models[[response]] <- model
  
  resids[[paste0("r", response)]] <- residuals(model)
}

print(head(resids))

  site       ral          rfe        rmg          rca         rna
1    L  1.835714  0.627857143 -0.5264286 -0.052142857  0.25928571
2    L  1.235714  0.707857143 -1.3964286 -0.082142857 -0.08071429
3    L  2.035714  0.717857143 -0.9464286 -0.072142857 -0.05071429
4    L -1.064286 -0.002142857  0.8135714 -0.042142857 -0.11071429
5    L  1.235714  0.687857143  0.5135714 -0.002142857 -0.05071429
6    L -1.664286 -0.112142857 -1.3564286 -0.032142857 -0.03071429

for(variable in c("al", "fe", "mg", "ca", "na")) {

  hist(pottery[[variable]], 
       main = paste("Histogram of", variable), 
       xlab = variable, 
       col = "lightblue", 
       border = "darkblue")
}

pairs(~al + fe + mg + ca + na, data = pottery,
      main = "Scatterplot Matrix",
       pch = 19, 
      col = "blue",
      cex = 1)

Example 8-2: Pottery Data

p <- 5  
g <- length(unique(pottery$site))  
df <- 0.5 * p * (p + 1) * (g - 1)

cov_matrices <- list()
for (site in unique(pottery$site)) {
  cov_matrices[[site]] <- cov(subset(pottery, site == site)[, c("al", "fe", "mg", "ca", "na")])
}
l_statistic <- 0
for (i in 1:(g - 1)) {
  for (j in (i + 1):g) {
    l_statistic <- l_statistic + (sum(diag((cov_matrices[[i]] + cov_matrices[[j]]) %*% solve(cov_matrices[[i]] + cov_matrices[[j]]))) - p)
  }
}
l_statistic <- l_statistic * 2


alpha <- 0.05  
critical_value <- qchisq(1 - alpha, df)

cat("Test statistic:", l_statistic, "\n")

Test statistic: -2.131628e-14

cat("Degrees of freedom:", df, "\n")

Degrees of freedom: 45

cat("Critical value (", 1 - alpha, "):", critical_value, "\n")

Critical value ( 0.95 ): 61.65623

if (l_statistic > critical_value) {
  cat("Reject the null hypothesis: Covariance matrices are not homogeneous.\n")
} else {
  cat("Fail to reject the null hypothesis: Covariance matrices are homogeneous.\n")
}

Fail to reject the null hypothesis: Covariance matrices are homogeneous.

Example 8-3: Pottery Data (MANOVA)

outcome <- cbind(pottery$al, pottery$fe, pottery$mg, pottery$ca, pottery$na)
model<- manova(outcome ~ site, data = pottery)
summary(model, intercept = TRUE, test = "Wilks")

            Df     Wilks approx F num Df den Df    Pr(>F)    
(Intercept)  1 0.0068354   523.07      5 18.000 < 2.2e-16 ***
site         3 0.0123009    13.09     15 50.091  1.84e-12 ***
Residuals   22                                               
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

pottery <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv", stringsAsFactors = TRUE)
library(tidyverse)

pottery_long <- pottery %>%
  gather(key = "chemical", value = "amount", al:na)
means <- pottery_long %>%
  group_by(site, chemical) %>%
  summarize(mean = mean(amount))
library(ggplot2)

ggplot(means, aes(x = chemical, y = mean, group = site, color = site)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Profile Plot for Pottery Data", x = "Chemical", y = "Mean Amount") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Example 8-4: Pottery Data (ANOVA)

# For specific univariate ANOVA results
al<-summary(lm(al ~ site, data = pottery))
fe<-summary(lm(fe ~ site, data = pottery))
mg<-summary(lm(mg ~ site, data = pottery))
ca<-summary(lm(ca ~ site, data = pottery))
na<-summary(lm(na ~ site, data = pottery))

df1<-data.frame(Variable = "Al",
                F_value = al$fstatistic[1])
df2<-data.frame(Variable = "Fe",
                F_value  = fe$fstatistic[1])
df3<-data.frame(Variable = "Mg",
                F_value  = mg$fstatistic[1])
df4<-data.frame(Variable = "Ca",
                F_value  = ca$fstatistic[1])
df5<-data.frame(Variable = "Na",
                F_value  = na$fstatistic[1])
rbind(df1,df2,df3,df4,df5)

       Variable   F_value
value        Al 26.669259
value1       Fe 89.882725
value2       Mg 49.120088
value3       Ca 29.156699
value4       Na  9.502604

Example 8-8: Pottery Data

library(car)

pottery <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv")
pottery$site <- factor(pottery$site)

site_contrasts <- matrix(c(8, -2, 8, -14,    # C+L-A-I
                            1, 0, -1, 0,      # A vs I
                            0, 1, 0, -1),     # C vs L
                          nrow = 3, byrow = TRUE)

rownames(site_contrasts) <- c("C+L-A-I", "A vs I", "C vs L")
colnames(site_contrasts) <- c("C", "L", "A", "I")

print(site_contrasts)

        C  L  A   I
C+L-A-I 8 -2  8 -14
A vs I  1  0 -1   0
C vs L  0  1  0  -1