Exercises from Units 1-8

knitr::opts_chunk$set(echo = TRUE, comment = "", message = FALSE)

Lesson 1: Measures of Central Tendency, Dispersion and Association

Warning: package 'readxl' was built under R version 4.3.3

Warning: package 'dplyr' was built under R version 4.3.3

# A tibble: 737 × 5
   calcium  iron protein `vitamin A` `vitamin C`
     <dbl> <dbl>   <dbl>       <dbl>       <dbl>
 1    522. 10.2     42.6       349.        54.1 
 2    343.  4.11    67.8       267.        24.8 
 3    858. 13.7     59.9       668.       155.  
 4    576. 13.2     42.2       792.       225.  
 5   1928. 18.9    111.        740.        81.0 
 6    608.  6.8     45.8       166.        13.0 
 7   1046. 18.4    116.       1120.       159.  
 8    181. 12.8     64.2        78.5       26.9 
 9    327.  8.69    49.2       568.        50.0 
10    383. 13.7    104.       1029.         8.40
# ℹ 727 more rows

1.4

colnames(nutrient) <- c("calcium", "iron", "protein", "vitamin A", "vitamin C")
calculate_summary <- function(x) {
  data.frame(
    N = sum(!is.na(x)),
    Mean = mean(x, na.rm = TRUE),
    SD = sd(x, na.rm = TRUE),
    Minimum = min(x, na.rm = TRUE),
    Maximum = max(x, na.rm = TRUE)
  )
}

summary_stats <- lapply(nutrient, calculate_summary)
summary_table <- do.call(rbind, summary_stats)
kable(summary_table, caption = "Summary Statistics", format = "html", col.names = c("Variable", "N", "Mean", "SD", "Minimum", "Maximum")) %>%
  kable_styling()

Summary Statistics
Variable	N	Mean	SD	Minimum	Maximum
calcium	737	624.04925	397.277540	7.44	2866.440
iron	737	11.12990	5.984191	0.00	58.668
protein	737	65.80344	30.575756	0.00	251.012
vitamin A	737	839.63535	1633.539828	0.00	34434.270
vitamin C	737	78.92845	73.595272	0.00	433.339

colnames(nutrient) <- c("calcium", "iron", "protein", "vitamin A", "vitamin C")


calculate_summary <- function(x) {
  data.frame(
    Mean = sprintf("%.1f", mean(x, na.rm = TRUE)),
    Standard_Deviation = sprintf("%.1f", sd(x, na.rm = TRUE))
  )
}

summary_stats <- lapply(nutrient, calculate_summary)

summary_table <- do.call(rbind, summary_stats)

kable(summary_table, caption = "Summary Statistics", format = "html", col.names = c("Variable", "Mean", "Standard Deviation")) %>%
  kable_styling()

Summary Statistics
Variable	Mean	Standard Deviation
calcium	624.0	397.3
iron	11.1	6.0
protein	65.8	30.6
vitamin A	839.6	1633.5
vitamin C	78.9	73.6

colnames(nutrient) <- c("calcium", "iron", "protein", "vitamin A", "vitamin C")

summary_table <- summary(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")])
summary_table <- as.data.frame(t(summary_table))  
kable(summary_table, caption = "Descriptive Statistics", format = "html", col.names = NA) %>%
  kable_styling()

Descriptive Statistics
Var1	Var2	Freq
calcium		Min. : 7.44
iron		Min. : 0.00
protein		Min. : 0.00
vitamin A		Min. : 0.0
vitamin C		Min. : 0.00
calcium		1st Qu.: 326.54
iron		1st Qu.: 7.46
protein		1st Qu.: 45.52
vitamin A		1st Qu.: 276.3
vitamin C		1st Qu.: 24.94
calcium		Median : 548.29
iron		Median :10.03
protein		Median : 61.07
vitamin A		Median : 524.0
vitamin C		Median : 53.59
calcium		Mean : 624.05
iron		Mean :11.13
protein		Mean : 65.80
vitamin A		Mean : 839.6
vitamin C		Mean : 78.93
calcium		3rd Qu.: 826.51
iron		3rd Qu.:13.71
protein		3rd Qu.: 80.77
vitamin A		3rd Qu.: 943.2
vitamin C		3rd Qu.:108.67
calcium		Max. :2866.44
iron		Max. :58.67
protein		Max. :251.01
vitamin A		Max. :34434.3
vitamin C		Max. :433.34

cor_matrix <- cor(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")], method = "pearson")
kable(cor_matrix, caption = "Pearson Correlation Matrix", format = "html", col.names = NA) %>%
  kable_styling()

Pearson Correlation Matrix
	calcium	iron	protein	vitamin A	vitamin C
calcium	1.0000000	0.3954301	0.5001882	0.1578060	0.2292111
iron	0.3954301	1.0000000	0.6233662	0.2437905	0.3126009
protein	0.5001882	0.6233662	1.0000000	0.1467574	0.2120670
vitamin A	0.1578060	0.2437905	0.1467574	1.0000000	0.1835227
vitamin C	0.2292111	0.3126009	0.2120670	0.1835227	1.0000000

cov_matrix <- cov(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")])
kable(cov_matrix, caption = "The variance-covariance matrix is also copied into the matrix below.", format = "html", col.names = NA) %>%
  kable_styling()

The variance-covariance matrix is also copied into the matrix below.
	calcium	iron	protein	vitamin A	vitamin C
calcium	157829.4439	940.08944	6075.8163	102411.127	6701.6160
iron	940.0894	35.81054	114.0580	2383.153	137.6720
protein	6075.8163	114.05803	934.8769	7330.052	477.1998
vitamin A	102411.1266	2383.15341	7330.0515	2668452.371	22063.2486
vitamin C	6701.6160	137.67199	477.1998	22063.249	5416.2641

sample_cor_matrix <- cor(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")], method = "pearson", use = "complete.obs")
sample_cor_matrix <- round(sample_cor_matrix, 3)  # Limit to 3 decimal places
sample_cor_matrix <- data.frame(sample_cor_matrix)


kable(sample_cor_matrix, caption = "Sample Correlation Matrix", format = "html", col.names = NA) %>%
  kable_styling()

Sample Correlation Matrix
	calcium	iron	protein	vitamin.A	vitamin.C
calcium	1.000	0.395	0.500	0.158	0.229
iron	0.395	1.000	0.623	0.244	0.313
protein	0.500	0.623	1.000	0.147	0.212
vitamin A	0.158	0.244	0.147	1.000	0.184
vitamin C	0.229	0.313	0.212	0.184	1.000

1.5

variances <- apply(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")], 2, var)

total_variance <- sum(variances)

variance_table <- data.frame(
  Variable = names(variances),
  Variance = round(variances, 3)
)

table <- rbind(variance_table, c("Total", round(total_variance, 3)))
table

           Variable    Variance
calcium     calcium  157829.444
iron           iron      35.811
protein     protein     934.877
vitamin A vitamin A 2668452.371
vitamin C vitamin C    5416.264
6             Total 2832668.766

1.6

cov_matrix <- cov(nutrient)
generalized_variance <- det(cov_matrix)
cat("Sample Variance-Covariance Matrix:\n")

Sample Variance-Covariance Matrix:

print(round(cov_matrix, 5))

              calcium       iron   protein   vitamin A  vitamin C
calcium   157829.4439  940.08944 6075.8162  102411.127  6701.6160
iron         940.0894   35.81054  114.0580    2383.153   137.6720
protein     6075.8162  114.05803  934.8769    7330.052   477.1998
vitamin A 102411.1266 2383.15341 7330.0515 2668452.371 22063.2486
vitamin C   6701.6160  137.67199  477.1998   22063.249  5416.2641

cat("\nGeneralized Variance:\n")


Generalized Variance:

print(round(generalized_variance, 3))

[1] 2.831042e+19

Lesson 2: Linear Combinations of Random Variables

Section 2.2

colnames(nutrient) <- c("calcium", "iron", "protein", "vitamin A", "vitamin C")


calculate_summary <- function(x) {
  data.frame(
    Mean = sprintf("%.1f", mean(x, na.rm = TRUE))
  )
}

summary_stats <- lapply(nutrient, calculate_summary)

summary_table <- do.call(rbind, summary_stats)

kable(summary_table, caption = "Population Mean", format = "html", col.names = c("Variable", "Mean")) %>%
  kable_styling()

Population Mean
Variable	Mean
calcium	624.0
iron	11.1
protein	65.8
vitamin A	839.6
vitamin C	78.9

Example 2.3

colnames(nutrient) <- c("calcium", "iron", "protein", "vitamin A", "vitamin C")

summary_table <- summary(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")])
summary_table <- as.data.frame(t(summary_table))  
kable(summary_table, caption = "Descriptive Statistics", format = "html", col.names = NA) %>%
  kable_styling()

Descriptive Statistics
Var1	Var2	Freq
calcium		Min. : 7.44
iron		Min. : 0.00
protein		Min. : 0.00
vitamin A		Min. : 0.0
vitamin C		Min. : 0.00
calcium		1st Qu.: 326.54
iron		1st Qu.: 7.46
protein		1st Qu.: 45.52
vitamin A		1st Qu.: 276.3
vitamin C		1st Qu.: 24.94
calcium		Median : 548.29
iron		Median :10.03
protein		Median : 61.07
vitamin A		Median : 524.0
vitamin C		Median : 53.59
calcium		Mean : 624.05
iron		Mean :11.13
protein		Mean : 65.80
vitamin A		Mean : 839.6
vitamin C		Mean : 78.93
calcium		3rd Qu.: 826.51
iron		3rd Qu.:13.71
protein		3rd Qu.: 80.77
vitamin A		3rd Qu.: 943.2
vitamin C		3rd Qu.:108.67
calcium		Max. :2866.44
iron		Max. :58.67
protein		Max. :251.01
vitamin A		Max. :34434.3
vitamin C		Max. :433.34

cor_matrix <- cor(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")], method = "pearson")
kable(cor_matrix, caption = "Pearson Correlation Matrix", format = "html", col.names = NA) %>%
  kable_styling()

Pearson Correlation Matrix
	calcium	iron	protein	vitamin A	vitamin C
calcium	1.0000000	0.3954301	0.5001882	0.1578060	0.2292111
iron	0.3954301	1.0000000	0.6233662	0.2437905	0.3126009
protein	0.5001882	0.6233662	1.0000000	0.1467574	0.2120670
vitamin A	0.1578060	0.2437905	0.1467574	1.0000000	0.1835227
vitamin C	0.2292111	0.3126009	0.2120670	0.1835227	1.0000000

cov_matrix <- cov(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")])
kable(cov_matrix, caption = "The variance-covariance matrix is also copied into the matrix below.", format = "html", col.names = NA) %>%
  kable_styling()

The variance-covariance matrix is also copied into the matrix below.
	calcium	iron	protein	vitamin A	vitamin C
calcium	157829.4439	940.08944	6075.8163	102411.127	6701.6160
iron	940.0894	35.81054	114.0580	2383.153	137.6720
protein	6075.8163	114.05803	934.8769	7330.052	477.1998
vitamin A	102411.1266	2383.15341	7330.0515	2668452.371	22063.2486
vitamin C	6701.6160	137.67199	477.1998	22063.249	5416.2641

Lesson 3: Graphical Display of Multivariate Data

Section 3.1

options(width = 78)

nutrient$L_calciu <- log(nutrient$calcium)
nutrient$S_calciu <- sqrt(nutrient$calcium)
nutrient$S_S_calc <- nutrient$calcium^0.25
nutrient$L_iron <- log(nutrient$iron)
nutrient$S_S_iron <- nutrient$iron^0.25
nutrient$L_prot <- log(nutrient$protein)
nutrient$S_S_prot <- nutrient$protein^0.25
nutrient$S_S_a <- nutrient$`vitamin A`^0.25
nutrient$S_S_c <- nutrient$`vitamin C`^0.25

summary_stats <- summary(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C", "L_calciu", "S_calciu", "S_S_calc", "L_iron", "S_S_iron", "L_prot", "S_S_prot", "S_S_a", "S_S_c")])
print(summary_stats)

    calcium             iron          protein         vitamin A      
 Min.   :   7.44   Min.   : 0.00   Min.   :  0.00   Min.   :    0.0  
 1st Qu.: 326.54   1st Qu.: 7.46   1st Qu.: 45.52   1st Qu.:  276.3  
 Median : 548.29   Median :10.03   Median : 61.07   Median :  524.0  
 Mean   : 624.05   Mean   :11.13   Mean   : 65.80   Mean   :  839.6  
 3rd Qu.: 826.51   3rd Qu.:13.71   3rd Qu.: 80.77   3rd Qu.:  943.2  
 Max.   :2866.44   Max.   :58.67   Max.   :251.01   Max.   :34434.3  
   vitamin C         L_calciu        S_calciu         S_S_calc    
 Min.   :  0.00   Min.   :2.007   Min.   : 2.728   Min.   :1.652  
 1st Qu.: 24.94   1st Qu.:5.789   1st Qu.:18.070   1st Qu.:4.251  
 Median : 53.59   Median :6.307   Median :23.416   Median :4.839  
 Mean   : 78.93   Mean   :6.222   Mean   :23.766   Mean   :4.809  
 3rd Qu.:108.67   3rd Qu.:6.717   3rd Qu.:28.749   3rd Qu.:5.362  
 Max.   :433.34   Max.   :7.961   Max.   :53.539   Max.   :7.317  
     L_iron         S_S_iron         L_prot         S_S_prot    
 Min.   : -Inf   Min.   :0.000   Min.   : -Inf   Min.   :0.000  
 1st Qu.:2.010   1st Qu.:1.653   1st Qu.:3.818   1st Qu.:2.597  
 Median :2.306   Median :1.780   Median :4.112   Median :2.796  
 Mean   : -Inf   Mean   :1.781   Mean   : -Inf   Mean   :2.786  
 3rd Qu.:2.618   3rd Qu.:1.924   3rd Qu.:4.392   3rd Qu.:2.998  
 Max.   :4.072   Max.   :2.768   Max.   :5.526   Max.   :3.980  
     S_S_a            S_S_c      
 Min.   : 0.000   Min.   :0.000  
 1st Qu.: 4.077   1st Qu.:2.235  
 Median : 4.785   Median :2.706  
 Mean   : 4.858   Mean   :2.735  
 3rd Qu.: 5.542   3rd Qu.:3.229  
 Max.   :13.622   Max.   :4.563

par(mfrow = c(1, 1), mar = c(5, 5, 4, 2) + 0.1)
hist(nutrient$calcium, main = "Histogram for Calcium", col = "skyblue", xlab = "Calcium", ylab = "Frequency", cex.lab = 1.2, cex.axis = 1.2)

diff_x <- nutrient$iron - 500
diff_y <- nutrient$calcium - 10

plot(nutrient$iron, nutrient$calcium, main = "Scatterplot of calcium vs iron", 
     xlab = "iron", ylab = "calcium", col = "darkblue", pch = 16)

diff_x <- nutrient$S_S_iron - 0.5
diff_y <- nutrient$S_S_calc - 1

plot(diff_x, diff_y, main = "Scatterplot of Difference in S_S_calc vs Difference in S_S_iron", 
     xlab = "S_S_iron", ylab = "S_S_calc", 
     col = "darkblue", pch = 16)

if (!requireNamespace("ggplot2", quietly = TRUE)) {
  install.packages("ggplot2")
}
library(ggplot2)

pairs(
  nutrient[, c("S_S_calc", "S_S_iron", "S_S_prot", "S_S_a", "S_S_c")],
  main = "Scatterplot Matrix for Nutrition Data",
  col= "darkblue")

Lesson 4: Multivariate Normal Distribution

Section 4.2

options(width = 78)

r <- 0.9

x1 <- seq(-4, 4, by = 0.1)
x2 <- seq(-4, 4, by = 0.1)
grid <- expand.grid(x1 = x1, x2 = x2)


grid$phi <- exp(-(grid$x1^2 - 2 * r * grid$x1 * grid$x2 + grid$x2^2) / (2 * (1 - r^2))) / (2 * pi * sqrt(1 - r^2))
matrix_data <- matrix(grid$phi, nrow = length(x1), ncol = length(x2), byrow = TRUE)

persp(
  x = x1, y = x2, z = matrix_data,
  main = "Bivariate Normal Density",
  xlab = "x1", ylab = "x2", zlab = "Density",
  theta = -20,
  col = "lightgreen"
)

#### Section 4.3

boards <- read.csv("D:/stat/boardstiffness.csv", skip = 1, header = TRUE, sep = ",")
pca_result <- princomp(boards)
mahal <- data.frame(dist2 = rowSums(pca_result$scores^2))
mahal

         dist2
1   893935.032
2   391392.446
3   278375.446
4    68326.618
5    75628.273
6    64510.308
7   110144.273
8  3796596.860
9   111029.722
10   68173.480
11   88038.722
12   20206.136
13    8433.101
14   27164.963
15  461749.480
16 1232904.756
17  621252.998
18   30311.446
19  101990.377
20  478555.791
21   87228.584
22  222617.825
23  154558.067
24  129191.273
25  285655.067
26  138544.549
27   94044.963
28 1048160.101
29  590113.411

Section 4.7

library(MASS)

wechsler <- read.csv("D:/stat/wechsler.csv")
print(wechsler)

   id info sim arith pict
1   1    7   5     9    8
2   2    8   8     5    6
3   3   16  18    11    9
4   4    8   3     7    9
5   5    6   3    13    9
6   6   11   8    10   10
7   7   12   7     9    8
8   8    8  11     9    3
9   9   14  12    11    4
10 10   13  13    13    6
11 11   13   9     9    9
12 12   13  10    15    7
13 13   14  11    12    8
14 14   15  11    11   10
15 15   13  10    15    9
16 16   10   5     8    6
17 17   10   3     7    7
18 18   17  13    13    7
19 19   10   6    10    7
20 20   10  10    15    8
21 21   14   7    11    5
22 22   16  11    12   11
23 23   10   7    14    6
24 24   10  10     9    6
25 25   10   7    10   10
26 26    7   6     5    9
27 27   15  12    10    6
28 28   17  15    15    8
29 29   16  13    16    9
30 30   13  10    17    8
31 31   13  10    17   10
32 32   19  12    16   10
33 33   19  15    17   11
34 34   13  10     7    8
35 35   15  11    12    8
36 36   16   9    11   11
37 37   14  13    14    9

pca_result <- princomp(wechsler[, -1], cor = FALSE)
summary(pca_result)

Importance of components:
                          Comp.1    Comp.2    Comp.3     Comp.4
Standard deviation     5.0533104 2.4670432 1.9558360 1.36381119
Proportion of Variance 0.6844717 0.1631387 0.1025341 0.04985539
Cumulative Proportion  0.6844717 0.8476105 0.9501446 1.00000000

variable_means <- colMeans(wechsler[, -1])
cat("Variable Means:\n")

Variable Means:

print(variable_means)

     info       sim     arith      pict 
12.567568  9.567568 11.486486  7.972973

variable_means_df <- data.frame(Variable = names(variable_means), Mean = variable_means)
variable_means_table <- knitr::kable(variable_means_df, align = c("l", "r"), col.names = c("Variable", "Mean"), digits = 2)
kableExtra::kable_styling(variable_means_table, "striped", full_width = FALSE)

	Variable	Mean
info	info	12.57
sim	sim	9.57
arith	arith	11.49
pict	pict	7.97

cov_matrix <- cov(wechsler[, -1])
cov_table <- knitr::kable(cov_matrix, digits = 4)
kableExtra::kable_styling(cov_table, "striped", full_width = FALSE)

	info	sim	arith	pict
info	11.4745	9.0856	6.3829	2.0713
sim	9.0856	12.0856	5.9384	0.5435
arith	6.3829	5.9384	11.0901	1.7913
pict	2.0713	0.5435	1.7913	3.6937

Section 4.8

library(MASS)
mu <- c(0, 0)  # Mean vector
sigma <- matrix(c(1.0000, 0.5000, 0.5000, 2.0000), nrow = 2)  # Covariance matrix


lambda <- eigen(sigma)$values
e <- eigen(sigma)$vectors
theta <- seq(0, 2 * pi, length.out = 201)
z <- matrix(c(cos(theta), sin(theta)), ncol = 2)
d <- diag(sqrt(lambda))
ellipse_coords <- t(z %*% t(e) %*% d * sqrt(5.99))
x <- ellipse_coords[, 1] + mu[1]
y <- ellipse_coords[, 2] + mu[2]

plot(x, y, type = "l", xlab = "X", ylab = "Y", main = "95% prediction ellipse", xlim = c(-5, 5), ylim = c(-5, 5))

Lesson 5: Sample Mean Vector and Sample Correlation and Related Inference Problems

Section 5.2

library(tibble)
library(dplyr)
library(tidyr)
library(broom)
library(readxl)
mineral<-read_xlsx("D:/stat//mineral.xlsx")
mineral

# A tibble: 25 × 6
   domradius radius domhumerus humerus domulna  ulna
       <dbl>  <dbl>      <dbl>   <dbl>   <dbl> <dbl>
 1     1.10   1.05        2.14    2.24   0.873 0.872
 2     0.842  0.859       1.87    1.74   0.59  0.744
 3     0.925  0.873       1.89    1.81   0.767 0.713
 4     0.857  0.744       1.74    1.55   0.706 0.674
 5     0.795  0.809       1.73    1.72   0.549 0.654
 6     0.787  0.779       1.51    1.47   0.782 0.571
 7     0.933  0.88        1.70    1.66   0.737 0.803
 8     0.799  0.851       1.74    1.78   0.618 0.682
 9     0.945  0.876       1.81    1.76   0.853 0.777
10     0.921  0.906       1.95    2.01   0.823 0.765
# ℹ 15 more rows

variables_of_interest <- c("domradius", "domhumerus", "domulna")
data <- mineral[, variables_of_interest, drop = FALSE]
data2 <- data.frame(
  N = sapply(data, function(x) length(na.omit(x))),
  Mean = sapply(data, function(x) round(mean(x, na.rm = TRUE), 5)),
  SD = sapply(data, function(x) round(sd(x, na.rm = TRUE), 5))
)

print(data2)

            N    Mean      SD
domradius  25 0.84380 0.11402
domhumerus 25 1.79268 0.28347
domulna    25 0.70440 0.10756

Section 5.3

library(readxl)
library(tibble)
library(knitr)
library(kableExtra)
wechsler<-read_xlsx("D:/stat//wechsler.xlsx")
wechsler

# A tibble: 37 × 5
      id Information Similarities Arithmetic Picture
   <dbl>       <dbl>        <dbl>      <dbl>   <dbl>
 1     1           7            5          9       8
 2     2           8            8          5       6
 3     3          16           18         11       9
 4     4           8            3          7       9
 5     5           6            3         13       9
 6     6          11            8         10      10
 7     7          12            7          9       8
 8     8           8           11          9       3
 9     9          14           12         11       4
10    10          13           13         13       6
# ℹ 27 more rows

cor_matrix <- cor(wechsler[, c("Information", "Similarities", "Arithmetic", "Picture")])


kable(cor_matrix, digits=5, caption = "Correlation Matrix for Wechsler Data", format = "html") %>%
  kable_styling(full_width = FALSE)

Correlation Matrix for Wechsler Data
	Information	Similarities	Arithmetic	Picture
Information	1.00000	0.77153	0.56583	0.31816
Similarities	0.77153	1.00000	0.51295	0.08135
Arithmetic	0.56583	0.51295	1.00000	0.27988
Picture	0.31816	0.08135	0.27988	1.00000

Lesson 6: Multivariate Conditional Distribution and Partial Correlation

Section 6.2

options(width = 78)

title <- "Partial Correlations - Wechsler Data"
wechsler <- read.csv("D:/stat/wechsler.csv", header = TRUE)
model <- lm(cbind(info, sim) ~ arith + pict, data = wechsler)
anova_table <- anova(model)

print(anova_table)

Analysis of Variance Table

            Df  Pillai approx F num Df den Df    Pr(>F)    
(Intercept)  1 0.95595   358.05      2     33 < 2.2e-16 ***
arith        1 0.33836     8.44      2     33  0.001097 ** 
pict         1 0.12496     2.36      2     33  0.110531    
Residuals   34                                             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lesson 7: Inferences Regarding Multivariate Population Mean

Section 7.1.4

library(MASS)
library(dplyr)


x <- as.matrix(nutrient[, c("calcium", "iron", "protein", "vitamin A", "vitamin C")])


hotel <- function(x, mu0) {
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  
  result <- data.frame(
    T2 = t2,
    F = f,
    df1 = df1,
    df2 = df2,
    p_value = p
  )
  
  return(result)
}


mu0 <- c(1000, 15, 60, 800, 75)
result <- hotel(x, mu0)


print(result)

        T2        F df1 df2 p_value
1 1758.541 349.7968   5 732       0

arranged_result <- arrange(result, T2)
print(arranged_result)

        T2        F df1 df2 p_value
1 1758.541 349.7968   5 732       0

Section 7.1.9

library(MASS)

hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
  
  cat("mu0:\n")
  print(mu0)
  cat("\nSample Means (ybar):\n")
  print(ybar)
  cat("\nSample Covariance Matrix (S):\n")
  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  
  # Print test statistics
  cat("\nHotelling T2 Statistic:\n")
  print(t2)
  
  cat("\nF-Statistic:\n")
  print(f)
  
  cat("\nDegrees of Freedom:\n")
  cat("df1:", df1, "\n")
  cat("df2:", df2, "\n")
  
  cat("\nP-Value:\n")
  print(p)
}

spouse <- read.csv("D:/stat/spouse.csv", header = TRUE)
d1 <- spouse$h1 - spouse$w1
d2 <- spouse$h2 - spouse$w2
d3 <- spouse$h3 - spouse$w3
d4 <- spouse$h4 - spouse$w4
diffs <- data.frame(d1, d2, d3, d4)

# Extract the columns d1, d2, d3, and d4 into a matrix x
x <- as.matrix(diffs)

# Apply the Hotelling function to the data
hotel(x)

mu0:
[1] 0 0 0 0

Sample Means (ybar):
          [,1]
d1  0.06666667
d2 -0.13333333
d3 -0.30000000
d4 -0.13333333

Sample Covariance Matrix (S):
            d1          d2         d3          d4
d1  0.82298851  0.07816092 -0.0137931 -0.05977011
d2  0.07816092  0.80919540 -0.2137931 -0.15632184
d3 -0.01379310 -0.21379310  0.5620690  0.51034483
d4 -0.05977011 -0.15632184  0.5103448  0.60229885

Hotelling T2 Statistic:
         [,1]
[1,] 13.12784

F-Statistic:
         [,1]
[1,] 2.942447

Degrees of Freedom:
df1: 4 
df2: 26 

P-Value:
           [,1]
[1,] 0.03936914

Error in (ident - one %% t(one)/nrow(x)) %% x : requires numeric/complex matrix/vector arguments

Section 7.1.10

hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)

  means <- colMeans(x)
  variances <- apply(x, 2, var)

  print(mu0)
  print(ybar)
  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)

  print(t2)
  print(f)
  print(df1)
  print(df2)
  print(p)

  return(list(means = means, variances = variances, t2 = t2, f = f, df1 = df1, df2 = df2, p = p))
}

d1 <- spouse$h1 - spouse$w1
d2 <- spouse$h2 - spouse$w2
d3 <- spouse$h3 - spouse$w3
d4 <- spouse$h4 - spouse$w4
diffs <- data.frame(d1, d2, d3, d4)

x <- as.matrix(diffs)
result <- hotel(x)

[1] 0 0 0 0
          [,1]
d1  0.06666667
d2 -0.13333333
d3 -0.30000000
d4 -0.13333333
            d1          d2         d3          d4
d1  0.82298851  0.07816092 -0.0137931 -0.05977011
d2  0.07816092  0.80919540 -0.2137931 -0.15632184
d3 -0.01379310 -0.21379310  0.5620690  0.51034483
d4 -0.05977011 -0.15632184  0.5103448  0.60229885
         [,1]
[1,] 13.12784
         [,1]
[1,] 2.942447
[1] 4
[1] 26
           [,1]
[1,] 0.03936914

print("Means:")

[1] "Means:"

print(result$means)

         d1          d2          d3          d4 
 0.06666667 -0.13333333 -0.30000000 -0.13333333

print("Variances:")

[1] "Variances:"

print(result$variances)

       d1        d2        d3        d4 
0.8229885 0.8091954 0.5620690 0.6022989

Section 7.1.11

spouse$d1 <- spouse$h1 - spouse$w2
spouse$d2 <- spouse$h2 - spouse$w1
spouse$d3 <- spouse$h3 - spouse$w4
spouse$d4 <- spouse$h4 - spouse$w3


print(spouse)

   h1 h2 h3 h4 w1 w2 w3 w4 d1 d2 d3 d4
1   2  3  5  5  4  4  5  5 -2 -1  0  0
2   5  5  4  4  4  5  5  5  0  1 -1 -1
3   4  5  5  5  4  4  5  5  0  1  0  0
4   4  3  4  4  4  5  5  5 -1 -1 -1 -1
5   3  3  5  5  4  4  5  5 -1 -1  0  0
6   3  3  4  5  3  3  4  4  0  0  0  1
7   3  4  4  4  4  3  5  4  0  0  0 -1
8   4  4  5  5  3  4  5  5  0  1  0  0
9   4  5  5  5  4  4  5  4  0  1  1  0
10  4  4  3  3  3  4  4  4  0  1 -1 -1
11  4  4  5  5  4  5  5  5 -1  0  0  0
12  5  5  4  4  5  5  5  5  0  0 -1 -1
13  4  4  4  4  4  4  5  5  0  0 -1 -1
14  4  3  5  5  4  4  4  4  0 -1  1  1
15  4  4  5  5  4  4  5  5  0  0  0  0
16  3  3  4  5  3  4  4  4 -1  0  0  1
17  4  5  4  4  5  5  5  5 -1  0 -1 -1
18  5  5  5  5  4  5  4  4  0  1  1  1
19  5  5  4  4  3  4  4  4  1  2  0  0
20  4  4  4  4  5  3  4  4  1 -1  0  0
21  4  4  4  4  5  3  4  4  1 -1  0  0
22  4  4  4  4  4  5  4  4 -1  0  0  0
23  3  4  5  5  2  5  5  5 -2  2  0  0
24  5  3  5  5  3  4  5  5  1  0  0  0
25  5  5  3  3  4  3  5  5  2  1 -2 -2
26  3  3  4  4  4  4  4  4 -1 -1  0  0
27  4  4  4  4  4  4  5  5  0  0 -1 -1
28  3  3  5  5  3  4  4  4 -1  0  1  1
29  4  4  3  3  4  4  5  4  0  0 -1 -2
30  4  4  5  5  4  4  5  5  0  0  0  0

hotel <- function(x) {
  mu0 <- c(0, 0, 0, 0)
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% (ident - one %*% t(one) / nrow(x)) %*% x / (nrow(x) - 1)
  
  print(mu0)
  print(ybar)
  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p_value <- 1 - pf(f, df1, df2)
  
  print(t2)
  print(f)
  print(df1)
  print(df2)
  print(p_value)
}

x <- as.matrix(spouse[, c("d1", "d2", "d3", "d4")])
hotel(x)

[1] 0 0 0 0
         [,1]
d1 -0.2000000
d2  0.1333333
d3 -0.2000000
d4 -0.2333333
           d1          d2          d3          d4
d1  0.7862069  0.13103448 -0.14482759 -0.18620690
d2  0.1310345  0.74022989 -0.04137931 -0.07126437
d3 -0.1448276 -0.04137931  0.51034483  0.50344828
d4 -0.1862069 -0.07126437  0.50344828  0.66781609
         [,1]
[1,] 6.426621
        [,1]
[1,] 1.44045
[1] 4
[1] 26
          [,1]
[1,] 0.2488886

Section 7.1.15

library(MASS)
hotel2 <- function(x1, x2) {
  n1 <- nrow(x1)
  n2 <- nrow(x2)
  k <- ncol(x1)
  one1 <- matrix(1, n1, 1)
  one2 <- matrix(1, n2, 1)
  ident1 <- diag(n1)
  ident2 <- diag(n2)
  
  ybar1 <- t(x1) %*% one1 / n1
  s1 <- t(x1) %*% ((ident1 - one1 %*% t(one1) / n1) %*% x1) / (n1 - 1.0)
  
  
  print(round(n1, 3))
  print(round(ybar1, 3))
  print(round(s1, 3))
  
  ybar2 <- t(x2) %*% one2 / n2
  s2 <- t(x2) %*% ((ident2 - one2 %*% t(one2) / n2) %*% x2) / (n2 - 1.0)
  
  print(round(n2, 3))
  print(round(ybar2, 3))
  print(round(s2, 3))
  
  spool <- ((n1 - 1.0) * s1 + (n2 - 1.0) * s2) / (n1 + n2 - 2.0)
  
  print(round(spool, 3))
  
  t2 <- t(ybar1 - ybar2) %*% solve(spool * (1/n1 + 1/n2)) %*% (ybar1 - ybar2)
  f <- (n1 + n2 - k - 1) * t2 / k / (n1 + n2 - 2)
  df1 <- k
  df2 <- n1 + n2 - k - 1
  p <- 1 - pf(f, df1, df2)
  

  print(round(t2, 3))
  print(round(f, 3))
  print(round(df1, 3))
  print(round(df2, 3))
  print(round(p, 3))
}

swiss <- read.csv("D:/stat/swiss3.csv", header = TRUE)
x1 <- subset(swiss, type == "real")[, c("length", "left", "right", "bottom", "top", "diag")]
x2 <- subset(swiss, type == "fake")[, c("length", "left", "right", "bottom", "top", "diag")]

hotel2(as.matrix(x1), as.matrix(x2))

[1] 100
          [,1]
length 214.969
left   129.943
right  129.720
bottom   8.305
top     10.168
diag   141.517
       length   left  right bottom    top   diag
length  0.150  0.058  0.057  0.057  0.014  0.005
left    0.058  0.133  0.086  0.057  0.049 -0.043
right   0.057  0.086  0.126  0.058  0.031 -0.024
bottom  0.057  0.057  0.058  0.413 -0.263  0.000
top     0.014  0.049  0.031 -0.263  0.421 -0.075
diag    0.005 -0.043 -0.024  0.000 -0.075  0.200
[1] 100
          [,1]
length 214.823
left   130.300
right  130.193
bottom  10.530
top     11.133
diag   139.450
       length   left  right bottom    top   diag
length  0.124  0.032  0.024 -0.101  0.019  0.012
left    0.032  0.065  0.047 -0.024 -0.012 -0.005
right   0.024  0.047  0.089 -0.019  0.000  0.034
bottom -0.101 -0.024 -0.019  1.281 -0.490  0.238
top     0.019 -0.012  0.000 -0.490  0.404 -0.022
diag    0.012 -0.005  0.034  0.238 -0.022  0.311
       length   left right bottom    top   diag
length  0.137  0.045 0.041 -0.022  0.017  0.009
left    0.045  0.099 0.066  0.016  0.019 -0.024
right   0.041  0.066 0.108  0.020  0.015  0.005
bottom -0.022  0.016 0.020  0.847 -0.377  0.119
top     0.017  0.019 0.015 -0.377  0.413 -0.049
diag    0.009 -0.024 0.005  0.119 -0.049  0.256
         [,1]
[1,] 2412.451
        [,1]
[1,] 391.922
[1] 6
[1] 193
     [,1]
[1,]    0

Section 7.2.1

library(MASS)

hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1.0)

  print(mu0)
  print(ybar)
  print(s)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  
  print(t2)
  print(f)
  print(df1)
  print(df2)
  print(p)
}

nutrient <- read.csv("D:/Stat/nutrient1.csv", header = TRUE)

nutrient$calcium <- nutrient$calcium / 1000
nutrient$iron <- nutrient$iron / 15
nutrient$protein <- nutrient$protein / 60
nutrient$a <- nutrient$vitamin.A / 800
nutrient$c <- nutrient$vitamin.C / 75

nutrient$diff1 <- nutrient$iron - nutrient$calcium
nutrient$diff2 <- nutrient$protein - nutrient$iron
nutrient$diff3 <- nutrient$a - nutrient$protein
nutrient$diff4 <- nutrient$c - nutrient$a

x <- as.matrix(nutrient[, c("diff1", "diff2", "diff3", "diff4")])

hotel(x)

[1] 0 0 0 0
              [,1]
diff1  0.117944052
diff2  0.354730710
diff3 -0.047179834
diff4  0.002835103
            diff1       diff2       diff3       diff4
diff1  0.19164212 -0.07101776  0.04511467 -0.03756199
diff2 -0.07101776  0.16538367 -0.17884359  0.02955601
diff3  0.04511467 -0.17884359  4.12372604 -3.75507101
diff4 -0.03756199  0.02955601 -3.75507101  4.39690660
         [,1]
[1,] 1030.795
         [,1]
[1,] 256.6484
[1] 4
[1] 733
     [,1]
[1,]    0

library(MASS)

hotel <- function(x) {
  mu0 <- rep(0, ncol(x))
  one <- rep(1, nrow(x))
  ident <- diag(nrow(x))
  ybar <- t(x) %*% one / nrow(x)
  s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1.0)
  
  t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
  f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
  df1 <- ncol(x)
  df2 <- nrow(x) - ncol(x)
  p <- 1 - pf(f, df1, df2)
  
  results <- data.frame(
    mu0 = mu0,
    ybar = ybar)
  
  cat("S matrix:\n")
  print(s)
  
  cat("Test Statistics:\n")
  print(data.frame(
    t2 = t2,
    f = f,
    df1 = df1,
    df2 = df2,
    p = p
  ))
  
  return(results)
}

nutrient <- read.csv("D:/Stat/nutrient1.csv", header = TRUE)

nutrient$calcium <- nutrient$calcium / 1000
nutrient$iron <- nutrient$iron / 15
nutrient$protein <- nutrient$protein / 60
nutrient$a <- nutrient$vitamin.A / 800
nutrient$c <- nutrient$vitamin.C / 75

nutrient$diff1 <- nutrient$iron - nutrient$calcium
nutrient$diff2 <- nutrient$protein - nutrient$iron
nutrient$diff3 <- nutrient$a - nutrient$protein
nutrient$diff4 <- nutrient$c - nutrient$a

x <- as.matrix(nutrient[, c("diff1", "diff2", "diff3", "diff4")])

results <- hotel(x)

S matrix:
            diff1       diff2       diff3       diff4
diff1  0.19164212 -0.07101776  0.04511467 -0.03756199
diff2 -0.07101776  0.16538367 -0.17884359  0.02955601
diff3  0.04511467 -0.17884359  4.12372604 -3.75507101
diff4 -0.03756199  0.02955601 -3.75507101  4.39690660
Test Statistics:
        t2        f df1 df2 p
1 1030.795 256.6484   4 733 0

print(results)

      mu0         ybar
diff1   0  0.117944052
diff2   0  0.354730710
diff3   0 -0.047179834
diff4   0  0.002835103

Section 7.2.3

swiss <- read.csv("D:/stat/swiss3.csv")

real <- subset(swiss, type == "real")
fake <- subset(swiss, type == "fake")

compute_stats <- function(data) {
  data_summary <- lapply(data, function(x) c(length(x), mean(x), var(x)))
  data_summary <- do.call(rbind, data_summary)
  colnames(data_summary) <- c("n", "mean", "var")
  return(data_summary)}

pop1 <- compute_stats(real[, -1])
pop2 <- compute_stats(fake[, -1])

combine <- merge(pop1, pop2, by = "row.names", suffixes = c(".real", ".fake"))
rownames(combine) <- combine$Row.names
combine$Row.names <- NULL

p <- 6
df1 <- ncol(pop1) - 1
df2 <- sum(pop1[, 1]) + sum(pop2[, 1]) - ncol(pop1) - 1

combine$F <- qf(0.95, df1, df2)
combine$T <- qt(1 - 0.025/6, df2)
combine$sp <- ((combine$n.real - 1) * combine$var.real + (combine$n.fake - 1) * combine$var.fake) / (combine$n.real + combine$n.fake - 2)
combine$losim <- (combine$mean.real - combine$mean.fake) - sqrt(p * (combine$n.real + combine$n.fake - 2) * combine$F * (1/combine$n.real + 1/combine$n.fake) * combine$sp / (combine$n.real + combine$n.fake - p - 1))
combine$upsim <- (combine$mean.real - combine$mean.fake) + sqrt(p * (combine$n.real + combine$n.fake - 2) * combine$F * (1/combine$n.real + 1/combine$n.fake) * combine$sp / (combine$n.real + combine$n.fake - p - 1))
combine$lobon <- (combine$mean.real - combine$mean.fake) - combine$T * sqrt((1/combine$n.real + 1/combine$n.fake) * combine$sp)
combine$upbon <- (combine$mean.real - combine$mean.fake) + combine$T * sqrt((1/combine$n.real + 1/combine$n.fake) * combine$sp)

print(combine)

       n.real mean.real  var.real n.fake mean.fake   var.fake        F
bottom    100     8.305 0.4132071    100    10.530 1.28131313 3.003249
diag      100   141.517 0.1998091    100   139.450 0.31121212 3.003249
left      100   129.943 0.1325768    100   130.300 0.06505051 3.003249
length    100   214.969 0.1502414    100   214.823 0.12401111 3.003249
right     100   129.720 0.1262626    100   130.193 0.08894040 3.003249
top       100    10.168 0.4211879    100    11.133 0.40445556 3.003249
              T         sp       losim      upsim        lobon      upbon
bottom 2.642654 0.84726010 -2.78469133 -1.6653087 -2.569004162 -1.8809958
diag   2.642654 0.25551061  1.75964190  2.3743581  1.878087896  2.2559121
left   2.642654 0.09881364 -0.54813871 -0.1658613 -0.474479952 -0.2395200
length 2.642654 0.13712626 -0.07916481  0.3711648  0.007606517  0.2843935
right  2.642654 0.10760152 -0.67245705 -0.2735429 -0.595592672 -0.3504073
top    2.642654 0.41282172 -1.35568026 -0.5743197 -1.205124563 -0.7248754

Section 7.2.6

library(biotools)

Warning: package 'biotools' was built under R version 4.3.3

result <- boxM(data = swiss[, -1], grouping = swiss$type)
result


    Box's M-test for Homogeneity of Covariance Matrices

data:  swiss[, -1]
Chi-Sq (approx.) = 121.9, df = 21, p-value = 3.198e-16

LESSON 8

Section 8.4

           ral          rfe        rmg          rca         rna
1   1.83571429  0.627857143 -0.5264286 -0.052142857  0.25928571
2   1.23571429  0.707857143 -1.3964286 -0.082142857 -0.08071429
3   2.03571429  0.717857143 -0.9464286 -0.072142857 -0.05071429
4  -1.06428571 -0.002142857  0.8135714 -0.042142857 -0.11071429
5   1.23571429  0.687857143  0.5135714 -0.002142857 -0.05071429
6  -1.66428571 -0.112142857 -1.3564286 -0.032142857 -0.03071429
7  -2.46428571 -2.112142857 -0.5664286 -0.002142857 -0.07071429
8  -0.96428571 -0.592142857  1.0835714 -0.022142857 -0.09071429
9  -1.46428571 -0.882142857 -0.3064286  0.087857143  0.04928571
10  0.83571429  0.547857143  2.4035714  0.077857143 -0.05071429
11 -0.16428571 -0.242142857  0.8635714  0.017857143  0.28928571
12  0.53571429  0.267857143  0.6835714  0.107857143 -0.01071429
13  0.13571429  0.317857143 -0.3764286 -0.002142857 -0.03071429
14 -0.06428571  0.067857143 -0.8864286  0.017857143 -0.02071429
15  0.10000000  0.025000000  0.0850000  0.005000000 -0.01000000
16 -0.10000000 -0.025000000 -0.0850000 -0.005000000  0.01000000
17  0.12000000 -0.432000000 -0.0040000  0.004000000 -0.02400000
18 -2.38000000  0.678000000 -0.0440000 -0.016000000 -0.01400000
19 -0.18000000 -0.212000000 -0.0040000 -0.016000000  0.00600000
20 -0.18000000  0.168000000  0.0060000 -0.016000000 -0.01400000
21  2.62000000 -0.202000000  0.0460000  0.044000000  0.04600000
22  0.38000000 -0.392000000 -0.0460000  0.008000000  0.01200000
23  0.98000000 -0.372000000  0.0640000  0.008000000  0.00200000
24 -0.62000000 -0.592000000 -0.0760000 -0.042000000  0.00200000
25 -2.52000000  1.228000000  0.0640000 -0.022000000  0.00200000
26  1.78000000  0.128000000 -0.0060000  0.048000000 -0.01800000

for(variable in c("al", "fe", "mg", "ca", "na")) {

  hist(pottery[[variable]], 
       main = paste("Histogram of", variable), 
       xlab = variable, 
       col = "yellow", 
       border = "black")
}

p <- 5  
g <- length(unique(pottery$site))  
df <- 0.5 * p * (p + 1) * (g - 1)

cov_matrices <- list()
for (site in unique(pottery$site)) {
  cov_matrices[[site]] <- cov(subset(pottery, site == site)[, c("al", "fe", "mg", "ca", "na")])
}
l_statistic <- 0
for (i in 1:(g - 1)) {
  for (j in (i + 1):g) {
    l_statistic <- l_statistic + (sum(diag((cov_matrices[[i]] + cov_matrices[[j]]) %*% solve(cov_matrices[[i]] + cov_matrices[[j]]))) - p)
  }
}
l_statistic <- l_statistic * 2


alpha <- 0.05  
critical_value <- qchisq(1 - alpha, df)

cat("Test statistic:", l_statistic, "\n")

Test statistic: -2.131628e-14

cat("Degrees of freedom:", df, "\n")

Degrees of freedom: 45

cat("Critical value (", 1 - alpha, "):", critical_value, "\n")

Critical value ( 0.95 ): 61.65623

if (l_statistic > critical_value) {
  cat("Reject the null hypothesis: Covariance matrices are not homogeneous.\n")
} else {
  cat("Fail to reject the null hypothesis: Covariance matrices are homogeneous.\n")
}

Fail to reject the null hypothesis: Covariance matrices are homogeneous.

Section 8.5

outcome <- cbind(pottery$al, pottery$fe, pottery$mg, pottery$ca, pottery$na)
model<- manova(outcome ~ site, data = pottery)
summary(model, intercept = TRUE, test = "Wilks")

            Df     Wilks approx F num Df den Df    Pr(>F)    
(Intercept)  1 0.0068354   523.07      5 18.000 < 2.2e-16 ***
site         3 0.0123009    13.09     15 50.091  1.84e-12 ***
Residuals   22                                               
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

print(pottery)

# A tibble: 26 × 6
   site     al    fe    mg    ca    na
   <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 L      14.4  7     4.3   0.15  0.51
 2 L      13.8  7.08  3.43  0.12  0.17
 3 L      14.6  7.09  3.88  0.13  0.2 
 4 L      11.5  6.37  5.64  0.16  0.14
 5 L      13.8  7.06  5.34  0.2   0.2 
 6 L      10.9  6.26  3.47  0.17  0.22
 7 L      10.1  4.26  4.26  0.2   0.18
 8 L      11.6  5.78  5.91  0.18  0.16
 9 L      11.1  5.49  4.52  0.29  0.3 
10 L      13.4  6.92  7.23  0.28  0.2 
# ℹ 16 more rows

model <- manova(cbind(al, ca, fe, mg, na) ~ site, data = pottery)
summary(model)

          Df Pillai approx F num Df den Df    Pr(>F)    
site       3 1.5539   4.2984     15     60 2.413e-05 ***
Residuals 22                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(ggplot2)
library(reshape2)

Warning: package 'reshape2' was built under R version 4.3.3

pottery_melted <- melt(pottery, id.vars = "site", variable.name = "chemical", value.name = "amount")


mean_data <- aggregate(amount ~ site + chemical, data = pottery_melted, FUN = mean)

# Plot
ggplot(mean_data, aes(x = chemical, y = amount, group = site, color = site)) +
  geom_line() +
  geom_point() +
  labs(title = "Profile Plot for Pottery Data", x = "Chemical", y = "Mean Amount") +
  theme_minimal()

manova_result <- manova(cbind(al, fe, mg, ca, na) ~ site, data = pottery)
summary(manova_result)

          Df Pillai approx F num Df den Df    Pr(>F)    
site       3 1.5539   4.2984     15     60 2.413e-05 ***
Residuals 22                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Section 8.7

library(car)

pottery$site <- factor(pottery$site)

site_contrasts <- matrix(c(8, -2, 8, -14,    # C+L-A-I
                            1, 0, -1, 0,      # A vs I
                            0, 1, 0, -1),     # C vs L
                          nrow = 3, byrow = TRUE)

rownames(site_contrasts) <- c("C+L-A-I", "A vs I", "C vs L")
colnames(site_contrasts) <- c("C", "L", "A", "I")

print(site_contrasts)

        C  L  A   I
C+L-A-I 8 -2  8 -14
A vs I  1  0 -1   0
C vs L  0  1  0  -1

Section 8.11

library(readr)
library(car)
library(emmeans)

Warning: package 'emmeans' was built under R version 4.3.3

rice <- read_csv("D:/stat/rice.csv")

rice$block <- factor(rice$block)
rice$variety <- factor(rice$variety)
manova_result <- manova(cbind(height, tillers) ~ block + variety, data = rice)
summary(manova_result)

          Df  Pillai approx F num Df den Df  Pr(>F)  
block      4 0.23044  0.39067      8     24 0.91486  
variety    3 0.76192  2.46162      6     24 0.05349 .
Residuals 12                                         
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

lsmeans_result <- emmeans(manova_result, "variety")
print(lsmeans_result)

 variety emmean   SE df lower.CL upper.CL
 a         32.1 0.78 12     30.4     33.8
 b         28.1 0.78 12     26.4     29.8
 c         30.6 0.78 12     28.9     32.3
 d         28.8 0.78 12     27.1     30.5

Results are averaged over the levels of: block, rep.meas 
Confidence level used: 0.95

print(summary.aov(manova_result))

 Response height :
            Df Sum Sq Mean Sq F value Pr(>F)  
block        4   31.7   7.925  0.6038 0.6673  
variety      3  165.0  55.000  4.1905 0.0303 *
Residuals   12  157.5  13.125                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

 Response tillers :
            Df Sum Sq Mean Sq F value Pr(>F)
block        4    1.0  0.2500  0.1648 0.9522
variety      3    5.8  1.9333  1.2747 0.3273
Residuals   12   18.2  1.5167

summary(aov(height ~ block + variety, data = rice))

            Df Sum Sq Mean Sq F value Pr(>F)  
block        4   31.7    7.93   0.604 0.6673  
variety      3  165.0   55.00   4.190 0.0303 *
Residuals   12  157.5   13.12                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(tillers ~ block + variety, data = rice))

            Df Sum Sq Mean Sq F value Pr(>F)
block        4    1.0   0.250   0.165  0.952
variety      3    5.8   1.933   1.275  0.327
Residuals   12   18.2   1.517

Exercises from Units 1-8

DEVINE FATHY MAE S. GRINO

2024-03-10

Lesson 1: Measures of Central Tendency, Dispersion and Association

1.4

1.5

1.6

Lesson 2: Linear Combinations of Random Variables

Section 2.2

Lesson 3: Graphical Display of Multivariate Data

Section 3.1

Lesson 4: Multivariate Normal Distribution

Section 4.2

Section 4.7

Section 4.8

Lesson 6: Multivariate Conditional Distribution and Partial Correlation

Section 6.2

Lesson 7: Inferences Regarding Multivariate Population Mean

Section 7.1.4

Section 7.1.9

Section 7.1.10

Section 7.1.11

Section 7.1.15

Section 7.2.1

Section 7.2.3

Section 7.2.6

LESSON 8

Section 8.4

Section 8.5

Section 8.7

Section 8.11