MANOVA

Fouille de données avec R Léa Lê Dinh : Esiee Paris 2022-2023


1. Importation des données

rm(list = ls())       # initialisation

library(kableExtra) 


library(readr)
library(dplyr)
## 
## Attachement du package : 'dplyr'
## L'objet suivant est masqué depuis 'package:kableExtra':
## 
##     group_rows
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
load("EX_MANOVA.Rda")

View(df)

Pour simplifier notre code, nous créons : - Une variable N : nombre total d’individus

N = nrow(df)
print(N)
## [1] 26
#Sélection colonnes classe numérique
var_pred = df %>% select_if(is.numeric)

#Nombre de colonnes sélectionnées
P = ncol(var_pred)
print(P)
## [1] 5
X <- df[,-6]
View(X)

X %>% kbl(digits=3) %>%    
  kable_styling(bootstrap_options = "striped", full_width = F, position = "center", latex_options = 'stripped') %>% 
  scroll_box( height = "250px")
Al Fe Mg Ca Na
14.4 7.00 4.30 0.15 0.51
13.8 7.08 3.43 0.12 0.17
14.6 7.09 3.88 0.13 0.20
11.5 6.37 5.64 0.16 0.14
13.8 7.06 5.34 0.20 0.20
10.9 6.26 3.47 0.17 0.22
10.1 4.26 4.26 0.20 0.18
11.6 5.78 5.91 0.18 0.16
11.1 5.49 4.52 0.29 0.30
13.4 6.92 7.23 0.28 0.20
12.4 6.13 5.69 0.22 0.54
13.1 6.64 5.51 0.31 0.24
12.7 6.69 4.45 0.20 0.22
12.5 6.44 3.94 0.22 0.23
11.8 5.44 3.94 0.30 0.04
11.6 5.39 3.77 0.29 0.06
18.3 1.28 0.67 0.03 0.03
15.8 2.39 0.63 0.01 0.04
18.0 1.50 0.67 0.01 0.06
18.0 1.88 0.68 0.01 0.04
20.8 1.51 0.72 0.07 0.10
17.7 1.12 0.56 0.06 0.06
18.3 1.14 0.67 0.06 0.05
16.7 0.92 0.53 0.01 0.05
14.8 2.74 0.67 0.03 0.05
19.1 1.64 0.60 0.10 0.03
Y <- as.vector(df[,c(6)])
print(Y)
## $Class
##  [1] "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ca"
## [16] "Ca" "Is" "Is" "Is" "Is" "Is" "As" "As" "As" "As" "As"
#Nous vérifions que Y est bien un vecteur
is.vector(Y)
## [1] TRUE
K = sapply(Y, function(x) sum(!duplicated(x)))
print(K)
## Class 
##     4
# Y %>% summarise_all(funs(n_distinct))
XK <- split(x = df, f = df$Class)
print(XK)
## $As
## # A tibble: 5 × 6
##      Al    Fe    Mg    Ca    Na Class
##   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1  17.7  1.12  0.56  0.06  0.06 As   
## 2  18.3  1.14  0.67  0.06  0.05 As   
## 3  16.7  0.92  0.53  0.01  0.05 As   
## 4  14.8  2.74  0.67  0.03  0.05 As   
## 5  19.1  1.64  0.6   0.1   0.03 As   
## 
## $Ca
## # A tibble: 2 × 6
##      Al    Fe    Mg    Ca    Na Class
##   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1  11.8  5.44  3.94  0.3   0.04 Ca   
## 2  11.6  5.39  3.77  0.29  0.06 Ca   
## 
## $Is
## # A tibble: 5 × 6
##      Al    Fe    Mg    Ca    Na Class
##   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1  18.3  1.28  0.67  0.03  0.03 Is   
## 2  15.8  2.39  0.63  0.01  0.04 Is   
## 3  18    1.5   0.67  0.01  0.06 Is   
## 4  18    1.88  0.68  0.01  0.04 Is   
## 5  20.8  1.51  0.72  0.07  0.1  Is   
## 
## $Ll
## # A tibble: 14 × 6
##       Al    Fe    Mg    Ca    Na Class
##    <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
##  1  14.4  7     4.3   0.15  0.51 Ll   
##  2  13.8  7.08  3.43  0.12  0.17 Ll   
##  3  14.6  7.09  3.88  0.13  0.2  Ll   
##  4  11.5  6.37  5.64  0.16  0.14 Ll   
##  5  13.8  7.06  5.34  0.2   0.2  Ll   
##  6  10.9  6.26  3.47  0.17  0.22 Ll   
##  7  10.1  4.26  4.26  0.2   0.18 Ll   
##  8  11.6  5.78  5.91  0.18  0.16 Ll   
##  9  11.1  5.49  4.52  0.29  0.3  Ll   
## 10  13.4  6.92  7.23  0.28  0.2  Ll   
## 11  12.4  6.13  5.69  0.22  0.54 Ll   
## 12  13.1  6.64  5.51  0.31  0.24 Ll   
## 13  12.7  6.69  4.45  0.2   0.22 Ll   
## 14  12.5  6.44  3.94  0.22  0.23 Ll
NK <- sapply(XK, nrow)
print(NK)
## As Ca Is Ll 
##  5  2  5 14
#On recupere les numéros de colonnes 
nb_col <- which(sapply(df, is.numeric))
GK <- lapply(XK, function(x) colMeans(x[, nb_col]))
print(GK)
## $As
##     Al     Fe     Mg     Ca     Na 
## 17.320  1.512  0.606  0.052  0.048 
## 
## $Ca
##     Al     Fe     Mg     Ca     Na 
## 11.700  5.415  3.855  0.295  0.050 
## 
## $Is
##     Al     Fe     Mg     Ca     Na 
## 18.180  1.712  0.674  0.026  0.054 
## 
## $Ll
##         Al         Fe         Mg         Ca         Na 
## 12.5642857  6.3721429  4.8264286  0.2021429  0.2507143
G <- sapply(X,mean)
#On vérifie que G est bien un vecteur numérique
is.vector(G)
## [1] TRUE
is.numeric(G)
## [1] TRUE
#C'est le cas. Nous l'affichons :
print(G)
##         Al         Fe         Mg         Ca         Na 
## 14.4923077  4.4676923  3.1415385  0.1465385  0.1584615

2. Calcul des inerties

2.1 Intertie totale
  • Transformer le dataframe en matrice (NxP)
# Transformation du dataframe en matrice
matrice <- as.matrix(X)
print(matrice)
##         Al   Fe   Mg   Ca   Na
##  [1,] 14.4 7.00 4.30 0.15 0.51
##  [2,] 13.8 7.08 3.43 0.12 0.17
##  [3,] 14.6 7.09 3.88 0.13 0.20
##  [4,] 11.5 6.37 5.64 0.16 0.14
##  [5,] 13.8 7.06 5.34 0.20 0.20
##  [6,] 10.9 6.26 3.47 0.17 0.22
##  [7,] 10.1 4.26 4.26 0.20 0.18
##  [8,] 11.6 5.78 5.91 0.18 0.16
##  [9,] 11.1 5.49 4.52 0.29 0.30
## [10,] 13.4 6.92 7.23 0.28 0.20
## [11,] 12.4 6.13 5.69 0.22 0.54
## [12,] 13.1 6.64 5.51 0.31 0.24
## [13,] 12.7 6.69 4.45 0.20 0.22
## [14,] 12.5 6.44 3.94 0.22 0.23
## [15,] 11.8 5.44 3.94 0.30 0.04
## [16,] 11.6 5.39 3.77 0.29 0.06
## [17,] 18.3 1.28 0.67 0.03 0.03
## [18,] 15.8 2.39 0.63 0.01 0.04
## [19,] 18.0 1.50 0.67 0.01 0.06
## [20,] 18.0 1.88 0.68 0.01 0.04
## [21,] 20.8 1.51 0.72 0.07 0.10
## [22,] 17.7 1.12 0.56 0.06 0.06
## [23,] 18.3 1.14 0.67 0.06 0.05
## [24,] 16.7 0.92 0.53 0.01 0.05
## [25,] 14.8 2.74 0.67 0.03 0.05
## [26,] 19.1 1.64 0.60 0.10 0.03
  • Créer une matrice de même taille (NxP) dont chaque ligne correspond au vecteur G
# Création de la liste de vecteurs
matrice2 <- matrix(rep(G, each = nrow(matrice)), nrow = nrow(matrice), ncol = ncol(matrice))
#matrice2 <- replicate(nrow(matrice), G, simplify = FALSE)
print(matrice2)
##           [,1]     [,2]     [,3]      [,4]      [,5]
##  [1,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [2,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [3,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [4,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [5,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [6,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [7,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [8,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
##  [9,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [10,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [11,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [12,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [13,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [14,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [15,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [16,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [17,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [18,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [19,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [20,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [21,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [22,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [23,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [24,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [25,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [26,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
  • Calculer la différence entre les 2 matrices
difference = matrice - matrice2
print(difference)
##                Al         Fe         Mg           Ca           Na
##  [1,] -0.09230769  2.5323077  1.1584615  0.003461538  0.351538462
##  [2,] -0.69230769  2.6123077  0.2884615 -0.026538462  0.011538462
##  [3,]  0.10769231  2.6223077  0.7384615 -0.016538462  0.041538462
##  [4,] -2.99230769  1.9023077  2.4984615  0.013461538 -0.018461538
##  [5,] -0.69230769  2.5923077  2.1984615  0.053461538  0.041538462
##  [6,] -3.59230769  1.7923077  0.3284615  0.023461538  0.061538462
##  [7,] -4.39230769 -0.2076923  1.1184615  0.053461538  0.021538462
##  [8,] -2.89230769  1.3123077  2.7684615  0.033461538  0.001538462
##  [9,] -3.39230769  1.0223077  1.3784615  0.143461538  0.141538462
## [10,] -1.09230769  2.4523077  4.0884615  0.133461538  0.041538462
## [11,] -2.09230769  1.6623077  2.5484615  0.073461538  0.381538462
## [12,] -1.39230769  2.1723077  2.3684615  0.163461538  0.081538462
## [13,] -1.79230769  2.2223077  1.3084615  0.053461538  0.061538462
## [14,] -1.99230769  1.9723077  0.7984615  0.073461538  0.071538462
## [15,] -2.69230769  0.9723077  0.7984615  0.153461538 -0.118461538
## [16,] -2.89230769  0.9223077  0.6284615  0.143461538 -0.098461538
## [17,]  3.80769231 -3.1876923 -2.4715385 -0.116538462 -0.128461538
## [18,]  1.30769231 -2.0776923 -2.5115385 -0.136538462 -0.118461538
## [19,]  3.50769231 -2.9676923 -2.4715385 -0.136538462 -0.098461538
## [20,]  3.50769231 -2.5876923 -2.4615385 -0.136538462 -0.118461538
## [21,]  6.30769231 -2.9576923 -2.4215385 -0.076538462 -0.058461538
## [22,]  3.20769231 -3.3476923 -2.5815385 -0.086538462 -0.098461538
## [23,]  3.80769231 -3.3276923 -2.4715385 -0.086538462 -0.108461538
## [24,]  2.20769231 -3.5476923 -2.6115385 -0.136538462 -0.108461538
## [25,]  0.30769231 -1.7276923 -2.4715385 -0.116538462 -0.108461538
## [26,]  4.60769231 -2.8276923 -2.5415385 -0.046538462 -0.128461538
  • Effectuer la multiplication avec transposition du premier élément
SS_Tot =  t(difference)%*% difference
print(SS_Tot)
##             Al          Fe          Mg         Ca         Na
## Al  223.898462 -142.215462 -130.201692 -5.7826923 -4.7833077
## Fe -142.215462  145.172462  118.272092  4.6665923  5.3927077
## Mg -130.201692  118.272092  118.780138  4.6445385  4.7381615
## Ca   -5.782692    4.666592    4.644538  0.2561885  0.1648615
## Na   -4.783308    5.392708    4.738162  0.1648615  0.4575385
2.2 Intertie intra classe
Y<-df$Class

#Création de la liste 
SS_partiel_Intra <- list() 
for (group in unique(Y)){ #Pour chaque dataframe de XK
  df_group <- df[Y == group,]
  Gk <- colMeans(df_group[, nb_col])
  mat_group <- as.matrix(df_group[, nb_col])
  mat_Gk <- matrix(rep(Gk, each = nrow(mat_group)), nrow = nrow(mat_group), ncol = ncol(mat_group))
  difference <- mat_group - mat_Gk
  #Matrice difference x sa transposée
  SSk_intra <- t(difference) %*% difference
  print(SSk_intra)
  SS_partiel_Intra <- c(SS_partiel_Intra, list(SSk_intra))
}
##            Al          Fe          Mg           Ca          Na
## Al 24.6521429 12.27507143  0.44621429 -0.231928571 0.470357143
## Fe 12.2750714  8.02243571  0.44130714 -0.132564286 0.085778571
## Mg  0.4462143  0.44130714 15.39492143  0.430207143 0.026935714
## Ca -0.2319286 -0.13256429  0.43020714  0.044035714 0.008778571
## Na  0.4703571  0.08577857  0.02693571  0.008778571 0.195492857
##        Al       Fe       Mg       Ca      Na
## Al  0.020  0.00500  0.01700  0.00100 -0.0020
## Fe  0.005  0.00125  0.00425  0.00025 -0.0005
## Mg  0.017  0.00425  0.01445  0.00085 -0.0017
## Ca  0.001  0.00025  0.00085  0.00005 -0.0001
## Na -0.002 -0.00050 -0.00170 -0.00010  0.0002
##         Al       Fe       Mg       Ca       Na
## Al 12.6080 -2.18680  0.22440  0.15960  0.15240
## Fe -2.1868  0.76028 -0.03554 -0.02076 -0.01204
## Mg  0.2244 -0.03554  0.00412  0.00268  0.00272
## Ca  0.1596 -0.02076  0.00268  0.00272  0.00228
## Na  0.1524 -0.01204  0.00272  0.00228  0.00312
##         Al       Fe       Mg       Ca       Na
## Al 11.0080 -3.01320 -0.07960  0.17780 -0.03180
## Fe -3.0132  2.16688  0.11704 -0.00212 -0.00648
## Mg -0.0796  0.11704  0.01612  0.00164 -0.00034
## Ca  0.1778 -0.00212  0.00164  0.00468 -0.00088
## Na -0.0318 -0.00648 -0.00034 -0.00088  0.00048

Les K matrices sont ensuite additionnées pour obtenir la SS Intra (SS_Intra).

SS_Intra = Reduce("+", SS_partiel_Intra)

print(SS_Intra)
##            Al          Fe          Mg          Ca         Na
## Al 48.2881429  7.08007143  0.60801429  0.10647143 0.58895714
## Fe  7.0800714 10.95084571  0.52705714 -0.15519429 0.06675857
## Mg  0.6080143  0.52705714 15.42961143  0.43537714 0.02761571
## Ca  0.1064714 -0.15519429  0.43537714  0.05148571 0.01007857
## Na  0.5889571  0.06675857  0.02761571  0.01007857 0.19929286
2.3 Inertie inter classe

Pour obtenir l’inertie inter classe, il suffit de soustraire l’inertie intra classe à l’inertie totale :

SS_Inter = SS_Tot - SS_Intra

print(SS_Inter)
##             Al          Fe          Mg         Ca         Na
## Al  175.610319 -149.295533 -130.809707 -5.8891637 -5.3722648
## Fe -149.295533  134.221616  117.745035  4.8217866  5.3259491
## Mg -130.809707  117.745035  103.350527  4.2091613  4.7105458
## Ca   -5.889164    4.821787    4.209161  0.2047027  0.1547830
## Na   -5.372265    5.325949    4.710546  0.1547830  0.2582456

3. Inférence statistique

3.1 Calcul du Lambda
lambda <- (det(SS_Intra))/(det(SS_Inter + SS_Intra))
print(lambda)
## [1] 0.01230091
3.2 Correction
n = nrow(df) 
correction = -(n-1-(P+K)/2)*log(lambda)
print(correction)
##    Class 
## 90.16069
3.3 Valeur critique
critique = qchisq(p = 0.05, P*(K-1))
print(critique)
## [1] 7.260944

4. Validation

model <- manova( as.matrix(X) ~ Y )
summary(model, test = 'Wilks')
##           Df    Wilks approx F num Df den Df   Pr(>F)    
## Y          3 0.012301   13.088     15 50.091 1.84e-12 ***
## Residuals 22                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5. Fonctions

MANOVA <- function(X,Y){
  
  N = nrow(X)
  
  var_pred = X %>% select_if(is.numeric)
  P = ncol(var_pred)
  
  K = sapply(Y, function(x) sum(!duplicated(x)))
  
  XK <- split(x = X, f = Y)
  
  NK <- sapply(XK, nrow)
  
  nb_col <- which(sapply(X, is.numeric))
  GK <- lapply(XK, function(x) colMeans(x[, nb_col]))
  
  G <- sapply(X,mean)
  
  
  #SS_Tot
  matrice <- as.matrix(X)
  matrice2 <- matrix(rep(G, each = nrow(matrice)), nrow = nrow(matrice), ncol = ncol(matrice))
  difference = matrice - matrice2
  SS_Tot =  t(difference)%*% difference
  
  
  #SS_Intra
  SS_partiel_Intra <- list() 
  for (group in unique(Y)){ 
  df_group <- X[Y == group,]
  Gk <- colMeans(df_group[, nb_col])
  mat_group <- as.matrix(df_group[, nb_col])
  mat_Gk <- matrix(rep(Gk, each = nrow(mat_group)), nrow = nrow(mat_group), ncol = ncol(mat_group))
  difference <- mat_group - mat_Gk
  #Matrice difference x sa transposée
  SSk_intra <- t(difference) %*% difference
  print(SSk_intra)
  SS_partiel_Intra <- c(SS_partiel_Intra, list(SSk_intra))
  }
  SS_Intra = Reduce("+", SS_partiel_Intra)
  
  
  #SS_Inter
  SS_Inter = SS_Tot - SS_Intra
  
  
  #Lambda
  lambda = (det(SS_Intra))/(det(SS_Inter + SS_Intra))
  
  
  #Liste qui contient tous les résultats 
  listfunc <- list("SS_Tot" = SS_Tot , "SS_Intra" = SS_Intra, "SS_Inter" = SS_Inter, "GK" = GK, "G" = G, "NK" = NK, "P" = P, "N" = N, "Lambda" = lambda)
  
  return(listfunc)
  
  
  
}

Nous utilisons le datafram iris, fourni par défaut dans R, afin de tester la fonction MANOVA que nous venons d’écrire.

X <- iris[,1:4]
Y <- iris$Species
MANOVA(X,Y)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length       6.0882      4.8616       0.8014      0.5062
## Sepal.Width        4.8616      7.0408       0.5732      0.4556
## Petal.Length       0.8014      0.5732       1.4778      0.2974
## Petal.Width        0.5062      0.4556       0.2974      0.5442
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      13.0552       4.174        8.962      2.7332
## Sepal.Width        4.1740       4.825        4.050      2.0190
## Petal.Length       8.9620       4.050       10.820      3.5820
## Petal.Width        2.7332       2.019        3.582      1.9162
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      19.8128      4.5944      14.8612      2.4056
## Sepal.Width        4.5944      5.0962       3.4976      2.3338
## Petal.Length      14.8612      3.4976      14.9248      2.3924
## Petal.Width        2.4056      2.3338       2.3924      3.6962
## $SS_Tot
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   102.168333   -6.322667     189.8730    76.92433
## Sepal.Width     -6.322667   28.306933     -49.1188   -18.12427
## Petal.Length   189.873000  -49.118800     464.3254   193.04580
## Petal.Width     76.924333  -18.124267     193.0458    86.56993
## 
## $SS_Intra
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      38.9562     13.6300      24.6246      5.6450
## Sepal.Width       13.6300     16.9620       8.1208      4.8084
## Petal.Length      24.6246      8.1208      27.2226      6.2718
## Petal.Width        5.6450      4.8084       6.2718      6.1566
## 
## $SS_Inter
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length     63.21213   -19.95267     165.2484    71.27933
## Sepal.Width     -19.95267    11.34493     -57.2396   -22.93267
## Petal.Length    165.24840   -57.23960     437.1028   186.77400
## Petal.Width      71.27933   -22.93267     186.7740    80.41333
## 
## $GK
## $GK$setosa
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.006        3.428        1.462        0.246 
## 
## $GK$versicolor
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.936        2.770        4.260        1.326 
## 
## $GK$virginica
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        6.588        2.974        5.552        2.026 
## 
## 
## $G
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333 
## 
## $NK
##     setosa versicolor  virginica 
##         50         50         50 
## 
## $P
## [1] 4
## 
## $N
## [1] 150
## 
## $Lambda
## [1] 0.02343863

Cette fois-ci, nous utilisons directement la fonction Manova de RStudio :

model <- manova(as.matrix(X)~Y)
summary(model,test= 'Wilks')
##            Df    Wilks approx F num Df den Df    Pr(>F)    
## Y           2 0.023439   199.15      8    288 < 2.2e-16 ***
## Residuals 147                                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Nous trouvons la même valeur de Lambda qu’avec notre fonction MANOVA().