MANOVA
Fouille de données avec R Léa Lê Dinh : Esiee Paris 2022-2023
rm(list = ls()) # initialisation
library(kableExtra)
library(readr)
library(dplyr)
##
## Attachement du package : 'dplyr'
## L'objet suivant est masqué depuis 'package:kableExtra':
##
## group_rows
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
load("EX_MANOVA.Rda")
View(df)
Pour simplifier notre code, nous créons : - Une variable N : nombre total d’individus
N = nrow(df)
print(N)
## [1] 26
#Sélection colonnes classe numérique
var_pred = df %>% select_if(is.numeric)
#Nombre de colonnes sélectionnées
P = ncol(var_pred)
print(P)
## [1] 5
X <- df[,-6]
View(X)
X %>% kbl(digits=3) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center", latex_options = 'stripped') %>%
scroll_box( height = "250px")
| Al | Fe | Mg | Ca | Na |
|---|---|---|---|---|
| 14.4 | 7.00 | 4.30 | 0.15 | 0.51 |
| 13.8 | 7.08 | 3.43 | 0.12 | 0.17 |
| 14.6 | 7.09 | 3.88 | 0.13 | 0.20 |
| 11.5 | 6.37 | 5.64 | 0.16 | 0.14 |
| 13.8 | 7.06 | 5.34 | 0.20 | 0.20 |
| 10.9 | 6.26 | 3.47 | 0.17 | 0.22 |
| 10.1 | 4.26 | 4.26 | 0.20 | 0.18 |
| 11.6 | 5.78 | 5.91 | 0.18 | 0.16 |
| 11.1 | 5.49 | 4.52 | 0.29 | 0.30 |
| 13.4 | 6.92 | 7.23 | 0.28 | 0.20 |
| 12.4 | 6.13 | 5.69 | 0.22 | 0.54 |
| 13.1 | 6.64 | 5.51 | 0.31 | 0.24 |
| 12.7 | 6.69 | 4.45 | 0.20 | 0.22 |
| 12.5 | 6.44 | 3.94 | 0.22 | 0.23 |
| 11.8 | 5.44 | 3.94 | 0.30 | 0.04 |
| 11.6 | 5.39 | 3.77 | 0.29 | 0.06 |
| 18.3 | 1.28 | 0.67 | 0.03 | 0.03 |
| 15.8 | 2.39 | 0.63 | 0.01 | 0.04 |
| 18.0 | 1.50 | 0.67 | 0.01 | 0.06 |
| 18.0 | 1.88 | 0.68 | 0.01 | 0.04 |
| 20.8 | 1.51 | 0.72 | 0.07 | 0.10 |
| 17.7 | 1.12 | 0.56 | 0.06 | 0.06 |
| 18.3 | 1.14 | 0.67 | 0.06 | 0.05 |
| 16.7 | 0.92 | 0.53 | 0.01 | 0.05 |
| 14.8 | 2.74 | 0.67 | 0.03 | 0.05 |
| 19.1 | 1.64 | 0.60 | 0.10 | 0.03 |
Y <- as.vector(df[,c(6)])
print(Y)
## $Class
## [1] "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ll" "Ca"
## [16] "Ca" "Is" "Is" "Is" "Is" "Is" "As" "As" "As" "As" "As"
#Nous vérifions que Y est bien un vecteur
is.vector(Y)
## [1] TRUE
K = sapply(Y, function(x) sum(!duplicated(x)))
print(K)
## Class
## 4
# Y %>% summarise_all(funs(n_distinct))
XK <- split(x = df, f = df$Class)
print(XK)
## $As
## # A tibble: 5 × 6
## Al Fe Mg Ca Na Class
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 17.7 1.12 0.56 0.06 0.06 As
## 2 18.3 1.14 0.67 0.06 0.05 As
## 3 16.7 0.92 0.53 0.01 0.05 As
## 4 14.8 2.74 0.67 0.03 0.05 As
## 5 19.1 1.64 0.6 0.1 0.03 As
##
## $Ca
## # A tibble: 2 × 6
## Al Fe Mg Ca Na Class
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 11.8 5.44 3.94 0.3 0.04 Ca
## 2 11.6 5.39 3.77 0.29 0.06 Ca
##
## $Is
## # A tibble: 5 × 6
## Al Fe Mg Ca Na Class
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 18.3 1.28 0.67 0.03 0.03 Is
## 2 15.8 2.39 0.63 0.01 0.04 Is
## 3 18 1.5 0.67 0.01 0.06 Is
## 4 18 1.88 0.68 0.01 0.04 Is
## 5 20.8 1.51 0.72 0.07 0.1 Is
##
## $Ll
## # A tibble: 14 × 6
## Al Fe Mg Ca Na Class
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 14.4 7 4.3 0.15 0.51 Ll
## 2 13.8 7.08 3.43 0.12 0.17 Ll
## 3 14.6 7.09 3.88 0.13 0.2 Ll
## 4 11.5 6.37 5.64 0.16 0.14 Ll
## 5 13.8 7.06 5.34 0.2 0.2 Ll
## 6 10.9 6.26 3.47 0.17 0.22 Ll
## 7 10.1 4.26 4.26 0.2 0.18 Ll
## 8 11.6 5.78 5.91 0.18 0.16 Ll
## 9 11.1 5.49 4.52 0.29 0.3 Ll
## 10 13.4 6.92 7.23 0.28 0.2 Ll
## 11 12.4 6.13 5.69 0.22 0.54 Ll
## 12 13.1 6.64 5.51 0.31 0.24 Ll
## 13 12.7 6.69 4.45 0.2 0.22 Ll
## 14 12.5 6.44 3.94 0.22 0.23 Ll
NK <- sapply(XK, nrow)
print(NK)
## As Ca Is Ll
## 5 2 5 14
#On recupere les numéros de colonnes
nb_col <- which(sapply(df, is.numeric))
GK <- lapply(XK, function(x) colMeans(x[, nb_col]))
print(GK)
## $As
## Al Fe Mg Ca Na
## 17.320 1.512 0.606 0.052 0.048
##
## $Ca
## Al Fe Mg Ca Na
## 11.700 5.415 3.855 0.295 0.050
##
## $Is
## Al Fe Mg Ca Na
## 18.180 1.712 0.674 0.026 0.054
##
## $Ll
## Al Fe Mg Ca Na
## 12.5642857 6.3721429 4.8264286 0.2021429 0.2507143
G <- sapply(X,mean)
#On vérifie que G est bien un vecteur numérique
is.vector(G)
## [1] TRUE
is.numeric(G)
## [1] TRUE
#C'est le cas. Nous l'affichons :
print(G)
## Al Fe Mg Ca Na
## 14.4923077 4.4676923 3.1415385 0.1465385 0.1584615
# Transformation du dataframe en matrice
matrice <- as.matrix(X)
print(matrice)
## Al Fe Mg Ca Na
## [1,] 14.4 7.00 4.30 0.15 0.51
## [2,] 13.8 7.08 3.43 0.12 0.17
## [3,] 14.6 7.09 3.88 0.13 0.20
## [4,] 11.5 6.37 5.64 0.16 0.14
## [5,] 13.8 7.06 5.34 0.20 0.20
## [6,] 10.9 6.26 3.47 0.17 0.22
## [7,] 10.1 4.26 4.26 0.20 0.18
## [8,] 11.6 5.78 5.91 0.18 0.16
## [9,] 11.1 5.49 4.52 0.29 0.30
## [10,] 13.4 6.92 7.23 0.28 0.20
## [11,] 12.4 6.13 5.69 0.22 0.54
## [12,] 13.1 6.64 5.51 0.31 0.24
## [13,] 12.7 6.69 4.45 0.20 0.22
## [14,] 12.5 6.44 3.94 0.22 0.23
## [15,] 11.8 5.44 3.94 0.30 0.04
## [16,] 11.6 5.39 3.77 0.29 0.06
## [17,] 18.3 1.28 0.67 0.03 0.03
## [18,] 15.8 2.39 0.63 0.01 0.04
## [19,] 18.0 1.50 0.67 0.01 0.06
## [20,] 18.0 1.88 0.68 0.01 0.04
## [21,] 20.8 1.51 0.72 0.07 0.10
## [22,] 17.7 1.12 0.56 0.06 0.06
## [23,] 18.3 1.14 0.67 0.06 0.05
## [24,] 16.7 0.92 0.53 0.01 0.05
## [25,] 14.8 2.74 0.67 0.03 0.05
## [26,] 19.1 1.64 0.60 0.10 0.03
# Création de la liste de vecteurs
matrice2 <- matrix(rep(G, each = nrow(matrice)), nrow = nrow(matrice), ncol = ncol(matrice))
#matrice2 <- replicate(nrow(matrice), G, simplify = FALSE)
print(matrice2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [2,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [3,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [4,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [5,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [6,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [7,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [8,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [9,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [10,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [11,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [12,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [13,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [14,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [15,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [16,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [17,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [18,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [19,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [20,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [21,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [22,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [23,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [24,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [25,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
## [26,] 14.49231 4.467692 3.141538 0.1465385 0.1584615
difference = matrice - matrice2
print(difference)
## Al Fe Mg Ca Na
## [1,] -0.09230769 2.5323077 1.1584615 0.003461538 0.351538462
## [2,] -0.69230769 2.6123077 0.2884615 -0.026538462 0.011538462
## [3,] 0.10769231 2.6223077 0.7384615 -0.016538462 0.041538462
## [4,] -2.99230769 1.9023077 2.4984615 0.013461538 -0.018461538
## [5,] -0.69230769 2.5923077 2.1984615 0.053461538 0.041538462
## [6,] -3.59230769 1.7923077 0.3284615 0.023461538 0.061538462
## [7,] -4.39230769 -0.2076923 1.1184615 0.053461538 0.021538462
## [8,] -2.89230769 1.3123077 2.7684615 0.033461538 0.001538462
## [9,] -3.39230769 1.0223077 1.3784615 0.143461538 0.141538462
## [10,] -1.09230769 2.4523077 4.0884615 0.133461538 0.041538462
## [11,] -2.09230769 1.6623077 2.5484615 0.073461538 0.381538462
## [12,] -1.39230769 2.1723077 2.3684615 0.163461538 0.081538462
## [13,] -1.79230769 2.2223077 1.3084615 0.053461538 0.061538462
## [14,] -1.99230769 1.9723077 0.7984615 0.073461538 0.071538462
## [15,] -2.69230769 0.9723077 0.7984615 0.153461538 -0.118461538
## [16,] -2.89230769 0.9223077 0.6284615 0.143461538 -0.098461538
## [17,] 3.80769231 -3.1876923 -2.4715385 -0.116538462 -0.128461538
## [18,] 1.30769231 -2.0776923 -2.5115385 -0.136538462 -0.118461538
## [19,] 3.50769231 -2.9676923 -2.4715385 -0.136538462 -0.098461538
## [20,] 3.50769231 -2.5876923 -2.4615385 -0.136538462 -0.118461538
## [21,] 6.30769231 -2.9576923 -2.4215385 -0.076538462 -0.058461538
## [22,] 3.20769231 -3.3476923 -2.5815385 -0.086538462 -0.098461538
## [23,] 3.80769231 -3.3276923 -2.4715385 -0.086538462 -0.108461538
## [24,] 2.20769231 -3.5476923 -2.6115385 -0.136538462 -0.108461538
## [25,] 0.30769231 -1.7276923 -2.4715385 -0.116538462 -0.108461538
## [26,] 4.60769231 -2.8276923 -2.5415385 -0.046538462 -0.128461538
SS_Tot = t(difference)%*% difference
print(SS_Tot)
## Al Fe Mg Ca Na
## Al 223.898462 -142.215462 -130.201692 -5.7826923 -4.7833077
## Fe -142.215462 145.172462 118.272092 4.6665923 5.3927077
## Mg -130.201692 118.272092 118.780138 4.6445385 4.7381615
## Ca -5.782692 4.666592 4.644538 0.2561885 0.1648615
## Na -4.783308 5.392708 4.738162 0.1648615 0.4575385
Y<-df$Class
#Création de la liste
SS_partiel_Intra <- list()
for (group in unique(Y)){ #Pour chaque dataframe de XK
df_group <- df[Y == group,]
Gk <- colMeans(df_group[, nb_col])
mat_group <- as.matrix(df_group[, nb_col])
mat_Gk <- matrix(rep(Gk, each = nrow(mat_group)), nrow = nrow(mat_group), ncol = ncol(mat_group))
difference <- mat_group - mat_Gk
#Matrice difference x sa transposée
SSk_intra <- t(difference) %*% difference
print(SSk_intra)
SS_partiel_Intra <- c(SS_partiel_Intra, list(SSk_intra))
}
## Al Fe Mg Ca Na
## Al 24.6521429 12.27507143 0.44621429 -0.231928571 0.470357143
## Fe 12.2750714 8.02243571 0.44130714 -0.132564286 0.085778571
## Mg 0.4462143 0.44130714 15.39492143 0.430207143 0.026935714
## Ca -0.2319286 -0.13256429 0.43020714 0.044035714 0.008778571
## Na 0.4703571 0.08577857 0.02693571 0.008778571 0.195492857
## Al Fe Mg Ca Na
## Al 0.020 0.00500 0.01700 0.00100 -0.0020
## Fe 0.005 0.00125 0.00425 0.00025 -0.0005
## Mg 0.017 0.00425 0.01445 0.00085 -0.0017
## Ca 0.001 0.00025 0.00085 0.00005 -0.0001
## Na -0.002 -0.00050 -0.00170 -0.00010 0.0002
## Al Fe Mg Ca Na
## Al 12.6080 -2.18680 0.22440 0.15960 0.15240
## Fe -2.1868 0.76028 -0.03554 -0.02076 -0.01204
## Mg 0.2244 -0.03554 0.00412 0.00268 0.00272
## Ca 0.1596 -0.02076 0.00268 0.00272 0.00228
## Na 0.1524 -0.01204 0.00272 0.00228 0.00312
## Al Fe Mg Ca Na
## Al 11.0080 -3.01320 -0.07960 0.17780 -0.03180
## Fe -3.0132 2.16688 0.11704 -0.00212 -0.00648
## Mg -0.0796 0.11704 0.01612 0.00164 -0.00034
## Ca 0.1778 -0.00212 0.00164 0.00468 -0.00088
## Na -0.0318 -0.00648 -0.00034 -0.00088 0.00048
Les K matrices sont ensuite additionnées pour obtenir la SS Intra (SS_Intra).
SS_Intra = Reduce("+", SS_partiel_Intra)
print(SS_Intra)
## Al Fe Mg Ca Na
## Al 48.2881429 7.08007143 0.60801429 0.10647143 0.58895714
## Fe 7.0800714 10.95084571 0.52705714 -0.15519429 0.06675857
## Mg 0.6080143 0.52705714 15.42961143 0.43537714 0.02761571
## Ca 0.1064714 -0.15519429 0.43537714 0.05148571 0.01007857
## Na 0.5889571 0.06675857 0.02761571 0.01007857 0.19929286
Pour obtenir l’inertie inter classe, il suffit de soustraire l’inertie intra classe à l’inertie totale :
SS_Inter = SS_Tot - SS_Intra
print(SS_Inter)
## Al Fe Mg Ca Na
## Al 175.610319 -149.295533 -130.809707 -5.8891637 -5.3722648
## Fe -149.295533 134.221616 117.745035 4.8217866 5.3259491
## Mg -130.809707 117.745035 103.350527 4.2091613 4.7105458
## Ca -5.889164 4.821787 4.209161 0.2047027 0.1547830
## Na -5.372265 5.325949 4.710546 0.1547830 0.2582456
lambda <- (det(SS_Intra))/(det(SS_Inter + SS_Intra))
print(lambda)
## [1] 0.01230091
n = nrow(df)
correction = -(n-1-(P+K)/2)*log(lambda)
print(correction)
## Class
## 90.16069
critique = qchisq(p = 0.05, P*(K-1))
print(critique)
## [1] 7.260944
model <- manova( as.matrix(X) ~ Y )
summary(model, test = 'Wilks')
## Df Wilks approx F num Df den Df Pr(>F)
## Y 3 0.012301 13.088 15 50.091 1.84e-12 ***
## Residuals 22
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MANOVA <- function(X,Y){
N = nrow(X)
var_pred = X %>% select_if(is.numeric)
P = ncol(var_pred)
K = sapply(Y, function(x) sum(!duplicated(x)))
XK <- split(x = X, f = Y)
NK <- sapply(XK, nrow)
nb_col <- which(sapply(X, is.numeric))
GK <- lapply(XK, function(x) colMeans(x[, nb_col]))
G <- sapply(X,mean)
#SS_Tot
matrice <- as.matrix(X)
matrice2 <- matrix(rep(G, each = nrow(matrice)), nrow = nrow(matrice), ncol = ncol(matrice))
difference = matrice - matrice2
SS_Tot = t(difference)%*% difference
#SS_Intra
SS_partiel_Intra <- list()
for (group in unique(Y)){
df_group <- X[Y == group,]
Gk <- colMeans(df_group[, nb_col])
mat_group <- as.matrix(df_group[, nb_col])
mat_Gk <- matrix(rep(Gk, each = nrow(mat_group)), nrow = nrow(mat_group), ncol = ncol(mat_group))
difference <- mat_group - mat_Gk
#Matrice difference x sa transposée
SSk_intra <- t(difference) %*% difference
print(SSk_intra)
SS_partiel_Intra <- c(SS_partiel_Intra, list(SSk_intra))
}
SS_Intra = Reduce("+", SS_partiel_Intra)
#SS_Inter
SS_Inter = SS_Tot - SS_Intra
#Lambda
lambda = (det(SS_Intra))/(det(SS_Inter + SS_Intra))
#Liste qui contient tous les résultats
listfunc <- list("SS_Tot" = SS_Tot , "SS_Intra" = SS_Intra, "SS_Inter" = SS_Inter, "GK" = GK, "G" = G, "NK" = NK, "P" = P, "N" = N, "Lambda" = lambda)
return(listfunc)
}
Nous utilisons le datafram iris, fourni par défaut dans R, afin de tester la fonction MANOVA que nous venons d’écrire.
X <- iris[,1:4]
Y <- iris$Species
MANOVA(X,Y)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 6.0882 4.8616 0.8014 0.5062
## Sepal.Width 4.8616 7.0408 0.5732 0.4556
## Petal.Length 0.8014 0.5732 1.4778 0.2974
## Petal.Width 0.5062 0.4556 0.2974 0.5442
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 13.0552 4.174 8.962 2.7332
## Sepal.Width 4.1740 4.825 4.050 2.0190
## Petal.Length 8.9620 4.050 10.820 3.5820
## Petal.Width 2.7332 2.019 3.582 1.9162
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 19.8128 4.5944 14.8612 2.4056
## Sepal.Width 4.5944 5.0962 3.4976 2.3338
## Petal.Length 14.8612 3.4976 14.9248 2.3924
## Petal.Width 2.4056 2.3338 2.3924 3.6962
## $SS_Tot
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 102.168333 -6.322667 189.8730 76.92433
## Sepal.Width -6.322667 28.306933 -49.1188 -18.12427
## Petal.Length 189.873000 -49.118800 464.3254 193.04580
## Petal.Width 76.924333 -18.124267 193.0458 86.56993
##
## $SS_Intra
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 38.9562 13.6300 24.6246 5.6450
## Sepal.Width 13.6300 16.9620 8.1208 4.8084
## Petal.Length 24.6246 8.1208 27.2226 6.2718
## Petal.Width 5.6450 4.8084 6.2718 6.1566
##
## $SS_Inter
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 63.21213 -19.95267 165.2484 71.27933
## Sepal.Width -19.95267 11.34493 -57.2396 -22.93267
## Petal.Length 165.24840 -57.23960 437.1028 186.77400
## Petal.Width 71.27933 -22.93267 186.7740 80.41333
##
## $GK
## $GK$setosa
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.006 3.428 1.462 0.246
##
## $GK$versicolor
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.936 2.770 4.260 1.326
##
## $GK$virginica
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 6.588 2.974 5.552 2.026
##
##
## $G
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
##
## $NK
## setosa versicolor virginica
## 50 50 50
##
## $P
## [1] 4
##
## $N
## [1] 150
##
## $Lambda
## [1] 0.02343863
Cette fois-ci, nous utilisons directement la fonction Manova de RStudio :
model <- manova(as.matrix(X)~Y)
summary(model,test= 'Wilks')
## Df Wilks approx F num Df den Df Pr(>F)
## Y 2 0.023439 199.15 8 288 < 2.2e-16 ***
## Residuals 147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Nous trouvons la même valeur de Lambda qu’avec notre fonction MANOVA().