library(ade4)
## Warning: package 'ade4' was built under R version 3.4.4
library(adegraphics)
## Warning: package 'adegraphics' was built under R version 3.4.4
library(microbenchmark)
## Warning: package 'microbenchmark' was built under R version 3.4.4
library(benchmarkme)
library(profvis)
library("parallel")
library("ggplot2movies")
library(factoextra)
library(readr)
library(dplyr)
library(cluster)
## Warning: package 'cluster' was built under R version 3.4.4
library(NbClust)

INTRODUCTION à la classification (TD1)

L’objectif est de toujours regrouper des individus par leurs similarités. En sachant que les individus de chaque groupes doivent être homogénes. L’objectif est aussi de retrouver une certaine hierarchie.

1 ET 2

# créer une séquence de factor 
pop=gl(n = 4,k = 5,labels = c("rouge","vert","bleu","jaune"))
pop
##  [1] rouge rouge rouge rouge rouge vert  vert  vert  vert  vert  bleu 
## [12] bleu  bleu  bleu  bleu  jaune jaune jaune jaune jaune
## Levels: rouge vert bleu jaune
# sample pour mélanger l'échantillon
w1 = sample(pop)
w1
##  [1] bleu  jaune vert  jaune vert  bleu  jaune vert  vert  bleu  rouge
## [12] bleu  rouge rouge rouge jaune vert  bleu  rouge jaune
## Levels: rouge vert bleu jaune
# renvoie une liste les éléments de chaque indices
s=split(1:20, w1)
str(s)
## List of 4
##  $ rouge: int [1:5] 11 13 14 15 19
##  $ vert : int [1:5] 3 5 8 9 17
##  $ bleu : int [1:5] 1 6 10 12 18
##  $ jaune: int [1:5] 2 4 7 16 20
# Chargment donnée de mortalité 
me <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mortality_Europe.txt", h=TRUE)
names(me)
##  [1] "State" "Code"  "CS0"   "CS1"   "CS1A"  "CS1B"  "CS1C"  "CS1D" 
##  [9] "CS1E"  "CS2"   "CS2A"  "CS2B"  "CS2C"  "CS2D"  "CS2E"  "CS2F" 
## [17] "CS2H"  "CS2I"  "CS2J"  "CS2K"  "CS2L"  "CS2M"  "CS2N"  "CS2P" 
## [25] "CS3"   "CS3A"  "CS3B"
# extration de certaines variables 
 mame <- me[,c(5:9,11:24)]
 # la variable code comme individu
 rownames(mame) <- me$Code
 head(mame)
 # summary 
 summary(mame)
##       CS1A             CS1B             CS1C               CS1D       
##  Min.   : 0.000   Min.   : 0.100   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.: 0.275   1st Qu.: 0.550   1st Qu.:0.000000   1st Qu.:0.1000  
##  Median : 0.550   Median : 1.550   Median :0.000000   Median :0.1000  
##  Mean   : 2.221   Mean   : 5.121   Mean   :0.007143   Mean   :0.3107  
##  3rd Qu.: 2.050   3rd Qu.: 5.600   3rd Qu.:0.000000   3rd Qu.:0.3000  
##  Max.   :14.100   Max.   :32.200   Max.   :0.100000   Max.   :1.5000  
##       CS1E             CS2A              CS2B            CS2C       
##  Min.   :0.0000   Min.   :  0.600   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:  6.375   1st Qu.:0.100   1st Qu.: 0.475  
##  Median :0.0500   Median : 15.250   Median :0.300   Median : 1.350  
##  Mean   :0.3821   Mean   : 43.521   Mean   :1.475   Mean   : 3.711  
##  3rd Qu.:0.3000   3rd Qu.: 35.925   3rd Qu.:1.200   3rd Qu.: 3.025  
##  Max.   :4.4000   Max.   :233.300   Max.   :8.800   Max.   :24.100  
##       CS2D            CS2E            CS2F             CS2H       
##  Min.   :0.000   Min.   :0.000   Min.   : 0.200   Min.   :  0.70  
##  1st Qu.:0.075   1st Qu.:0.300   1st Qu.: 0.475   1st Qu.: 12.03  
##  Median :0.250   Median :0.550   Median : 4.250   Median : 24.75  
##  Mean   :1.361   Mean   :1.082   Mean   :14.386   Mean   : 56.04  
##  3rd Qu.:1.025   3rd Qu.:0.925   3rd Qu.:10.775   3rd Qu.: 53.10  
##  Max.   :9.200   Max.   :7.100   Max.   :87.600   Max.   :329.60  
##       CS2I            CS2J             CS2K             CS2L      
##  Min.   : 0.10   Min.   : 0.100   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.90   1st Qu.: 1.175   1st Qu.: 0.475   1st Qu.:0.000  
##  Median : 3.25   Median : 2.850   Median : 1.050   Median :0.100  
##  Mean   :10.23   Mean   : 7.404   Mean   : 3.768   Mean   :0.325  
##  3rd Qu.: 8.90   3rd Qu.: 5.450   3rd Qu.: 3.225   3rd Qu.:0.225  
##  Max.   :54.30   Max.   :40.000   Max.   :26.700   Max.   :1.900  
##       CS2M             CS2N             CS2P        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0750   1st Qu.:0.1000   1st Qu.:0.00000  
##  Median :0.2000   Median :0.2000   Median :0.00000  
##  Mean   :0.7643   Mean   :0.4357   Mean   :0.01786  
##  3rd Qu.:0.4250   3rd Qu.:0.3500   3rd Qu.:0.00000  
##  Max.   :4.2000   Max.   :2.0000   Max.   :0.20000
#boxplot 
boxplot(mame)

# pour calculer la distance euclidien entre  Aut et Bel
sqrt(sum((mame[1,]-mame[2,])^2))
## [1] 13.49074
#calcul de distance euclidienne pour tout les individus 
dmame <- dist(mame)
 as.matrix(dmame)[1,2]
## [1] 13.49074
 # pour calculer la distance euclidien entre  Aut et Bel
sqrt(sum((mame[1,]-mame[2,])^2))
## [1] 13.49074
head(dmame)
## [1] 13.49074 10.43743 19.46767 21.23629 30.94608 17.76485
# distance euclidienne pour allemagne et france
sqrt(sum((mame[8,]-mame[9,])^2))
## [1] 198.2435
as.matrix(dmame)[8,9]
## [1] 198.2435

3 Dendograme

# vecteur de valeur
w <- c(0,1,2.1,3.3)

# sous forme 
w <- data.frame(w)
# Sous 
(dw <- dist(w))
##     1   2   3
## 2 1.0        
## 3 2.1 1.1    
## 4 3.3 2.3 1.2
 as.matrix(dw)
##     1   2   3   4
## 1 0.0 1.0 2.1 3.3
## 2 1.0 0.0 1.1 2.3
## 3 2.1 1.1 0.0 1.2
## 4 3.3 2.3 1.2 0.0

Saut minimal

# cluster avec méthode à saut minimal 
h_single=hclust(dw,"single")

Table des étapes de regroupement (merge)

h_single$merge
##      [,1] [,2]
## [1,]   -1   -2
## [2,]   -3    1
## [3,]   -4    2

Longueur des branches

h_single$height
## [1] 1.0 1.1 1.2

Dendograme

 plot(h_single,hang=-1)

Saut maximal

# cluster avec méthode à saut maximal ou complete 
h_complete=hclust(dw,"complete")

Table des étapes de regroupement (merge)

h_complete$merge
##      [,1] [,2]
## [1,]   -1   -2
## [2,]   -3   -4
## [3,]    1    2

Longueur des branches

h_complete$height
## [1] 1.0 1.2 3.3

dendograme

plot(h_complete,han=-1)

Pour les sauts moyen c’est average. ### Exercire 3.3

En considérant le boxplot precedent il vaut mieux centre et réduire les données.

nmame=scalewt(mame,scale = T,center = T)
dnmame=dist(nmame)
h_mame =hclust(dnmame,"complete") 
plot(h_mame)

Avec le critére de WARD

nmame=scalewt(mame,scale = T,center = T)
dnmame=dist(nmame)
h_mame =hclust(dnmame,"ward.D2") 
plote=plot(h_mame)

4 critére de ward

Exo 1 (cas euclidien)

dmame_sq=dmame^2
dmame_sqrt=sqrt(dmame)

h_dmame=hclust(dmame,method = "complete")
h_dmame_sq=hclust(dmame_sq,method = "complete")
h_dmame_sqrt=hclust(dmame_sqrt,method = "complete")

par(mfrow=c(1,2))
# echelle normal
plot(h_dmame)
# le carré des distance 
plot(h_dmame_sq)

# la racine des distances 
plot(h_dmame_sqrt)

Exo 2 (critere de ward )

h_dmame_ward=hclust(dmame,method = "ward.D2")
plot(h_dmame_ward)

Nous constatons naturellement que ITA DEU, on rejoint l’autre classe.

5 recherche de partition

 plot(h_dmamew<-hclust(dmame,method = "ward.D2"), main="Dendrogramme des pays \n methode de Ward", xlab = "Les pays", sub = "")
 abline(h=250, col="red", lwd=1.5)

 # spécification du nombre de groupe 
 parti <- cutree(h_dmamew,k=3)
 parti
## AUT BEL HRV CZE DNK EST FIN FRA DEU HUN ISL IRL ITA LVA LTU LUX MLT NLD 
##   1   1   1   1   1   1   1   2   3   1   1   1   2   1   1   1   1   1 
## NOR MDA ROU SVK SVN ESP SWE CHE MKD GBR 
##   1   1   2   1   1   2   1   1   1   2
 Partih<- cutree(h_dmamew,h=250)

tout combinée

library(mvtnorm)
 fc <- function(sd) {
          x1 <- rmvnorm(10, mean = c(-1, -1), sig=diag(.25, 2))
           x2 <- rmvnorm(10, mean = c(1, -1), sig=diag(sd, 2))
       x3 <- rmvnorm(10, mean = c(-1, 1), sig=diag(sd, 2))
       x4 <- rmvnorm(10, mean = c(1, 1), sig=diag(sd, 2))
       x <- rbind(x1,x2,x3,x4)
       init <- factor(rep(1:4,rep(10,4)))
 #
 gfi <- s.class(x, init, psub.text="initial",psub.cex=2, col=TRUE, plot=FALSE)
 #
 h0 <- hclust(dist(x),"single")
 parti <- as.factor(cutree(h0,k=4))
 gfs <- s.class(x, parti, psub.text="single", psub.cex=2, col=TRUE, plot=FALSE)
 #
 h0 <- hclust(dist(x),"average")
 parti <- as.factor(cutree(h0,k=4))
 gfa <- s.class(x, parti, psub.text="average", psub.cex=2, col=TRUE, plot=FALSE)
 #
 h0 <- hclust(dist(x),"complete")
 parti <- as.factor(cutree(h0,k=4))
 gfc <- s.class(x, parti, psub.text="complete", psub.cex=2, col=TRUE, plot=FALSE)
 #
 ADEgS(list(gfi,gfs,gfa,gfc), layout=c(2,2))
 }
 
 # Ecart type échantillon à 0.25
 fc(sd=0.25)

 #Ecart type echantillon à 0.5 
 fc(sd=0.25)

Clustering Avancée (TD2)

Critére de mesure qualité partition

me <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mortality_Europe.txt", h=TRUE)
 mame <- me[,c(5:9,11:24)]
 rownames(mame) <- me$Code
 dmame <- dist(mame)
 classif <- hclust(d = dmame, method = "ward.D2")
 plot(classif, main="Dendrogramme des pays \n methode de Ward", xlab = "Les pays",
         sub = "")
 abline(h = 250, col = "red", lwd = 1.5)

 parti <- cutree(classif, h = 250)
 parti
## AUT BEL HRV CZE DNK EST FIN FRA DEU HUN ISL IRL ITA LVA LTU LUX MLT NLD 
##   1   1   1   1   1   1   1   2   3   1   1   1   2   1   1   1   1   1 
## NOR MDA ROU SVK SVN ESP SWE CHE MKD GBR 
##   1   1   2   1   1   2   1   1   1   2

critére du coude

 names(classif)
## [1] "merge"       "height"      "order"       "labels"      "method"     
## [6] "call"        "dist.method"
plot(rev(classif$height), type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "classif$height", xlab = "", las = 1)
 points(1:length(classif$height), rev(classif$height), pch = 20)

ratio d’inertie

RSQ <- rep(0, nrow(mame))
 sum(scale(mame, scale = FALSE)^2) -> SQTot
 for (i in 1:nrow(mame)) {
   for(j in 1:ncol(mame)) {
  Cla <- as.factor(cutree(h_dmame, i))
  sum(t((t(sapply(1:ncol(mame), function(j) tapply(mame[,j], Cla, mean)))-
          apply(mame, 2, mean))^2) * as.vector(table(Cla)))/SQTot -> RSQ[i]
   }
 }
 
 # L'idée étant d'essayer de s'approcher de 1 avec le moins de découpage 
 plot(RSQ)

### pseudo F (reprendre la lecture ici ) \[F_{pseudo}=\frac{\frac{R^2}{K-1}}{\frac{1-R^2}{n-K}}\sim F(K-1,N-K)\] \(R^2\) $$ coefficient de corrélation inter-classe (Between) \(1-R^2\) \(\rightarrow\) coefficient de correlation intra-classe (Within )

pseudoF=NULL
N=length(RSQ)
for(i in 2:N-1){
  pseudoF=c(pseudoF,(RSQ[i]/i-1)/((1-RSQ[i])/(N-i) ))
  
}
plot(pseudoF)

Limite des agrégations utilisées dans la TD1

DEs serpentins

Exo 1

DataS=tbl_df(read_csv("http://195.220.111.226/R/serp.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   V1 = col_double(),
##   V2 = col_double()
## )
rownames(DataS)=DataS$X1
## Warning: Setting row names on a tibble is deprecated.
DataS=DataS[,-1]
#boxplot
boxplot(DataS[,-1])

# calcule de distance 
ddatas=dist(x = DataS)

# cluster avec ward2
h_datas=hclust(ddatas,method= "ward.D2")

# plot 
plot(h_datas)

# coude 
plot(rev(h_datas$height), type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "h_datas$height", xlab = "", las = 1)
 points(1:length(h_datas$height), rev(h_datas$height), pch = 20)

# Gap stat
mydist <- function(DataS) as.dist((1-cor(t(DataS)))/2)
mycluster <- function(DataS, k) list(cluster=cutree(hclust(mydist(DataS), method = "ward.D2"),k=k))
myclusGap <- clusGap(DataS,FUN = mycluster, 
                                   K.max = 30, 
                                   B = 100)

plot(myclusGap, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)

fviz_dend(h_datas, cex = 0.5, k = 3, palette = "jco") 

####Exo 2 * faire pareil avec les autres methodes ### Des hamas sphérique

set.seed(11101)
 library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
 C1 <- mvrnorm(120, c(0,0), matrix(c(.5,0,0,.5),2,2))
 C2 <- mvrnorm(40, c(4,2), matrix(c(.15,0,0,.15),2,2))
 C3 <- mvrnorm(40, c(4,-2), matrix(c(.15,0,0,.15),2,2))
 C4 <- mvrnorm(40, c(-4,3), matrix(c(.15,0,0,.15),2,2))
 P <- rbind(C1, C2, C3, C4)
 plot(P, pch = 19, cex = 0.75, col = 'navy', main = 'Amas Spheriques',
         xlab = "", ylab = "", las = 1)

Exo

  • Faire pareil

Des amas ovoides

set.seed(11102)
 t = pi/6
 C1 <- mvrnorm(150, c(0,0), 1.4*matrix(c(cos(t),sin(t),-sin(t),cos(t)),2,2))
 C2 <- mvrnorm(40, c(4,2), matrix(c(.25,0,0,.25),2,2));
 C3 <- mvrnorm(40, c(4,-2), matrix(c(.25,0,0,.25),2,2))
 C4 <- mvrnorm(40, c(-4,3), matrix(c(.25,0,0,.25),2,2))
 P <- rbind(C1, C2, C3, C4)
 plot(P, pch = 19, cex = 0.75, col = 'navy', main = 'Amas ovoïdes',
         xlab = "", ylab = "", las = 1)

## compléxité de hclust

big=read.table("http://195.220.111.226/R/P200901_50000.csv",sep=";",header = T)
 T <- sapply(seq(from = 1000, to = 15000, by = 1000), function(n) system.time({
  cat(n,'\n');
  ind <- sort(sample(1:nrow(big), n));
  hclust(dist(big[ind,-1]), 'ward.D2');
 }))
## 1000 
## 2000 
## 3000 
## 4000 
## 5000 
## 6000 
## 7000 
## 8000 
## 9000 
## 10000 
## 11000 
## 12000 
## 13000 
## 14000 
## 15000
 plot(seq(from = 1000, to = 15000, by = 1000), T[3,], pch = 20, las = 1,
         main = "Complexité en temps" )

x <- seq(from = 1000, to = 15000, by = 1000)
M <- lm(T[3,] ~ I(x)+I(x^2)+I(x^3))
summary(M)
## 
## Call:
## lm(formula = T[3, ] ~ I(x) + I(x^2) + I(x^3))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3555  -3.1903  -0.7896   4.0261  14.0115 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -1.394e+01  1.022e+01  -1.364  0.19993   
## I(x)         1.122e-02  5.356e-03   2.095  0.06011 . 
## I(x^2)      -1.997e-06  7.650e-07  -2.611  0.02423 * 
## I(x^3)       1.087e-10  3.150e-11   3.451  0.00542 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 11 degrees of freedom
## Multiple R-squared:  0.9083, Adjusted R-squared:  0.8833 
## F-statistic: 36.31 on 3 and 11 DF,  p-value: 5.33e-06

Exo

I=I(1000000)
pred=M$coefficients[[1]]+ I*M$coefficients[[2]]+
    (I^2)*M$coefficients[[3]]+(I^3)*M$coefficients[[4]]
# temps
pred 
## [1] 106698049

Intro au k-means

Exo 1

D <- matrix(c(0,0,0,1,0,2,2,1,3,1,3,2,0,5,1,4,-1,4), ncol = 2, byrow = TRUE)

# centre initial
C <- matrix(c(0,4,3/2,2.4,1,4/5), ncol = 2, byrow = TRUE)

# fonctions kmeans
km.CD=kmeans(D,centers =  C)
km.CD
## K-means clustering with 3 clusters of sizes 3, 3, 3
## 
## Cluster means:
##       [,1]     [,2]
## 1 0.000000 4.333333
## 2 2.666667 1.333333
## 3 0.000000 1.000000
## 
## Clustering vector:
## [1] 3 3 3 2 2 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2.666667 1.333333 2.000000
##  (between_SS / total_SS =  85.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# plot final 
 couleur <- c("blue", "red", "orange2")
 plot(D, asp = 1, pch = 19, col = couleur[km.CD$cluster])
 points(km.CD$centers, pch = 8, col = couleur, cex = 2)

###Exo 2

R=NULL
for(i in 1:50){
km.p=kmeans(P,i)
r=km.p$betweenss/km.p$totss
R=c(R,r)
}


plot(R, type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "R2", xlab = "Class", las = 1)
 points(1:50, pch = 20)

gri=expand.grid(3:6,seq(1,50,5))
R=NULL
for(i in 1:nrow(gri)){
  for(j in 1:nrow(gri)){
km.p=kmeans(P,i,nstart = j)
r=km.p$betweenss/km.p$totss
R=c(R,r)


  }
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
data=data.frame(cbind(R,gri))
data$Var2=as.factor(data$Var2)
ggplot(data = data,aes(y=R,x=Var1,col=Var2 ))+
  geom_line()

## variante du k-means

Risque de centres initiaux

Exo 1 (Lloyd)

set.seed(11103)
 C1 <- mvrnorm(120, c(0,0), matrix(c(0.5,0,0,0.5),2,2))
 C2 <- mvrnorm(40, c(4,2), matrix(c(0.15,0,0,0.15),2,2))
 C3 <- mvrnorm(40, c(4,-2), matrix(c(0.15,0,0,0.15),2,2))
 C4 <- mvrnorm(40, c(-4,3), matrix(c(0.15,0,0,0.15),2,2))
 P <- data.frame(rbind(C1, C2, C3, C4))
 # centre inital 
 CK <- matrix(c(-6,0,2,0,4,4,10,-1), ncol = 2, byrow = TRUE)
 
 # kmeans avec l'algo de Floyd à noter qu'il y a une case de vide
 km.Pfloyd=kmeans(P,CK,algorithm = "Lloyd")
## Warning: empty cluster: try a better set of initial centers

## Warning: empty cluster: try a better set of initial centers
 # representation Classe
 fviz_cluster(km.Pfloyd, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Lloyd          ")

 # Avec une seule itération
 km.Pfloyd1=kmeans(P,CK,algorithm = "Lloyd",iter.max = 1) 
## Warning: empty cluster: try a better set of initial centers
## Warning: did not converge in 1 iteration
## Warning: empty cluster: try a better set of initial centers
 #rpZ
 # Remarque nous n'avons pas eu de convergence
fviz_cluster(km.Pfloyd1, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Lloyd  Avec une Iter")

#### Exo 2 (Forgy)

km.Pforgy= kmeans(x = P,centers = 4,algorithm = "Forgy")
km.Pforgy
## K-means clustering with 4 clusters of sizes 40, 55, 65, 80
## 
## Cluster means:
##           X1         X2
## 1 -4.0291091  2.9649813
## 2  0.6985004 -0.2107204
## 3 -0.4093240  0.1973451
## 4  3.9956840 -0.0827769
## 
## Clustering vector:
##   [1] 3 3 2 2 3 3 2 2 3 3 3 3 2 3 3 2 2 3 3 2 3 3 2 3 2 3 3 3 2 2 3 2 3 2 2
##  [36] 3 2 3 3 2 2 3 3 3 3 3 3 2 2 2 3 2 2 2 2 2 3 3 3 3 3 2 3 2 2 2 2 3 2 3
##  [71] 2 3 2 2 3 2 3 3 3 3 2 3 3 3 2 3 2 3 3 2 2 2 2 2 3 3 2 2 3 3 3 3 3 2 3
## [106] 2 2 2 3 2 2 2 3 3 3 3 2 3 2 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [141] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [176] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1]  12.94407  43.34795  43.52353 348.33175
##  (between_SS / total_SS =  82.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
 # La séparation est beaucoup plus distinct ici 
# cependant cette méthode respecte le nbre de classe fixée au départ
fviz_cluster(km.Pforgy, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Forgy")

Stabilité de Classe

Essai Forgy

km.Pforgy1= kmeans(x = P,centers = 6,algorithm = "Forgy",nstart = 20)
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
km.Pforgy2= kmeans(x = P,centers = 4,algorithm = "Forgy")
km.Pforgy3= kmeans(x = P,centers = 4,algorithm = "Forgy")
p1=fviz_cluster(km.Pforgy1, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Forgy")
p2=fviz_cluster(km.Pforgy2, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Forgy")
p3=fviz_cluster(km.Pforgy3, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Forgy")
par(mfrow=c(2,2))
p1

p2

p3

#### Essai Hartigan Faire de même #### Essai Mac Queen

Exo (avec nstart plus Grand)

km.Pforgy=kmeans(P,4,nstart = 1000,algorithm = "Forgy")
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
fviz_cluster(km.Pforgy, data = P, 
             ellipse.type = "convex",
             palette = "jco",
             repel = TRUE,
             ggtheme = theme_minimal(),geom = "point",main = "Forgy")

km.Pforgy
## K-means clustering with 4 clusters of sizes 40, 40, 119, 41
## 
## Cluster means:
##           X1           X2
## 1 -4.0291091  2.964981292
## 2  3.9926048 -2.095184186
## 3  0.0832006 -0.007283337
## 4  3.9478322  1.933896163
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [141] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1]  12.94407  13.91159 120.66077  14.72071
##  (between_SS / total_SS =  93.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Performance des variantes

Exo

Lloyd
bigger=na.omit(big[,-56])
 T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
  cat(n,'\n');
  ind <- sort(sample(1:nrow(big),n));
  kmeans(bigger[ind,],3,algorithm = "Lloyd");
 }))
 plot(seq(from=100,to=50000,by=5000),T[3,],type="l")
Lloyd
bigger=na.omit(big[,-56])
 T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
  cat(n,'\n');
  ind <- sort(sample(1:nrow(big),n));
  kmeans(bigger[ind,],3,algorithm = "Forgy");
 }))
 plot(seq(from=100,to=50000,by=5000),T[3,],type="l")
Macqueen
bigger=na.omit(big[,-56])
 T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
  cat(n,'\n');
  ind <- sort(sample(1:nrow(big),n));
  kmeans(bigger[ind,],3,algorithm = "MacQueen");
 }))
 plot(seq(from=100,to=50000,by=5000),T[3,],type="l")

Parallélisation du code (trés important mais pour plutard)