library(ade4)
## Warning: package 'ade4' was built under R version 3.4.4
library(adegraphics)
## Warning: package 'adegraphics' was built under R version 3.4.4
library(microbenchmark)
## Warning: package 'microbenchmark' was built under R version 3.4.4
library(benchmarkme)
library(profvis)
library("parallel")
library("ggplot2movies")
library(factoextra)
library(readr)
library(dplyr)
library(cluster)
## Warning: package 'cluster' was built under R version 3.4.4
library(NbClust)
L’objectif est de toujours regrouper des individus par leurs similarités. En sachant que les individus de chaque groupes doivent être homogénes. L’objectif est aussi de retrouver une certaine hierarchie.
# créer une séquence de factor
pop=gl(n = 4,k = 5,labels = c("rouge","vert","bleu","jaune"))
pop
## [1] rouge rouge rouge rouge rouge vert vert vert vert vert bleu
## [12] bleu bleu bleu bleu jaune jaune jaune jaune jaune
## Levels: rouge vert bleu jaune
# sample pour mélanger l'échantillon
w1 = sample(pop)
w1
## [1] bleu jaune vert jaune vert bleu jaune vert vert bleu rouge
## [12] bleu rouge rouge rouge jaune vert bleu rouge jaune
## Levels: rouge vert bleu jaune
# renvoie une liste les éléments de chaque indices
s=split(1:20, w1)
str(s)
## List of 4
## $ rouge: int [1:5] 11 13 14 15 19
## $ vert : int [1:5] 3 5 8 9 17
## $ bleu : int [1:5] 1 6 10 12 18
## $ jaune: int [1:5] 2 4 7 16 20
# Chargment donnée de mortalité
me <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mortality_Europe.txt", h=TRUE)
names(me)
## [1] "State" "Code" "CS0" "CS1" "CS1A" "CS1B" "CS1C" "CS1D"
## [9] "CS1E" "CS2" "CS2A" "CS2B" "CS2C" "CS2D" "CS2E" "CS2F"
## [17] "CS2H" "CS2I" "CS2J" "CS2K" "CS2L" "CS2M" "CS2N" "CS2P"
## [25] "CS3" "CS3A" "CS3B"
# extration de certaines variables
mame <- me[,c(5:9,11:24)]
# la variable code comme individu
rownames(mame) <- me$Code
head(mame)
# summary
summary(mame)
## CS1A CS1B CS1C CS1D
## Min. : 0.000 Min. : 0.100 Min. :0.000000 Min. :0.0000
## 1st Qu.: 0.275 1st Qu.: 0.550 1st Qu.:0.000000 1st Qu.:0.1000
## Median : 0.550 Median : 1.550 Median :0.000000 Median :0.1000
## Mean : 2.221 Mean : 5.121 Mean :0.007143 Mean :0.3107
## 3rd Qu.: 2.050 3rd Qu.: 5.600 3rd Qu.:0.000000 3rd Qu.:0.3000
## Max. :14.100 Max. :32.200 Max. :0.100000 Max. :1.5000
## CS1E CS2A CS2B CS2C
## Min. :0.0000 Min. : 0.600 Min. :0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 6.375 1st Qu.:0.100 1st Qu.: 0.475
## Median :0.0500 Median : 15.250 Median :0.300 Median : 1.350
## Mean :0.3821 Mean : 43.521 Mean :1.475 Mean : 3.711
## 3rd Qu.:0.3000 3rd Qu.: 35.925 3rd Qu.:1.200 3rd Qu.: 3.025
## Max. :4.4000 Max. :233.300 Max. :8.800 Max. :24.100
## CS2D CS2E CS2F CS2H
## Min. :0.000 Min. :0.000 Min. : 0.200 Min. : 0.70
## 1st Qu.:0.075 1st Qu.:0.300 1st Qu.: 0.475 1st Qu.: 12.03
## Median :0.250 Median :0.550 Median : 4.250 Median : 24.75
## Mean :1.361 Mean :1.082 Mean :14.386 Mean : 56.04
## 3rd Qu.:1.025 3rd Qu.:0.925 3rd Qu.:10.775 3rd Qu.: 53.10
## Max. :9.200 Max. :7.100 Max. :87.600 Max. :329.60
## CS2I CS2J CS2K CS2L
## Min. : 0.10 Min. : 0.100 Min. : 0.000 Min. :0.000
## 1st Qu.: 0.90 1st Qu.: 1.175 1st Qu.: 0.475 1st Qu.:0.000
## Median : 3.25 Median : 2.850 Median : 1.050 Median :0.100
## Mean :10.23 Mean : 7.404 Mean : 3.768 Mean :0.325
## 3rd Qu.: 8.90 3rd Qu.: 5.450 3rd Qu.: 3.225 3rd Qu.:0.225
## Max. :54.30 Max. :40.000 Max. :26.700 Max. :1.900
## CS2M CS2N CS2P
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0750 1st Qu.:0.1000 1st Qu.:0.00000
## Median :0.2000 Median :0.2000 Median :0.00000
## Mean :0.7643 Mean :0.4357 Mean :0.01786
## 3rd Qu.:0.4250 3rd Qu.:0.3500 3rd Qu.:0.00000
## Max. :4.2000 Max. :2.0000 Max. :0.20000
#boxplot
boxplot(mame)
# pour calculer la distance euclidien entre Aut et Bel
sqrt(sum((mame[1,]-mame[2,])^2))
## [1] 13.49074
#calcul de distance euclidienne pour tout les individus
dmame <- dist(mame)
as.matrix(dmame)[1,2]
## [1] 13.49074
# pour calculer la distance euclidien entre Aut et Bel
sqrt(sum((mame[1,]-mame[2,])^2))
## [1] 13.49074
head(dmame)
## [1] 13.49074 10.43743 19.46767 21.23629 30.94608 17.76485
# distance euclidienne pour allemagne et france
sqrt(sum((mame[8,]-mame[9,])^2))
## [1] 198.2435
as.matrix(dmame)[8,9]
## [1] 198.2435
# vecteur de valeur
w <- c(0,1,2.1,3.3)
# sous forme
w <- data.frame(w)
# Sous
(dw <- dist(w))
## 1 2 3
## 2 1.0
## 3 2.1 1.1
## 4 3.3 2.3 1.2
as.matrix(dw)
## 1 2 3 4
## 1 0.0 1.0 2.1 3.3
## 2 1.0 0.0 1.1 2.3
## 3 2.1 1.1 0.0 1.2
## 4 3.3 2.3 1.2 0.0
# cluster avec méthode à saut minimal
h_single=hclust(dw,"single")
Table des étapes de regroupement (merge)
h_single$merge
## [,1] [,2]
## [1,] -1 -2
## [2,] -3 1
## [3,] -4 2
Longueur des branches
h_single$height
## [1] 1.0 1.1 1.2
Dendograme
plot(h_single,hang=-1)
# cluster avec méthode à saut maximal ou complete
h_complete=hclust(dw,"complete")
Table des étapes de regroupement (merge)
h_complete$merge
## [,1] [,2]
## [1,] -1 -2
## [2,] -3 -4
## [3,] 1 2
Longueur des branches
h_complete$height
## [1] 1.0 1.2 3.3
dendograme
plot(h_complete,han=-1)
Pour les sauts moyen c’est average. ### Exercire 3.3
En considérant le boxplot precedent il vaut mieux centre et réduire les données.
nmame=scalewt(mame,scale = T,center = T)
dnmame=dist(nmame)
h_mame =hclust(dnmame,"complete")
plot(h_mame)
Avec le critére de WARD
nmame=scalewt(mame,scale = T,center = T)
dnmame=dist(nmame)
h_mame =hclust(dnmame,"ward.D2")
plote=plot(h_mame)
dmame_sq=dmame^2
dmame_sqrt=sqrt(dmame)
h_dmame=hclust(dmame,method = "complete")
h_dmame_sq=hclust(dmame_sq,method = "complete")
h_dmame_sqrt=hclust(dmame_sqrt,method = "complete")
par(mfrow=c(1,2))
# echelle normal
plot(h_dmame)
# le carré des distance
plot(h_dmame_sq)
# la racine des distances
plot(h_dmame_sqrt)
h_dmame_ward=hclust(dmame,method = "ward.D2")
plot(h_dmame_ward)
Nous constatons naturellement que ITA DEU, on rejoint l’autre classe.
plot(h_dmamew<-hclust(dmame,method = "ward.D2"), main="Dendrogramme des pays \n methode de Ward", xlab = "Les pays", sub = "")
abline(h=250, col="red", lwd=1.5)
# spécification du nombre de groupe
parti <- cutree(h_dmamew,k=3)
parti
## AUT BEL HRV CZE DNK EST FIN FRA DEU HUN ISL IRL ITA LVA LTU LUX MLT NLD
## 1 1 1 1 1 1 1 2 3 1 1 1 2 1 1 1 1 1
## NOR MDA ROU SVK SVN ESP SWE CHE MKD GBR
## 1 1 2 1 1 2 1 1 1 2
Partih<- cutree(h_dmamew,h=250)
library(mvtnorm)
fc <- function(sd) {
x1 <- rmvnorm(10, mean = c(-1, -1), sig=diag(.25, 2))
x2 <- rmvnorm(10, mean = c(1, -1), sig=diag(sd, 2))
x3 <- rmvnorm(10, mean = c(-1, 1), sig=diag(sd, 2))
x4 <- rmvnorm(10, mean = c(1, 1), sig=diag(sd, 2))
x <- rbind(x1,x2,x3,x4)
init <- factor(rep(1:4,rep(10,4)))
#
gfi <- s.class(x, init, psub.text="initial",psub.cex=2, col=TRUE, plot=FALSE)
#
h0 <- hclust(dist(x),"single")
parti <- as.factor(cutree(h0,k=4))
gfs <- s.class(x, parti, psub.text="single", psub.cex=2, col=TRUE, plot=FALSE)
#
h0 <- hclust(dist(x),"average")
parti <- as.factor(cutree(h0,k=4))
gfa <- s.class(x, parti, psub.text="average", psub.cex=2, col=TRUE, plot=FALSE)
#
h0 <- hclust(dist(x),"complete")
parti <- as.factor(cutree(h0,k=4))
gfc <- s.class(x, parti, psub.text="complete", psub.cex=2, col=TRUE, plot=FALSE)
#
ADEgS(list(gfi,gfs,gfa,gfc), layout=c(2,2))
}
# Ecart type échantillon à 0.25
fc(sd=0.25)
#Ecart type echantillon à 0.5
fc(sd=0.25)
me <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mortality_Europe.txt", h=TRUE)
mame <- me[,c(5:9,11:24)]
rownames(mame) <- me$Code
dmame <- dist(mame)
classif <- hclust(d = dmame, method = "ward.D2")
plot(classif, main="Dendrogramme des pays \n methode de Ward", xlab = "Les pays",
sub = "")
abline(h = 250, col = "red", lwd = 1.5)
parti <- cutree(classif, h = 250)
parti
## AUT BEL HRV CZE DNK EST FIN FRA DEU HUN ISL IRL ITA LVA LTU LUX MLT NLD
## 1 1 1 1 1 1 1 2 3 1 1 1 2 1 1 1 1 1
## NOR MDA ROU SVK SVN ESP SWE CHE MKD GBR
## 1 1 2 1 1 2 1 1 1 2
names(classif)
## [1] "merge" "height" "order" "labels" "method"
## [6] "call" "dist.method"
plot(rev(classif$height), type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "classif$height", xlab = "", las = 1)
points(1:length(classif$height), rev(classif$height), pch = 20)
RSQ <- rep(0, nrow(mame))
sum(scale(mame, scale = FALSE)^2) -> SQTot
for (i in 1:nrow(mame)) {
for(j in 1:ncol(mame)) {
Cla <- as.factor(cutree(h_dmame, i))
sum(t((t(sapply(1:ncol(mame), function(j) tapply(mame[,j], Cla, mean)))-
apply(mame, 2, mean))^2) * as.vector(table(Cla)))/SQTot -> RSQ[i]
}
}
# L'idée étant d'essayer de s'approcher de 1 avec le moins de découpage
plot(RSQ)
### pseudo F (reprendre la lecture ici ) \[F_{pseudo}=\frac{\frac{R^2}{K-1}}{\frac{1-R^2}{n-K}}\sim F(K-1,N-K)\] \(R^2\) $$ coefficient de corrélation inter-classe (Between) \(1-R^2\) \(\rightarrow\) coefficient de correlation intra-classe (Within )
pseudoF=NULL
N=length(RSQ)
for(i in 2:N-1){
pseudoF=c(pseudoF,(RSQ[i]/i-1)/((1-RSQ[i])/(N-i) ))
}
plot(pseudoF)
DataS=tbl_df(read_csv("http://195.220.111.226/R/serp.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## V1 = col_double(),
## V2 = col_double()
## )
rownames(DataS)=DataS$X1
## Warning: Setting row names on a tibble is deprecated.
DataS=DataS[,-1]
#boxplot
boxplot(DataS[,-1])
# calcule de distance
ddatas=dist(x = DataS)
# cluster avec ward2
h_datas=hclust(ddatas,method= "ward.D2")
# plot
plot(h_datas)
# coude
plot(rev(h_datas$height), type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "h_datas$height", xlab = "", las = 1)
points(1:length(h_datas$height), rev(h_datas$height), pch = 20)
# Gap stat
mydist <- function(DataS) as.dist((1-cor(t(DataS)))/2)
mycluster <- function(DataS, k) list(cluster=cutree(hclust(mydist(DataS), method = "ward.D2"),k=k))
myclusGap <- clusGap(DataS,FUN = mycluster,
K.max = 30,
B = 100)
plot(myclusGap, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)
fviz_dend(h_datas, cex = 0.5, k = 3, palette = "jco")
####Exo 2 * faire pareil avec les autres methodes ### Des hamas sphérique
set.seed(11101)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
C1 <- mvrnorm(120, c(0,0), matrix(c(.5,0,0,.5),2,2))
C2 <- mvrnorm(40, c(4,2), matrix(c(.15,0,0,.15),2,2))
C3 <- mvrnorm(40, c(4,-2), matrix(c(.15,0,0,.15),2,2))
C4 <- mvrnorm(40, c(-4,3), matrix(c(.15,0,0,.15),2,2))
P <- rbind(C1, C2, C3, C4)
plot(P, pch = 19, cex = 0.75, col = 'navy', main = 'Amas Spheriques',
xlab = "", ylab = "", las = 1)
set.seed(11102)
t = pi/6
C1 <- mvrnorm(150, c(0,0), 1.4*matrix(c(cos(t),sin(t),-sin(t),cos(t)),2,2))
C2 <- mvrnorm(40, c(4,2), matrix(c(.25,0,0,.25),2,2));
C3 <- mvrnorm(40, c(4,-2), matrix(c(.25,0,0,.25),2,2))
C4 <- mvrnorm(40, c(-4,3), matrix(c(.25,0,0,.25),2,2))
P <- rbind(C1, C2, C3, C4)
plot(P, pch = 19, cex = 0.75, col = 'navy', main = 'Amas ovoïdes',
xlab = "", ylab = "", las = 1)
## compléxité de hclust
big=read.table("http://195.220.111.226/R/P200901_50000.csv",sep=";",header = T)
T <- sapply(seq(from = 1000, to = 15000, by = 1000), function(n) system.time({
cat(n,'\n');
ind <- sort(sample(1:nrow(big), n));
hclust(dist(big[ind,-1]), 'ward.D2');
}))
## 1000
## 2000
## 3000
## 4000
## 5000
## 6000
## 7000
## 8000
## 9000
## 10000
## 11000
## 12000
## 13000
## 14000
## 15000
plot(seq(from = 1000, to = 15000, by = 1000), T[3,], pch = 20, las = 1,
main = "Complexité en temps" )
x <- seq(from = 1000, to = 15000, by = 1000)
M <- lm(T[3,] ~ I(x)+I(x^2)+I(x^3))
summary(M)
##
## Call:
## lm(formula = T[3, ] ~ I(x) + I(x^2) + I(x^3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3555 -3.1903 -0.7896 4.0261 14.0115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.394e+01 1.022e+01 -1.364 0.19993
## I(x) 1.122e-02 5.356e-03 2.095 0.06011 .
## I(x^2) -1.997e-06 7.650e-07 -2.611 0.02423 *
## I(x^3) 1.087e-10 3.150e-11 3.451 0.00542 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.538 on 11 degrees of freedom
## Multiple R-squared: 0.9083, Adjusted R-squared: 0.8833
## F-statistic: 36.31 on 3 and 11 DF, p-value: 5.33e-06
I=I(1000000)
pred=M$coefficients[[1]]+ I*M$coefficients[[2]]+
(I^2)*M$coefficients[[3]]+(I^3)*M$coefficients[[4]]
# temps
pred
## [1] 106698049
D <- matrix(c(0,0,0,1,0,2,2,1,3,1,3,2,0,5,1,4,-1,4), ncol = 2, byrow = TRUE)
# centre initial
C <- matrix(c(0,4,3/2,2.4,1,4/5), ncol = 2, byrow = TRUE)
# fonctions kmeans
km.CD=kmeans(D,centers = C)
km.CD
## K-means clustering with 3 clusters of sizes 3, 3, 3
##
## Cluster means:
## [,1] [,2]
## 1 0.000000 4.333333
## 2 2.666667 1.333333
## 3 0.000000 1.000000
##
## Clustering vector:
## [1] 3 3 3 2 2 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 2.666667 1.333333 2.000000
## (between_SS / total_SS = 85.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# plot final
couleur <- c("blue", "red", "orange2")
plot(D, asp = 1, pch = 19, col = couleur[km.CD$cluster])
points(km.CD$centers, pch = 8, col = couleur, cex = 2)
###Exo 2
R=NULL
for(i in 1:50){
km.p=kmeans(P,i)
r=km.p$betweenss/km.p$totss
R=c(R,r)
}
plot(R, type = 'l', main = "hauteurs du dendrogramme décroissantes", ylab = "R2", xlab = "Class", las = 1)
points(1:50, pch = 20)
gri=expand.grid(3:6,seq(1,50,5))
R=NULL
for(i in 1:nrow(gri)){
for(j in 1:nrow(gri)){
km.p=kmeans(P,i,nstart = j)
r=km.p$betweenss/km.p$totss
R=c(R,r)
}
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
data=data.frame(cbind(R,gri))
data$Var2=as.factor(data$Var2)
ggplot(data = data,aes(y=R,x=Var1,col=Var2 ))+
geom_line()
## variante du k-means
set.seed(11103)
C1 <- mvrnorm(120, c(0,0), matrix(c(0.5,0,0,0.5),2,2))
C2 <- mvrnorm(40, c(4,2), matrix(c(0.15,0,0,0.15),2,2))
C3 <- mvrnorm(40, c(4,-2), matrix(c(0.15,0,0,0.15),2,2))
C4 <- mvrnorm(40, c(-4,3), matrix(c(0.15,0,0,0.15),2,2))
P <- data.frame(rbind(C1, C2, C3, C4))
# centre inital
CK <- matrix(c(-6,0,2,0,4,4,10,-1), ncol = 2, byrow = TRUE)
# kmeans avec l'algo de Floyd à noter qu'il y a une case de vide
km.Pfloyd=kmeans(P,CK,algorithm = "Lloyd")
## Warning: empty cluster: try a better set of initial centers
## Warning: empty cluster: try a better set of initial centers
# representation Classe
fviz_cluster(km.Pfloyd, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Lloyd ")
# Avec une seule itération
km.Pfloyd1=kmeans(P,CK,algorithm = "Lloyd",iter.max = 1)
## Warning: empty cluster: try a better set of initial centers
## Warning: did not converge in 1 iteration
## Warning: empty cluster: try a better set of initial centers
#rpZ
# Remarque nous n'avons pas eu de convergence
fviz_cluster(km.Pfloyd1, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Lloyd Avec une Iter")
#### Exo 2 (Forgy)
km.Pforgy= kmeans(x = P,centers = 4,algorithm = "Forgy")
km.Pforgy
## K-means clustering with 4 clusters of sizes 40, 55, 65, 80
##
## Cluster means:
## X1 X2
## 1 -4.0291091 2.9649813
## 2 0.6985004 -0.2107204
## 3 -0.4093240 0.1973451
## 4 3.9956840 -0.0827769
##
## Clustering vector:
## [1] 3 3 2 2 3 3 2 2 3 3 3 3 2 3 3 2 2 3 3 2 3 3 2 3 2 3 3 3 2 2 3 2 3 2 2
## [36] 3 2 3 3 2 2 3 3 3 3 3 3 2 2 2 3 2 2 2 2 2 3 3 3 3 3 2 3 2 2 2 2 3 2 3
## [71] 2 3 2 2 3 2 3 3 3 3 2 3 3 3 2 3 2 3 3 2 2 2 2 2 3 3 2 2 3 3 3 3 3 2 3
## [106] 2 2 2 3 2 2 2 3 3 3 3 2 3 2 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [141] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [176] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 12.94407 43.34795 43.52353 348.33175
## (between_SS / total_SS = 82.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# La séparation est beaucoup plus distinct ici
# cependant cette méthode respecte le nbre de classe fixée au départ
fviz_cluster(km.Pforgy, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Forgy")
km.Pforgy1= kmeans(x = P,centers = 6,algorithm = "Forgy",nstart = 20)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
km.Pforgy2= kmeans(x = P,centers = 4,algorithm = "Forgy")
km.Pforgy3= kmeans(x = P,centers = 4,algorithm = "Forgy")
p1=fviz_cluster(km.Pforgy1, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Forgy")
p2=fviz_cluster(km.Pforgy2, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Forgy")
p3=fviz_cluster(km.Pforgy3, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Forgy")
par(mfrow=c(2,2))
p1
p2
p3
#### Essai Hartigan Faire de même #### Essai Mac Queen
km.Pforgy=kmeans(P,4,nstart = 1000,algorithm = "Forgy")
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
fviz_cluster(km.Pforgy, data = P,
ellipse.type = "convex",
palette = "jco",
repel = TRUE,
ggtheme = theme_minimal(),geom = "point",main = "Forgy")
km.Pforgy
## K-means clustering with 4 clusters of sizes 40, 40, 119, 41
##
## Cluster means:
## X1 X2
## 1 -4.0291091 2.964981292
## 2 3.9926048 -2.095184186
## 3 0.0832006 -0.007283337
## 4 3.9478322 1.933896163
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [141] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 12.94407 13.91159 120.66077 14.72071
## (between_SS / total_SS = 93.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
bigger=na.omit(big[,-56])
T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
cat(n,'\n');
ind <- sort(sample(1:nrow(big),n));
kmeans(bigger[ind,],3,algorithm = "Lloyd");
}))
plot(seq(from=100,to=50000,by=5000),T[3,],type="l")
bigger=na.omit(big[,-56])
T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
cat(n,'\n');
ind <- sort(sample(1:nrow(big),n));
kmeans(bigger[ind,],3,algorithm = "Forgy");
}))
plot(seq(from=100,to=50000,by=5000),T[3,],type="l")
bigger=na.omit(big[,-56])
T <- sapply(seq(from=1000,to=nrow(big),by=5000),function(n) system.time({
cat(n,'\n');
ind <- sort(sample(1:nrow(big),n));
kmeans(bigger[ind,],3,algorithm = "MacQueen");
}))
plot(seq(from=100,to=50000,by=5000),T[3,],type="l")