wss <- numeric(10)
for(k in 1:10){
clusters_k <- cutree(hc, k = k)
wss[k] <- sum(sapply(unique(clusters_k), function(cl){
points <- data_option1[clusters_k == cl, ]
if(nrow(points) <= 1) return(0)
sum(rowSums((points - matrix(colMeans(points), nrow(points), ncol(points), byrow = TRUE))^2))
}))
}
wss_df <- data.frame(k = 1:10, WSS = wss)
ggplot(wss_df, aes(x = k, y = WSS)) +
geom_point(size = 3) +
geom_line() +
labs(title = "Méthode du coude",
x = "Nombre de clusters", y = "WSS") +
theme_minimal()
set.seed(123)
stab_hclust1 <- clusterboot(
data_option1,
B = 100,
clustermethod = hclustCBI,
k = 3,
method = "ward.D2"
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
Bien que cela puisse regrouper par défaut uniquement en fonction de l’envergure de la commune, il est intéressant d’un point de vue statistique de donner un poids aux communes pour mieux étaler le nuage de points. De plus, cela permettra de donner une ‘importance’ aux communes : deux communes avec des fragilités identiques n’obtiendront pas forcément les mêmes aides pour privilégier au plus grand nombre (c’est aussi plus fiable d’aider les grandes communes selon les indicateurs).
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## === INTERPRÉTATION DES DIMENSIONS ===
##
## Dim1 :
## Variables structurantes : partPopEt, partPopImmi, partEmpDurLim, partOuvr, txChom, partPopSansDip, partMenProp, partResPrincApp, DISP_MED_A21, partEmpSal
## Pôle positif : partPopEt, partPopImmi, partEmpDurLim, partOuvr, txChom, partPopSansDip, partResPrincApp, partEmpSal
## Pôle négatif : partMenProp, DISP_MED_A21
##
## Dim2 :
## Variables structurantes : partPop25_59, partPop75p, indiceJeunesse, partPopBepCap, partMen1p, nbPersResPrinc
## Pôle positif : partPop25_59, indiceJeunesse, partPopBepCap, nbPersResPrinc
## Pôle négatif : partPop75p, partMen1p
FactoInvestigate::description(res.PCA_option2)
##
## * * *
##
## The **dimension 1** opposes individuals characterized by a strongly positive coordinate on the axis (to the right of the graph)
## to individuals such as *Venasque*, *Saint-Didier* and *Bédoin* (to the left of the graph, characterized by a strongly negative coordinate on the axis).
##
## The group 1 (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group in which the individuals *Venasque*, *Saint-Didier* and *Bédoin* stand (characterized by a negative coordinate on the axis) is sharing :
##
## - high values for the variables *partResPrincSurocc*, *partEmpDurLim*, *txChom*, *partFamMono*, *partPopEt* and *partPopImmi* (variables are sorted from the strongest).
## - low values for the variables *txEmp*, *partPopBac*, *txScol15_24* and *DISP_MED_A21* (variables are sorted from the weakest).
##
## Note that the variables *partPopSansDip*, *partMenProp*, *partResPrincApp* and *DISP_MED_A21* are highly correlated with this dimension (respective correlation of 0.94, 0.95, 0.93, 0.91). These variables could therefore summarize themselve the dimension 1.
##
## * * *
##
## The **dimension 2** opposes individuals such as *Carpentras*, *Loriol-du-Comtat*, *Aubignan*, *Modène* and *Mazan* (to the top of the graph, characterized by a strongly positive coordinate on the axis)
## to individuals such as *Venasque*, *Saint-Didier* and *Bédoin* (to the bottom of the graph, characterized by a strongly negative coordinate on the axis).
##
## The group in which the individuals *Carpentras*, *Loriol-du-Comtat*, *Aubignan*, *Modène* and *Mazan* stand (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group in which the individuals *Venasque*, *Saint-Didier* and *Bédoin* stand (characterized by a negative coordinate on the axis) is sharing :
##
## - high values for the variables *partResPrincSurocc*, *partEmpDurLim*, *txChom*, *partFamMono*, *partPopEt* and *partPopImmi* (variables are sorted from the strongest).
## - low values for the variables *txEmp*, *partPopBac*, *txScol15_24* and *DISP_MED_A21* (variables are sorted from the weakest).
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 4 proposed 7 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
##
## === CLASSIFICATION ===
## Nombre optimal de clusters : 3
#Factoshiny::HCPCshiny(res.PCA_option2)
plot_cluster_profils_CAH(
data_cluster = data_clean[-ind_sup_option2, ],
res.HCPC = res.HCPC_option2,
main_title = "Profils moyens des clusters CAH avec ACP pondérée par population (Option 2)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
FactoInvestigate::classif(res.HCPC_option2)
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## drawn <-
## c("Carpentras", "Venasque", "Aubignan", "Bédoin", "Loriol-du-Comtat",
## "Saint-Didier", "Sarrians", "Mazan", "Modène")
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'tree', title = '')
## ```
##
## **Figure.1 - Hierarchical tree.**
##
## The classification made on individuals reveals 3 clusters.
##
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'map', draw.tree = FALSE, title = '')
## ```
##
## **Figure.2 - Ascending Hierarchical Classification of the individuals.**
##
## The **cluster 1** is made of individuals such as *Bédoin*, *Saint-Didier* and *Venasque*. This group is characterized by :
##
## - high values for variables like *partPop60_74*, *partArtCadr*, *partPopBacSup*, *partResSec*, *partPop75p*, *partMenProp*, *DISP_MED_A21*, *partTpsPart*, *partMenVoit* and *partPopBac* (variables are sorted from the strongest).
## - low values for variables like *indiceJeunesse*, *partPop0_14*, *partPop15_24*, *partResVac*, *partEmpl*, *nbPersResPrinc*, *partPopSansDip*, *partOuvr*, *partEmpSal* and *partPopBepCap* (variables are sorted from the weakest).
##
## The **cluster 2** is made of individuals such as *Aubignan*, *Loriol-du-Comtat*, *Mazan*, *Modène* and *Sarrians*. This group is characterized by :
##
## - high values for variables like *txEmp*, *partMenVoit*, *partPop25_59*, *partPopBepCap*, *DISP_MED_A21*, *partMenProp*, *nbPersResPrinc*, *partPopBac*, *partPI* and *txScol15_24* (variables are sorted from the strongest).
## - low values for variables like *partPopEt*, *partMen1p*, *partPopImmi*, *txChom*, *partResPrincApp*, *partResPrincSurocc*, *partEmpDurLim*, *partPopSansDip*, *partEmpSal* and *partOuvr* (variables are sorted from the weakest).
##
## The **cluster 3** is made of individuals such as *Carpentras*. This group is characterized by :
##
## - high values for variables like *partResPrincApp*, *partPopImmi*, *partPopEt*, *txChom*, *partPopSansDip*, *partOuvr*, *partEmpDurLim*, *partEmpSal*, *partResPrincSurocc* and *partMen1p* (variables are sorted from the strongest).
## - low values for variables like *partMenProp*, *partMenVoit*, *DISP_MED_A21*, *txEmp*, *partPopBac*, *partArtCadr*, *partResSec*, *partPopBacSup*, *partPop60_74* and *partPop25_59* (variables are sorted from the weakest).
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = '3D.map', ind.names=FALSE, title = '')
## ```
##
## **Figure.3 - Hierarchical tree on the factorial map.**
##
## The hierarchical tree can be drawn on the factorial map with the individuals colored according to their clusters.
##
## === CLASSIFICATION K-MEANS ===
## Nombre optimal de clusters : 3
plot_cluster_profils_KMEANS(
data_cluster = data_clean[-ind_sup_option2,],
res.kmeans = res.km_option2,
main_title = "Profils moyens des clusters k-means (Option 2)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
X_pca2 <- res.PCA_option2$ind$coord[, 1:2]
set.seed(123)
stab_hclust2 <- clusterboot(
X_pca2,
B = 100,
clustermethod = hclustCBI,
k = nb_optimal_option2,
method = "ward.D2"
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
stab_km2 <- clusterboot(
X_pca2,
B = 100,
clustermethod = kmeansCBI,
k = nb_optimal_km_option2,
seed = 123
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## === INTERPRÉTATION DES DIMENSIONS ===
##
## Dim1 :
## Variables structurantes : partPopImmi, partEmpDurLim, partOuvr, txChom, partPopSansDip, partMenProp, partResPrincApp, DISP_MED_A21, partEmpSal
## Pôle positif : partPopImmi, partEmpDurLim, partOuvr, txChom, partPopSansDip, partResPrincApp, partEmpSal
## Pôle négatif : partMenProp, DISP_MED_A21
##
## Dim2 :
## Variables structurantes : partPop25_59, partPop75p, indiceJeunesse, partMen1p, nbPersResPrinc
## Pôle positif : partPop25_59, indiceJeunesse, nbPersResPrinc
## Pôle négatif : partPop75p, partMen1p
FactoInvestigate::description(res.PCA_option3)
##
## * * *
##
## The **dimension 1** opposes individuals characterized by a strongly positive coordinate on the axis (to the right of the graph)
## to individuals such as *Venasque*, *Saint-Didier*, *Crillon-le-Brave*, *Le Barroux* and *Bédoin* (to the left of the graph, characterized by a strongly negative coordinate on the axis).
##
## The group 1 (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group in which the individuals *Venasque*, *Saint-Didier*, *Crillon-le-Brave*, *Le Barroux* and *Bédoin* stand (characterized by a negative coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
##
## * * *
##
## The **dimension 2** opposes individuals such as *Loriol-du-Comtat* and *Modène* (to the top of the graph, characterized by a strongly positive coordinate on the axis)
## to individuals characterized by a strongly negative coordinate on the axis (to the bottom of the graph).
##
## The group in which the individuals *Loriol-du-Comtat* and *Modène* stand (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group 2 (characterized by a negative coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 3 as the best number of clusters
## * 6 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 7 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 8
##
##
## *******************************************************************
##
## === CLASSIFICATION ===
## Nombre optimal de clusters : 8
plot_cluster_profils_CAH(
data_cluster = data_clean[-ind_sup_option3, ],
res.HCPC = res.HCPC_option3,
main_title = "Profils moyens des clusters CAH avec ACP pondérée par sqrt(population) (Option 3)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
FactoInvestigate::classif(res.HCPC_option3)
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## drawn <-
## c("Carpentras", "Venasque", "Modène", "Loriol-du-Comtat", "Aubignan",
## "Bédoin", "La Roque-sur-Pernes", "Saint-Didier", "Sarrians",
## "Crillon-le-Brave", "Le Barroux")
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'tree', title = '')
## ```
##
## **Figure.1 - Hierarchical tree.**
##
## The classification made on individuals reveals 8 clusters.
##
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'map', draw.tree = FALSE, title = '')
## ```
##
## **Figure.2 - Ascending Hierarchical Classification of the individuals.**
##
## The **cluster 1** is made of individuals such as *Le Barroux*, *Crillon-le-Brave* and *La Roque-sur-Pernes*. This group is characterized by :
##
## - high values for variables like *partArtCadr*, *partResSec*, *partPopBacSup*, *partPop60_74*, *txScol15_24*, *DISP_MED_A21*, *partPopBac*, *partMenProp*, *partMenVoit* and *txEmp* (variables are sorted from the strongest).
## - low values for variables like *partPopSansDip*, *partPopBepCap*, *partResVac*, *partEmpl*, *partOuvr*, *txChom*, *partEmpDurLim*, *partPop15_24*, *indiceJeunesse* and *partResPrincApp* (variables are sorted from the weakest).
##
## The **cluster 2** is made of individuals such as *Venasque*. This group is characterized by :
##
## - high values for variables like *partPop75p*, *partMen1p*, *partPopBacSup*, *partArtCadr*, *partPop60_74*, *txEmp*, *partResSec*, *DISP_MED_A21*, *partTpsPart* and *partMenProp* (variables are sorted from the strongest).
## - low values for variables like *nbPersResPrinc*, *partPop15_24*, *indiceJeunesse*, *partPop0_14*, *partMenVoit*, *partPop25_59*, *partPopBepCap*, *partOuvr*, *partPopSansDip* and *partEmpDurLim* (variables are sorted from the weakest).
##
## The **cluster 3** is made of individuals such as *Loriol-du-Comtat* and *Modène*. This group is characterized by :
##
## - high values for variables like *partPop25_59*, *partMenVoit*, *partPopBepCap*, *partMenProp*, *partPopBac*, *nbPersResPrinc*, *DISP_MED_A21*, *partPI*, *txEmp* and *indiceJeunesse* (variables are sorted from the strongest).
## - low values for variables like *partEmpDurLim*, *partEmpSal*, *partTpsPart*, *partFamMono*, *partPop75p*, *partPopSansDip*, *partMen1p*, *partPopImmi*, *partPopEt* and *partResPrincApp* (variables are sorted from the weakest).
##
## The **cluster 4** is made of individuals such as *Bédoin* and *Saint-Didier*. This group is characterized by :
##
## - high values for the variables *partPop60_74*, *partPop75p*, *partTpsPart*, *partMenProp*, *partResSec*, *partPopBacSup*, *partArtCadr*, *partMenVoit*, *partResPrincSurocc* and *partMen1p* (variables are sorted from the strongest).
## - low values for variables like *indiceJeunesse*, *partPop0_14*, *partPop25_59*, *nbPersResPrinc*, *partResVac*, *partEmpSal*, *partResPrincApp*, *partPopBepCap*, *partEmpl* and *txScol15_24* (variables are sorted from the weakest).
##
## The **cluster 5** is made of individuals sharing :
##
## - high values for variables like *partResVac*, *partPI*, *DISP_MED_A21*, *partPopBepCap*, *partMenVoit*, *txScol15_24*, *txEmp*, *nbPersResPrinc*, *partTpsPart* and *partPop0_14* (variables are sorted from the strongest).
## - low values for variables like *partPopImmi*, *partPopEt*, *partMen1p*, *partResPrincSurocc*, *partEmpl*, *txChom*, *partPop75p*, *partResSec*, *partResPrincApp* and *partEmpDurLim* (variables are sorted from the weakest).
##
## The **cluster 6** is made of individuals sharing :
##
## - high values for the variables *partPop75p*, *partResVac*, *partEmpl*, *partResSec*, *partTpsPart*, *partEmpDurLim*, *partMen1p*, *partPopSansDip*, *partPopBepCap* and *partFamMono* (variables are sorted from the strongest).
## - low values for variables like *nbPersResPrinc*, *indiceJeunesse*, *txScol15_24*, *partPopEt*, *partPI*, *partPop15_24*, *DISP_MED_A21*, *partPop0_14*, *partPopBacSup* and *partPopImmi* (variables are sorted from the weakest).
##
## The **cluster 7** is made of individuals such as *Aubignan* and *Sarrians*. This group is characterized by :
##
## - high values for variables like *nbPersResPrinc*, *indiceJeunesse*, *partPop25_59*, *partPop0_14*, *partPop15_24*, *partPopBepCap*, *partEmpl*, *partEmpSal*, *partOuvr* and *partPopSansDip* (variables are sorted from the strongest).
## - low values for variables like *partPop60_74*, *partResSec*, *partPop75p*, *partTpsPart*, *partPopBacSup*, *partMen1p*, *txScol15_24*, *partArtCadr*, *partPI* and *txChom* (variables are sorted from the weakest).
##
## The **cluster 8** is made of individuals such as *Carpentras*. This group is characterized by :
##
## - high values for variables like *partResPrincApp*, *partPopEt*, *partPopImmi*, *txChom*, *partPopSansDip*, *partEmpSal*, *partOuvr*, *partEmpDurLim*, *partPop0_14* and *partMen1p* (variables are sorted from the strongest).
## - low values for variables like *partMenProp*, *partMenVoit*, *DISP_MED_A21*, *txEmp*, *partResSec*, *partPopBacSup*, *partArtCadr*, *partPopBac*, *partPop60_74* and *partPop25_59* (variables are sorted from the weakest).
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = '3D.map', ind.names=FALSE, title = '')
## ```
##
## **Figure.3 - Hierarchical tree on the factorial map.**
##
## The hierarchical tree can be drawn on the factorial map with the individuals colored according to their clusters.
##
## === CLASSIFICATION K-MEANS ===
## Nombre optimal de clusters : 3
plot_cluster_profils_KMEANS(
data_cluster = data_clean[-ind_sup_option3,],
res.kmeans = res.km_option3,
main_title = "Profils moyens des clusters en k-means (Option 3)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
X_pca3 <- res.PCA_option3$ind$coord[, 1:2]
set.seed(123)
stab_hclust3 <- clusterboot(
X_pca3,
B = 100,
clustermethod = hclustCBI,
k = nb_optimal_option3,
method = "ward.D2"
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
stab_km3 <- clusterboot(
X_pca3,
B = 100,
clustermethod = kmeansCBI,
k = nb_optimal_km_option3,
seed = 123
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## === INTERPRÉTATION DES DIMENSIONS ===
##
## Dim1 :
## Variables structurantes : partPop60_74, partArtCadr, partOuvr, partPopBacSup, partPopSansDip, partMenProp, partResSec, partResPrincApp, DISP_MED_A21
## Pôle positif : partOuvr, partPopSansDip, partResPrincApp
## Pôle négatif : partPop60_74, partArtCadr, partPopBacSup, partMenProp, partResSec, DISP_MED_A21
##
## Dim2 :
## Variables structurantes : partPopEt, partPopImmi, partMenVoit, partMen1p, nbPersResPrinc
## Pôle positif : partPopEt, partPopImmi, partMen1p
## Pôle négatif : partMenVoit, nbPersResPrinc
FactoInvestigate::description(res.PCA_option4)
##
## * * *
##
## The **dimension 1** opposes individuals characterized by a strongly positive coordinate on the axis (to the right of the graph)
## to individuals such as *Aubignan*, *Loriol-du-Comtat*, *Sarrians*, *La Roque-Alric* and *Vacqueyras* (to the left of the graph, characterized by a strongly negative coordinate on the axis).
##
## The group 1 (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group in which the individual *La Roque-Alric* stands (characterized by a negative coordinate on the axis) is sharing :
##
## - high values for the variable *partResVac*.
##
## The group in which the individuals *Aubignan*, *Loriol-du-Comtat*, *Sarrians* and *Vacqueyras* stand (characterized by a negative coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
##
## * * *
##
## The **dimension 2** opposes individuals characterized by a strongly positive coordinate on the axis (to the top of the graph)
## to individuals such as *Aubignan*, *Loriol-du-Comtat*, *Sarrians* and *Vacqueyras* (to the bottom of the graph, characterized by a strongly negative coordinate on the axis).
##
## The group 1 (characterized by a positive coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The group in which the individuals *Aubignan*, *Loriol-du-Comtat*, *Sarrians* and *Vacqueyras* stand (characterized by a negative coordinate on the axis) is sharing :
##
## - variables whose values do not differ significantly from the mean.
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 8 proposed 6 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 6
##
##
## *******************************************************************
##
## === CLASSIFICATION ===
## Nombre optimal de clusters : 6
plot_cluster_profils_CAH(
data_cluster = data_clean[-ind_sup_option4, ],
res.HCPC = res.HCPC_option4,
main_title = "Profils moyens des clusters CAH avec ACP sans pondération (Option 4)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
FactoInvestigate::classif(res.HCPC_option4)
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## drawn <-
## c("La Roque-Alric", "Carpentras", "Modène", "Venasque", "La Roque-sur-Pernes",
## "Loriol-du-Comtat", "Suzette", "Aubignan", "Vacqueyras", "Sarrians",
## "Flassan", "Saint-Hippolyte-le-Graveyron", "Crillon-le-Brave"
## )
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'tree', title = '')
## ```
##
## **Figure.1 - Hierarchical tree.**
##
## The classification made on individuals reveals 6 clusters.
##
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = 'map', draw.tree = FALSE, title = '')
## ```
##
## **Figure.2 - Ascending Hierarchical Classification of the individuals.**
##
## The **cluster 1** is made of individuals such as *La Roque-Alric* and *Venasque*. This group is characterized by :
##
## - high values for the variables *txScol15_24*, *partMen1p*, *partPopBacSup*, *partPopImmi*, *txEmp* and *partResSec* (variables are sorted from the strongest).
## - low values for the variables *partPop15_24*, *nbPersResPrinc*, *partMenVoit* and *partPopBepCap* (variables are sorted from the weakest).
##
## The **cluster 2** is made of individuals such as *Crillon-le-Brave*, *La Roque-sur-Pernes*, *Saint-Hippolyte-le-Graveyron* and *Suzette*. This group is characterized by :
##
## - high values for the variables *partPop60_74*, *DISP_MED_A21*, *partArtCadr*, *partPopBacSup*, *partMenVoit*, *partResSec* and *partMenProp* (variables are sorted from the strongest).
## - low values for the variables *partEmpl*, *partOuvr*, *partResVac*, *partPopSansDip*, *partEmpDurLim* and *partPopBepCap* (variables are sorted from the weakest).
##
## The **cluster 3** is made of individuals sharing :
##
## - variables whose values do not differ significantly from the mean.
##
## The **cluster 4** is made of individuals such as *Flassan*, *Loriol-du-Comtat* and *Modène*. This group is characterized by :
##
## - high values for the variable *partResVac*.
## - low values for the variables *partMen1p*, *partPopImmi*, *partPopEt*, *partEmpDurLim* and *partTpsPart* (variables are sorted from the weakest).
##
## The **cluster 5** is made of individuals such as *Aubignan*, *Sarrians* and *Vacqueyras*. This group is characterized by :
##
## - high values for the variables *partPopSansDip*, *partOuvr* and *partEmpSal* (variables are sorted from the strongest).
## - low values for the variables *partResSec*, *partPopBacSup*, *partMenProp* and *partPop60_74* (variables are sorted from the weakest).
##
## The **cluster 6** is made of individuals such as *Carpentras*. This group is characterized by :
##
## - high values for the variables *partResPrincApp*, *partPopEt*, *txChom*, *partPopImmi*, *partPopSansDip* and *partEmpSal* (variables are sorted from the strongest).
## - low values for the variables *partMenProp*, *partMenVoit*, *txEmp* and *DISP_MED_A21* (variables are sorted from the weakest).
##
## ```{r, echo = FALSE, fig.align = 'center', fig.height = 5.5, fig.width = 5.5}
## par(mar = c(4.1, 4.1, 1.1, 2.1))
## plot.HCPC(res, choice = '3D.map', ind.names=FALSE, title = '')
## ```
##
## **Figure.3 - Hierarchical tree on the factorial map.**
##
## The hierarchical tree can be drawn on the factorial map with the individuals colored according to their clusters.
##
## === CLASSIFICATION K-MEANS ===
## Nombre optimal de clusters : 6
plot_cluster_profils_KMEANS(
data_cluster = data_clean[-ind_sup_option4,],
res.kmeans = res.km_option4,
main_title = "Profils moyens des clusters en k-means (Option 4)"
)
## Warning in validateCoords(lng, lat, funcName): Data contains 2 rows with either
## missing or invalid lat/lon values and will be ignored
X_pca4 <- res.PCA_option4$ind$coord[, 1:2]
set.seed(123)
stab_hclust4 <- clusterboot(
X_pca4,
B = 100,
clustermethod = hclustCBI,
k = nb_optimal_option4,
method = "ward.D2"
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
stab_km4 <- clusterboot(
X_pca4,
B = 100,
clustermethod = kmeansCBI,
k = nb_optimal_km_option4,
seed = 123
)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
test = df_numeric_copy[-c(1:6),c(87:92)]
pheatmap(test)
#hclust
stab_hclust1$bootmean
## [1] 0.7250033 0.7188883 0.6900000
stab_hclust2$bootmean
## [1] 0.6001466 0.8479253 0.6135808
stab_hclust3$bootmean
## [1] 0.8528333 0.7423333 0.9608333 0.6538095 0.7200000 0.8050000 0.6200000
## [8] 0.6800000
stab_hclust4$bootmean
## [1] 0.7120595 0.9018333 0.7628571 0.7200000 0.8753333 0.8500000
#km
stab_km2$bootmean
## [1] 0.7499325 0.8851627 0.5774087
stab_km3$bootmean
## [1] 0.7351310 0.9083254 0.5950130
stab_km4$bootmean
## [1] 0.7372857 0.8595000 0.7025000 0.7350476 0.8167143 0.8957857