K-means

Carregamento dos dados

dados <- read_excel("C:/Users/cided/Desktop/Vanusa/dados/dados.xlsx")
dados <- as.data.frame(dados)
rownames(dados) <- dados$CLONES
dados <- dados[-1] 
names(dados) <- c("Leaf (E1)","Leaf (E2)", "Pod Lab.", "Pod Field")

dat <- dados[1] 
dat %>% kable

	Leaf (E1)
AMAZON 15.15	1.6020600
AMAZON 2.1	1.0211893
APA 5	0.9542425
CAB 4	1.3222193
CAB 5003.23	1.3802112
CAB 5046.140	1.2174839
CCN 51	1.6532125
CEPEC 1008	1.3521825
CEPEC 38	1.3979400
CEPEC 40	1.5740313
CEPEC 42	1.0413927
CEPEC 44	1.7817554
CEPEC 75	1.4623980
CEPEC 82	1.6901961
CEPEC 84	1.5118834
CEPEC 89	1.3010300
CEPEC 92	1.3010300
CEPEC 93	1.4623980
CHUAO 120	0.9777236
CJ 10	1.6020600
CSUL 4	1.2304489
CSUL 5	1.4313638
EET 272	1.3324385
EET 390	1.1760913
EET 392	1.3117539
EET 62	1.3117539
ICS 100	1.5563025
ICS 78	1.5051500
ICS 95	1.4771213
IMC 2	1.6020600
IMC 23	1.5563025
IMC 51	1.5682017
MA 12	1.5250448
MOCORONGO 1	1.0211893
MOCORONGO 2	1.1461280
MOQ 417	1.3891661
MOQ 647	1.2041200
NA 312	1.3424227
NA 33	1.5440680
OC 77	1.7242759
PA 120	1.4232459
PA 148	1.3710679
PA 15	1.3617278
PA 150	1.3324385
PA 169	0.9542425
PA 285	0.3979400
PA 294	1.2174839
PA 30	1.1613680
PA 44	1.3710679
PA 51	1.3324385
PA 70	1.3010300
PA 88	1.2787536
RB 31	1.3222193
RB 32	1.5563025
RB 33	1.2304489
RB 39	1.3324385
RIM 117	1.4471580
SCAVINA 6	0.7403627
SIAL 164	1.3010300
SIAL 505	1.2552725
SIAL 542	1.4913617
SIC 19	1.6283889
SIC 23	1.8195439
SIC 842	1.2552725
SIC 864	1.3710679
SIC 891	1.3710679
SPA 12	1.4393327
SPA 5	1.4149733
TSA 516	1.5440680
TSA 644	1.6074550
TSA 654	1.3324385
UF 36	1.4065402
UF 667	1.5250448

Dimensionamento e padronização

df <- scale(dat) 

df %>% kable()

	Leaf (E1)
AMAZON 15.15	1.0339809
AMAZON 2.1	-1.4723134
APA 5	-1.7611700
CAB 4	-0.1734533
CAB 5003.23	0.0767657
CAB 5046.140	-0.6253570
CCN 51	1.2546897
CEPEC 1008	-0.0441704
CEPEC 38	0.1532604
CEPEC 40	0.9130448
CEPEC 42	-1.3851414
CEPEC 44	1.8093162
CEPEC 75	0.4313785
CEPEC 82	1.4142635
CEPEC 84	0.6448940
CEPEC 89	-0.2648792
CEPEC 92	-0.2648792
CEPEC 93	0.4313785
CHUAO 120	-1.6598557
CJ 10	1.0339809
CSUL 4	-0.5694167
CSUL 5	0.2974745
EET 272	-0.1293605
EET 390	-0.8039548
EET 392	-0.2186087
EET 62	-0.2186087
ICS 100	0.8365502
ICS 78	0.6158414
ICS 95	0.4949053
IMC 2	1.0339809
IMC 23	0.8365502
IMC 51	0.8878920
MA 12	0.7016820
MOCORONGO 1	-1.4723134
MOCORONGO 2	-0.9332377
MOQ 417	0.1154034
MOQ 647	-0.6830187
NA 312	-0.0862813
NA 33	0.7837619
OC 77	1.5613082
PA 120	0.2624481
PA 148	0.0373146
PA 15	-0.0029850
PA 150	-0.1293605
PA 169	-1.7611700
PA 285	-4.1614595
PA 294	-0.6253570
PA 30	-0.8674815
PA 44	0.0373146
PA 51	-0.1293605
PA 70	-0.2648792
PA 88	-0.3609956
RB 31	-0.1734533
RB 32	0.8365502
RB 33	-0.5694167
RB 39	-0.1293605
RIM 117	0.3656224
SCAVINA 6	-2.6840015
SIAL 164	-0.2648792
SIAL 505	-0.4623099
SIAL 542	0.5563488
SIC 19	1.1475829
SIC 23	1.9723632
SIC 842	-0.4623099
SIC 864	0.0373146
SIC 891	0.0373146
SPA 12	0.3318582
SPA 5	0.2267545
TSA 516	0.7837619
TSA 644	1.0572590
TSA 654	-0.1293605
UF 36	0.1903677
UF 667	0.7016820

Número ótimo de clusters

nbclust_out <- NbClust(
  data = df,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "ward.D"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 3 proposed 9 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  9 
##  
##  
## *******************************************************************

# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 10)

# create plot
ggplot(nbclust_plot) +
  aes(x = clusters) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
  theme_minimal()

Dendrograma

dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")

fviz_dend(dista.hc, cex=0.5, k = 9, color_labels_by_k = TRUE, horiz = T)

res <- hcut(dat, k = 9, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = "Dark2", horiz = T)

km.res1 <- hkmeans(df, 9,hc.metric = "euclid" ,hc.method = "ward.D")


fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
          rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)

fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

dat <- dados[2] 
dat %>% kable

	Leaf (E2)
AMAZON 15.15	1.1139434
AMAZON 2.1	1.0000000
APA 5	1.3222193
CAB 4	1.3891661
CAB 5003.23	1.4149733
CAB 5046.140	1.4232459
CCN 51	1.0791812
CEPEC 1008	1.6483600
CEPEC 38	1.2174839
CEPEC 40	1.3222193
CEPEC 42	1.2041200
CEPEC 44	1.2900346
CEPEC 75	1.3710679
CEPEC 82	1.2787536
CEPEC 84	1.3617278
CEPEC 89	1.3222193
CEPEC 92	1.2671717
CEPEC 93	1.3617278
CHUAO 120	1.1461280
CJ 10	1.2671717
CSUL 4	1.2671717
CSUL 5	1.4623980
EET 272	1.2900346
EET 390	1.3979400
EET 392	1.4393327
EET 62	1.3324385
ICS 100	1.4623980
ICS 78	1.3617278
ICS 95	1.5682017
IMC 2	1.2430380
IMC 23	0.8750613
IMC 51	1.0413927
MA 12	1.3222193
MOCORONGO 1	1.2900346
MOCORONGO 2	1.3222193
MOQ 417	1.1461280
MOQ 647	0.7403627
NA 312	1.2787536
NA 33	1.5118834
OC 77	1.4232459
PA 120	0.9777236
PA 148	1.1760913
PA 15	1.4698220
PA 150	1.4623980
PA 169	1.4698220
PA 285	0.3979400
PA 294	0.9030900
PA 30	0.9777236
PA 44	0.9030900
PA 51	1.2041200
PA 70	0.9777236
PA 88	1.0000000
RB 31	1.1461280
RB 32	1.4232459
RB 33	1.4623980
RB 39	1.3617278
RIM 117	1.3979400
SCAVINA 6	0.7781513
SIAL 164	1.4065402
SIAL 505	1.4393327
SIAL 542	1.2900346
SIC 19	1.4065402
SIC 23	1.6857417
SIC 842	1.2671717
SIC 864	1.1903317
SIC 891	1.3710679
SPA 12	1.5250448
SPA 5	1.3710679
TSA 516	1.5051500
TSA 644	1.4393327
TSA 654	1.2552725
UF 36	1.2787536
UF 667	1.3010300

Dimensionamento e padronização

df <- scale(dat) 

df %>% kable()

	Leaf (E2)
AMAZON 15.15	-0.7179530
AMAZON 2.1	-1.2381215
APA 5	0.2328580
CAB 4	0.5384801
CAB 5003.23	0.6562941
CAB 5046.140	0.6940595
CCN 51	-0.8766472
CEPEC 1008	1.7217393
CEPEC 38	-0.2452746
CEPEC 40	0.2328580
CEPEC 42	-0.3062831
CEPEC 44	0.0859300
CEPEC 75	0.4558590
CEPEC 82	0.0344305
CEPEC 84	0.4132204
CEPEC 89	0.2328580
CEPEC 92	-0.0184424
CEPEC 93	0.4132204
CHUAO 120	-0.5710251
CJ 10	-0.0184424
CSUL 4	-0.0184424
CSUL 5	0.8727948
EET 272	0.0859300
EET 390	0.5785344
EET 392	0.7674982
EET 62	0.2795100
ICS 100	0.8727948
ICS 78	0.4132204
ICS 95	1.3558047
IMC 2	-0.1286163
IMC 23	-1.8084856
IMC 51	-1.0491577
MA 12	0.2328580
MOCORONGO 1	0.0859300
MOCORONGO 2	0.2328580
MOQ 417	-0.5710251
MOQ 647	-2.4234048
NA 312	0.0344305
NA 33	1.0987029
OC 77	0.6940595
PA 120	-1.3398166
PA 148	-0.4342385
PA 15	0.9066865
PA 150	0.8727948
PA 169	0.9066865
PA 285	-3.9866158
PA 294	-1.6805303
PA 30	-1.3398166
PA 44	-1.6805303
PA 51	-0.3062831
PA 70	-1.3398166
PA 88	-1.2381215
RB 31	-0.5710251
RB 32	0.6940595
RB 33	0.8727948
RB 39	0.4132204
RIM 117	0.5785344
SCAVINA 6	-2.2508944
SIAL 164	0.6177955
SIAL 505	0.7674982
SIAL 542	0.0859300
SIC 19	0.6177955
SIC 23	1.8923925
SIC 842	-0.0184424
SIC 864	-0.3692287
SIC 891	0.4558590
SPA 12	1.1587869
SPA 5	0.4558590
TSA 516	1.0679640
TSA 644	0.7674982
TSA 654	-0.0727642
UF 36	0.0344305
UF 667	0.1361256

Número ótimo de clusters

nbclust_out <- NbClust(
  data = df,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "ward.D"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 6)

# create plot
ggplot(nbclust_plot) +
  aes(x = clusters) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
  theme_minimal()

Dendrograma

dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")

fviz_dend(dista.hc, cex=0.5, k = 3, color_labels_by_k = TRUE, horiz = T)

res <- hcut(dat, k = 3, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = "Dark2", horiz = T)

km.res1 <- hkmeans(df, 3,hc.metric = "euclid" ,hc.method = "ward.D")


fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
          rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)

fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

dat <- dados[3] 
dat %>% kable

	Pod Lab.
AMAZON 15.15	0.8571114
AMAZON 2.1	1.5354184
APA 5	0.5068532
CAB 4	0.4960531
CAB 5003.23	1.3177060
CAB 5046.140	0.9791707
CCN 51	0.2809029
CEPEC 1008	0.9891409
CEPEC 38	1.3666029
CEPEC 40	0.7961165
CEPEC 42	1.1048756
CEPEC 44	1.2825478
CEPEC 75	0.8729614
CEPEC 82	0.9510319
CEPEC 84	0.7590695
CEPEC 89	0.2587726
CEPEC 92	0.2639505
CEPEC 93	0.3156066
CHUAO 120	1.7819526
CJ 10	1.8245065
CSUL 4	0.9232208
CSUL 5	1.1350635
EET 272	0.6117698
EET 390	0.5764116
EET 392	0.5464359
EET 62	1.1572554
ICS 100	1.1430697
ICS 78	1.2239901
ICS 95	0.6780611
IMC 2	0.5063949
IMC 23	1.9398831
IMC 51	1.9315410
MA 12	0.7386612
MOCORONGO 1	1.2753404
MOCORONGO 2	0.7311415
MOQ 417	1.0109978
MOQ 647	0.7115607
NA 312	1.4783115
NA 33	1.5895142
OC 77	1.0502791
PA 120	0.9652000
PA 148	1.0276270
PA 15	1.2356085
PA 150	1.2646627
PA 169	0.5385116
PA 285	0.6545942
PA 294	0.4485199
PA 30	0.0000000
PA 44	0.7755202
PA 51	0.7822918
PA 70	1.0344874
PA 88	0.5565108
RB 31	0.6663900
RB 32	0.3983577
RB 33	0.6417879
RB 39	0.9754240
RIM 117	2.1415689
SCAVINA 6	0.0778750
SIAL 164	1.2432902
SIAL 505	1.5233441
SIAL 542	1.2504389
SIC 19	1.2911151
SIC 23	1.4537780
SIC 842	1.7287604
SIC 864	1.7068808
SIC 891	1.2539790
SPA 12	1.7871112
SPA 5	1.3093953
TSA 516	0.1945788
TSA 644	0.3985943
TSA 654	0.7623967
UF 36	1.9960110
UF 667	2.0585615

Dimensionamento e padronização

df <- scale(dat) 

df %>% kable()

	Pod Lab.
AMAZON 15.15	-0.2973635
AMAZON 2.1	1.0323556
APA 5	-0.9839921
CAB 4	-1.0051639
CAB 5003.23	0.6055630
CAB 5046.140	-0.0580844
CCN 51	-1.4269337
CEPEC 1008	-0.0385395
CEPEC 38	0.7014181
CEPEC 40	-0.4169349
CEPEC 42	0.1883411
CEPEC 44	0.5366408
CEPEC 75	-0.2662919
CEPEC 82	-0.1132464
CEPEC 84	-0.4895599
CEPEC 89	-1.4703169
CEPEC 92	-1.4601664
CEPEC 93	-1.3589023
CHUAO 120	1.5156488
CJ 10	1.5990695
CSUL 4	-0.1677659
CSUL 5	0.2475199
EET 272	-0.7783188
EET 390	-0.8476333
EET 392	-0.9063960
EET 62	0.2910238
ICS 100	0.2632149
ICS 78	0.4218472
ICS 95	-0.6483647
IMC 2	-0.9848905
IMC 23	1.8252479
IMC 51	1.8088944
MA 12	-0.5295673
MOCORONGO 1	0.5225118
MOCORONGO 2	-0.5443085
MOQ 417	0.0043078
MOQ 647	-0.5826938
NA 312	0.9204060
NA 33	1.1384022
OC 77	0.0813128
PA 120	-0.0854719
PA 148	0.0369068
PA 15	0.4446232
PA 150	0.5015796
PA 169	-0.9219304
PA 285	-0.6943680
PA 294	-1.0983457
PA 30	-1.9776016
PA 44	-0.4573107
PA 51	-0.4440361
PA 70	0.0503555
PA 88	-0.8866458
RB 31	-0.6712442
RB 32	-1.1966812
RB 33	-0.7194729
RB 39	-0.0654294
RIM 117	2.2206225
SCAVINA 6	-1.8249394
SIAL 164	0.4596822
SIAL 505	1.0086858
SIAL 542	0.4736960
SIC 19	0.5534357
SIC 23	0.8723117
SIC 842	1.4113735
SIC 864	1.3684819
SIC 891	0.4806359
SPA 12	1.5257616
SPA 5	0.5892712
TSA 516	-1.5961591
TSA 644	-1.1962174
TSA 654	-0.4830375
UF 36	1.9352783
UF 667	2.0578990

Número ótimo de clusters

nbclust_out <- NbClust(
  data = df,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "ward.D"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 1 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 6)

# create plot
ggplot(nbclust_plot) +
  aes(x = clusters) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
  theme_minimal()

Dendrograma

dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")

fviz_dend(dista.hc, cex=0.5, k = 4, color_labels_by_k = TRUE, horiz = T)

res <- hcut(dat, k = 4, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = "Dark2", horiz = T)

km.res1 <- hkmeans(df, 4,hc.metric = "euclid" ,hc.method = "ward.D")


fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
          rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)

fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

dat <- dados[4] 
dat %>% kable

	Pod Field
AMAZON 15.15	1.723369
AMAZON 2.1	1.900000
APA 5	2.918904
CAB 4	2.104757
CAB 5003.23	1.466288
CAB 5046.140	6.892750
CCN 51	5.754129
CEPEC 1008	4.869292
CEPEC 38	4.202380
CEPEC 40	1.445683
CEPEC 42	3.495712
CEPEC 44	1.539480
CEPEC 75	1.624808
CEPEC 82	2.424871
CEPEC 84	1.876166
CEPEC 89	2.391652
CEPEC 92	3.224903
CEPEC 93	1.288410
CHUAO 120	6.527634
CJ 10	1.606238
CSUL 4	2.922328
CSUL 5	5.498182
EET 272	1.926136
EET 390	2.083267
EET 392	4.435087
EET 62	3.896152
ICS 100	5.324472
ICS 78	7.615773
ICS 95	4.369211
IMC 2	3.018278
IMC 23	2.118962
IMC 51	1.407125
MA 12	2.670206
MOCORONGO 1	2.875761
MOCORONGO 2	2.600000
MOQ 417	1.260952
MOQ 647	1.268858
NA 312	3.917908
NA 33	2.090454
OC 77	3.917908
PA 120	1.228821
PA 148	3.938274
PA 15	4.701064
PA 150	4.282523
PA 169	2.477902
PA 285	3.056141
PA 294	2.477902
PA 30	3.364521
PA 44	2.100000
PA 51	1.720465
PA 70	2.095233
PA 88	2.306513
RB 31	2.170253
RB 32	2.233831
RB 33	2.034699
RB 39	3.430743
RIM 117	7.009280
SCAVINA 6	3.424909
SIAL 164	4.846648
SIAL 505	7.345747
SIAL 542	4.316248
SIC 19	4.771792
SIC 23	4.941660
SIC 842	5.336666
SIC 864	5.502727
SIC 891	5.582114
SPA 12	2.760435
SPA 5	4.760252
TSA 516	3.856164
TSA 644	2.092845
TSA 654	1.989975
UF 36	1.311488
UF 667	5.486347

Dimensionamento e padronização

df <- scale(dat) 

df %>% kable()

	Pod Field
AMAZON 15.15	-0.9692307
AMAZON 2.1	-0.8629994
APA 5	-0.2502001
CAB 4	-0.7398527
CAB 5003.23	-1.1238469
CAB 5046.140	2.1397897
CCN 51	1.4549891
CEPEC 1008	0.9228214
CEPEC 38	0.5217210
CEPEC 40	-1.1362391
CEPEC 42	0.0967094
CEPEC 44	-1.0798266
CEPEC 75	-1.0285083
CEPEC 82	-0.5473262
CEPEC 84	-0.8773337
CEPEC 89	-0.5673051
CEPEC 92	-0.0661630
CEPEC 93	-1.2308280
CHUAO 120	1.9201979
CJ 10	-1.0396767
CSUL 4	-0.2481408
CSUL 5	1.3010546
EET 272	-0.8472804
EET 390	-0.7527773
EET 392	0.6616780
EET 62	0.3375461
ICS 100	1.1965804
ICS 78	2.5746376
ICS 95	0.6220577
IMC 2	-0.1904337
IMC 23	-0.7313091
IMC 51	-1.1594293
MA 12	-0.3997745
MOCORONGO 1	-0.2761477
MOCORONGO 2	-0.4419984
MOQ 417	-1.2473420
MOQ 647	-1.2425872
NA 312	0.3506306
NA 33	-0.7484543
OC 77	0.3506306
PA 120	-1.2666668
PA 148	0.3628794
PA 15	0.8216441
PA 150	0.5699210
PA 169	-0.5154316
PA 285	-0.1676614
PA 294	-0.5154316
PA 30	0.0178072
PA 44	-0.7427134
PA 51	-0.9709771
PA 70	-0.7455806
PA 88	-0.6185106
RB 31	-0.7004609
RB 32	-0.6622236
RB 33	-0.7819874
RB 39	0.0576355
RIM 117	2.2098742
SCAVINA 6	0.0541264
SIAL 164	0.9092031
SIAL 505	2.4122358
SIAL 542	0.5902047
SIC 19	0.8641823
SIC 23	0.9663457
SIC 842	1.2039141
SIC 864	1.3037882
SIC 891	1.3515343
SPA 12	-0.3455082
SPA 5	0.8572418
TSA 516	0.3134961
TSA 644	-0.7470167
TSA 654	-0.8088858
UF 36	-1.2169483
UF 667	1.2939368

Número ótimo de clusters

nbclust_out <- NbClust(
  data = df,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "ward.D"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 1 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  6 
##  
##  
## *******************************************************************

# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 10)

# create plot
ggplot(nbclust_plot) +
  aes(x = clusters) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +  theme_minimal()

Dendrograma

dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")

fviz_dend(dista.hc, cex=0.5, k = 6, color_labels_by_k = TRUE, horiz = T)

res <- hcut(dat, k = 6, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = "Dark2", horiz = T)

km.res1 <- hkmeans(df, 6,hc.metric = "euclid" ,hc.method = "ward.D")


fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
          rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)

fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

K-means

Cid Póvoas

2020-07-09