Carregamento dos dados
dados <- read_excel("C:/Users/cided/Desktop/Vanusa/dados/dados.xlsx")
dados <- as.data.frame(dados)
rownames(dados) <- dados$CLONES
dados <- dados[-1]
names(dados) <- c("Leaf (E1)","Leaf (E2)", "Pod Lab.", "Pod Field")
dat <- dados[1]
dat %>% kable
| Leaf (E1) | |
|---|---|
| AMAZON 15.15 | 1.6020600 |
| AMAZON 2.1 | 1.0211893 |
| APA 5 | 0.9542425 |
| CAB 4 | 1.3222193 |
| CAB 5003.23 | 1.3802112 |
| CAB 5046.140 | 1.2174839 |
| CCN 51 | 1.6532125 |
| CEPEC 1008 | 1.3521825 |
| CEPEC 38 | 1.3979400 |
| CEPEC 40 | 1.5740313 |
| CEPEC 42 | 1.0413927 |
| CEPEC 44 | 1.7817554 |
| CEPEC 75 | 1.4623980 |
| CEPEC 82 | 1.6901961 |
| CEPEC 84 | 1.5118834 |
| CEPEC 89 | 1.3010300 |
| CEPEC 92 | 1.3010300 |
| CEPEC 93 | 1.4623980 |
| CHUAO 120 | 0.9777236 |
| CJ 10 | 1.6020600 |
| CSUL 4 | 1.2304489 |
| CSUL 5 | 1.4313638 |
| EET 272 | 1.3324385 |
| EET 390 | 1.1760913 |
| EET 392 | 1.3117539 |
| EET 62 | 1.3117539 |
| ICS 100 | 1.5563025 |
| ICS 78 | 1.5051500 |
| ICS 95 | 1.4771213 |
| IMC 2 | 1.6020600 |
| IMC 23 | 1.5563025 |
| IMC 51 | 1.5682017 |
| MA 12 | 1.5250448 |
| MOCORONGO 1 | 1.0211893 |
| MOCORONGO 2 | 1.1461280 |
| MOQ 417 | 1.3891661 |
| MOQ 647 | 1.2041200 |
| NA 312 | 1.3424227 |
| NA 33 | 1.5440680 |
| OC 77 | 1.7242759 |
| PA 120 | 1.4232459 |
| PA 148 | 1.3710679 |
| PA 15 | 1.3617278 |
| PA 150 | 1.3324385 |
| PA 169 | 0.9542425 |
| PA 285 | 0.3979400 |
| PA 294 | 1.2174839 |
| PA 30 | 1.1613680 |
| PA 44 | 1.3710679 |
| PA 51 | 1.3324385 |
| PA 70 | 1.3010300 |
| PA 88 | 1.2787536 |
| RB 31 | 1.3222193 |
| RB 32 | 1.5563025 |
| RB 33 | 1.2304489 |
| RB 39 | 1.3324385 |
| RIM 117 | 1.4471580 |
| SCAVINA 6 | 0.7403627 |
| SIAL 164 | 1.3010300 |
| SIAL 505 | 1.2552725 |
| SIAL 542 | 1.4913617 |
| SIC 19 | 1.6283889 |
| SIC 23 | 1.8195439 |
| SIC 842 | 1.2552725 |
| SIC 864 | 1.3710679 |
| SIC 891 | 1.3710679 |
| SPA 12 | 1.4393327 |
| SPA 5 | 1.4149733 |
| TSA 516 | 1.5440680 |
| TSA 644 | 1.6074550 |
| TSA 654 | 1.3324385 |
| UF 36 | 1.4065402 |
| UF 667 | 1.5250448 |
Dimensionamento e padronização
df <- scale(dat)
df %>% kable()
| Leaf (E1) | |
|---|---|
| AMAZON 15.15 | 1.0339809 |
| AMAZON 2.1 | -1.4723134 |
| APA 5 | -1.7611700 |
| CAB 4 | -0.1734533 |
| CAB 5003.23 | 0.0767657 |
| CAB 5046.140 | -0.6253570 |
| CCN 51 | 1.2546897 |
| CEPEC 1008 | -0.0441704 |
| CEPEC 38 | 0.1532604 |
| CEPEC 40 | 0.9130448 |
| CEPEC 42 | -1.3851414 |
| CEPEC 44 | 1.8093162 |
| CEPEC 75 | 0.4313785 |
| CEPEC 82 | 1.4142635 |
| CEPEC 84 | 0.6448940 |
| CEPEC 89 | -0.2648792 |
| CEPEC 92 | -0.2648792 |
| CEPEC 93 | 0.4313785 |
| CHUAO 120 | -1.6598557 |
| CJ 10 | 1.0339809 |
| CSUL 4 | -0.5694167 |
| CSUL 5 | 0.2974745 |
| EET 272 | -0.1293605 |
| EET 390 | -0.8039548 |
| EET 392 | -0.2186087 |
| EET 62 | -0.2186087 |
| ICS 100 | 0.8365502 |
| ICS 78 | 0.6158414 |
| ICS 95 | 0.4949053 |
| IMC 2 | 1.0339809 |
| IMC 23 | 0.8365502 |
| IMC 51 | 0.8878920 |
| MA 12 | 0.7016820 |
| MOCORONGO 1 | -1.4723134 |
| MOCORONGO 2 | -0.9332377 |
| MOQ 417 | 0.1154034 |
| MOQ 647 | -0.6830187 |
| NA 312 | -0.0862813 |
| NA 33 | 0.7837619 |
| OC 77 | 1.5613082 |
| PA 120 | 0.2624481 |
| PA 148 | 0.0373146 |
| PA 15 | -0.0029850 |
| PA 150 | -0.1293605 |
| PA 169 | -1.7611700 |
| PA 285 | -4.1614595 |
| PA 294 | -0.6253570 |
| PA 30 | -0.8674815 |
| PA 44 | 0.0373146 |
| PA 51 | -0.1293605 |
| PA 70 | -0.2648792 |
| PA 88 | -0.3609956 |
| RB 31 | -0.1734533 |
| RB 32 | 0.8365502 |
| RB 33 | -0.5694167 |
| RB 39 | -0.1293605 |
| RIM 117 | 0.3656224 |
| SCAVINA 6 | -2.6840015 |
| SIAL 164 | -0.2648792 |
| SIAL 505 | -0.4623099 |
| SIAL 542 | 0.5563488 |
| SIC 19 | 1.1475829 |
| SIC 23 | 1.9723632 |
| SIC 842 | -0.4623099 |
| SIC 864 | 0.0373146 |
| SIC 891 | 0.0373146 |
| SPA 12 | 0.3318582 |
| SPA 5 | 0.2267545 |
| TSA 516 | 0.7837619 |
| TSA 644 | 1.0572590 |
| TSA 654 | -0.1293605 |
| UF 36 | 0.1903677 |
| UF 667 | 0.7016820 |
Número ótimo de clusters
nbclust_out <- NbClust(
data = df,
distance = "euclidean",
min.nc = 2,
max.nc = 10,
method = "ward.D"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 3 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 9
##
##
## *******************************************************************
# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 10)
# create plot
ggplot(nbclust_plot) +
aes(x = clusters) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
theme_minimal()
Dendrograma
dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")
fviz_dend(dista.hc, cex=0.5, k = 9, color_labels_by_k = TRUE, horiz = T)
res <- hcut(dat, k = 9, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = "Dark2", horiz = T)
km.res1 <- hkmeans(df, 9,hc.metric = "euclid" ,hc.method = "ward.D")
fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)
fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
dat <- dados[2]
dat %>% kable
| Leaf (E2) | |
|---|---|
| AMAZON 15.15 | 1.1139434 |
| AMAZON 2.1 | 1.0000000 |
| APA 5 | 1.3222193 |
| CAB 4 | 1.3891661 |
| CAB 5003.23 | 1.4149733 |
| CAB 5046.140 | 1.4232459 |
| CCN 51 | 1.0791812 |
| CEPEC 1008 | 1.6483600 |
| CEPEC 38 | 1.2174839 |
| CEPEC 40 | 1.3222193 |
| CEPEC 42 | 1.2041200 |
| CEPEC 44 | 1.2900346 |
| CEPEC 75 | 1.3710679 |
| CEPEC 82 | 1.2787536 |
| CEPEC 84 | 1.3617278 |
| CEPEC 89 | 1.3222193 |
| CEPEC 92 | 1.2671717 |
| CEPEC 93 | 1.3617278 |
| CHUAO 120 | 1.1461280 |
| CJ 10 | 1.2671717 |
| CSUL 4 | 1.2671717 |
| CSUL 5 | 1.4623980 |
| EET 272 | 1.2900346 |
| EET 390 | 1.3979400 |
| EET 392 | 1.4393327 |
| EET 62 | 1.3324385 |
| ICS 100 | 1.4623980 |
| ICS 78 | 1.3617278 |
| ICS 95 | 1.5682017 |
| IMC 2 | 1.2430380 |
| IMC 23 | 0.8750613 |
| IMC 51 | 1.0413927 |
| MA 12 | 1.3222193 |
| MOCORONGO 1 | 1.2900346 |
| MOCORONGO 2 | 1.3222193 |
| MOQ 417 | 1.1461280 |
| MOQ 647 | 0.7403627 |
| NA 312 | 1.2787536 |
| NA 33 | 1.5118834 |
| OC 77 | 1.4232459 |
| PA 120 | 0.9777236 |
| PA 148 | 1.1760913 |
| PA 15 | 1.4698220 |
| PA 150 | 1.4623980 |
| PA 169 | 1.4698220 |
| PA 285 | 0.3979400 |
| PA 294 | 0.9030900 |
| PA 30 | 0.9777236 |
| PA 44 | 0.9030900 |
| PA 51 | 1.2041200 |
| PA 70 | 0.9777236 |
| PA 88 | 1.0000000 |
| RB 31 | 1.1461280 |
| RB 32 | 1.4232459 |
| RB 33 | 1.4623980 |
| RB 39 | 1.3617278 |
| RIM 117 | 1.3979400 |
| SCAVINA 6 | 0.7781513 |
| SIAL 164 | 1.4065402 |
| SIAL 505 | 1.4393327 |
| SIAL 542 | 1.2900346 |
| SIC 19 | 1.4065402 |
| SIC 23 | 1.6857417 |
| SIC 842 | 1.2671717 |
| SIC 864 | 1.1903317 |
| SIC 891 | 1.3710679 |
| SPA 12 | 1.5250448 |
| SPA 5 | 1.3710679 |
| TSA 516 | 1.5051500 |
| TSA 644 | 1.4393327 |
| TSA 654 | 1.2552725 |
| UF 36 | 1.2787536 |
| UF 667 | 1.3010300 |
Dimensionamento e padronização
df <- scale(dat)
df %>% kable()
| Leaf (E2) | |
|---|---|
| AMAZON 15.15 | -0.7179530 |
| AMAZON 2.1 | -1.2381215 |
| APA 5 | 0.2328580 |
| CAB 4 | 0.5384801 |
| CAB 5003.23 | 0.6562941 |
| CAB 5046.140 | 0.6940595 |
| CCN 51 | -0.8766472 |
| CEPEC 1008 | 1.7217393 |
| CEPEC 38 | -0.2452746 |
| CEPEC 40 | 0.2328580 |
| CEPEC 42 | -0.3062831 |
| CEPEC 44 | 0.0859300 |
| CEPEC 75 | 0.4558590 |
| CEPEC 82 | 0.0344305 |
| CEPEC 84 | 0.4132204 |
| CEPEC 89 | 0.2328580 |
| CEPEC 92 | -0.0184424 |
| CEPEC 93 | 0.4132204 |
| CHUAO 120 | -0.5710251 |
| CJ 10 | -0.0184424 |
| CSUL 4 | -0.0184424 |
| CSUL 5 | 0.8727948 |
| EET 272 | 0.0859300 |
| EET 390 | 0.5785344 |
| EET 392 | 0.7674982 |
| EET 62 | 0.2795100 |
| ICS 100 | 0.8727948 |
| ICS 78 | 0.4132204 |
| ICS 95 | 1.3558047 |
| IMC 2 | -0.1286163 |
| IMC 23 | -1.8084856 |
| IMC 51 | -1.0491577 |
| MA 12 | 0.2328580 |
| MOCORONGO 1 | 0.0859300 |
| MOCORONGO 2 | 0.2328580 |
| MOQ 417 | -0.5710251 |
| MOQ 647 | -2.4234048 |
| NA 312 | 0.0344305 |
| NA 33 | 1.0987029 |
| OC 77 | 0.6940595 |
| PA 120 | -1.3398166 |
| PA 148 | -0.4342385 |
| PA 15 | 0.9066865 |
| PA 150 | 0.8727948 |
| PA 169 | 0.9066865 |
| PA 285 | -3.9866158 |
| PA 294 | -1.6805303 |
| PA 30 | -1.3398166 |
| PA 44 | -1.6805303 |
| PA 51 | -0.3062831 |
| PA 70 | -1.3398166 |
| PA 88 | -1.2381215 |
| RB 31 | -0.5710251 |
| RB 32 | 0.6940595 |
| RB 33 | 0.8727948 |
| RB 39 | 0.4132204 |
| RIM 117 | 0.5785344 |
| SCAVINA 6 | -2.2508944 |
| SIAL 164 | 0.6177955 |
| SIAL 505 | 0.7674982 |
| SIAL 542 | 0.0859300 |
| SIC 19 | 0.6177955 |
| SIC 23 | 1.8923925 |
| SIC 842 | -0.0184424 |
| SIC 864 | -0.3692287 |
| SIC 891 | 0.4558590 |
| SPA 12 | 1.1587869 |
| SPA 5 | 0.4558590 |
| TSA 516 | 1.0679640 |
| TSA 644 | 0.7674982 |
| TSA 654 | -0.0727642 |
| UF 36 | 0.0344305 |
| UF 667 | 0.1361256 |
Número ótimo de clusters
nbclust_out <- NbClust(
data = df,
distance = "euclidean",
min.nc = 2,
max.nc = 10,
method = "ward.D"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 3 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 6)
# create plot
ggplot(nbclust_plot) +
aes(x = clusters) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
theme_minimal()
Dendrograma
dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")
fviz_dend(dista.hc, cex=0.5, k = 3, color_labels_by_k = TRUE, horiz = T)
res <- hcut(dat, k = 3, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = "Dark2", horiz = T)
km.res1 <- hkmeans(df, 3,hc.metric = "euclid" ,hc.method = "ward.D")
fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)
fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
dat <- dados[3]
dat %>% kable
| Pod Lab. | |
|---|---|
| AMAZON 15.15 | 0.8571114 |
| AMAZON 2.1 | 1.5354184 |
| APA 5 | 0.5068532 |
| CAB 4 | 0.4960531 |
| CAB 5003.23 | 1.3177060 |
| CAB 5046.140 | 0.9791707 |
| CCN 51 | 0.2809029 |
| CEPEC 1008 | 0.9891409 |
| CEPEC 38 | 1.3666029 |
| CEPEC 40 | 0.7961165 |
| CEPEC 42 | 1.1048756 |
| CEPEC 44 | 1.2825478 |
| CEPEC 75 | 0.8729614 |
| CEPEC 82 | 0.9510319 |
| CEPEC 84 | 0.7590695 |
| CEPEC 89 | 0.2587726 |
| CEPEC 92 | 0.2639505 |
| CEPEC 93 | 0.3156066 |
| CHUAO 120 | 1.7819526 |
| CJ 10 | 1.8245065 |
| CSUL 4 | 0.9232208 |
| CSUL 5 | 1.1350635 |
| EET 272 | 0.6117698 |
| EET 390 | 0.5764116 |
| EET 392 | 0.5464359 |
| EET 62 | 1.1572554 |
| ICS 100 | 1.1430697 |
| ICS 78 | 1.2239901 |
| ICS 95 | 0.6780611 |
| IMC 2 | 0.5063949 |
| IMC 23 | 1.9398831 |
| IMC 51 | 1.9315410 |
| MA 12 | 0.7386612 |
| MOCORONGO 1 | 1.2753404 |
| MOCORONGO 2 | 0.7311415 |
| MOQ 417 | 1.0109978 |
| MOQ 647 | 0.7115607 |
| NA 312 | 1.4783115 |
| NA 33 | 1.5895142 |
| OC 77 | 1.0502791 |
| PA 120 | 0.9652000 |
| PA 148 | 1.0276270 |
| PA 15 | 1.2356085 |
| PA 150 | 1.2646627 |
| PA 169 | 0.5385116 |
| PA 285 | 0.6545942 |
| PA 294 | 0.4485199 |
| PA 30 | 0.0000000 |
| PA 44 | 0.7755202 |
| PA 51 | 0.7822918 |
| PA 70 | 1.0344874 |
| PA 88 | 0.5565108 |
| RB 31 | 0.6663900 |
| RB 32 | 0.3983577 |
| RB 33 | 0.6417879 |
| RB 39 | 0.9754240 |
| RIM 117 | 2.1415689 |
| SCAVINA 6 | 0.0778750 |
| SIAL 164 | 1.2432902 |
| SIAL 505 | 1.5233441 |
| SIAL 542 | 1.2504389 |
| SIC 19 | 1.2911151 |
| SIC 23 | 1.4537780 |
| SIC 842 | 1.7287604 |
| SIC 864 | 1.7068808 |
| SIC 891 | 1.2539790 |
| SPA 12 | 1.7871112 |
| SPA 5 | 1.3093953 |
| TSA 516 | 0.1945788 |
| TSA 644 | 0.3985943 |
| TSA 654 | 0.7623967 |
| UF 36 | 1.9960110 |
| UF 667 | 2.0585615 |
Dimensionamento e padronização
df <- scale(dat)
df %>% kable()
| Pod Lab. | |
|---|---|
| AMAZON 15.15 | -0.2973635 |
| AMAZON 2.1 | 1.0323556 |
| APA 5 | -0.9839921 |
| CAB 4 | -1.0051639 |
| CAB 5003.23 | 0.6055630 |
| CAB 5046.140 | -0.0580844 |
| CCN 51 | -1.4269337 |
| CEPEC 1008 | -0.0385395 |
| CEPEC 38 | 0.7014181 |
| CEPEC 40 | -0.4169349 |
| CEPEC 42 | 0.1883411 |
| CEPEC 44 | 0.5366408 |
| CEPEC 75 | -0.2662919 |
| CEPEC 82 | -0.1132464 |
| CEPEC 84 | -0.4895599 |
| CEPEC 89 | -1.4703169 |
| CEPEC 92 | -1.4601664 |
| CEPEC 93 | -1.3589023 |
| CHUAO 120 | 1.5156488 |
| CJ 10 | 1.5990695 |
| CSUL 4 | -0.1677659 |
| CSUL 5 | 0.2475199 |
| EET 272 | -0.7783188 |
| EET 390 | -0.8476333 |
| EET 392 | -0.9063960 |
| EET 62 | 0.2910238 |
| ICS 100 | 0.2632149 |
| ICS 78 | 0.4218472 |
| ICS 95 | -0.6483647 |
| IMC 2 | -0.9848905 |
| IMC 23 | 1.8252479 |
| IMC 51 | 1.8088944 |
| MA 12 | -0.5295673 |
| MOCORONGO 1 | 0.5225118 |
| MOCORONGO 2 | -0.5443085 |
| MOQ 417 | 0.0043078 |
| MOQ 647 | -0.5826938 |
| NA 312 | 0.9204060 |
| NA 33 | 1.1384022 |
| OC 77 | 0.0813128 |
| PA 120 | -0.0854719 |
| PA 148 | 0.0369068 |
| PA 15 | 0.4446232 |
| PA 150 | 0.5015796 |
| PA 169 | -0.9219304 |
| PA 285 | -0.6943680 |
| PA 294 | -1.0983457 |
| PA 30 | -1.9776016 |
| PA 44 | -0.4573107 |
| PA 51 | -0.4440361 |
| PA 70 | 0.0503555 |
| PA 88 | -0.8866458 |
| RB 31 | -0.6712442 |
| RB 32 | -1.1966812 |
| RB 33 | -0.7194729 |
| RB 39 | -0.0654294 |
| RIM 117 | 2.2206225 |
| SCAVINA 6 | -1.8249394 |
| SIAL 164 | 0.4596822 |
| SIAL 505 | 1.0086858 |
| SIAL 542 | 0.4736960 |
| SIC 19 | 0.5534357 |
| SIC 23 | 0.8723117 |
| SIC 842 | 1.4113735 |
| SIC 864 | 1.3684819 |
| SIC 891 | 0.4806359 |
| SPA 12 | 1.5257616 |
| SPA 5 | 0.5892712 |
| TSA 516 | -1.5961591 |
| TSA 644 | -1.1962174 |
| TSA 654 | -0.4830375 |
| UF 36 | 1.9352783 |
| UF 667 | 2.0578990 |
Número ótimo de clusters
nbclust_out <- NbClust(
data = df,
distance = "euclidean",
min.nc = 2,
max.nc = 10,
method = "ward.D"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 6)
# create plot
ggplot(nbclust_plot) +
aes(x = clusters) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
theme_minimal()
Dendrograma
dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")
fviz_dend(dista.hc, cex=0.5, k = 4, color_labels_by_k = TRUE, horiz = T)
res <- hcut(dat, k = 4, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = "Dark2", horiz = T)
km.res1 <- hkmeans(df, 4,hc.metric = "euclid" ,hc.method = "ward.D")
fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)
fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
dat <- dados[4]
dat %>% kable
| Pod Field | |
|---|---|
| AMAZON 15.15 | 1.723369 |
| AMAZON 2.1 | 1.900000 |
| APA 5 | 2.918904 |
| CAB 4 | 2.104757 |
| CAB 5003.23 | 1.466288 |
| CAB 5046.140 | 6.892750 |
| CCN 51 | 5.754129 |
| CEPEC 1008 | 4.869292 |
| CEPEC 38 | 4.202380 |
| CEPEC 40 | 1.445683 |
| CEPEC 42 | 3.495712 |
| CEPEC 44 | 1.539480 |
| CEPEC 75 | 1.624808 |
| CEPEC 82 | 2.424871 |
| CEPEC 84 | 1.876166 |
| CEPEC 89 | 2.391652 |
| CEPEC 92 | 3.224903 |
| CEPEC 93 | 1.288410 |
| CHUAO 120 | 6.527634 |
| CJ 10 | 1.606238 |
| CSUL 4 | 2.922328 |
| CSUL 5 | 5.498182 |
| EET 272 | 1.926136 |
| EET 390 | 2.083267 |
| EET 392 | 4.435087 |
| EET 62 | 3.896152 |
| ICS 100 | 5.324472 |
| ICS 78 | 7.615773 |
| ICS 95 | 4.369211 |
| IMC 2 | 3.018278 |
| IMC 23 | 2.118962 |
| IMC 51 | 1.407125 |
| MA 12 | 2.670206 |
| MOCORONGO 1 | 2.875761 |
| MOCORONGO 2 | 2.600000 |
| MOQ 417 | 1.260952 |
| MOQ 647 | 1.268858 |
| NA 312 | 3.917908 |
| NA 33 | 2.090454 |
| OC 77 | 3.917908 |
| PA 120 | 1.228821 |
| PA 148 | 3.938274 |
| PA 15 | 4.701064 |
| PA 150 | 4.282523 |
| PA 169 | 2.477902 |
| PA 285 | 3.056141 |
| PA 294 | 2.477902 |
| PA 30 | 3.364521 |
| PA 44 | 2.100000 |
| PA 51 | 1.720465 |
| PA 70 | 2.095233 |
| PA 88 | 2.306513 |
| RB 31 | 2.170253 |
| RB 32 | 2.233831 |
| RB 33 | 2.034699 |
| RB 39 | 3.430743 |
| RIM 117 | 7.009280 |
| SCAVINA 6 | 3.424909 |
| SIAL 164 | 4.846648 |
| SIAL 505 | 7.345747 |
| SIAL 542 | 4.316248 |
| SIC 19 | 4.771792 |
| SIC 23 | 4.941660 |
| SIC 842 | 5.336666 |
| SIC 864 | 5.502727 |
| SIC 891 | 5.582114 |
| SPA 12 | 2.760435 |
| SPA 5 | 4.760252 |
| TSA 516 | 3.856164 |
| TSA 644 | 2.092845 |
| TSA 654 | 1.989975 |
| UF 36 | 1.311488 |
| UF 667 | 5.486347 |
Dimensionamento e padronização
df <- scale(dat)
df %>% kable()
| Pod Field | |
|---|---|
| AMAZON 15.15 | -0.9692307 |
| AMAZON 2.1 | -0.8629994 |
| APA 5 | -0.2502001 |
| CAB 4 | -0.7398527 |
| CAB 5003.23 | -1.1238469 |
| CAB 5046.140 | 2.1397897 |
| CCN 51 | 1.4549891 |
| CEPEC 1008 | 0.9228214 |
| CEPEC 38 | 0.5217210 |
| CEPEC 40 | -1.1362391 |
| CEPEC 42 | 0.0967094 |
| CEPEC 44 | -1.0798266 |
| CEPEC 75 | -1.0285083 |
| CEPEC 82 | -0.5473262 |
| CEPEC 84 | -0.8773337 |
| CEPEC 89 | -0.5673051 |
| CEPEC 92 | -0.0661630 |
| CEPEC 93 | -1.2308280 |
| CHUAO 120 | 1.9201979 |
| CJ 10 | -1.0396767 |
| CSUL 4 | -0.2481408 |
| CSUL 5 | 1.3010546 |
| EET 272 | -0.8472804 |
| EET 390 | -0.7527773 |
| EET 392 | 0.6616780 |
| EET 62 | 0.3375461 |
| ICS 100 | 1.1965804 |
| ICS 78 | 2.5746376 |
| ICS 95 | 0.6220577 |
| IMC 2 | -0.1904337 |
| IMC 23 | -0.7313091 |
| IMC 51 | -1.1594293 |
| MA 12 | -0.3997745 |
| MOCORONGO 1 | -0.2761477 |
| MOCORONGO 2 | -0.4419984 |
| MOQ 417 | -1.2473420 |
| MOQ 647 | -1.2425872 |
| NA 312 | 0.3506306 |
| NA 33 | -0.7484543 |
| OC 77 | 0.3506306 |
| PA 120 | -1.2666668 |
| PA 148 | 0.3628794 |
| PA 15 | 0.8216441 |
| PA 150 | 0.5699210 |
| PA 169 | -0.5154316 |
| PA 285 | -0.1676614 |
| PA 294 | -0.5154316 |
| PA 30 | 0.0178072 |
| PA 44 | -0.7427134 |
| PA 51 | -0.9709771 |
| PA 70 | -0.7455806 |
| PA 88 | -0.6185106 |
| RB 31 | -0.7004609 |
| RB 32 | -0.6622236 |
| RB 33 | -0.7819874 |
| RB 39 | 0.0576355 |
| RIM 117 | 2.2098742 |
| SCAVINA 6 | 0.0541264 |
| SIAL 164 | 0.9092031 |
| SIAL 505 | 2.4122358 |
| SIAL 542 | 0.5902047 |
| SIC 19 | 0.8641823 |
| SIC 23 | 0.9663457 |
| SIC 842 | 1.2039141 |
| SIC 864 | 1.3037882 |
| SIC 891 | 1.3515343 |
| SPA 12 | -0.3455082 |
| SPA 5 | 0.8572418 |
| TSA 516 | 0.3134961 |
| TSA 644 | -0.7470167 |
| TSA 654 | -0.8088858 |
| UF 36 | -1.2169483 |
| UF 667 | 1.2939368 |
Número ótimo de clusters
nbclust_out <- NbClust(
data = df,
distance = "euclidean",
min.nc = 2,
max.nc = 10,
method = "ward.D"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 6
##
##
## *******************************************************************
# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 5 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 10)
# create plot
ggplot(nbclust_plot) +
aes(x = clusters) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") + theme_minimal()
Dendrograma
dista=dist(df, method="euclidean")
dista.hc=hclust(d=dista, method="ward.D")
fviz_dend(dista.hc, cex=0.5, k = 6, color_labels_by_k = TRUE, horiz = T)
res <- hcut(dat, k = 6, stand = TRUE)
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = "Dark2", horiz = T)
km.res1 <- hkmeans(df, 6,hc.metric = "euclid" ,hc.method = "ward.D")
fviz_dend(km.res1, cex = 0.6, palette = "Dark2",
rect = TRUE, rect_border = "Dark2", rect_fill = TRUE, horiz = T)
fviz_dist(dista, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))