1 Análise de cluster

2 Pacotes

library(FactoMineR)
library(factoextra)
library(cluster)
library(clustertend)
library(seriation)
library(fpc)
library(caret)
library(rgl)
library(knitr)
library(tidyverse)
library(phylogram)
library(dendextend)

3 Resumo estatístico

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
plot(iris, col = iris$Species)

Species <- as.integer(iris$Species)
plot3d(iris,col=Species, type = "s",radius=0.2, main="Iris Dataset")

4 K-Means

4.1 Verificação de distribuição aleatória

hopkins(iris[,-5], n = nrow(iris)-1)
## $H
## [1] 0.1716086
iris2 <- iris[,-5]

# Hopkins Statisc = chance das observações estarem distribuidas de forma uniforme
# Valores baixos representam distribuições não-randômicos

4.2 Exploração inicial

iris2 <- as.data.frame(scale(iris2))

# Mapa de Calor
Iris_d_euclidian <- dist(iris2, method = "euclidean")
Iris_d_euclidian <- dist(iris2, method = "euclidean")
heatmap(as.matrix(Iris_d_euclidian), Rowv = NA, Colv=NA)

5 Cálculo da quantidade de clusters

fviz_nbclust(iris2, kmeans, method = "gap_stat")

fviz_nbclust(iris2, kmeans, method = "silhouette")

fviz_nbclust(iris2, kmeans, method = "wss")

5.1 Geração do Kmeans

set.seed(1, sample.kind = "Rounding")
kmeans <- kmeans(iris2, center = 3)
kmeans
## K-means clustering with 3 clusters of sizes 50, 53, 47
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 2  -0.05005221 -0.88042696    0.3465767   0.2805873
## 3   1.13217737  0.08812645    0.9928284   1.0141287
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
##  [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
##  (between_SS / total_SS =  76.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmeans$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
##  [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
kmeans$centers
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 2  -0.05005221 -0.88042696    0.3465767   0.2805873
## 3   1.13217737  0.08812645    0.9928284   1.0141287
kmeans$size
## [1] 50 53 47
iris2$Cluster <- kmeans$cluster
plot(iris2, col = kmeans$cluster)

5.1.1 Visualização do Kmeans

fviz_cluster(kmeans, geom = c("point"), ellipse.type = "norm", ggtheme = theme_bw(), data = iris2)

iris$Cluster <- kmeans$cluster

5.1.2 Verificação da precisão

tab <- table(iris$Species, iris$Cluster)
tab
##             
##               1  2  3
##   setosa     50  0  0
##   versicolor  0 39 11
##   virginica   0 14 36
plotcluster(iris[1:4], iris$Species)

clusplot(iris[1:4], iris$Cluster)

plot(iris[,3:4], col = iris$Species, pch = iris$Cluster)

iris$Cluster <- factor(iris$Cluster, levels = c(1,2,3),
                       labels = c("setosa", "versicolor", "virginica"))

confusionMatrix(iris$Species, iris$Cluster)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         39        11
##   virginica       0         14        36
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8333          
##                  95% CI : (0.7639, 0.8891)
##     No Information Rate : 0.3533          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7358           0.7660
## Specificity                 1.0000            0.8866           0.8641
## Pos Pred Value              1.0000            0.7800           0.7200
## Neg Pred Value              1.0000            0.8600           0.8900
## Prevalence                  0.3333            0.3533           0.3133
## Detection Rate              0.3333            0.2600           0.2400
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.8112           0.8150

6 Cluster Hierárquico

6.1 Verificação da quantidade de clusters

data(iris)
fviz_nbclust(iris[,1:4], hcut ,method = "gap_stat")

fviz_nbclust(iris, hcut, method = "silhouette")

fviz_nbclust(iris, hcut, method = "wss")

6.2 Cluster Hierárquico WardD2

iris3 <- as.data.frame(scale(iris[,-5]))

grupo <- hclust(dist(iris3)^2, method = "ward.D") # cluster hierárquico com distância euclidiana² e método ward

6.2.1 Visualização

plot(grupo, labels = FALSE) # plotar o dendograma
grps <- cutree(grupo, k=3) # Dividir a base de acordo com a classificação do cluster
grps
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 2 3 2 2 2 2 3 3 3 3
##  [75] 3 3 3 3 3 2 2 2 2 3 2 3 3 2 2 2 2 3 2 2 2 2 2 3 2 2 3 3 3 3 3 3 2 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3
rect.hclust(grupo, k = 3, border = 2:5) #colorir o dendograma

colors = c("red", "blue", "green")
clust3 = cutree(grupo, 3)
plot(as.phylo(grupo), type = "fan", tip.color = colors[clust3], label.offset = 1, cex = 1) # Filograma colorido

plot(as.phylo(grupo), type = "fan", label.offset = 0.1, no.margin = TRUE, cex = 0.5, show.tip.label = TRUE)

iris3$grps <- grps # Agrupar a variável no dataframe
plot(as.phylo(grupo), type = "unrooted", cex = 0.6, no.margin = TRUE) # Plotar como árvore

6.2.2 Extendendo as funções do dendograma

g1 <- fviz_dend(grupo, show_labels = F, rect = T, ggtheme = theme_bw(), main = "Cluster Hierárquico")
g1

g2 <- fviz_dend(grupo, show_labels = F, rect = T,
                k = 3,
                ggtheme = theme_bw(),
                main = "Cluster Hierárquico")
g2

6.2.3 Cálculo da precisão

table(iris$Species, iris3$grps)
##             
##               1  2  3
##   setosa     49  1  0
##   versicolor  0 27 23
##   virginica   0  2 48
iris3$grps <- factor(iris3$grps, levels = c(1,2,3),
                    labels = c("setosa", "versicolor", "virginica"))

tab <- table(iris$Species, iris3$grps)
confusionMatrix(iris$Species, iris3$grps)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         49          1         0
##   versicolor      0         27        23
##   virginica       0          2        48
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8267          
##                  95% CI : (0.7564, 0.8835)
##     No Information Rate : 0.4733          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.74            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9000           0.6761
## Specificity                 0.9901            0.8083           0.9747
## Pos Pred Value              0.9800            0.5400           0.9600
## Neg Pred Value              1.0000            0.9700           0.7700
## Prevalence                  0.3267            0.2000           0.4733
## Detection Rate              0.3267            0.1800           0.3200
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           0.9950            0.8542           0.8254

6.3 Mudar o método de ligação para “complete” e o cálculo de distância para “euclidiana”

iris4 <- as.data.frame(scale(iris[,-5]))
complete <- hclust(dist(iris4, method = "euclidean"), method = "complete")
completeCluster <- cutree(complete, k=3)
iris4$completeCluster <- completeCluster
iris4$completeCluster <- factor(iris4$completeCluster, levels = c(1,2,3),
                             labels = c("setosa", "versicolor", "virginica"))
confusionMatrix(iris$Species, iris4$completeCluster)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         49          1         0
##   versicolor      0         21        29
##   virginica       0          2        48
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7867          
##                  95% CI : (0.7124, 0.8493)
##     No Information Rate : 0.5133          
##     P-Value [Acc > NIR] : 4.236e-12       
##                                           
##                   Kappa : 0.68            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8750           0.6234
## Specificity                 0.9901            0.7698           0.9726
## Pos Pred Value              0.9800            0.4200           0.9600
## Neg Pred Value              1.0000            0.9700           0.7100
## Prevalence                  0.3267            0.1600           0.5133
## Detection Rate              0.3267            0.1400           0.3200
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           0.9950            0.8224           0.7980

6.4 Mudar o método de ligação para “Single” e o cálculo de distância para “euclidiana”

iris5 <- as.data.frame(scale(iris[,-5]))
single <- hclust(dist(iris5, method = "euclidean"), method = "single")
singleCluster <- cutree(single, k=3)
iris5$singleCluster <- singleCluster
iris5$singleCluster <- factor(iris5$singleCluster, levels = c(1,2,3),
                    labels = c("setosa", "versicolor", "virginica"))
confusionMatrix(iris$Species, iris5$singleCluster)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         49          1         0
##   versicolor      0          0        50
##   virginica       0          0        50
## 
## Overall Statistics
##                                           
##                Accuracy : 0.66            
##                  95% CI : (0.5783, 0.7353)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.6059          
##                                           
##                   Kappa : 0.49            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000          0.000000           0.5000
## Specificity                 0.9901          0.664430           1.0000
## Pos Pred Value              0.9800          0.000000           1.0000
## Neg Pred Value              1.0000          0.990000           0.5000
## Prevalence                  0.3267          0.006667           0.6667
## Detection Rate              0.3267          0.000000           0.3333
## Detection Prevalence        0.3333          0.333333           0.3333
## Balanced Accuracy           0.9950          0.332215           0.7500

6.5 Comparação Final

Species <- as.integer(iris$Species)

mfrow3d(nr = 2, nc = 2, sharedMouse =T) 
plot3d((iris),col=Species, type = "s",radius=0.1, main="Original")  
plot3d((iris3),col=grps, type = "s",radius=0.1, main="WardD2")  
plot3d((iris4),col=completeCluster, type = "s",radius=0.1, main="Complete")  
plot3d((iris5),col=singleCluster, type = "s",radius=0.1, main="Single")