Análise de cluster
Pacotes
library(FactoMineR)
library(factoextra)
library(cluster)
library(clustertend)
library(seriation)
library(fpc)
library(caret)
library(rgl)
library(knitr)
library(tidyverse)
library(phylogram)
library(dendextend)
Resumo estatístico
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
plot(iris, col = iris$Species)

Species <- as.integer(iris$Species)
plot3d(iris,col=Species, type = "s",radius=0.2, main="Iris Dataset")
K-Means
Verificação de distribuição aleatória
hopkins(iris[,-5], n = nrow(iris)-1)
## $H
## [1] 0.1716086
iris2 <- iris[,-5]
# Hopkins Statisc = chance das observações estarem distribuidas de forma uniforme
# Valores baixos representam distribuições não-randômicos
Exploração inicial
iris2 <- as.data.frame(scale(iris2))
# Mapa de Calor
Iris_d_euclidian <- dist(iris2, method = "euclidean")
Iris_d_euclidian <- dist(iris2, method = "euclidean")
heatmap(as.matrix(Iris_d_euclidian), Rowv = NA, Colv=NA)

Cálculo da quantidade de clusters
fviz_nbclust(iris2, kmeans, method = "gap_stat")

fviz_nbclust(iris2, kmeans, method = "silhouette")

fviz_nbclust(iris2, kmeans, method = "wss")

Geração do Kmeans
set.seed(1, sample.kind = "Rounding")
kmeans <- kmeans(iris2, center = 3)
kmeans
## K-means clustering with 3 clusters of sizes 50, 53, 47
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
## [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
## [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
kmeans$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
kmeans$size
## [1] 50 53 47
iris2$Cluster <- kmeans$cluster
plot(iris2, col = kmeans$cluster)

Visualização do Kmeans
fviz_cluster(kmeans, geom = c("point"), ellipse.type = "norm", ggtheme = theme_bw(), data = iris2)

iris$Cluster <- kmeans$cluster
Verificação da precisão
tab <- table(iris$Species, iris$Cluster)
tab
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 39 11
## virginica 0 14 36
plotcluster(iris[1:4], iris$Species)

clusplot(iris[1:4], iris$Cluster)

plot(iris[,3:4], col = iris$Species, pch = iris$Cluster)

iris$Cluster <- factor(iris$Cluster, levels = c(1,2,3),
labels = c("setosa", "versicolor", "virginica"))
confusionMatrix(iris$Species, iris$Cluster)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 39 11
## virginica 0 14 36
##
## Overall Statistics
##
## Accuracy : 0.8333
## 95% CI : (0.7639, 0.8891)
## No Information Rate : 0.3533
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7358 0.7660
## Specificity 1.0000 0.8866 0.8641
## Pos Pred Value 1.0000 0.7800 0.7200
## Neg Pred Value 1.0000 0.8600 0.8900
## Prevalence 0.3333 0.3533 0.3133
## Detection Rate 0.3333 0.2600 0.2400
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.8112 0.8150
Cluster Hierárquico
Verificação da quantidade de clusters
data(iris)
fviz_nbclust(iris[,1:4], hcut ,method = "gap_stat")

fviz_nbclust(iris, hcut, method = "silhouette")

fviz_nbclust(iris, hcut, method = "wss")

Cluster Hierárquico WardD2
iris3 <- as.data.frame(scale(iris[,-5]))
grupo <- hclust(dist(iris3)^2, method = "ward.D") # cluster hierárquico com distância euclidiana² e método ward
Visualização
plot(grupo, labels = FALSE) # plotar o dendograma
grps <- cutree(grupo, k=3) # Dividir a base de acordo com a classificação do cluster
grps
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 2 3 2 2 2 2 3 3 3 3
## [75] 3 3 3 3 3 2 2 2 2 3 2 3 3 2 2 2 2 3 2 2 2 2 2 3 2 2 3 3 3 3 3 3 2 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3
rect.hclust(grupo, k = 3, border = 2:5) #colorir o dendograma

colors = c("red", "blue", "green")
clust3 = cutree(grupo, 3)
plot(as.phylo(grupo), type = "fan", tip.color = colors[clust3], label.offset = 1, cex = 1) # Filograma colorido

plot(as.phylo(grupo), type = "fan", label.offset = 0.1, no.margin = TRUE, cex = 0.5, show.tip.label = TRUE)

iris3$grps <- grps # Agrupar a variável no dataframe
plot(as.phylo(grupo), type = "unrooted", cex = 0.6, no.margin = TRUE) # Plotar como árvore

Extendendo as funções do dendograma
g1 <- fviz_dend(grupo, show_labels = F, rect = T, ggtheme = theme_bw(), main = "Cluster Hierárquico")
g1

g2 <- fviz_dend(grupo, show_labels = F, rect = T,
k = 3,
ggtheme = theme_bw(),
main = "Cluster Hierárquico")
g2

Cálculo da precisão
table(iris$Species, iris3$grps)
##
## 1 2 3
## setosa 49 1 0
## versicolor 0 27 23
## virginica 0 2 48
iris3$grps <- factor(iris3$grps, levels = c(1,2,3),
labels = c("setosa", "versicolor", "virginica"))
tab <- table(iris$Species, iris3$grps)
confusionMatrix(iris$Species, iris3$grps)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 49 1 0
## versicolor 0 27 23
## virginica 0 2 48
##
## Overall Statistics
##
## Accuracy : 0.8267
## 95% CI : (0.7564, 0.8835)
## No Information Rate : 0.4733
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.74
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 0.6761
## Specificity 0.9901 0.8083 0.9747
## Pos Pred Value 0.9800 0.5400 0.9600
## Neg Pred Value 1.0000 0.9700 0.7700
## Prevalence 0.3267 0.2000 0.4733
## Detection Rate 0.3267 0.1800 0.3200
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 0.9950 0.8542 0.8254
Mudar o método de ligação para “complete” e o cálculo de distância para “euclidiana”
iris4 <- as.data.frame(scale(iris[,-5]))
complete <- hclust(dist(iris4, method = "euclidean"), method = "complete")
completeCluster <- cutree(complete, k=3)
iris4$completeCluster <- completeCluster
iris4$completeCluster <- factor(iris4$completeCluster, levels = c(1,2,3),
labels = c("setosa", "versicolor", "virginica"))
confusionMatrix(iris$Species, iris4$completeCluster)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 49 1 0
## versicolor 0 21 29
## virginica 0 2 48
##
## Overall Statistics
##
## Accuracy : 0.7867
## 95% CI : (0.7124, 0.8493)
## No Information Rate : 0.5133
## P-Value [Acc > NIR] : 4.236e-12
##
## Kappa : 0.68
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8750 0.6234
## Specificity 0.9901 0.7698 0.9726
## Pos Pred Value 0.9800 0.4200 0.9600
## Neg Pred Value 1.0000 0.9700 0.7100
## Prevalence 0.3267 0.1600 0.5133
## Detection Rate 0.3267 0.1400 0.3200
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 0.9950 0.8224 0.7980
Mudar o método de ligação para “Single” e o cálculo de distância para “euclidiana”
iris5 <- as.data.frame(scale(iris[,-5]))
single <- hclust(dist(iris5, method = "euclidean"), method = "single")
singleCluster <- cutree(single, k=3)
iris5$singleCluster <- singleCluster
iris5$singleCluster <- factor(iris5$singleCluster, levels = c(1,2,3),
labels = c("setosa", "versicolor", "virginica"))
confusionMatrix(iris$Species, iris5$singleCluster)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 49 1 0
## versicolor 0 0 50
## virginica 0 0 50
##
## Overall Statistics
##
## Accuracy : 0.66
## 95% CI : (0.5783, 0.7353)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.6059
##
## Kappa : 0.49
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.000000 0.5000
## Specificity 0.9901 0.664430 1.0000
## Pos Pred Value 0.9800 0.000000 1.0000
## Neg Pred Value 1.0000 0.990000 0.5000
## Prevalence 0.3267 0.006667 0.6667
## Detection Rate 0.3267 0.000000 0.3333
## Detection Prevalence 0.3333 0.333333 0.3333
## Balanced Accuracy 0.9950 0.332215 0.7500
Comparação Final
Species <- as.integer(iris$Species)
mfrow3d(nr = 2, nc = 2, sharedMouse =T)
plot3d((iris),col=Species, type = "s",radius=0.1, main="Original")
plot3d((iris3),col=grps, type = "s",radius=0.1, main="WardD2")
plot3d((iris4),col=completeCluster, type = "s",radius=0.1, main="Complete")
plot3d((iris5),col=singleCluster, type = "s",radius=0.1, main="Single")