# Kmeans Clustering of Wine Dataset
library(rattle)
## Warning: package 'rattle' was built under R version 3.5.3
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(NbClust)
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# import the data set
data(wine, package = "rattle")
head(wine) #display the first few obs
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## 2 0.26 1.28 4.38 1.05 3.40 1050
## 3 0.30 2.81 5.68 1.03 3.17 1185
## 4 0.24 2.18 7.80 0.86 3.45 1480
## 5 0.39 1.82 4.32 1.04 2.93 735
## 6 0.34 1.97 6.75 1.05 2.85 1450
View(wine)
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# Finding the optimum no of clusters
df <- wine[-1]
df <- scale(df)
library(NbClust)
NbClust(data = df,
min.nc = 2,
max.nc = 10,
method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW
## 2 1.4106 69.5233 52.4491 0.4177 295.3215 5.971798e+25 26438.125
## 3 6.4010 70.9400 14.4149 4.1548 700.1066 1.382516e+25 13582.745
## 4 1.9126 55.6739 9.4839 3.4985 787.9212 1.500696e+25 10243.388
## 5 0.5185 46.1356 11.6586 2.8745 894.8302 1.286086e+25 9571.880
## 6 6.1979 41.4863 5.2105 3.3736 999.2386 1.030127e+25 8216.166
## 7 0.1842 36.2755 9.5754 2.4787 1088.4634 8.493539e+24 7677.550
## 8 0.6580 34.0023 12.6941 2.9948 1196.8913 6.032872e+24 7172.766
## 9 3.0410 33.3630 5.9473 4.5444 1310.6234 4.030316e+24 5784.726
## 10 44.2547 31.1749 2.5530 4.3861 1418.6451 2.712043e+24 5636.409
## TraceW Friedman Rubin Cindex DB Silhouette Duda Pseudot2
## 2 1649.4400 13.7691 1.3950 0.3390 1.5893 0.2593 1.2343 -23.7320
## 3 1270.7491 24.0761 1.8107 0.3373 1.4681 0.2849 1.6385 -41.3059
## 4 1174.0421 24.5845 1.9599 0.3191 1.8694 0.2442 1.1891 -6.2015
## 5 1113.3583 26.4921 2.0667 0.3032 1.7512 0.2178 1.3683 -20.4566
## 6 1043.0655 29.3567 2.2060 0.2948 1.9512 0.1671 1.6770 -27.8544
## 7 1012.3963 32.1381 2.2728 0.2917 1.7819 0.1655 1.4032 -16.6660
## 8 958.7118 34.4077 2.4001 0.3101 1.8307 0.1548 1.6495 -22.8377
## 9 892.0978 35.8595 2.5793 0.3631 1.6558 0.1683 1.0402 -1.8151
## 10 861.7709 38.9197 2.6701 0.3590 1.7214 0.1447 1.7951 -16.3888
## Beale Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert
## 2 -1.6675 0.3393 824.7200 0.5231 0.2481 0.7411 0.1357 0.0011
## 3 -3.4061 0.3748 423.5830 0.6154 0.6290 1.3219 0.2323 0.0014
## 4 -1.3675 0.3440 293.5105 0.5934 0.2848 1.8345 0.1635 0.0014
## 5 -2.3492 0.3166 222.6717 0.5961 1.8234 2.0591 0.1730 0.0015
## 6 -3.5076 0.2981 173.8442 0.5321 0.4705 2.7949 0.1635 0.0016
## 7 -2.4954 0.2797 144.6280 0.5278 0.3552 2.9053 0.1635 0.0016
## 8 -3.4116 0.2677 119.8390 0.5121 0.0041 3.4582 0.1813 0.0016
## 9 -0.3348 0.2596 99.1220 0.5204 1.8778 3.5093 0.2121 0.0017
## 10 -3.7904 0.2486 86.1771 0.4733 -0.0416 4.3888 0.2143 0.0018
## SDindex Dindex SDbw
## 2 1.3503 2.9230 0.7915
## 3 1.1492 2.5256 0.6173
## 4 1.3466 2.4355 0.5913
## 5 1.2478 2.3879 0.6136
## 6 1.2872 2.3032 0.5403
## 7 1.2491 2.2661 0.4701
## 8 1.2758 2.2098 0.4831
## 9 1.2969 2.1502 0.4553
## 10 1.5124 2.1077 0.4462
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.8207 27.3041 1
## 3 0.7967 27.0529 1
## 4 0.7362 13.9763 1
## 5 0.7900 20.1967 1
## 6 0.7718 20.3978 1
## 7 0.7699 17.3393 1
## 8 0.7612 18.1937 1
## 9 0.7635 14.5593 1
## 10 0.7213 14.2932 1
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot
## Number_clusters 10.0000 3.00 3.0000 9.0000 3.0000 3.000000e+00
## Value_Index 44.2547 70.94 38.0341 4.5444 404.7851 4.707462e+25
## TrCovW TraceW Friedman Rubin Cindex DB
## Number_clusters 3.00 3.0000 3.000 3.0000 7.0000 3.0000
## Value_Index 12855.38 281.9839 10.307 -0.2666 0.2917 1.4681
## Silhouette Duda PseudoT2 Beale Ratkowsky Ball
## Number_clusters 3.0000 2.0000 2.000 2.0000 3.0000 3.000
## Value_Index 0.2849 1.2343 -23.732 -1.6675 0.3748 401.137
## PtBiserial Frey McClain Dunn Hubert SDindex Dindex
## Number_clusters 3.0000 1 2.0000 3.0000 0 3.0000 0
## Value_Index 0.6154 NA 0.7411 0.2323 0 1.1492 0
## SDbw
## Number_clusters 10.0000
## Value_Index 0.4462
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3 3 3 3 3 3
## [71] 3 3 3 1 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 1 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2
library(factoextra)
fviz_nbclust(x = df, FUNcluster = kmeans,method = "wss")

fviz_nbclust(x = df, FUNcluster = kmeans, method = "silhouette")

model <- kmeans(x = df,centers = 3,nstart = 25)
fviz_cluster(object = model, data = df)

aggregate(df,by=list(cluster = model$cluster),mean)
## cluster Alcohol Malic Ash Alcalinity Magnesium
## 1 1 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869
## 2 2 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208
## 3 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047
## Phenols Flavanoids Nonflavanoids Proanthocyanins Color
## 1 -0.07576891 0.02075402 -0.03343924 0.05810161 -0.8993770
## 2 0.88274724 0.97506900 -0.56050853 0.57865427 0.1705823
## 3 -0.97657548 -1.21182921 0.72402116 -0.77751312 0.9388902
## Hue Dilution Proline
## 1 0.4605046 0.2700025 -0.7517257
## 2 0.4726504 0.7770551 1.1220202
## 3 -1.1615122 -1.2887761 -0.4059428
wine$cluster <- model$cluster
t <- table(wine$Type, model$cluster)
library(flexclust)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Loading required package: stats4
randIndex(t)
## ARI
## 0.897495
model$cluster == 1
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [122] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE
cluster1 <- as.data.frame(wine[wine$cluster == 1,])
model$size
## [1] 65 62 51