Kmeans.R

# Kmeans Clustering of Wine Dataset


library(rattle)

## Warning: package 'rattle' was built under R version 3.5.3

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(NbClust)
library(factoextra)

## Warning: package 'factoextra' was built under R version 3.5.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.3

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

# import the data set 

data(wine, package = "rattle")
head(wine) #display the first few obs

##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39
##   Nonflavanoids Proanthocyanins Color  Hue Dilution Proline
## 1          0.28            2.29  5.64 1.04     3.92    1065
## 2          0.26            1.28  4.38 1.05     3.40    1050
## 3          0.30            2.81  5.68 1.03     3.17    1185
## 4          0.24            2.18  7.80 0.86     3.45    1480
## 5          0.39            1.82  4.32 1.04     2.93     735
## 6          0.34            1.97  6.75 1.05     2.85    1450

View(wine)
str(wine)

## 'data.frame':    178 obs. of  14 variables:
##  $ Type           : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol        : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash            : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Alcalinity     : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium      : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols        : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids     : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoids  : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color          : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue            : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ Dilution       : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline        : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

# Finding the optimum no of clusters 

df <- wine[-1]

df <- scale(df)

library(NbClust)
NbClust(data = df,
              min.nc = 2,
              max.nc = 10,
              method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##         KL      CH Hartigan    CCC     Scott      Marriot    TrCovW
## 2   1.4106 69.5233  52.4491 0.4177  295.3215 5.971798e+25 26438.125
## 3   6.4010 70.9400  14.4149 4.1548  700.1066 1.382516e+25 13582.745
## 4   1.9126 55.6739   9.4839 3.4985  787.9212 1.500696e+25 10243.388
## 5   0.5185 46.1356  11.6586 2.8745  894.8302 1.286086e+25  9571.880
## 6   6.1979 41.4863   5.2105 3.3736  999.2386 1.030127e+25  8216.166
## 7   0.1842 36.2755   9.5754 2.4787 1088.4634 8.493539e+24  7677.550
## 8   0.6580 34.0023  12.6941 2.9948 1196.8913 6.032872e+24  7172.766
## 9   3.0410 33.3630   5.9473 4.5444 1310.6234 4.030316e+24  5784.726
## 10 44.2547 31.1749   2.5530 4.3861 1418.6451 2.712043e+24  5636.409
##       TraceW Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2
## 2  1649.4400  13.7691 1.3950 0.3390 1.5893     0.2593 1.2343 -23.7320
## 3  1270.7491  24.0761 1.8107 0.3373 1.4681     0.2849 1.6385 -41.3059
## 4  1174.0421  24.5845 1.9599 0.3191 1.8694     0.2442 1.1891  -6.2015
## 5  1113.3583  26.4921 2.0667 0.3032 1.7512     0.2178 1.3683 -20.4566
## 6  1043.0655  29.3567 2.2060 0.2948 1.9512     0.1671 1.6770 -27.8544
## 7  1012.3963  32.1381 2.2728 0.2917 1.7819     0.1655 1.4032 -16.6660
## 8   958.7118  34.4077 2.4001 0.3101 1.8307     0.1548 1.6495 -22.8377
## 9   892.0978  35.8595 2.5793 0.3631 1.6558     0.1683 1.0402  -1.8151
## 10  861.7709  38.9197 2.6701 0.3590 1.7214     0.1447 1.7951 -16.3888
##      Beale Ratkowsky     Ball Ptbiserial    Frey McClain   Dunn Hubert
## 2  -1.6675    0.3393 824.7200     0.5231  0.2481  0.7411 0.1357 0.0011
## 3  -3.4061    0.3748 423.5830     0.6154  0.6290  1.3219 0.2323 0.0014
## 4  -1.3675    0.3440 293.5105     0.5934  0.2848  1.8345 0.1635 0.0014
## 5  -2.3492    0.3166 222.6717     0.5961  1.8234  2.0591 0.1730 0.0015
## 6  -3.5076    0.2981 173.8442     0.5321  0.4705  2.7949 0.1635 0.0016
## 7  -2.4954    0.2797 144.6280     0.5278  0.3552  2.9053 0.1635 0.0016
## 8  -3.4116    0.2677 119.8390     0.5121  0.0041  3.4582 0.1813 0.0016
## 9  -0.3348    0.2596  99.1220     0.5204  1.8778  3.5093 0.2121 0.0017
## 10 -3.7904    0.2486  86.1771     0.4733 -0.0416  4.3888 0.2143 0.0018
##    SDindex Dindex   SDbw
## 2   1.3503 2.9230 0.7915
## 3   1.1492 2.5256 0.6173
## 4   1.3466 2.4355 0.5913
## 5   1.2478 2.3879 0.6136
## 6   1.2872 2.3032 0.5403
## 7   1.2491 2.2661 0.4701
## 8   1.2758 2.2098 0.4831
## 9   1.2969 2.1502 0.4553
## 10  1.5124 2.1077 0.4462
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.8207            27.3041            1
## 3          0.7967            27.0529            1
## 4          0.7362            13.9763            1
## 5          0.7900            20.1967            1
## 6          0.7718            20.3978            1
## 7          0.7699            17.3393            1
## 8          0.7612            18.1937            1
## 9          0.7635            14.5593            1
## 10         0.7213            14.2932            1
## 
## $Best.nc
##                      KL    CH Hartigan    CCC    Scott      Marriot
## Number_clusters 10.0000  3.00   3.0000 9.0000   3.0000 3.000000e+00
## Value_Index     44.2547 70.94  38.0341 4.5444 404.7851 4.707462e+25
##                   TrCovW   TraceW Friedman   Rubin Cindex     DB
## Number_clusters     3.00   3.0000    3.000  3.0000 7.0000 3.0000
## Value_Index     12855.38 281.9839   10.307 -0.2666 0.2917 1.4681
##                 Silhouette   Duda PseudoT2   Beale Ratkowsky    Ball
## Number_clusters     3.0000 2.0000    2.000  2.0000    3.0000   3.000
## Value_Index         0.2849 1.2343  -23.732 -1.6675    0.3748 401.137
##                 PtBiserial Frey McClain   Dunn Hubert SDindex Dindex
## Number_clusters     3.0000    1  2.0000 3.0000      0  3.0000      0
## Value_Index         0.6154   NA  0.7411 0.2323      0  1.1492      0
##                    SDbw
## Number_clusters 10.0000
## Value_Index      0.4462
## 
## $Best.partition
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3 3 3 3 3 3
##  [71] 3 3 3 1 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 1 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2

library(factoextra)
fviz_nbclust(x = df, FUNcluster = kmeans,method = "wss")

fviz_nbclust(x = df, FUNcluster = kmeans, method = "silhouette")

model <- kmeans(x = df,centers = 3,nstart = 25)

fviz_cluster(object = model, data = df)

aggregate(df,by=list(cluster = model$cluster),mean)

##   cluster    Alcohol      Malic        Ash Alcalinity   Magnesium
## 1       1 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869
## 2       2  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208
## 3       3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047
##       Phenols  Flavanoids Nonflavanoids Proanthocyanins      Color
## 1 -0.07576891  0.02075402   -0.03343924      0.05810161 -0.8993770
## 2  0.88274724  0.97506900   -0.56050853      0.57865427  0.1705823
## 3 -0.97657548 -1.21182921    0.72402116     -0.77751312  0.9388902
##          Hue   Dilution    Proline
## 1  0.4605046  0.2700025 -0.7517257
## 2  0.4726504  0.7770551  1.1220202
## 3 -1.1615122 -1.2887761 -0.4059428

wine$cluster <- model$cluster

t <- table(wine$Type, model$cluster)

library(flexclust)

## Loading required package: grid

## Loading required package: lattice

## Loading required package: modeltools

## Loading required package: stats4

randIndex(t)

##      ARI 
## 0.897495

model$cluster == 1

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [122] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE

cluster1 <- as.data.frame(wine[wine$cluster == 1,])
model$size

## [1] 65 62 51

Kmeans.R

adarsh

2019-10-11