# Uncomment and install below packages if not already available
# install.packages("funModeling")
# install.packages("cluster")
# Use libraries
library(cluster)
library(funModeling)

Check the structure of “ruspini” data.

str(ruspini)
## 'data.frame':    75 obs. of  2 variables:
##  $ x: int  4 5 10 9 13 13 12 15 18 19 ...
##  $ y: int  53 63 59 77 49 69 88 75 61 65 ...

Perform various exploration on the data ruspini

describe(ruspini)
## ruspini 
## 
##  2  Variables      75  Observations
## ---------------------------------------------------------------------------
## x 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       75        0       56        1    54.88     35.1     11.4     16.2 
##      .25      .50      .75      .90      .95 
##     31.5     52.0     76.5     99.0    108.6 
## 
## lowest :   4   5   9  10  12, highest: 108 110 111 115 117
## ---------------------------------------------------------------------------
## y 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       75        0       58        1    92.03    55.84     15.0     20.4 
##      .25      .50      .75      .90      .95 
##     56.5     96.0    141.5    149.6    152.3 
## 
## lowest :   4  12  13  15  16, highest: 152 153 154 155 156
## ---------------------------------------------------------------------------
df_status(ruspini)
##   variable q_zeros p_zeros q_na p_na q_inf p_inf    type unique
## 1        x       0       0    0    0     0     0 integer     56
## 2        y       0       0    0    0     0     0 integer     58
profiling_num(ruspini)
##   variable     mean  std_dev variation_coef p_01 p_05 p_25 p_50  p_75
## 1        x 54.88000 30.50253      0.5558041 4.74 11.4 31.5   52  76.5
## 2        y 92.02667 48.70262      0.5292229 9.92 15.0 56.5   96 141.5
##    p_95   p_99   skewness kurtosis iqr       range_98      range_80
## 1 108.6 115.52  0.3201771 2.123066  45 [4.74, 115.52]    [16.2, 99]
## 2 152.3 155.26 -0.2742345 1.666166  85 [9.92, 155.26] [20.4, 149.6]
plot_num(ruspini)

Store the data as a data frame

ruspini_data<-as.data.frame(ruspini)
ruspini_data
##      x   y
## 1    4  53
## 2    5  63
## 3   10  59
## 4    9  77
## 5   13  49
## 6   13  69
## 7   12  88
## 8   15  75
## 9   18  61
## 10  19  65
## 11  22  74
## 12  27  72
## 13  28  76
## 14  24  58
## 15  27  55
## 16  28  60
## 17  30  52
## 18  31  60
## 19  32  61
## 20  36  72
## 21  28 147
## 22  32 149
## 23  35 153
## 24  33 154
## 25  38 151
## 26  41 150
## 27  38 145
## 28  38 143
## 29  32 143
## 30  34 141
## 31  44 156
## 32  44 149
## 33  44 143
## 34  46 142
## 35  47 149
## 36  49 152
## 37  50 142
## 38  53 144
## 39  52 152
## 40  55 155
## 41  54 124
## 42  60 136
## 43  63 139
## 44  86 132
## 45  85 115
## 46  85  96
## 47  78  94
## 48  74  96
## 49  97 122
## 50  98 116
## 51  98 124
## 52  99 119
## 53  99 128
## 54 101 115
## 55 108 111
## 56 110 111
## 57 108 116
## 58 111 126
## 59 115 117
## 60 117 115
## 61  70   4
## 62  77  12
## 63  83  21
## 64  61  15
## 65  69  15
## 66  78  16
## 67  66  18
## 68  58  13
## 69  64  20
## 70  69  21
## 71  66  23
## 72  61  25
## 73  76  27
## 74  72  31
## 75  64  30

Create a numeric vector of length 15 which will store for us the Within Sum of Square for each of our 15 clusters. You may choose initially 20 cluster ,25 or 10 or any random number of your choice.

wss<- numeric(15)
wss
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
for (k in 1:15) {
wss[k]<- sum(kmeans(ruspini_data,k,nstart=25)$withinss)
}
wss
##  [1] 244373.867  89337.832  51063.475  12881.051  10126.720   8575.407
##  [7]   7126.199   6149.639   5550.633   4446.282   3897.252   3501.088
## [13]   3159.283   2899.617   2673.267

As you see above we have executed K-Means algorithm for random 15 clusters ranging from 1 through 15. We have calculated the WSS for each of the cluster on ruspini data in the numerical vector ‘wss’. nstart = 25 means that for each cluster , execute the algorithm for 25 times to get the least distance.

Now plot the WSS for each cluster and try to find out the “Elbow” point

plot(1:15,wss,type="b",xlab = "Number of Clusters",ylab = "Within Sum of Squares")

After 4 clusters the “Within Sum of Square” values seems to be almost linear. They do not see any significant decrease. Hence, 4 is considered as elbow point. Now let us run the K-Means algorithm only with 4 clusters and find various parameters.

km_4 <- kmeans(ruspini_data,4,nstart = 35)
km_4
## K-means clustering with 4 clusters of sizes 20, 23, 15, 17
## 
## Cluster means:
##          x        y
## 1 20.15000  64.9500
## 2 43.91304 146.0435
## 3 68.93333  19.4000
## 4 98.17647 114.8824
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  4  4  4  4  4  4  4 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  4  4  4  4  4  4  4  4  4  4  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3 
## 
## Within cluster sum of squares by cluster:
## [1] 3689.500 3176.783 1456.533 4558.235
##  (between_SS / total_SS =  94.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

The ratio of “between SS” and “Within Sum of square” is also known as F-Test. If this ratio is high it signifies that groups are correctly formed with least distance within them and maximum distance between them. Thus, the higher the ratio the better the clustering.

Let us also see how this ratio varies if we take up 3 or 5 clusters and even with higher number of clusters.

km_3 <- kmeans(ruspini_data,3,nstart = 35)
km_3
## K-means clustering with 3 clusters of sizes 17, 35, 23
## 
## Cluster means:
##          x         y
## 1 98.17647 114.88235
## 2 41.05714  45.42857
## 3 43.91304 146.04348
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3  3  3  3 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  1  1  1  1  1  1  1 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2 
## 
## Within cluster sum of squares by cluster:
## [1]  4558.235 43328.457  3176.783
##  (between_SS / total_SS =  79.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
km_5 <- kmeans(ruspini_data,5,nstart = 35)
km_5
## K-means clustering with 5 clusters of sizes 15, 23, 13, 4, 20
## 
## Cluster means:
##           x        y
## 1  68.93333  19.4000
## 2  43.91304 146.0435
## 3 103.61538 119.3846
## 4  80.50000 100.2500
## 5  20.15000  64.9500
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  2  2  2  2  2 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  4  4  4  4  3  3 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  3  3  3  3  3  3  3  3  3  3  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
## 
## Within cluster sum of squares by cluster:
## [1] 1456.533 3176.783 1422.154  381.750 3689.500
##  (between_SS / total_SS =  95.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

As can be seen from the above two code results, when we have 3 clusters the ratio i.e. F-Test result has dropped significantly. Suggesting that there are still some points which are not grouped / clustered properly. With 5 cluster we notice that this ratio has improved. But has it significantly improved is the question ? Below you will see that we will be able to reach the 100% ratio result with higher and higher cluster. But the change in this ratio is not significantly growing with increase in clusster. Thus we can consider that for this data , 4 or 5 cluster is the “Optimal” cluster value.

km_6 <- kmeans(ruspini_data,6,nstart = 35)
km_6
## K-means clustering with 6 clusters of sizes 12, 4, 20, 13, 11, 15
## 
## Cluster means:
##           x        y
## 1  36.41667 148.4167
## 2  80.50000 100.2500
## 3  20.15000  64.9500
## 4 103.61538 119.3846
## 5  52.09091 143.4545
## 6  68.93333  19.4000
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  1  1  1  1  1 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  1  1  1  1  1  1  1  5  5  5  5  5  5  5  5  5  5  5  4  2  2  2  2  4  4 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  4  4  4  4  4  4  4  4  4  4  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 
## 
## Within cluster sum of squares by cluster:
## [1]  519.8333  381.7500 3689.5000 1422.1538 1105.6364 1456.5333
##  (between_SS / total_SS =  96.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
km_10 <- kmeans(ruspini_data,10,nstart = 35)
km_10
## K-means clustering with 10 clusters of sizes 8, 7, 5, 3, 3, 8, 6, 15, 10, 10
## 
## Cluster means:
##            x         y
## 1   18.12500  74.50000
## 2   29.71429  59.71429
## 3   10.00000  57.00000
## 4   59.00000 133.00000
## 5   79.00000  95.33333
## 6   95.37500 121.37500
## 7  111.50000 116.00000
## 8   68.93333  19.40000
## 9   48.40000 148.40000
## 10  34.90000 147.60000
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  3  3  3  1  3  1  1  1  3  1  1  1  1  2  2  2  2  2  2  2 10 10 10 10 10 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 10 10 10 10 10  9  9  9  9  9  9  9  9  9  9  4  4  4  6  6  5  5  5  6  6 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  6  6  6  6  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8 
## 
## Within cluster sum of squares by cluster:
##  [1]  666.87500  326.85714  270.00000  168.00000   64.66667  549.75000
##  [7]  221.50000 1456.53333  404.80000  317.30000
##  (between_SS / total_SS =  98.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
km_15 <- kmeans(ruspini_data,15,nstart = 35)
km_15
## K-means clustering with 15 clusters of sizes 6, 6, 3, 3, 6, 4, 2, 6, 3, 3, 4, 10, 10, 3, 6
## 
## Cluster means:
##            x         y
## 1   98.66667 120.66667
## 2   28.66667  57.66667
## 3   79.33333  16.33333
## 4   59.00000 133.00000
## 5   68.00000  26.16667
## 6    8.00000  56.00000
## 7   85.50000 123.50000
## 8  111.50000 116.00000
## 9   16.66667  65.00000
## 10  12.00000  80.00000
## 11  28.25000  73.50000
## 12  48.40000 148.40000
## 13  34.90000 147.60000
## 14  79.00000  95.33333
## 15  64.66667  14.16667
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  6  6  6 10  6  9 10 10  9  9 11 11 11  2  2  2  2  2  2 11 13 13 13 13 13 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 13 13 13 13 13 12 12 12 12 12 12 12 12 12 12  4  4  4  7  7 14 14 14  1  1 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  1  1  1  1  8  8  8  8  8  8 15  3  3 15 15  3 15 15 15  5  5  5  5  5  5 
## 
## Within cluster sum of squares by cluster:
##  [1] 132.66667 104.66667  61.33333 168.00000 226.83333 170.00000 145.00000
##  [8] 221.50000  52.66667 116.00000 111.75000 404.80000 317.30000  64.66667
## [15] 262.16667
##  (between_SS / total_SS =  99.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
km_25 <- kmeans(ruspini_data,25,nstart = 35)
km_25
## K-means clustering with 25 clusters of sizes 2, 2, 4, 2, 3, 4, 2, 1, 3, 3, 2, 3, 6, 3, 3, 4, 3, 2, 4, 1, 3, 3, 5, 3, 4
## 
## Cluster means:
##            x         y
## 1    8.50000  51.00000
## 2   74.00000  29.00000
## 3   44.00000 151.00000
## 4    7.50000  61.00000
## 5  108.66667 112.66667
## 6   48.25000 142.75000
## 7   30.00000 148.00000
## 8   54.00000 124.00000
## 9   16.66667  65.00000
## 10  12.00000  80.00000
## 11  61.50000 137.50000
## 12  98.00000 124.66667
## 13  28.66667  57.66667
## 14  99.33333 116.66667
## 15  52.00000 153.00000
## 16  35.50000 143.00000
## 17  79.33333  16.33333
## 18  85.50000 123.50000
## 19  65.00000  24.75000
## 20  70.00000   4.00000
## 21 114.33333 119.33333
## 22  79.00000  95.33333
## 23  63.60000  16.20000
## 24  35.33333 152.66667
## 25  28.25000  73.50000
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  4  4 10  1  9 10 10  9  9 25 25 25 13 13 13 13 13 13 25  7  7 24 24 24 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  3 16 16 16 16  3  3  6  6  3 15  6  6 15 15  8 11 11 18 18 22 22 22 12 14 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
## 12 14 12 14  5  5  5 21 21 21 20 17 17 23 23 17 23 23 23 19 19 19  2  2 19 
## 
## Within cluster sum of squares by cluster:
##  [1]  48.50000  16.00000  52.00000  20.50000  19.33333  51.50000  10.00000
##  [8]   0.00000  52.66667 116.00000   9.00000  20.66667 104.66667  13.33333
## [15]  24.00000  35.00000  61.33333 145.00000  78.75000   0.00000  87.33333
## [22]  64.66667 104.00000  17.33333 111.75000
##  (between_SS / total_SS =  99.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"