# Uncomment and install below packages if not already available
# install.packages("funModeling")
# install.packages("cluster")
# Use libraries
library(cluster)
library(funModeling)
Check the structure of “ruspini” data.
str(ruspini)
## 'data.frame': 75 obs. of 2 variables:
## $ x: int 4 5 10 9 13 13 12 15 18 19 ...
## $ y: int 53 63 59 77 49 69 88 75 61 65 ...
Perform various exploration on the data ruspini
describe(ruspini)
## ruspini
##
## 2 Variables 75 Observations
## ---------------------------------------------------------------------------
## x
## n missing distinct Info Mean Gmd .05 .10
## 75 0 56 1 54.88 35.1 11.4 16.2
## .25 .50 .75 .90 .95
## 31.5 52.0 76.5 99.0 108.6
##
## lowest : 4 5 9 10 12, highest: 108 110 111 115 117
## ---------------------------------------------------------------------------
## y
## n missing distinct Info Mean Gmd .05 .10
## 75 0 58 1 92.03 55.84 15.0 20.4
## .25 .50 .75 .90 .95
## 56.5 96.0 141.5 149.6 152.3
##
## lowest : 4 12 13 15 16, highest: 152 153 154 155 156
## ---------------------------------------------------------------------------
df_status(ruspini)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 x 0 0 0 0 0 0 integer 56
## 2 y 0 0 0 0 0 0 integer 58
profiling_num(ruspini)
## variable mean std_dev variation_coef p_01 p_05 p_25 p_50 p_75
## 1 x 54.88000 30.50253 0.5558041 4.74 11.4 31.5 52 76.5
## 2 y 92.02667 48.70262 0.5292229 9.92 15.0 56.5 96 141.5
## p_95 p_99 skewness kurtosis iqr range_98 range_80
## 1 108.6 115.52 0.3201771 2.123066 45 [4.74, 115.52] [16.2, 99]
## 2 152.3 155.26 -0.2742345 1.666166 85 [9.92, 155.26] [20.4, 149.6]
plot_num(ruspini)
Store the data as a data frame
ruspini_data<-as.data.frame(ruspini)
ruspini_data
## x y
## 1 4 53
## 2 5 63
## 3 10 59
## 4 9 77
## 5 13 49
## 6 13 69
## 7 12 88
## 8 15 75
## 9 18 61
## 10 19 65
## 11 22 74
## 12 27 72
## 13 28 76
## 14 24 58
## 15 27 55
## 16 28 60
## 17 30 52
## 18 31 60
## 19 32 61
## 20 36 72
## 21 28 147
## 22 32 149
## 23 35 153
## 24 33 154
## 25 38 151
## 26 41 150
## 27 38 145
## 28 38 143
## 29 32 143
## 30 34 141
## 31 44 156
## 32 44 149
## 33 44 143
## 34 46 142
## 35 47 149
## 36 49 152
## 37 50 142
## 38 53 144
## 39 52 152
## 40 55 155
## 41 54 124
## 42 60 136
## 43 63 139
## 44 86 132
## 45 85 115
## 46 85 96
## 47 78 94
## 48 74 96
## 49 97 122
## 50 98 116
## 51 98 124
## 52 99 119
## 53 99 128
## 54 101 115
## 55 108 111
## 56 110 111
## 57 108 116
## 58 111 126
## 59 115 117
## 60 117 115
## 61 70 4
## 62 77 12
## 63 83 21
## 64 61 15
## 65 69 15
## 66 78 16
## 67 66 18
## 68 58 13
## 69 64 20
## 70 69 21
## 71 66 23
## 72 61 25
## 73 76 27
## 74 72 31
## 75 64 30
Create a numeric vector of length 15 which will store for us the Within Sum of Square for each of our 15 clusters. You may choose initially 20 cluster ,25 or 10 or any random number of your choice.
wss<- numeric(15)
wss
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
for (k in 1:15) {
wss[k]<- sum(kmeans(ruspini_data,k,nstart=25)$withinss)
}
wss
## [1] 244373.867 89337.832 51063.475 12881.051 10126.720 8575.407
## [7] 7126.199 6149.639 5550.633 4446.282 3897.252 3501.088
## [13] 3159.283 2899.617 2673.267
As you see above we have executed K-Means algorithm for random 15 clusters ranging from 1 through 15. We have calculated the WSS for each of the cluster on ruspini data in the numerical vector ‘wss’. nstart = 25 means that for each cluster , execute the algorithm for 25 times to get the least distance.
Now plot the WSS for each cluster and try to find out the “Elbow” point
plot(1:15,wss,type="b",xlab = "Number of Clusters",ylab = "Within Sum of Squares")
After 4 clusters the “Within Sum of Square” values seems to be almost linear. They do not see any significant decrease. Hence, 4 is considered as elbow point. Now let us run the K-Means algorithm only with 4 clusters and find various parameters.
km_4 <- kmeans(ruspini_data,4,nstart = 35)
km_4
## K-means clustering with 4 clusters of sizes 20, 23, 15, 17
##
## Cluster means:
## x y
## 1 20.15000 64.9500
## 2 43.91304 146.0435
## 3 68.93333 19.4000
## 4 98.17647 114.8824
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 3689.500 3176.783 1456.533 4558.235
## (between_SS / total_SS = 94.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
The ratio of “between SS” and “Within Sum of square” is also known as F-Test. If this ratio is high it signifies that groups are correctly formed with least distance within them and maximum distance between them. Thus, the higher the ratio the better the clustering.
Let us also see how this ratio varies if we take up 3 or 5 clusters and even with higher number of clusters.
km_3 <- kmeans(ruspini_data,3,nstart = 35)
km_3
## K-means clustering with 3 clusters of sizes 17, 35, 23
##
## Cluster means:
## x y
## 1 98.17647 114.88235
## 2 41.05714 45.42857
## 3 43.91304 146.04348
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 4558.235 43328.457 3176.783
## (between_SS / total_SS = 79.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
km_5 <- kmeans(ruspini_data,5,nstart = 35)
km_5
## K-means clustering with 5 clusters of sizes 15, 23, 13, 4, 20
##
## Cluster means:
## x y
## 1 68.93333 19.4000
## 2 43.91304 146.0435
## 3 103.61538 119.3846
## 4 80.50000 100.2500
## 5 20.15000 64.9500
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 2 2 2 2 2
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 4 4 4 4 3 3
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 1456.533 3176.783 1422.154 381.750 3689.500
## (between_SS / total_SS = 95.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
As can be seen from the above two code results, when we have 3 clusters the ratio i.e. F-Test result has dropped significantly. Suggesting that there are still some points which are not grouped / clustered properly. With 5 cluster we notice that this ratio has improved. But has it significantly improved is the question ? Below you will see that we will be able to reach the 100% ratio result with higher and higher cluster. But the change in this ratio is not significantly growing with increase in clusster. Thus we can consider that for this data , 4 or 5 cluster is the “Optimal” cluster value.
km_6 <- kmeans(ruspini_data,6,nstart = 35)
km_6
## K-means clustering with 6 clusters of sizes 12, 4, 20, 13, 11, 15
##
## Cluster means:
## x y
## 1 36.41667 148.4167
## 2 80.50000 100.2500
## 3 20.15000 64.9500
## 4 103.61538 119.3846
## 5 52.09091 143.4545
## 6 68.93333 19.4000
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 1 1 1 1 1 1 1 5 5 5 5 5 5 5 5 5 5 5 4 2 2 2 2 4 4
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
##
## Within cluster sum of squares by cluster:
## [1] 519.8333 381.7500 3689.5000 1422.1538 1105.6364 1456.5333
## (between_SS / total_SS = 96.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
km_10 <- kmeans(ruspini_data,10,nstart = 35)
km_10
## K-means clustering with 10 clusters of sizes 8, 7, 5, 3, 3, 8, 6, 15, 10, 10
##
## Cluster means:
## x y
## 1 18.12500 74.50000
## 2 29.71429 59.71429
## 3 10.00000 57.00000
## 4 59.00000 133.00000
## 5 79.00000 95.33333
## 6 95.37500 121.37500
## 7 111.50000 116.00000
## 8 68.93333 19.40000
## 9 48.40000 148.40000
## 10 34.90000 147.60000
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 3 3 3 1 3 1 1 1 3 1 1 1 1 2 2 2 2 2 2 2 10 10 10 10 10
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 10 10 10 10 10 9 9 9 9 9 9 9 9 9 9 4 4 4 6 6 5 5 5 6 6
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 6 6 6 6 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
##
## Within cluster sum of squares by cluster:
## [1] 666.87500 326.85714 270.00000 168.00000 64.66667 549.75000
## [7] 221.50000 1456.53333 404.80000 317.30000
## (between_SS / total_SS = 98.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
km_15 <- kmeans(ruspini_data,15,nstart = 35)
km_15
## K-means clustering with 15 clusters of sizes 6, 6, 3, 3, 6, 4, 2, 6, 3, 3, 4, 10, 10, 3, 6
##
## Cluster means:
## x y
## 1 98.66667 120.66667
## 2 28.66667 57.66667
## 3 79.33333 16.33333
## 4 59.00000 133.00000
## 5 68.00000 26.16667
## 6 8.00000 56.00000
## 7 85.50000 123.50000
## 8 111.50000 116.00000
## 9 16.66667 65.00000
## 10 12.00000 80.00000
## 11 28.25000 73.50000
## 12 48.40000 148.40000
## 13 34.90000 147.60000
## 14 79.00000 95.33333
## 15 64.66667 14.16667
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 6 6 6 10 6 9 10 10 9 9 11 11 11 2 2 2 2 2 2 11 13 13 13 13 13
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 13 13 13 13 13 12 12 12 12 12 12 12 12 12 12 4 4 4 7 7 14 14 14 1 1
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 1 1 1 1 8 8 8 8 8 8 15 3 3 15 15 3 15 15 15 5 5 5 5 5 5
##
## Within cluster sum of squares by cluster:
## [1] 132.66667 104.66667 61.33333 168.00000 226.83333 170.00000 145.00000
## [8] 221.50000 52.66667 116.00000 111.75000 404.80000 317.30000 64.66667
## [15] 262.16667
## (between_SS / total_SS = 99.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
km_25 <- kmeans(ruspini_data,25,nstart = 35)
km_25
## K-means clustering with 25 clusters of sizes 2, 2, 4, 2, 3, 4, 2, 1, 3, 3, 2, 3, 6, 3, 3, 4, 3, 2, 4, 1, 3, 3, 5, 3, 4
##
## Cluster means:
## x y
## 1 8.50000 51.00000
## 2 74.00000 29.00000
## 3 44.00000 151.00000
## 4 7.50000 61.00000
## 5 108.66667 112.66667
## 6 48.25000 142.75000
## 7 30.00000 148.00000
## 8 54.00000 124.00000
## 9 16.66667 65.00000
## 10 12.00000 80.00000
## 11 61.50000 137.50000
## 12 98.00000 124.66667
## 13 28.66667 57.66667
## 14 99.33333 116.66667
## 15 52.00000 153.00000
## 16 35.50000 143.00000
## 17 79.33333 16.33333
## 18 85.50000 123.50000
## 19 65.00000 24.75000
## 20 70.00000 4.00000
## 21 114.33333 119.33333
## 22 79.00000 95.33333
## 23 63.60000 16.20000
## 24 35.33333 152.66667
## 25 28.25000 73.50000
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 4 4 10 1 9 10 10 9 9 25 25 25 13 13 13 13 13 13 25 7 7 24 24 24
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 3 16 16 16 16 3 3 6 6 3 15 6 6 15 15 8 11 11 18 18 22 22 22 12 14
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 12 14 12 14 5 5 5 21 21 21 20 17 17 23 23 17 23 23 23 19 19 19 2 2 19
##
## Within cluster sum of squares by cluster:
## [1] 48.50000 16.00000 52.00000 20.50000 19.33333 51.50000 10.00000
## [8] 0.00000 52.66667 116.00000 9.00000 20.66667 104.66667 13.33333
## [15] 24.00000 35.00000 61.33333 145.00000 78.75000 0.00000 87.33333
## [22] 64.66667 104.00000 17.33333 111.75000
## (between_SS / total_SS = 99.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"