kmeans clustering

elbow curve, kselction , animation for selecting k value to form clusters

#insurance dataset

Insurance.Dataset<- read.csv(file.choose())
View(Insurance.Dataset)
str(Insurance.Dataset)
## 'data.frame':    100 obs. of  5 variables:
##  $ Premiums.Paid: int  2800 2950 3100 3250 3400 3550 3700 3850 4000 6225 ...
##  $ Age          : int  26 27 28 30 32 35 44 45 46 56 ...
##  $ Days.to.Renew: int  233 130 144 65 56 89 95 48 76 200 ...
##  $ Claims.made  : num  3890 2294 2565 1978 2009 ...
##  $ Income       : int  28000 29500 31000 32500 34000 35500 37000 38500 40000 41500 ...
dim(Insurance.Dataset)
## [1] 100   5
summary(Insurance.Dataset)
##  Premiums.Paid        Age        Days.to.Renew    Claims.made   
##  Min.   : 2800   Min.   :23.00   Min.   :  1.0   Min.   : 1978  
##  1st Qu.: 6975   1st Qu.:34.00   1st Qu.: 56.0   1st Qu.: 5221  
##  Median :11825   Median :45.00   Median : 89.0   Median : 8386  
##  Mean   :12542   Mean   :46.11   Mean   :120.4   Mean   :12579  
##  3rd Qu.:15475   3rd Qu.:54.50   3rd Qu.:186.5   3rd Qu.:14671  
##  Max.   :29900   Max.   :82.00   Max.   :321.0   Max.   :99677  
##      Income      
##  Min.   : 28000  
##  1st Qu.: 65125  
##  Median :102250  
##  Mean   :102250  
##  3rd Qu.:139375  
##  Max.   :176500
insurance<-scale(Insurance.Dataset)
View(insurance)
attach(data.frame(insurance))
#mean(insurance)
#`is.na<-`(insurance)
#insurance<-na.omit(insurance)

###hierarchical clustering for dendogram
d<-dist(insurance,method="euclidean") # Distance matrix
fit<-hclust(d,method="complete")
plot(fit,hang = -1)

# by using dendogram we can form 3 or 5  clusters

# install.packages("animation")
library(animation) 
km<-kmeans.ani(insurance,5) 

## by kmeans animation formed 5 clusters


# Determine number of clusters by scree-plot

wss = NULL#(nrow(insurance)-1)*sum(apply(insurance, 2, var))          
for (i in 1:20) wss[i] = sum(kmeans(insurance, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")   # Look for an "elbow" in the scree plot #
title(sub = "K-Means Clustering Scree-Plot")

# selecting K for kmeans clustering using kselection
#install.packages("kselection")
library(kselection)

#install.packages("doParallel")
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(cores=4)
k <- kselection(insurance, parallel = TRUE, k_threshold = 0.9, max_centers=6)
k
## f(k) finds 2 clusters
summary(k)
##             Length Class  Mode    
## k           1      -none- numeric 
## f_k         6      -none- numeric 
## max_centers 1      -none- numeric 
## k_threshold 1      -none- numeric 
## fun_cluster 1      -none- function
#clustering based on elbow curve and dendogram by k=5 clusters
km<-kmeans(insurance,5)
km
## K-means clustering with 5 clusters of sizes 28, 18, 10, 8, 36
## 
## Cluster means:
##   Premiums.Paid         Age Days.to.Renew Claims.made      Income
## 1    -0.9898365 -0.63540567    -0.5163935  -0.6468855 -1.11532157
## 2     1.6833769  1.14018246     0.4976644   1.1637588  1.28301648
## 3    -0.3324016 -1.06641583     1.4070629   0.4835244  0.01378764
## 4    -0.4995412  1.39620547     1.4164319   0.3602506 -0.94790023
## 5     0.1315273 -0.08992808    -0.5528062  -0.2931143  0.43277869
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 4 4 4 4 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4
##  [36] 4 3 5 5 1 5 5 5 5 5 5 3 3 3 3 3 5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 2 3
##  [71] 3 3 5 5 5 5 5 5 5 5 2 2 5 2 2 2 2 2 2 5 2 5 2 5 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 29.77750 71.45934 17.44616 12.58531 39.87052
##  (between_SS / total_SS =  65.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
summary(km)
##              Length Class  Mode   
## cluster      100    -none- numeric
## centers       25    -none- numeric
## totss          1    -none- numeric
## withinss       5    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           5    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
#View(km)

f_clust<- data.frame(sort(km$cluster),Insurance.Dataset) # append cluster membership
f_clust
##     sort.km.cluster. Premiums.Paid Age Days.to.Renew Claims.made Income
## 1                  1          2800  26           233    3890.076  28000
## 2                  1          2950  27           130    2294.444  29500
## 3                  1          3100  28           144    2564.545  31000
## 4                  1          3250  30            65    1978.261  32500
## 5                  1          3400  32            56    2009.091  34000
## 6                  1          3550  35            89    2349.455  35500
## 7                  1          3700  44            95    2503.346  37000
## 8                  1          3850  45            48    2217.405  38500
## 9                  1          4000  46            76    2527.778  40000
## 10                 1          6225  56           200    6908.232  41500
## 11                 1          6450  67           211    7672.549  43000
## 12                 1          6675  69           245   10208.824  44500
## 13                 1          6900  70           261   12192.233  46000
## 14                 1          4750  34           278   10052.326  47500
## 15                 1          4900  44           182    4900.000  49000
## 16                 1          7575  45            60    4535.033  50500
## 17                 1          5200  23            12    2688.636  52000
## 18                 1          8025  53             2    4034.669  53500
## 19                 1          5500  48             1    2757.576  55000
## 20                 1          5650  49            14    2938.000  56500
## 21                 1          5800  41            17    3042.075  58000
## 22                 1          5950  42            65    3621.739  59500
## 23                 1          9150  50            56    5406.818  61000
## 24                 1          6250  26            89    4136.364  62500
## 25                 1          6400  27            95    4330.112  64000
## 26                 1          6550  28            48    3772.468  65500
## 27                 1          6700  30            76    4234.028  67000
## 28                 1          6850  32            39    3836.000  68500
## 29                 2          7000  35            34    3860.606  70000
## 30                 2          7150  44            57    4238.762  71500
## 31                 2          7300  45            85    4762.007  73000
## 32                 2          7450  46           165    6813.568  74500
## 33                 2         11400  56           234   15960.000  76000
## 34                 2         11625  67           256   19590.278  77500
## 35                 2         11850  69           233   16463.359  79000
## 36                 2         12075  70           321   51108.140  80500
## 37                 2          8200  34           233   11392.366  82000
## 38                 2         12525  44           130    9741.667  83500
## 39                 2         12750  45           144   10547.727  85000
## 40                 2         12975  23            65    7897.826  86500
## 41                 2         13200  53            56    7800.000  88000
## 42                 2          8950  48            89    5923.273  89500
## 43                 2          9100  49            95    6156.877  91000
## 44                 2          9250  41            48    5327.532  92500
## 45                 2          9400  42            76    5940.278  94000
## 46                 2         14325  50           123   10818.050  95500
## 47                 3          9700  26           156    8487.500  97000
## 48                 3          9850  27           245   15064.706  98500
## 49                 3         10000  28           261   17669.903 100000
## 50                 3         10150  30           278   21480.233 101500
## 51                 3         10300  32           182   10300.000 103000
## 52                 3         10450  35            60    6256.250 104500
## 53                 3         10600  44            12    5480.682 106000
## 54                 3         10750  45             2    5404.696 107500
## 55                 3         10900  46             1    5465.014 109000
## 56                 3         16575  56            14    8619.000 110500
## 57                 4         16800  67            17    8811.527 112000
## 58                 4         17025  69            65   10363.043 113500
## 59                 4         23000  70            56   13590.909 115000
## 60                 4         11650  34            89    7710.182 116500
## 61                 4         11800  44            95    7983.643 118000
## 62                 4         11950  45            48    6882.595 119500
## 63                 4         12100  23            76    7646.528 121000
## 64                 4         18375  53            39   10290.000 122500
## 65                 5         12400  48            34    6838.788 124000
## 66                 5         12550  49            57    7440.065 125500
## 67                 5         12700  41            85    8284.588 127000
## 68                 5         12850  42           165   11752.261 128500
## 69                 5         19500  50           234   27300.000 130000
## 70                 5         13150  32           256   22160.185 131500
## 71                 5         13300  34           233   18477.863 133000
## 72                 5         13450  36           321   56927.907 134500
## 73                 5         13600  39            65    8278.261 136000
## 74                 5         13750  42            56    8125.000 137500
## 75                 5         13900  44            89    9199.273 139000
## 76                 5         14050  45            95    9505.948 140500
## 77                 5         14200  48            48    8178.481 142000
## 78                 5         14350  49            76    9068.403 143500
## 79                 5         21750  54            39   12180.000 145000
## 80                 5         14650  32            34    8079.697 146500
## 81                 5         29600  77            57   17547.883 148000
## 82                 5         29900  82            85   19504.660 149500
## 83                 5         15100  34           165   13810.050 151000
## 84                 5         15250  56           234   21350.000 152500
## 85                 5         23100  63           256   38927.778 154000
## 86                 5         23325  62           233   32405.725 155500
## 87                 5         23550  59           321   99676.744 157000
## 88                 5         23775  58           233   33030.916 158500
## 89                 5         24000  52           130   18666.667 160000
## 90                 5         16150  45           144   13360.455 161500
## 91                 5         24450  54            65   14882.609 163000
## 92                 5         16450  36            56    9720.455 164500
## 93                 5         16600  82            89   10986.182 166000
## 94                 5         16750  34            95   11332.714 167500
## 95                 5         25350  56            48   14600.316 169000
## 96                 5         25575  63            76   16161.979 170500
## 97                 5         25800  62           166   23715.152 172000
## 98                 5         26025  59           167   24043.401 173500
## 99                 5         26250  58           245   40147.059 175000
## 100                5         26475  52           261   46781.068 176500

the kmeans clusters changes every time we execute, so the results varies at my execution and while on publishing