elbow curve, kselction , animation for selecting k value to form clusters
#insurance dataset
Insurance.Dataset<- read.csv(file.choose())
View(Insurance.Dataset)
str(Insurance.Dataset)
## 'data.frame': 100 obs. of 5 variables:
## $ Premiums.Paid: int 2800 2950 3100 3250 3400 3550 3700 3850 4000 6225 ...
## $ Age : int 26 27 28 30 32 35 44 45 46 56 ...
## $ Days.to.Renew: int 233 130 144 65 56 89 95 48 76 200 ...
## $ Claims.made : num 3890 2294 2565 1978 2009 ...
## $ Income : int 28000 29500 31000 32500 34000 35500 37000 38500 40000 41500 ...
dim(Insurance.Dataset)
## [1] 100 5
summary(Insurance.Dataset)
## Premiums.Paid Age Days.to.Renew Claims.made
## Min. : 2800 Min. :23.00 Min. : 1.0 Min. : 1978
## 1st Qu.: 6975 1st Qu.:34.00 1st Qu.: 56.0 1st Qu.: 5221
## Median :11825 Median :45.00 Median : 89.0 Median : 8386
## Mean :12542 Mean :46.11 Mean :120.4 Mean :12579
## 3rd Qu.:15475 3rd Qu.:54.50 3rd Qu.:186.5 3rd Qu.:14671
## Max. :29900 Max. :82.00 Max. :321.0 Max. :99677
## Income
## Min. : 28000
## 1st Qu.: 65125
## Median :102250
## Mean :102250
## 3rd Qu.:139375
## Max. :176500
insurance<-scale(Insurance.Dataset)
View(insurance)
attach(data.frame(insurance))
#mean(insurance)
#`is.na<-`(insurance)
#insurance<-na.omit(insurance)
###hierarchical clustering for dendogram
d<-dist(insurance,method="euclidean") # Distance matrix
fit<-hclust(d,method="complete")
plot(fit,hang = -1)

# by using dendogram we can form 3 or 5 clusters
# install.packages("animation")
library(animation)
km<-kmeans.ani(insurance,5)










## by kmeans animation formed 5 clusters
# Determine number of clusters by scree-plot
wss = NULL#(nrow(insurance)-1)*sum(apply(insurance, 2, var))
for (i in 1:20) wss[i] = sum(kmeans(insurance, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # Look for an "elbow" in the scree plot #
title(sub = "K-Means Clustering Scree-Plot")

# selecting K for kmeans clustering using kselection
#install.packages("kselection")
library(kselection)
#install.packages("doParallel")
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(cores=4)
k <- kselection(insurance, parallel = TRUE, k_threshold = 0.9, max_centers=6)
k
## f(k) finds 2 clusters
summary(k)
## Length Class Mode
## k 1 -none- numeric
## f_k 6 -none- numeric
## max_centers 1 -none- numeric
## k_threshold 1 -none- numeric
## fun_cluster 1 -none- function
#clustering based on elbow curve and dendogram by k=5 clusters
km<-kmeans(insurance,5)
km
## K-means clustering with 5 clusters of sizes 28, 18, 10, 8, 36
##
## Cluster means:
## Premiums.Paid Age Days.to.Renew Claims.made Income
## 1 -0.9898365 -0.63540567 -0.5163935 -0.6468855 -1.11532157
## 2 1.6833769 1.14018246 0.4976644 1.1637588 1.28301648
## 3 -0.3324016 -1.06641583 1.4070629 0.4835244 0.01378764
## 4 -0.4995412 1.39620547 1.4164319 0.3602506 -0.94790023
## 5 0.1315273 -0.08992808 -0.5528062 -0.2931143 0.43277869
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 4 4 4 4 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4
## [36] 4 3 5 5 1 5 5 5 5 5 5 3 3 3 3 3 5 5 5 5 5 5 5 2 5 5 5 5 5 5 5 5 5 2 3
## [71] 3 3 5 5 5 5 5 5 5 5 2 2 5 2 2 2 2 2 2 5 2 5 2 5 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 29.77750 71.45934 17.44616 12.58531 39.87052
## (between_SS / total_SS = 65.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
summary(km)
## Length Class Mode
## cluster 100 -none- numeric
## centers 25 -none- numeric
## totss 1 -none- numeric
## withinss 5 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 5 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
#View(km)
f_clust<- data.frame(sort(km$cluster),Insurance.Dataset) # append cluster membership
f_clust
## sort.km.cluster. Premiums.Paid Age Days.to.Renew Claims.made Income
## 1 1 2800 26 233 3890.076 28000
## 2 1 2950 27 130 2294.444 29500
## 3 1 3100 28 144 2564.545 31000
## 4 1 3250 30 65 1978.261 32500
## 5 1 3400 32 56 2009.091 34000
## 6 1 3550 35 89 2349.455 35500
## 7 1 3700 44 95 2503.346 37000
## 8 1 3850 45 48 2217.405 38500
## 9 1 4000 46 76 2527.778 40000
## 10 1 6225 56 200 6908.232 41500
## 11 1 6450 67 211 7672.549 43000
## 12 1 6675 69 245 10208.824 44500
## 13 1 6900 70 261 12192.233 46000
## 14 1 4750 34 278 10052.326 47500
## 15 1 4900 44 182 4900.000 49000
## 16 1 7575 45 60 4535.033 50500
## 17 1 5200 23 12 2688.636 52000
## 18 1 8025 53 2 4034.669 53500
## 19 1 5500 48 1 2757.576 55000
## 20 1 5650 49 14 2938.000 56500
## 21 1 5800 41 17 3042.075 58000
## 22 1 5950 42 65 3621.739 59500
## 23 1 9150 50 56 5406.818 61000
## 24 1 6250 26 89 4136.364 62500
## 25 1 6400 27 95 4330.112 64000
## 26 1 6550 28 48 3772.468 65500
## 27 1 6700 30 76 4234.028 67000
## 28 1 6850 32 39 3836.000 68500
## 29 2 7000 35 34 3860.606 70000
## 30 2 7150 44 57 4238.762 71500
## 31 2 7300 45 85 4762.007 73000
## 32 2 7450 46 165 6813.568 74500
## 33 2 11400 56 234 15960.000 76000
## 34 2 11625 67 256 19590.278 77500
## 35 2 11850 69 233 16463.359 79000
## 36 2 12075 70 321 51108.140 80500
## 37 2 8200 34 233 11392.366 82000
## 38 2 12525 44 130 9741.667 83500
## 39 2 12750 45 144 10547.727 85000
## 40 2 12975 23 65 7897.826 86500
## 41 2 13200 53 56 7800.000 88000
## 42 2 8950 48 89 5923.273 89500
## 43 2 9100 49 95 6156.877 91000
## 44 2 9250 41 48 5327.532 92500
## 45 2 9400 42 76 5940.278 94000
## 46 2 14325 50 123 10818.050 95500
## 47 3 9700 26 156 8487.500 97000
## 48 3 9850 27 245 15064.706 98500
## 49 3 10000 28 261 17669.903 100000
## 50 3 10150 30 278 21480.233 101500
## 51 3 10300 32 182 10300.000 103000
## 52 3 10450 35 60 6256.250 104500
## 53 3 10600 44 12 5480.682 106000
## 54 3 10750 45 2 5404.696 107500
## 55 3 10900 46 1 5465.014 109000
## 56 3 16575 56 14 8619.000 110500
## 57 4 16800 67 17 8811.527 112000
## 58 4 17025 69 65 10363.043 113500
## 59 4 23000 70 56 13590.909 115000
## 60 4 11650 34 89 7710.182 116500
## 61 4 11800 44 95 7983.643 118000
## 62 4 11950 45 48 6882.595 119500
## 63 4 12100 23 76 7646.528 121000
## 64 4 18375 53 39 10290.000 122500
## 65 5 12400 48 34 6838.788 124000
## 66 5 12550 49 57 7440.065 125500
## 67 5 12700 41 85 8284.588 127000
## 68 5 12850 42 165 11752.261 128500
## 69 5 19500 50 234 27300.000 130000
## 70 5 13150 32 256 22160.185 131500
## 71 5 13300 34 233 18477.863 133000
## 72 5 13450 36 321 56927.907 134500
## 73 5 13600 39 65 8278.261 136000
## 74 5 13750 42 56 8125.000 137500
## 75 5 13900 44 89 9199.273 139000
## 76 5 14050 45 95 9505.948 140500
## 77 5 14200 48 48 8178.481 142000
## 78 5 14350 49 76 9068.403 143500
## 79 5 21750 54 39 12180.000 145000
## 80 5 14650 32 34 8079.697 146500
## 81 5 29600 77 57 17547.883 148000
## 82 5 29900 82 85 19504.660 149500
## 83 5 15100 34 165 13810.050 151000
## 84 5 15250 56 234 21350.000 152500
## 85 5 23100 63 256 38927.778 154000
## 86 5 23325 62 233 32405.725 155500
## 87 5 23550 59 321 99676.744 157000
## 88 5 23775 58 233 33030.916 158500
## 89 5 24000 52 130 18666.667 160000
## 90 5 16150 45 144 13360.455 161500
## 91 5 24450 54 65 14882.609 163000
## 92 5 16450 36 56 9720.455 164500
## 93 5 16600 82 89 10986.182 166000
## 94 5 16750 34 95 11332.714 167500
## 95 5 25350 56 48 14600.316 169000
## 96 5 25575 63 76 16161.979 170500
## 97 5 25800 62 166 23715.152 172000
## 98 5 26025 59 167 24043.401 173500
## 99 5 26250 58 245 40147.059 175000
## 100 5 26475 52 261 46781.068 176500