data <- read.csv('F:/Machine Learning/Data Science/Machine Learning/cluster/mtcars.csv')
str(data)
## 'data.frame': 32 obs. of 12 variables:
## $ Model: Factor w/ 32 levels "AMC Javelin",..: 18 19 5 13 14 31 7 21 20 22 ...
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : int 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : int 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : int 0 0 1 1 0 1 0 1 1 1 ...
## $ am : int 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : int 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : int 4 4 1 1 2 1 4 2 2 4 ...
#scatter plot - check for possible clusters
plot(wt ~ mpg, data)
with(data, text(data$wt~data$mpg, labels=data$Model, pos=1, cex=.4))
There are four cars with high mileage with less weight. Then there are two groups in the middle which have medium to moderate weight and mileage. And at top left side, there are three cars with high weight and less mileage. Very broadly, we can have 4 clusters but these clusters are only based on two variables, wt and mpg.
#Normalization of data
z <- data[ , -c(1,1)] #first variable is excluded being model of a car
m <- apply(z, 2,mean)
s <- apply(z, 2, sd)
z <- scale(z,m,s)
Normalization is needed so that all variables have a level-playing field.
#calculate Euclidean distance
distance <- dist(z)
print(distance, digits = 3)
## 1 2 3 4 5 6 7 8 9 10 11 12
## 2 0.408
## 3 3.243 3.176
## 4 4.401 4.263 3.437
## 5 3.880 3.820 5.003 3.042
## 6 4.844 4.676 3.868 0.994 3.399
## 7 4.190 4.175 5.896 4.340 1.891 4.596
## 8 3.997 3.821 2.501 2.534 4.607 2.911 5.621
## 9 4.918 4.670 3.312 3.270 5.360 3.409 6.307 1.768
## 10 3.138 2.988 3.295 2.986 3.760 3.386 4.116 2.347 3.174
## 11 3.293 3.117 3.344 2.971 3.840 3.290 4.202 2.331 2.964 0.408
## 12 3.856 3.733 5.167 3.247 1.216 3.374 1.720 4.610 5.283 3.504 3.545
## 13 3.726 3.614 5.014 3.096 1.058 3.266 1.748 4.479 5.139 3.425 3.466 0.394
## 14 3.859 3.728 5.084 3.135 1.277 3.210 1.828 4.529 5.094 3.461 3.457 0.490
## 15 5.450 5.285 6.770 4.685 2.884 4.625 2.496 6.057 6.472 4.848 4.808 2.375
## 16 5.480 5.313 6.820 4.787 2.968 4.741 2.497 6.120 6.534 4.865 4.831 2.415
## 17 5.097 4.936 6.525 4.603 2.626 4.715 2.131 5.848 6.320 4.533 4.550 2.130
## 18 4.024 3.941 1.783 4.185 5.878 4.731 6.927 2.848 3.476 4.104 4.194 6.103
## 19 4.053 4.051 2.646 5.245 6.505 5.885 7.297 3.743 4.279 4.352 4.460 6.743
## 20 4.345 4.272 2.212 4.551 6.233 5.110 7.282 3.188 3.639 4.450 4.527 6.489
## 21 4.330 4.214 2.574 2.136 4.410 2.538 5.459 1.819 2.360 3.009 2.959 4.523
## 22 4.109 4.053 5.179 3.103 1.037 3.205 2.157 4.768 5.584 3.990 4.042 1.205
## 23 3.760 3.685 4.850 2.935 0.836 3.138 2.093 4.443 5.171 3.571 3.604 1.055
## 24 4.119 4.105 5.928 4.674 2.354 5.015 1.055 5.707 6.375 4.028 4.126 2.174
## 25 4.172 4.088 5.272 3.182 0.548 3.508 1.984 4.786 5.529 3.973 4.052 1.304
## 26 3.611 3.566 1.065 3.983 5.603 4.496 6.575 2.737 3.468 3.781 3.855 5.825
## 27 2.595 2.659 2.967 5.363 5.533 5.905 6.168 4.070 4.911 4.206 4.357 5.751
## 28 3.559 3.643 2.350 4.865 5.891 5.409 6.567 3.680 4.669 4.127 4.311 6.148
## 29 3.624 3.700 5.672 6.218 4.570 6.660 3.976 6.343 7.004 4.764 4.902 4.662
## 30 2.217 2.311 4.671 5.726 4.874 6.039 4.505 5.200 6.035 3.896 4.054 4.630
## 31 4.976 5.007 7.289 7.454 5.796 7.698 4.508 7.603 8.153 5.761 5.867 5.444
## 32 2.906 2.788 0.980 3.584 4.910 4.022 5.590 2.487 3.193 2.780 2.820 4.948
## 13 14 15 16 17 18 19 20 21 22 23 24
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14 0.417
## 15 2.619 2.446
## 16 2.682 2.517 0.296
## 17 2.383 2.330 1.064 0.908
## 18 5.918 6.052 7.797 7.846 7.435
## 19 6.561 6.688 8.393 8.423 7.970 1.924
## 20 6.285 6.420 8.201 8.253 7.828 0.576 1.780
## 21 4.347 4.368 6.148 6.216 5.970 3.180 3.965 3.439
## 22 1.153 1.225 2.837 2.955 2.868 6.190 6.921 6.581 4.562
## 23 0.943 0.963 2.899 2.991 2.818 5.851 6.451 6.219 4.192 0.783
## 24 2.263 2.317 2.815 2.741 2.266 6.950 7.103 7.297 5.564 2.778 2.481
## 25 1.265 1.456 2.570 2.656 2.304 6.115 6.795 6.486 4.686 1.194 1.177 2.453
## 26 5.643 5.747 7.527 7.580 7.224 0.944 1.836 1.263 2.877 5.853 5.495 6.579
## 27 5.610 5.748 7.343 7.363 6.937 3.220 2.993 3.411 4.653 5.870 5.489 6.007
## 28 5.959 6.132 7.831 7.878 7.466 2.391 2.862 2.592 4.245 6.143 5.921 6.648
## 29 4.652 4.741 5.291 5.236 4.817 6.617 6.484 6.912 6.711 5.008 4.728 3.631
## 30 4.555 4.672 5.805 5.807 5.455 5.466 5.486 5.773 5.835 5.008 4.831 4.505
## 31 5.465 5.526 5.604 5.536 5.217 8.181 8.187 8.480 8.061 6.018 5.940 4.457
## 32 4.829 4.881 6.404 6.430 6.110 2.194 2.613 2.601 2.753 5.151 4.754 5.510
## 25 26 27 28 29 30 31
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26 5.883
## 27 5.805 2.904
## 28 6.164 2.179 2.561
## 29 4.703 6.226 4.693 5.554
## 30 5.096 5.109 3.631 4.219 3.065
## 31 5.868 7.870 6.575 6.985 3.029 3.372
## 32 5.156 1.621 2.888 2.665 5.216 4.249 6.745
This gives Euclidean distance among all different records.
#by default, it runs complete linkage method
hc.c <- hclust(distance)
#dendrogram with labels
plot(hc.c, labels=data$Model, cex=.5)
The cars in the data, Hornet 4 Drive and Valiant seem to be very close to each other so they are grouped into one cluster.
#clustering with average linkage method
hc.a <- hclust(distance, method = "average")
plot(hc.a, hang=-1, cex=.5)
Cars in the 10th and 11th observations are similar so they got grouped into one cluster.
#membership based on complete linkage
member.c <- cutree(hc.c, 3)
#membership based on average linkage
member.a <- cutree(hc.a, 3)
#comparison table
table(member.c, member.a)
## member.a
## member.c 1 2 3
## 1 3 0 2
## 2 15 0 0
## 3 0 12 0
There is a lot of mismatch between the two methods.
#calculate cluster means
aggregate(z, list(member.c), mean)
## Group.1 mpg cyl disp hp drat
## 1 1 -0.2639188 0.3429602 -0.05907659 0.7600688 0.4478156
## 2 2 0.7570512 -0.9262258 -0.79938884 -0.8073490 0.5625263
## 3 3 -0.8363478 1.0148821 1.02385129 0.6924910 -0.8897477
## wt qsec vs am gear carb
## 1 -0.2210111 -1.2494801 -0.8680278 1.1899014 1.2367782 1.4781451
## 2 -0.6514165 0.7326758 0.9837648 0.2546807 0.3331960 -0.6268574
## 3 0.9063586 -0.3952280 -0.8680278 -0.8141431 -0.9318192 0.1676779
#Silhoutte plot
library(cluster)
plot(silhouette(cutree(hc.c,4), distance), cex=.5)
If the members in the cluster are closer, then the cluster is good.
#Screeplot
wss <- (nrow(z)-1)*sum(apply(z,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(z, centers=i)$withinss)
plot(1:20, wss, type="b", xlab="No. of clusters", ylab="within group SS")
When we go from cluster 1 to cluster 2, we can see the drop in within group sum of squares is very large. If we try to have 5 or more clusters, the improvement is not that significant. Screeplot indicates in this case that we should go for lower number of clusters.
kclust <- kmeans(z,3)
kclust
## K-means clustering with 3 clusters of sizes 7, 14, 11
##
## Cluster means:
## mpg cyl disp hp drat wt
## 1 0.1082193 -0.5849321 -0.4486701 -0.6496905 -0.04967936 -0.02346989
## 2 -0.8280518 1.0148821 0.9874085 0.9119628 -0.68691115 0.79918068
## 3 0.9850172 -0.9194387 -0.9711844 -0.7472405 0.90586470 -1.00220367
## qsec vs am gear carb
## 1 1.18548414 1.1160357 -0.8141431 -0.1573201 -0.4145882
## 2 -0.60248536 -0.8680278 -0.5278510 -0.5445697 0.4256439
## 3 0.01240056 0.3945581 1.1899014 0.7932015 -0.2778997
##
## Clustering vector:
## [1] 3 3 3 1 2 1 2 1 1 1 1 2 2 2 2 2 2 3 3 3 1 2 2 2 2 3 3 3 2 3 2 3
##
## Within cluster sum of squares by cluster:
## [1] 21.28798 64.93441 50.64064
## (between_SS / total_SS = 59.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(mpg ~ disp, data, col=kclust$cluster)
Clustering is good when between cluster distance is high and when within cluster distance is low.