set.seed(7)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
# create index for data partition
inTrain<-createDataPartition(y=iris$Species, p=0.70, list=FALSE)
# train & test dframes
training <- iris[inTrain,]
testing <- iris[-inTrain,]
pairs(training, col=training[,5], pch=20)
library(GGally)
ggpairs(training, aes(colour=Species), title="Iris Variable Relationships") +
# change colors
theme(plot.title = element_text(size=15, face='bold'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In “Petal.Length” and “Petal.Width”, the “Species” is clearly separated into 3 groups with relatively less overlap.
So we focus in various combination of this two factors (“Petal.Length” and “Petal.Width”) with 3 clusters.
km1 = kmeans(training$Petal.Length, 3, nstart=100)
km2 = kmeans(training$Petal.Width, 3, nstart=100)
km3 = kmeans(c(training$Petal.Length, training$Petal.Width), 3, nstart=100)
print("Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length")
## [1] "Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length"
km1$tot.withinss
## [1] 16.13926
print("Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Width")
## [1] "Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Width"
km2$tot.withinss
## [1] 3.788665
print("Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length & Petal.Width")
## [1] "Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length & Petal.Width"
km3$tot.withinss
## [1] 58.52404
print("K-Means clusting with Petal.Length")
## [1] "K-Means clusting with Petal.Length"
km1
## K-means clustering with 3 clusters of sizes 37, 33, 35
##
## Cluster means:
## [,1]
## 1 4.278378
## 2 5.596970
## 3 1.445714
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [71] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 7.3627027 7.9296970 0.8468571
## (between_SS / total_SS = 95.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
print("K-Means clusting with Petal.Width")
## [1] "K-Means clusting with Petal.Width"
km2
## K-means clustering with 3 clusters of sizes 35, 39, 31
##
## Cluster means:
## [,1]
## 1 0.240000
## 2 1.353846
## 3 2.067742
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 0.224000 1.576923 1.987742
## (between_SS / total_SS = 93.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
print("K-Means clusting with Petal.Length & Petal.Width")
## [1] "K-Means clusting with Petal.Length & Petal.Width"
km3
## K-means clustering with 3 clusters of sizes 106, 35, 69
##
## Cluster means:
## [,1]
## 1 1.608491
## 2 0.240000
## 3 4.927536
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 16.34236 0.22400 41.95768
## (between_SS / total_SS = 92.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
"K-Means clusting with Petal.Length"
## [1] "K-Means clusting with Petal.Length"
table(training[,5], km1$cluster)
##
## 1 2 3
## setosa 0 0 35
## versicolor 33 2 0
## virginica 4 31 0
"K-Means clusting with Petal.Width"
## [1] "K-Means clusting with Petal.Width"
table(training[,5], km2$cluster)
##
## 1 2 3
## setosa 35 0 0
## versicolor 0 34 1
## virginica 0 5 30
In training data, it seem a single predictor (“Petal.Length”) model is the best combination of predictor that have highest cluster means (between_SS / total_SS) of 95% and second lowest total within groups sum of squares (“tot.withinss”) 16.13926.
The next best model is a single factor (“Petal.Width”) model with cluster means (between_SS / total_SS) of 93.7% and lowest total within groups sum of squares (“tot.withinss”) 3.788665.
(The cluster means (between_SS / total_SS) means combine to give the centroids (centres) of the clusters in the multivariate space defined by the input variables. Hence the set of means for cluster 1 that you show are the coordinates of the centroid (centre) for that cluster. They are computed as the mean of the values for each variable for those samples assigned to that cluster.
The 95 % is a measure of the total variance in your data set that is explained by the clustering. k-means minimises the within group dispersion (spread) of the samples, the sum of squares. This maximises the between-group dispersion. By assigning the samples to k clusters rather than n (number of samples) clusters achieved a reduction in sums of squares of 95 %.)
test.km1 = kmeans(testing$Petal.Length, 3, nstart=100)
test.km2 = kmeans(testing$Petal.Width, 3, nstart=100)
print("Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length")
## [1] "Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Length"
test.km1$tot.withinss
## [1] 8.213937
print("Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Width")
## [1] "Total within groups sum of squares (tot.withinss) of K-Means clusting with Petal.Width"
test.km2$tot.withinss
## [1] 1.082667
print("K-Means clusting with Petal.Length")
## [1] "K-Means clusting with Petal.Length"
test.km1
## K-means clustering with 3 clusters of sizes 17, 13, 15
##
## Cluster means:
## [,1]
## 1 4.317647
## 2 5.707692
## 3 1.500000
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [36] 2 1 1 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 4.004706 3.609231 0.600000
## (between_SS / total_SS = 94.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
print("K-Means clusting with Petal.Width")
## [1] "K-Means clusting with Petal.Width"
test.km2
## K-means clustering with 3 clusters of sizes 15, 15, 15
##
## Cluster means:
## [,1]
## 1 0.260000
## 2 2.086667
## 3 1.293333
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 0.3160000 0.4173333 0.3493333
## (between_SS / total_SS = 95.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
print("K-Means clusting with Petal.Length")
## [1] "K-Means clusting with Petal.Length"
table(testing[,5], test.km1$cluster)
##
## 1 2 3
## setosa 0 0 15
## versicolor 15 0 0
## virginica 2 13 0
print("K-Means clusting with Petal.Width")
## [1] "K-Means clusting with Petal.Width"
table(testing[,5], test.km2$cluster)
##
## 1 2 3
## setosa 15 0 0
## versicolor 0 0 15
## virginica 0 15 0
In testing data, “Petal.Width” seem perform slightly better than “Petal.Length”.
The cluster means (between_SS / total_SS) of “Petal.Length” is 94.1% and “Petal.Width” is 95.9%.
Seem a single predictor (“Petal.Width”) model with highest cluster means (between_SS / total_SS) of 96.5% and lowest total within groups sum of squares (“tot.withinss”) 1.082667.
The single predictor (“Petal.Length”) model is the best combination of predictor that have lower cluster means (between_SS / total_SS) of 94.1% and higher total within groups sum of squares (“tot.withinss”) 8.213937.