df <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 4 - Clustering\\Section 24 - K-Means Clustering\\K_Means\\Mall_Customers.csv")
head(df)
df <- df[,4:5]
head(df)
# Using the elbow method to find the optimal number of clusters
set.seed(6)
wcss <- vector()
for (i in 1:10)
wcss[i] <- sum(kmeans(df, i)$ withinss)
plot(1:10, wcss,type="b", main = paste("clusters of clients"), xlab="Number of clusters", ylab="WCSS")
NA
so based on the elbow method, the optimal number of clusters is 5.
set.seed(29)
kmeans <- kmeans(df, 5, iter.max = 300, nstart = 10)
kmeans
K-means clustering with 5 clusters of sizes 22, 81, 23, 39, 35
Cluster means:
Annual.Income..k.. Spending.Score..1.100.
1 25.72727 79.36364
2 55.29630 49.51852
3 26.30435 20.91304
4 86.53846 82.12821
5 88.20000 17.11429
Clustering vector:
[1] 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 2 3 1 2 2 2 2 2 2 2
[54] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[107] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 5 4 2 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
[160] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
Within cluster sum of squares by cluster:
[1] 3519.455 9875.111 5098.696 13444.051 12511.143
(between_SS / total_SS = 83.5 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
library(cluster)
clusplot(df, kmeans$cluster,
lines = 0 ,
shade = TRUE,
color = TRUE,
labels = 2,
plotchar = FALSE,
span = TRUE,
main = paste("Clusters of cleints"),
xlab="Annual Income",
ylab="Spending Score")