rm(list = ls())
data("iris") # Loading the data set
df <- scale(iris[,-5]) # Scaling the data
# View the firt 3 rows of the data
head(df,6)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
dim(df) #[1] 50 4
## [1] 150 4
library(factoextra)
## 载入需要的程辑包:ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#####################################################1__Determining Optimal Clusters
?fviz_nbclust
## starting httpd help server ...
## done
fviz_nbclust(df, FUN = kmeans, method = "wss") #+

##################################
wss <- (nrow(df)-1)*sum(apply(df,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(df,centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

# geom_vline(xintercept = 5, linetype = 2)
##Computing k-means clustering
# Compute k-means with k = 4
set.seed(123)
#?kmeans
####################################################2____kmeans
km.res <- kmeans(df, centers = 3, algorithm = c("Hartigan-Wong"))
# Print the results
print(km.res)
## K-means clustering with 3 clusters of sizes 50, 53, 47
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
## [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
dd <- cbind(iris, cluster = km.res$cluster)
head(dd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species cluster
## 1 5.1 3.5 1.4 0.2 setosa 1
## 2 4.9 3.0 1.4 0.2 setosa 1
## 3 4.7 3.2 1.3 0.2 setosa 1
## 4 4.6 3.1 1.5 0.2 setosa 1
## 5 5.0 3.6 1.4 0.2 setosa 1
## 6 5.4 3.9 1.7 0.4 setosa 1
table(dd$Species, dd$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 39 11
## virginica 0 14 36
plot(df, col = km.res$cluster)

##############ref https://uc-r.github.io/kmeans_clustering