k-means.R

rm(list = ls())
data("iris")      # Loading the data set
df <- scale(iris[,-5]) # Scaling the data
# View the firt 3 rows of the data
head(df,6)

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667

dim(df) #[1] 50  4

## [1] 150   4

library(factoextra)

## 载入需要的程辑包：ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#####################################################1__Determining Optimal Clusters
?fviz_nbclust

## starting httpd help server ...

##  done

fviz_nbclust(df, FUN = kmeans, method = "wss") #+

##################################
wss <- (nrow(df)-1)*sum(apply(df,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(df,centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

#  geom_vline(xintercept = 5, linetype = 2)
##Computing k-means clustering
# Compute k-means with k = 4
set.seed(123)
#?kmeans
####################################################2____kmeans
km.res <- kmeans(df, centers = 3, algorithm = c("Hartigan-Wong"))
 # Print the results
print(km.res)

## K-means clustering with 3 clusters of sizes 50, 53, 47
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 2  -0.05005221 -0.88042696    0.3465767   0.2805873
## 3   1.13217737  0.08812645    0.9928284   1.0141287
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
##  [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
##  (between_SS / total_SS =  76.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

dd <- cbind(iris, cluster = km.res$cluster)
head(dd)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species cluster
## 1          5.1         3.5          1.4         0.2  setosa       1
## 2          4.9         3.0          1.4         0.2  setosa       1
## 3          4.7         3.2          1.3         0.2  setosa       1
## 4          4.6         3.1          1.5         0.2  setosa       1
## 5          5.0         3.6          1.4         0.2  setosa       1
## 6          5.4         3.9          1.7         0.4  setosa       1

table(dd$Species, dd$cluster)

##             
##               1  2  3
##   setosa     50  0  0
##   versicolor  0 39 11
##   virginica   0 14 36

plot(df, col = km.res$cluster)

##############ref https://uc-r.github.io/kmeans_clustering

k-means.R

liyix

2022-06-28