R语言做K均值聚类的一个简单小例子

小明的数据分析笔记本

2022-08-02

这个是公众号小明的数据分析笔记本 2019年11月30号推文的示例数据和代码

library(factoextra)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df<-read.csv("Wine.csv",header = T)
winescale<-scale(df[,2:14])
head(winescale)
##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue        OD
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585 1.8427215
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085 1.1103172
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085 0.7863692
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410 1.1807407
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585 0.4483365
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085 0.3356589
##          Proline
## [1,]  1.01015939
## [2,]  0.96252635
## [3,]  1.39122370
## [4,]  2.32800680
## [5,] -0.03776747
## [6,]  2.23274072
fviz_nbclust(winescale,kmeans,method='wss')+
  geom_vline(xintercept=3,linetype=5,col="darkred")

winekmeans<-kmeans(winescale,3,nstart=25)
winekmeans
## K-means clustering with 3 clusters of sizes 51, 62, 65
## 
## Cluster means:
##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
## 2  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue         OD
## 1 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122 -1.2887761
## 2  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504  0.7770551
## 3  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046  0.2700025
##      Proline
## 1 -0.4059428
## 2  1.1220202
## 3 -0.7517257
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
##  [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
winekmeans$centers
##      Alcohol Malic.acid        Ash        Acl          Mg     Phenols
## 1  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
## 2  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
##    Flavanoids Nonflavanoid.phenols     Proanth  Color.int        Hue         OD
## 1 -1.21182921           0.72402116 -0.77751312  0.9388902 -1.1615122 -1.2887761
## 2  0.97506900          -0.56050853  0.57865427  0.1705823  0.4726504  0.7770551
## 3  0.02075402          -0.03343924  0.05810161 -0.8993770  0.4605046  0.2700025
##      Proline
## 1 -0.4059428
## 2  1.1220202
## 3 -0.7517257
winekmeans$size
## [1] 51 62 65
fviz_cluster(object=winekmeans,
             data=winescale,
             ellipse.type = "norm",
             geom = ("point"),
             palette='jco',
             main="",
             ggtheme=theme_minimal())

欢迎大家关注我的公众号

小明的数据分析笔记本