实验二 聚类分析

熟悉K-means聚类、K-medoids聚类、层次聚类以及基于密度的聚类方法;对iris数据集进行聚类分析,比较聚类结果的差异。

## 设置显示方式
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE,
                      fig.width = 9.5,fig.height = 6)
rm(list = ls());gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 363955 19.5     592000 31.7   460000 24.6
## Vcells 552169  4.3    1023718  7.9   786371  6.0
## 准备工作 ,加载包
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(GGally)
library(gridExtra)
library(cluster)
library(ggfortify)


theme_set(theme_bw(base_family = "STKaiti"))

数据准备工作

iris <- iris
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
## 对数据可视化,矩阵散点图

GGally::ggscatmat(iris,columns = 1:4,color = "Species")

## 生成聚类使用的数据
iris_cl <- iris[,1:4]
str(iris_cl)
## 'data.frame':    150 obs. of  4 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

该数据集为iris数据集,是三种鸢尾花的4种测量指标。从散点矩阵图中可以观察4种指标之间的相关性和分布。接下来对我们使用鸢尾花数据进行聚类分析,学习一些聚类算法。

K-means聚类

选择聚类的数目

## Select number of clusters
## 探索由聚类数目k的变化
## 计算聚类的残差平方和
set.seed(123)

k1 <- kmeans(iris_cl,2)
## Total within-cluster sum of squares 族内平房和的和
k1$tot.withinss
## [1] 152.348
# The between-cluster sum of squares 组间平房和
k1$betweenss
## [1] 529.0226
## 计算组内平方和  组间平房和
tot_withinss <- vector()
betweenss <- vector()
for(ii in 1:15){
  k1 <- kmeans(iris_cl,ii)
  tot_withinss[ii] <- k1$tot.withinss
  betweenss[ii] <- k1$betweenss
}

kmeanvalue <- data.frame(kk = 1:15,
                         tot_withinss = tot_withinss,
                         betweenss = betweenss)


p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+
  geom_point() +
  geom_line() +
  labs(x = "kmean 聚类个数",y = "value") +
  ggtitle("Total within-cluster sum of squares")+
  theme(plot.title = element_text(hjust = 0.5))

p1

p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+
  geom_point() +
  geom_line() +
  labs(x = "kmean 聚类个数",y = "value") +
  ggtitle("The between-cluster sum of squares") +
  theme(plot.title = element_text(hjust = 0.5))

p2

grid.arrange(p1,p2,nrow=2)

我们可以看到随着聚类个数的增加组间内平方和的总和在减少,组间平房和的总和在增加,根据图和数据的背景我们将数据聚类为三类。

kmean聚类聚为3类

k3 <- kmeans(iris_cl,3)
k3
## K-means clustering with 3 clusters of sizes 21, 33, 96
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     4.738095    2.904762     1.790476   0.3523810
## 2     5.175758    3.624242     1.472727   0.2727273
## 3     6.314583    2.895833     4.973958   1.7031250
## 
## Clustering vector:
##   [1] 2 1 1 1 2 2 2 2 1 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1
##  [36] 2 2 2 1 2 2 1 1 2 2 1 2 1 2 2 3 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 1 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  17.669524   6.432121 118.651875
##  (between_SS / total_SS =  79.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
table(k3$cluster)
## 
##  1  2  3 
## 21 33 96
## 对聚类结果可视化
clusplot(iris_cl,k3$cluster,main = "kmean cluster number=3")

## 表示聚类效果
si1 <- silhouette(k3$cluster,dist(iris_cl,method = "euclidean"))

plot(si1,main = "kmean silhouette",col = "red")

kmeans 聚类为3类时每类的数量分别为96,33,21。可以看出有一类数目很多。上面的数据并没有对数据标准化,我们对数据标准化后在进行聚类分析比较结果。

数据标准化后聚类

iris_scale <- scale(iris_cl)
ks3 <- kmeans(iris_scale,3)
ks3
## K-means clustering with 3 clusters of sizes 96, 33, 21
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.5690971  -0.3705265    0.6888118   0.6609378
## 2   -0.8135055   1.3145538   -1.2825372  -1.2156393
## 3   -1.3232208  -0.3718921   -1.1334386  -1.1111395
## 
## Clustering vector:
##   [1] 2 3 3 3 2 2 2 2 3 3 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 3 2 2 2 3
##  [36] 3 2 2 3 2 2 3 3 2 2 3 2 3 2 2 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 149.25899  17.33362  23.15862
##  (between_SS / total_SS =  68.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
table(ks3$cluster)
## 
##  1  2  3 
## 96 33 21
## 对聚类结果可视化
clusplot(iris_scale,ks3$cluster,main = "data scale kmean cluster number=3")

## 表示聚类效果
sis1 <- silhouette(ks3$cluster,dist(iris_scale,method = "euclidean"))

plot(sis1,main = "data scale kmean silhouette",col = "red")

我们可以看出数据标准化后进行kmeans聚类分析,的到的聚类结果更好

K-medoids聚类

pam3 <- cluster::pam(iris_cl,3)
pam3
## Medoids:
##       ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   8          5.0         3.4          1.5         0.2
## [2,]  79          6.0         2.9          4.5         1.5
## [3,] 113          6.8         3.0          5.5         2.1
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## Objective function:
##     build      swap 
## 0.6709391 0.6542077 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
plot(pam3)

autoplot(pam3, frame = TRUE) +
  ggtitle("K-medoids聚类") +
  theme(plot.title = element_text(hjust = 0.5))

autoplot(pam3,frame = TRUE,frame.type = "norm")

K-medoids聚类的结果中每类的样本数量分别为50,62,38,虽然没有把最后两类很好的却分开来,但是从聚类的散点图赏和轮廓图可以看出聚类的效果很好,并且受到极端值的影响比较小。

层次聚类

hc1 <- hclust(dist(iris_scale),method = "ward.D2")
hc1$labels <- paste(iris$Species,1:150,sep = "-")
##  可视化结果
par(family = "STKaiti",cex = 0.45)
plot(hc1)
rect.hclust(hc1, k=3, border="red") 

## 分组情况
group <- cutree(hc1 , k=3)    
group 
##       setosa-1       setosa-2       setosa-3       setosa-4       setosa-5 
##              1              1              1              1              1 
##       setosa-6       setosa-7       setosa-8       setosa-9      setosa-10 
##              1              1              1              1              1 
##      setosa-11      setosa-12      setosa-13      setosa-14      setosa-15 
##              1              1              1              1              1 
##      setosa-16      setosa-17      setosa-18      setosa-19      setosa-20 
##              1              1              1              1              1 
##      setosa-21      setosa-22      setosa-23      setosa-24      setosa-25 
##              1              1              1              1              1 
##      setosa-26      setosa-27      setosa-28      setosa-29      setosa-30 
##              1              1              1              1              1 
##      setosa-31      setosa-32      setosa-33      setosa-34      setosa-35 
##              1              1              1              1              1 
##      setosa-36      setosa-37      setosa-38      setosa-39      setosa-40 
##              1              1              1              1              1 
##      setosa-41      setosa-42      setosa-43      setosa-44      setosa-45 
##              1              2              1              1              1 
##      setosa-46      setosa-47      setosa-48      setosa-49      setosa-50 
##              1              1              1              1              1 
##  versicolor-51  versicolor-52  versicolor-53  versicolor-54  versicolor-55 
##              3              3              3              2              3 
##  versicolor-56  versicolor-57  versicolor-58  versicolor-59  versicolor-60 
##              2              3              2              3              2 
##  versicolor-61  versicolor-62  versicolor-63  versicolor-64  versicolor-65 
##              2              3              2              3              2 
##  versicolor-66  versicolor-67  versicolor-68  versicolor-69  versicolor-70 
##              3              2              2              2              2 
##  versicolor-71  versicolor-72  versicolor-73  versicolor-74  versicolor-75 
##              3              3              3              3              3 
##  versicolor-76  versicolor-77  versicolor-78  versicolor-79  versicolor-80 
##              3              3              3              3              2 
##  versicolor-81  versicolor-82  versicolor-83  versicolor-84  versicolor-85 
##              2              2              2              3              2 
##  versicolor-86  versicolor-87  versicolor-88  versicolor-89  versicolor-90 
##              3              3              2              2              2 
##  versicolor-91  versicolor-92  versicolor-93  versicolor-94  versicolor-95 
##              2              3              2              2              2 
##  versicolor-96  versicolor-97  versicolor-98  versicolor-99 versicolor-100 
##              2              2              3              2              2 
##  virginica-101  virginica-102  virginica-103  virginica-104  virginica-105 
##              3              3              3              3              3 
##  virginica-106  virginica-107  virginica-108  virginica-109  virginica-110 
##              3              2              3              3              3 
##  virginica-111  virginica-112  virginica-113  virginica-114  virginica-115 
##              3              3              3              3              3 
##  virginica-116  virginica-117  virginica-118  virginica-119  virginica-120 
##              3              3              3              3              2 
##  virginica-121  virginica-122  virginica-123  virginica-124  virginica-125 
##              3              3              3              3              3 
##  virginica-126  virginica-127  virginica-128  virginica-129  virginica-130 
##              3              3              3              3              3 
##  virginica-131  virginica-132  virginica-133  virginica-134  virginica-135 
##              3              3              3              3              3 
##  virginica-136  virginica-137  virginica-138  virginica-139  virginica-140 
##              3              3              3              3              3 
##  virginica-141  virginica-142  virginica-143  virginica-144  virginica-145 
##              3              3              3              3              3 
##  virginica-146  virginica-147  virginica-148  virginica-149  virginica-150 
##              3              3              3              3              3

基于密度的聚类方法

基于密度聚类的优良特性,它可以对抗噪声,能处理任意形状和大小的簇,这样可以发现K均值不能发现的簇。但是对于高维数据,点之间极为稀疏,密度就很难定义了。

library(fpc)
## 简单查看一下DBSCAN算法的性质
# 生成数据
x1 <- seq(0,pi,length.out=100)
y1 <- sin(x1) + 0.1*rnorm(100)
x2 <- 1.5+ seq(0,pi,length.out=100)
y2 <- cos(x2) + 0.1*rnorm(100)
data <- data.frame(c(x1,x2),c(y1,y2))
names(data) <- c('x','y')
 

# 用fpc包中的dbscan函数进行密度聚类
model1 <- dbscan(data,eps=0.3,MinPts=4)
## 聚类结果
model1$cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 没有点杯识别为噪声
model1
## dbscan Pts=200 MinPts=4 eps=0.3
##         1   2
## seed  100 100
## total 100 100
ggplot(data,aes(x = x,y=y))+
  geom_point(size=2.5, aes(shape=factor(model1$cluster)))+
  theme(legend.position ="bottom",plot.title = element_text(hjust = 0.5)) +
  ggtitle("iris data基于密度的聚类") 

可以看出基于密度的聚类算法将数据识别为了3类,border(边界)上的点单独识别为了一类,类标号为0。其余的数据识别为了两类,从图上看出这样的聚类结果时很好的。

iris data基于密度的聚类方法

为了便于在二维空间展示聚类结果,先对数据降维,然后聚类

iris_2d <- princomp(iris_cl)
summary(iris_2d)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3      Comp.4
## Standard deviation     2.0494032 0.49097143 0.27872586 0.153870700
## Proportion of Variance 0.9246187 0.05306648 0.01710261 0.005212184
## Cumulative Proportion  0.9246187 0.97768521 0.99478782 1.000000000
## 前两维的数据累积贡献率达到97%,使用前两维进行聚类分析

irisnew <- as.data.frame(iris_2d$scores[,1:2])
summary(irisnew)
##      Comp.1            Comp.2        
##  Min.   :-3.2238   Min.   :-1.37417  
##  1st Qu.:-2.5303   1st Qu.:-0.32492  
##  Median : 0.5546   Median : 0.02216  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.5501   3rd Qu.: 0.32542  
##  Max.   : 3.7956   Max.   : 1.26597
## 基于密度的聚类方法

modl2 <- dbscan(irisnew,eps = 0.6,MinPts = 5)
modl2
## dbscan Pts=150 MinPts=5 eps=0.6
##        0  1  2
## border 6  1  2
## seed   0 49 92
## total  6 50 94
ggplot(irisnew,aes(x = Comp.1,y = Comp.2)) +
  geom_point(aes(shape = factor(modl2$cluster))) +
  theme(legend.position ="bottom",plot.title = element_text(hjust = 0.5)) +
  ggtitle("iris data基于密度的聚类:eps = 0.6,MinPts = 5") 

modl3 <- dbscan(irisnew,eps = 1,MinPts = 5)
modl3
## dbscan Pts=150 MinPts=5 eps=1
##        1   2
## seed  50 100
## total 50 100
ggplot(irisnew,aes(x = Comp.1,y = Comp.2)) +
  geom_point(aes(shape = factor(modl3$cluster))) +
  theme(legend.position ="bottom",plot.title = element_text(hjust = 0.5)) +
  ggtitle("iris data基于密度的聚类:eps = 1,MinPts = 5") 

从密度聚类的结果可以看出,在eps = 0.6,MinPts = 5时:有6个数据被当作了边界点单独居委了一类,这些数据也可以当做噪声点,因为它们离群体比较远。

当eps = 1,MinPts = 5时,数据被聚类为了2类,样本量分别维50,100