对于不同类型数据,聚类分析都可以使用,只是采取的具体方法不同。目的是把分类对象按一定的规则分成若干类,寻找相似度(距离)或相异度(相关系数)。
聚类分析类型:
聚类分析方法:
x1=c(5,7,3,6,6)
x2=c(7,1,2,5,6)
plot(x1,x2)
text(x1,x2,labels=c(1:5),adj=-0.5)
X=cbind(x1,x2)
dist(X)#默认为euclidean距离
## 1 2 3 4
## 2 6.324555
## 3 5.385165 4.123106
## 4 2.236068 4.123106 4.242641
## 5 1.414214 5.099020 5.000000 1.000000
dist(X,diag=TRUE)#添加主对角线距离
## 1 2 3 4 5
## 1 0.000000
## 2 6.324555 0.000000
## 3 5.385165 4.123106 0.000000
## 4 2.236068 4.123106 4.242641 0.000000
## 5 1.414214 5.099020 5.000000 1.000000 0.000000
dist(X,method="manhattan")#manhattan距离
## 1 2 3 4
## 2 8
## 3 7 5
## 4 3 5 6
## 5 2 6 7 1
dist(X,method="minkowski",p=1)#manhattan距离
## 1 2 3 4
## 2 8
## 3 7 5
## 4 3 5 6
## 5 2 6 7 1
dist(X,upper=TRUE)#添加上三角距离
## 1 2 3 4 5
## 1 6.324555 5.385165 2.236068 1.414214
## 2 6.324555 4.123106 4.123106 5.099020
## 3 5.385165 4.123106 4.242641 5.000000
## 4 2.236068 4.123106 4.242641 1.000000
## 5 1.414214 5.099020 5.000000 1.000000
dist(X,method="minkowski",p=2)#euclidean距离
## 1 2 3 4
## 2 6.324555
## 3 5.385165 4.123106
## 4 2.236068 4.123106 4.242641
## 5 1.414214 5.099020 5.000000 1.000000
hc<-hclust(dist(X),"single")#最短距离法
cbind(hc$merge,hc$height)#分类过程
## [,1] [,2] [,3]
## [1,] -4 -5 1.000000
## [2,] -1 1 1.414214
## [3,] -2 2 4.123106
## [4,] -3 3 4.123106
plot(hc)#聚类图
hc<-hclust(dist(X),"ward.D")#ward距离法
cbind(hc$merge,hc$height)#分类过程
## [,1] [,2] [,3]
## [1,] -4 -5 1.000000
## [2,] -1 1 2.100188
## [3,] -2 -3 4.123106
## [4,] 2 3 8.355856
plot(hc)#聚类图
setwd("C:/Users/lenovo/Desktop")
d7.2=read.table("d7.2.txt",header=T)
plot(d7.2)
library(mvstats)
H.clust(d7.2,"euclidean","single",plot=T)#最短距离法
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 31
H.clust(d7.2,"euclidean","complete",plot=T)#最长距离法
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 31
H.clust(d7.2,"euclidean","median",plot=T)#中间距离法
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : median
## Distance : euclidean
## Number of objects: 31
H.clust(d7.2,"euclidean","average",plot=T)#类平均法
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 31
H.clust(d7.2,"euclidean","centroid",plot=T)#重心法
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : centroid
## Distance : euclidean
## Number of objects: 31
H.clust(d7.2,"euclidean","ward",plot=T)#ward法
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 31
x1=matrix(rnorm(1000,mean=0,sd=0.3),ncol=10)#均值1,标准差为0.3的100x10的正态随机数矩阵
x2=matrix(rnorm(1000,mean=1,sd=0.3),ncol=10)
x=rbind(x1,x2)
H.clust(x,"euclidean","complete")
##
## Call:
## hclust(d = D, method = m)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 200
cl=kmeans(x,2)#kmeans聚类
pch1=rep("1",100)
pch2=rep("2",100)
plot(x,col=cl$cluster,pch=c(pch1,pch2),cex=0.7)
points(cl$centers,col=3,pch="*",cex=3)
x1=matrix(rnorm(10000,mean=0,sd=0.3),ncol=10)#均值1,标准差为0.3的1000x10的正态随机数矩阵
x2=matrix(rnorm(10000,mean=1,sd=0.3),ncol=10)
x=rbind(x1,x2)
cl=kmeans(x,2)#kmeans聚类
pch1=rep("1",1000)
pch2=rep("2",1000)
plot(x,col=cl$cluster,pch=c(pch1,pch2),cex=0.7)
points(cl$centers,col=3,pch ="*",cex=3)