分群問題
距離計算
x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)
#euclidean
?dist
rbind(x,y)
## [,1] [,2] [,3] [,4] [,5] [,6]
## x 0 0 1 1 1 1
## y 1 0 1 1 0 1
dist(rbind(x,y), method ="euclidean")
## x
## y 1.414214
sqrt(sum((x-y)^2))
## [1] 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
## x
## y 1.414214
#city block
dist(rbind(x,y), method ="manhattan")
## x
## y 2
sum(abs(x-y))
## [1] 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
Hierarchical Clustering
聚合式(bottom-up)
setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)
## ID Visit.Time Average.Expense Sex Age
## 1 1 3 5.7 0 10
## 2 2 5 14.5 0 27
## 3 3 16 33.5 0 32
## 4 4 5 15.9 0 30
## 5 5 16 24.9 0 23
## 6 6 3 12.0 0 15
str(customer)
## 'data.frame': 60 obs. of 5 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Visit.Time : int 3 5 16 5 16 3 12 14 6 3 ...
## $ Average.Expense: num 5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
## $ Sex : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : int 10 27 32 30 23 15 33 27 16 11 ...
#數值變數作正規化
customer_s =scale(customer[,-1])
?scale
#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)
## [1] 0
round(sd(customer_s[,2]),3)
## [1] 1
?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="complete")
plot(hc3, hang =-0.01, cex=0.8)

cutree
fit =cutree(hc, k =4)
fit
## [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
table(fit)
## fit
## 1 2 3 4
## 11 8 16 25
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.000 Min. :3.000 Min. : 4.60 Min. :0 Min. : 9
## 1st Qu.: 5.000 1st Qu.:3.500 1st Qu.: 7.15 1st Qu.:0 1st Qu.:12
## Median :10.000 Median :5.000 Median :14.50 Median :0 Median :16
## Mean : 9.636 Mean :4.909 Mean :12.71 Mean :0 Mean :17
## 3rd Qu.:14.500 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.:0 3rd Qu.:20
## Max. :18.000 Max. :8.000 Max. :23.80 Max. :0 Max. :30
分裂式階層式(top-down)
#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)
## Merge:
## [,1] [,2]
## [1,] -24 -50
## [2,] -28 -46
## [3,] -7 -13
## [4,] -30 -35
## [5,] -21 -40
## [6,] -54 -58
## [7,] -23 -26
## [8,] -1 -10
## [9,] 7 -51
## [10,] -27 -59
## [11,] 5 -39
## [12,] -32 -45
## [13,] -8 -12
## [14,] -2 -4
## [15,] -14 -18
## [16,] 11 -43
## [17,] -44 -49
## [18,] 9 -56
## [19,] -37 -60
## [20,] -6 -11
## [21,] -29 -48
## [22,] -5 -19
## [23,] 10 -36
## [24,] -42 17
## [25,] -25 12
## [26,] 18 -41
## [27,] 21 -38
## [28,] 13 -17
## [29,] -34 -52
## [30,] 16 6
## [31,] 8 20
## [32,] 26 4
## [33,] 19 -57
## [34,] -47 -55
## [35,] 25 -53
## [36,] 24 -31
## [37,] 30 36
## [38,] -3 3
## [39,] -9 15
## [40,] -33 33
## [41,] 32 23
## [42,] 22 28
## [43,] 31 -15
## [44,] 37 27
## [45,] -20 40
## [46,] -22 35
## [47,] 44 34
## [48,] 14 39
## [49,] 1 29
## [50,] 45 2
## [51,] 38 42
## [52,] 43 -16
## [53,] 46 49
## [54,] 52 48
## [55,] 47 41
## [56,] 50 53
## [57,] 54 55
## [58,] 51 56
## [59,] 57 58
## Order of objects:
## [1] 1 10 6 11 15 16 2 4 9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36 3 7 13 5 19 8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
## [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
## [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
##
## 1770 dissimilarities, summarized :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.845 2.572 2.595 3.354 5.857
## Metric : euclidean
## Number of objects : 60
##
## Available components:
## [1] "order" "height" "dc" "merge" "diss" "call" "data"
plot(dv)


fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==1,]
summary(c_1)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.000 Min. :3.000 Min. : 4.60 Min. :0 Min. : 9
## 1st Qu.: 5.000 1st Qu.:3.500 1st Qu.: 7.15 1st Qu.:0 1st Qu.:12
## Median :10.000 Median :5.000 Median :14.50 Median :0 Median :16
## Mean : 9.636 Mean :4.909 Mean :12.71 Mean :0 Mean :17
## 3rd Qu.:14.500 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.:0 3rd Qu.:20
## Max. :18.000 Max. :8.000 Max. :23.80 Max. :0 Max. :30
k-means
str(customer_s)
## num [1:60, 1:4] -1.202 -0.757 1.692 -0.757 1.692 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
## - attr(*, "scaled:center")= Named num [1:4] 8.4 17.058 0.683 21.433
## ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
## - attr(*, "scaled:scale")= Named num [1:4] 4.492 8.399 0.469 9.285
## ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
set.seed(22)
fit =kmeans(customer_s, centers=4)
?kmeans
barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

?barplot
fit$centers
## Visit.Time Average.Expense Sex Age
## 1 1.3302016 1.0155226 -1.4566845 0.5591307
## 2 -0.7771737 -0.5178412 -1.4566845 -0.4774599
## 3 0.8571173 0.9887331 0.6750489 1.0505015
## 4 -0.6322632 -0.7299063 0.6750489 -0.6411604
投影至二維空間
#install.packages("cluster")
library(cluster)
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)

par(mfrow= c(1,2))
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)
rect(-0.7,-1.7, 2.2,-1.2, border = "orange", lwd=2)
clusplot(customer_s, fit$cluster, color = TRUE, xlim = c(-0.7,2.2), ylim = c(-1.7,-1.2))

#了解component 成分為何
pca =princomp(customer_s)
pca$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## Visit.Time -0.576 -0.601 0.554
## Average.Expense -0.602 -0.146 -0.785
## Sex 0.989 -0.133
## Age -0.550 0.148 0.775 0.274
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
Evaluating model
#silhouette
par(mfrow= c(1,1))
set.seed(22)
library(cluster)
km =kmeans(customer_s, 4)
kms=silhouette(km$cluster,dist(customer_s))
summary(kms)
## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer_s)) :
## Cluster sizes and average silhouette widths:
## 8 11 16 25
## 0.5464597 0.4080823 0.3794910 0.5164434
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1931 0.4030 0.4890 0.4641 0.5422 0.6333
plot(kms)

選擇k-means最佳k值
#within sum of squares
nk=2:10
set.seed(22)
WSS =sapply(nk, function(k){set.seed(22);kmeans(customer_s, centers=k)$tot.withinss})
WSS
## [1] 123.49224 93.08341 61.34890 48.76431 43.08965 40.25820 29.58014
## [8] 26.97709 24.99510
plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")
#install.packages("fpc")
#install.packages("robustbase", repos="http://R-Forge.R-project.org")
library(fpc)

?cluster.stats
cluster.stats(dist(customer_s), kmeans(customer_s, centers=2)$cluster)
## $n
## [1] 60
##
## $cluster.number
## [1] 2
##
## $cluster.size
## [1] 24 36
##
## $min.cluster.size
## [1] 24
##
## $noisen
## [1] 0
##
## $diameter
## [1] 3.885722 3.576798
##
## $average.distance
## [1] 2.045880 1.798055
##
## $median.distance
## [1] 2.243328 1.996810
##
## $separation
## [1] 0.9276315 0.9276315
##
## $average.toother
## [1] 3.351368 3.351368
##
## $separation.matrix
## [,1] [,2]
## [1,] 0.0000000 0.9276315
## [2,] 0.9276315 0.0000000
##
## $ave.between.matrix
## [,1] [,2]
## [1,] 0.000000 3.351368
## [2,] 3.351368 0.000000
##
## $average.between
## [1] 3.351368
##
## $average.within
## [1] 1.873552
##
## $n.between
## [1] 864
##
## $n.within
## [1] 906
##
## $max.diameter
## [1] 3.885722
##
## $min.separation
## [1] 0.9276315
##
## $within.cluster.ss
## [1] 123.4922
##
## $clus.avg.silwidths
## 1 2
## 0.3827935 0.4454536
##
## $avg.silwidth
## [1] 0.4203896
##
## $g2
## NULL
##
## $g3
## NULL
##
## $pearsongamma
## [1] 0.6564321
##
## $dunn
## [1] 0.2387282
##
## $dunn2
## [1] 1.638105
##
## $entropy
## [1] 0.6730117
##
## $wb.ratio
## [1] 0.559041
##
## $ch
## [1] 52.84097
##
## $cwidegap
## [1] 2.148705 2.131733
##
## $widestgap
## [1] 2.148705
##
## $sindex
## [1] 1.010004
##
## $corrected.rand
## NULL
##
## $vi
## NULL
sapply(nk,function(k){
set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s,centers=k)$cluster)$avg.silwidth })
## [1] 0.4203896 0.4092890 0.4640587 0.4308448 0.4196734 0.4349898 0.4396910
## [8] 0.3979406 0.3666168
WSS =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$within.cluster.ss})
SW =sapply(2:10,function(k){set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s, centers=k)$cluster)$avg.silwidth})
WSS
## [1] 123.49224 93.08341 61.34890 48.76431 43.08965 40.25820 29.58014
## [8] 26.97709 24.99510
plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

#average silhouette
nk=2:10
SW =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$avg.silwidth})
plot(x=nk, y=SW, type="l", xlab="number of clusers", ylab="average silhouette width")

nk[which.max(SW)]
## [1] 4
model comparison
single_c=hclust(dist(customer_s), method="single")
hc_single=cutree(single_c, k =4)
complete_c=hclust(dist(customer_s), method="complete")
hc_complete=cutree(complete_c, k =4)
set.seed(22)
km =kmeans(customer_s, 4)
cs=cluster.stats(dist(customer_s),km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 61.3489
##
## $avg.silwidth
## [1] 0.4640587
q =sapply(
list(kmeans=km$cluster,
hc_single=hc_single,
hc_complete=hc_complete), function(c)cluster.stats(dist(customer_s),c)[c("within.cluster.ss","avg.silwidth")])
q
## kmeans hc_single hc_complete
## within.cluster.ss 61.3489 136.0092 65.94076
## avg.silwidth 0.4640587 0.2481926 0.4255961
density-based method-DBSCAN
#install.packages("mlbench")
# mlbench package provides many methods to generate simulated data with different shapes and sizes.
#In this example, we generate a Cassini problem graph
library(mlbench)
#install.packages("fpc")
library(fpc)
set.seed(2)
p = mlbench.cassini(500)
plot(p$x)

?mlbench.cassini
ds = dbscan(data = dist(p$x),eps= 0.2, MinPts = 2, method="dist")
ds
## dbscan Pts=500 MinPts=2 eps=0.2
## 1 2 3
## seed 200 200 100
## total 200 200 100
plot(ds, p$x)

y = matrix(0,nrow=3,ncol=2)
y[1,] = c(0,0)
y[2,] = c(0,-1.5)
y[3,] = c(1,1)
y
## [,1] [,2]
## [1,] 0 0.0
## [2,] 0 -1.5
## [3,] 1 1.0
predict(ds, p$x, y)
## [1] 3 1 2