分群問題

距離計算

x =c(0, 0, 1, 1, 1, 1)
y =c(1, 0, 1, 1, 0, 1)

#euclidean
?dist
rbind(x,y)

##   [,1] [,2] [,3] [,4] [,5] [,6]
## x    0    0    1    1    1    1
## y    1    0    1    1    0    1

dist(rbind(x,y), method ="euclidean")

##          x
## y 1.414214

sqrt(sum((x-y)^2))

## [1] 1.414214

dist(rbind(x,y), method ="minkowski", p=2)

##          x
## y 1.414214

#city block
dist(rbind(x,y), method ="manhattan")

##   x
## y 2

sum(abs(x-y))

## [1] 2

dist(rbind(x,y), method ="minkowski", p=1)

##   x
## y 2

Hierarchical Clustering

聚合式(bottom-up)

setwd('~/lecture/riii')
customer=read.csv('data/customer.csv',header=TRUE)
head(customer)

##   ID Visit.Time Average.Expense Sex Age
## 1  1          3             5.7   0  10
## 2  2          5            14.5   0  27
## 3  3         16            33.5   0  32
## 4  4          5            15.9   0  30
## 5  5         16            24.9   0  23
## 6  6          3            12.0   0  15

str(customer)

## 'data.frame':    60 obs. of  5 variables:
##  $ ID             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Visit.Time     : int  3 5 16 5 16 3 12 14 6 3 ...
##  $ Average.Expense: num  5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
##  $ Sex            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age            : int  10 27 32 30 23 15 33 27 16 11 ...

#數值變數作正規化
customer_s =scale(customer[,-1])
?scale

#正規化後的變數平均數為0, 標準差為1
round(mean(customer_s[,2]),3)

## [1] 0

round(sd(customer_s[,2]),3)

## [1] 1

?hclust
hc=hclust(dist(customer_s, method="euclidean"), method="ward.D2")
plot(hc,hang =-0.01, cex=0.7)

hc3 =hclust(dist(customer, method="euclidean"), method="complete")
plot(hc3, hang =-0.01, cex=0.8)

cutree

fit =cutree(hc, k =4)
fit

##  [1] 1 1 2 1 2 1 2 2 1 1 1 2 2 1 1 1 2 1 2 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3

table(fit)

## fit
##  1  2  3  4 
## 11  8 16 25

plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =4, border="red")
rect.hclust(hc, k =3, border="blue")

c_1 = customer[fit == 1,]
summary(c_1)

##        ID           Visit.Time    Average.Expense      Sex         Age    
##  Min.   : 1.000   Min.   :3.000   Min.   : 4.60   Min.   :0   Min.   : 9  
##  1st Qu.: 5.000   1st Qu.:3.500   1st Qu.: 7.15   1st Qu.:0   1st Qu.:12  
##  Median :10.000   Median :5.000   Median :14.50   Median :0   Median :16  
##  Mean   : 9.636   Mean   :4.909   Mean   :12.71   Mean   :0   Mean   :17  
##  3rd Qu.:14.500   3rd Qu.:6.000   3rd Qu.:16.00   3rd Qu.:0   3rd Qu.:20  
##  Max.   :18.000   Max.   :8.000   Max.   :23.80   Max.   :0   Max.   :30

分裂式階層式(top-down)

#install.packages('cluster')
library(cluster)
?diana
dv =diana(customer_s, metric ="euclidean")
summary(dv)

## Merge:
##       [,1] [,2]
##  [1,]  -24  -50
##  [2,]  -28  -46
##  [3,]   -7  -13
##  [4,]  -30  -35
##  [5,]  -21  -40
##  [6,]  -54  -58
##  [7,]  -23  -26
##  [8,]   -1  -10
##  [9,]    7  -51
## [10,]  -27  -59
## [11,]    5  -39
## [12,]  -32  -45
## [13,]   -8  -12
## [14,]   -2   -4
## [15,]  -14  -18
## [16,]   11  -43
## [17,]  -44  -49
## [18,]    9  -56
## [19,]  -37  -60
## [20,]   -6  -11
## [21,]  -29  -48
## [22,]   -5  -19
## [23,]   10  -36
## [24,]  -42   17
## [25,]  -25   12
## [26,]   18  -41
## [27,]   21  -38
## [28,]   13  -17
## [29,]  -34  -52
## [30,]   16    6
## [31,]    8   20
## [32,]   26    4
## [33,]   19  -57
## [34,]  -47  -55
## [35,]   25  -53
## [36,]   24  -31
## [37,]   30   36
## [38,]   -3    3
## [39,]   -9   15
## [40,]  -33   33
## [41,]   32   23
## [42,]   22   28
## [43,]   31  -15
## [44,]   37   27
## [45,]  -20   40
## [46,]  -22   35
## [47,]   44   34
## [48,]   14   39
## [49,]    1   29
## [50,]   45    2
## [51,]   38   42
## [52,]   43  -16
## [53,]   46   49
## [54,]   52   48
## [55,]   47   41
## [56,]   50   53
## [57,]   54   55
## [58,]   51   56
## [59,]   57   58
## Order of objects:
##  [1]  1 10  6 11 15 16  2  4  9 14 18 21 40 39 43 54 58 42 44 49 31 29 48
## [24] 38 47 55 23 26 51 56 41 30 35 27 59 36  3  7 13  5 19  8 12 17 20 33
## [47] 37 60 57 28 46 22 25 32 45 53 24 50 34 52
## Height:
##  [1] 0.11775833 0.92338041 0.50974266 1.47360965 2.04722777 2.51250579
##  [7] 0.36355872 1.79099892 1.08967479 0.39308959 3.57679780 0.00000000
## [13] 0.21833707 0.44391855 0.80354844 0.08334529 0.98499722 0.70126085
## [19] 0.44921797 0.98499722 1.48962560 0.55960408 0.76573069 1.77868059
## [25] 0.97891452 2.79693737 0.09525176 0.12305649 0.48657744 0.76517620
## [31] 0.93270565 0.00000000 1.28196769 0.16054657 0.60321756 5.85655734
## [37] 1.07657773 0.00000000 1.98611220 0.59473487 1.44920797 0.33912975
## [43] 0.78523518 3.88572195 1.51921913 1.18521332 0.50902071 0.97225583
## [49] 1.91123321 0.00000000 3.39304108 1.52798723 0.72296652 0.31544012
## [55] 0.98335831 2.45910026 0.00000000 1.85224545 0.79085454
## Divisive coefficient:
## [1] 0.9117911
## 
## 1770 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.845   2.572   2.595   3.354   5.857 
## Metric :  euclidean 
## Number of objects : 60
## 
## Available components:
## [1] "order"  "height" "dc"     "merge"  "diss"   "call"   "data"

plot(dv)

fit2 =cutree(dv,k=4)
c_1 = customer[fit2 ==1,]
summary(c_1)

##        ID           Visit.Time    Average.Expense      Sex         Age    
##  Min.   : 1.000   Min.   :3.000   Min.   : 4.60   Min.   :0   Min.   : 9  
##  1st Qu.: 5.000   1st Qu.:3.500   1st Qu.: 7.15   1st Qu.:0   1st Qu.:12  
##  Median :10.000   Median :5.000   Median :14.50   Median :0   Median :16  
##  Mean   : 9.636   Mean   :4.909   Mean   :12.71   Mean   :0   Mean   :17  
##  3rd Qu.:14.500   3rd Qu.:6.000   3rd Qu.:16.00   3rd Qu.:0   3rd Qu.:20  
##  Max.   :18.000   Max.   :8.000   Max.   :23.80   Max.   :0   Max.   :30

k-means

str(customer_s)

##  num [1:60, 1:4] -1.202 -0.757 1.692 -0.757 1.692 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
##  - attr(*, "scaled:center")= Named num [1:4] 8.4 17.058 0.683 21.433
##   ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"
##  - attr(*, "scaled:scale")= Named num [1:4] 4.492 8.399 0.469 9.285
##   ..- attr(*, "names")= chr [1:4] "Visit.Time" "Average.Expense" "Sex" "Age"

set.seed(22)
fit =kmeans(customer_s, centers=4)
?kmeans

barplot(t(fit$centers), beside =TRUE,xlab="cluster", ylab="value")

?barplot
fit$centers

##   Visit.Time Average.Expense        Sex        Age
## 1  1.3302016       1.0155226 -1.4566845  0.5591307
## 2 -0.7771737      -0.5178412 -1.4566845 -0.4774599
## 3  0.8571173       0.9887331  0.6750489  1.0505015
## 4 -0.6322632      -0.7299063  0.6750489 -0.6411604

投影至二維空間

#install.packages("cluster")
library(cluster)
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)

par(mfrow= c(1,2))
clusplot(customer_s, fit$cluster, color=TRUE, shade=TRUE)
rect(-0.7,-1.7, 2.2,-1.2, border = "orange", lwd=2)
clusplot(customer_s, fit$cluster, color = TRUE, xlim = c(-0.7,2.2), ylim = c(-1.7,-1.2))

#了解component 成分為何
pca =princomp(customer_s)
pca$loadings

## 
## Loadings:
##                 Comp.1 Comp.2 Comp.3 Comp.4
## Visit.Time      -0.576        -0.601  0.554
## Average.Expense -0.602        -0.146 -0.785
## Sex                     0.989 -0.133       
## Age             -0.550  0.148  0.775  0.274
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00

Evaluating model

#silhouette
par(mfrow= c(1,1))
set.seed(22)
library(cluster)
km =kmeans(customer_s, 4)
kms=silhouette(km$cluster,dist(customer_s))
summary(kms)

## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer_s)) :
##  Cluster sizes and average silhouette widths:
##         8        11        16        25 
## 0.5464597 0.4080823 0.3794910 0.5164434 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1931  0.4030  0.4890  0.4641  0.5422  0.6333

plot(kms)

選擇k-means最佳k值

#within sum of squares
nk=2:10
set.seed(22)
WSS =sapply(nk, function(k){set.seed(22);kmeans(customer_s, centers=k)$tot.withinss})
WSS

## [1] 123.49224  93.08341  61.34890  48.76431  43.08965  40.25820  29.58014
## [8]  26.97709  24.99510

plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

#install.packages("fpc")
#install.packages("robustbase", repos="http://R-Forge.R-project.org")
library(fpc)

?cluster.stats
cluster.stats(dist(customer_s), kmeans(customer_s, centers=2)$cluster)

## $n
## [1] 60
## 
## $cluster.number
## [1] 2
## 
## $cluster.size
## [1] 24 36
## 
## $min.cluster.size
## [1] 24
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 3.885722 3.576798
## 
## $average.distance
## [1] 2.045880 1.798055
## 
## $median.distance
## [1] 2.243328 1.996810
## 
## $separation
## [1] 0.9276315 0.9276315
## 
## $average.toother
## [1] 3.351368 3.351368
## 
## $separation.matrix
##           [,1]      [,2]
## [1,] 0.0000000 0.9276315
## [2,] 0.9276315 0.0000000
## 
## $ave.between.matrix
##          [,1]     [,2]
## [1,] 0.000000 3.351368
## [2,] 3.351368 0.000000
## 
## $average.between
## [1] 3.351368
## 
## $average.within
## [1] 1.873552
## 
## $n.between
## [1] 864
## 
## $n.within
## [1] 906
## 
## $max.diameter
## [1] 3.885722
## 
## $min.separation
## [1] 0.9276315
## 
## $within.cluster.ss
## [1] 123.4922
## 
## $clus.avg.silwidths
##         1         2 
## 0.3827935 0.4454536 
## 
## $avg.silwidth
## [1] 0.4203896
## 
## $g2
## NULL
## 
## $g3
## NULL
## 
## $pearsongamma
## [1] 0.6564321
## 
## $dunn
## [1] 0.2387282
## 
## $dunn2
## [1] 1.638105
## 
## $entropy
## [1] 0.6730117
## 
## $wb.ratio
## [1] 0.559041
## 
## $ch
## [1] 52.84097
## 
## $cwidegap
## [1] 2.148705 2.131733
## 
## $widestgap
## [1] 2.148705
## 
## $sindex
## [1] 1.010004
## 
## $corrected.rand
## NULL
## 
## $vi
## NULL

sapply(nk,function(k){
  set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s,centers=k)$cluster)$avg.silwidth })

## [1] 0.4203896 0.4092890 0.4640587 0.4308448 0.4196734 0.4349898 0.4396910
## [8] 0.3979406 0.3666168

WSS =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$within.cluster.ss})

SW =sapply(2:10,function(k){set.seed(22);cluster.stats(dist(customer_s),kmeans(customer_s, centers=k)$cluster)$avg.silwidth})

WSS

## [1] 123.49224  93.08341  61.34890  48.76431  43.08965  40.25820  29.58014
## [8]  26.97709  24.99510

plot(x=nk, y=WSS, type="l", xlab="number of k", ylab="within sum of squares")

#average silhouette
nk=2:10
SW =sapply(nk, function(k){set.seed(22);cluster.stats(dist(customer_s), kmeans(customer_s, centers=k)$cluster)$avg.silwidth})

plot(x=nk, y=SW, type="l", xlab="number of clusers", ylab="average silhouette width")

nk[which.max(SW)]

## [1] 4

model comparison

single_c=hclust(dist(customer_s), method="single")
hc_single=cutree(single_c, k =4)

complete_c=hclust(dist(customer_s), method="complete")
hc_complete=cutree(complete_c, k =4)

set.seed(22)
km =kmeans(customer_s, 4)

cs=cluster.stats(dist(customer_s),km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 61.3489
## 
## $avg.silwidth
## [1] 0.4640587

q =sapply(
  list(kmeans=km$cluster, 
       hc_single=hc_single, 
       hc_complete=hc_complete), function(c)cluster.stats(dist(customer_s),c)[c("within.cluster.ss","avg.silwidth")])
q

##                   kmeans    hc_single hc_complete
## within.cluster.ss 61.3489   136.0092  65.94076   
## avg.silwidth      0.4640587 0.2481926 0.4255961

density-based method-DBSCAN

http://123android.blogspot.tw/2012/01/28dec11-data-mining.html

#install.packages("mlbench")
# mlbench package provides many methods to generate simulated data with different shapes and sizes.
#In this example, we generate a Cassini problem graph
library(mlbench)
#install.packages("fpc")
library(fpc)
set.seed(2)
p = mlbench.cassini(500)
plot(p$x)

?mlbench.cassini

ds = dbscan(data = dist(p$x),eps= 0.2, MinPts = 2, method="dist")
ds

## dbscan Pts=500 MinPts=2 eps=0.2
##         1   2   3
## seed  200 200 100
## total 200 200 100

plot(ds, p$x)

y = matrix(0,nrow=3,ncol=2)
y[1,] = c(0,0)
y[2,] = c(0,-1.5)
y[3,] = c(1,1)
y

##      [,1] [,2]
## [1,]    0  0.0
## [2,]    0 -1.5
## [3,]    1  1.0

predict(ds, p$x, y)

## [1] 3 1 2

R_basic5

York Lin

2018年07月05日

分群問題

距離計算

Hierarchical Clustering

聚合式(bottom-up)

cutree

分裂式階層式(top-down)

k-means

投影至二維空間

Evaluating model

選擇k-means最佳k值

model comparison

density-based method-DBSCAN