임의적 방법: 고객등급과 고객구분(신규/기존), 현재가치와 미래가치
통계적 기법: clustering, k-means
시간 많이 걸리고, 병합 원칙 불명확
변수의 특성으로 인한 변동에 따라 의미 없이 고객집단이 이동
- ex) 첫달에 A,B,C 집단, 둘째달에 B,D,F 집단
- ex) A,B,C 집단 유지되도 분포가 틀리거나 프로파일 자체 변화
solution: k-means+SRM
자동화된 방식으로 세분화되고 프로파일링돼야 일관된 품질
세분화한 집단의 프로파일링이 집단을 변별하는 가장 유의미한 변수 순서로 표시 고정된 변수로 다양한 세부집단 비교
데이터 입수하자마자 마트 생성: 반나절~1일
군집분석을 통한 세분화 및 보고서 작성: 1일
프로파일링 먼저, 군집분석 나중에? Nonsense
프로파일링 by SQL, EXCEL pivot, OLAP : 1주일
군집분석: 30분
데이터 생성
a <- c(1, 4)
b <- c(2, 1)
c <- c(4, 6)
d <- c(4, 3)
e <- c(5, 1)
data <- data.frame(a, b, c, d, e)
data <- t(data)
data #transpose
## [,1] [,2]
## a 1 4
## b 2 1
## c 4 6
## d 4 3
## e 5 1
거리 구하기
eu <- round(dist(data, method = "euclidean"), digits = 2)
eu
## a b c d
## b 3.16
## c 3.61 5.39
## d 3.16 2.83 3.00
## e 5.00 3.00 5.10 2.24
man <- round(dist(data, method = "manhattan"), digits = 2)
man
## a b c d
## b 4
## c 5 7
## d 4 4 3
## e 7 3 6 3
can <- round(dist(data, method = "canberra"), digits = 2)
can
## a b c d
## b 0.93
## c 0.80 1.05
## d 0.74 0.83 0.33
## e 1.27 0.43 0.83 0.61
min <- round(dist(data, method = "minkowski"), digits = 2)
min
## a b c d
## b 3.16
## c 3.61 5.39
## d 3.16 2.83 3.00
## e 5.00 3.00 5.10 2.24
많은 군집으로 시작해 점점 개수 줄여가기
관측벡터간 거리(2.사 참고)뿐만 아니라 군집간 거리 정의해야
원리
labels = row.names(data)
plot(data, xlim = c(0, 6), ylim = c(0, 6))
text(data, labels = labels, pos = 4)
dist(data)
## a b c d
## b 3.162
## c 3.606 5.385
## d 3.162 2.828 3.000
## e 5.000 3.000 5.099 2.236
{
dist(data)
}^2
## a b c d
## b 10
## c 13 29
## d 10 8 9
## e 25 9 26 5
그림
(m1 <- hclust(dist(data)^2, method = "single"))
##
## Call:
## hclust(d = dist(data)^2, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 5
plot(m1)
(m2 <- hclust(dist(data)^2, method = "complete"))
##
## Call:
## hclust(d = dist(data)^2, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 5
plot(m2)
(m3 <- hclust(dist(data)^2, method = "ward"))
##
## Call:
## hclust(d = dist(data)^2, method = "ward")
##
## Cluster method : ward
## Distance : euclidean
## Number of objects: 5
plot(m3)
(m4 <- hclust(dist(data)^2, method = "average"))
##
## Call:
## hclust(d = dist(data)^2, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 5
plot(m4)
K 평균법
iris 이용
rm(list = ls(all = TRUE))
data <- iris
head(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
data$Species <- NULL
head(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
# species가 3개니까 최적의 값이 3이라는 사전지식으로 시작
(m <- kmeans(data, 3))
## K-means clustering with 3 clusters of sizes 38, 62, 50
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 6.850 3.074 5.742 2.071
## 2 5.902 2.748 4.394 1.434
## 3 5.006 3.428 1.462 0.246
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1
## [106] 1 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1
## [141] 1 1 2 1 1 1 2 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 23.88 39.82 15.15
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iris$Species, m$cluster)
##
## 1 2 3
## setosa 0 0 50
## versicolor 2 48 0
## virginica 36 14 0
plot(data[c("Sepal.Length", "Sepal.Width")], main = "kmeans", col = m$cluster)
plot(iris$Sepal.Length, iris$Sepal.Width, main = "true", col = c(1, 2, 3)[unclass(iris$Species)])
# species를 4라고 시작하면 오차가 커짐
(m <- kmeans(data, 4))
## K-means clustering with 4 clusters of sizes 45, 50, 27, 28
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 6.264 2.884 4.887 1.667
## 2 5.006 3.428 1.462 0.246
## 3 7.015 3.096 5.919 2.156
## 4 5.532 2.636 3.961 1.229
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 4 1 4 1 4 1 4 4 4 4 1 4 1 4 4 1 4
## [71] 1 4 1 1 1 1 1 1 1 4 4 4 4 1 4 1 1 1 4 4 4 1 4 4 4 4 4 1 4 4 3 1 3 1 3
## [106] 3 4 3 3 3 1 1 3 1 1 1 1 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 1 3 3 1 1 3
## [141] 3 3 1 3 3 3 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 17.014 15.151 15.351 9.749
## (between_SS / total_SS = 91.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iris$Species, m$cluster)
##
## 1 2 3 4
## setosa 0 50 0 0
## versicolor 23 0 0 27
## virginica 22 0 27 1
plot(data[c("Sepal.Length", "Sepal.Width")], col = m$cluster)
wine 이용
rm(list = ls(all = TRUE))
library(HDclassif)
## Loading required package: MASS
data(wine, package = "HDclassif")
data <- wine
head(data)
## class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
str(data)
## 'data.frame': 178 obs. of 14 variables:
## $ class: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V1 : num 14.2 13.2 13.2 14.4 13.2 ...
## $ V2 : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ V3 : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ V4 : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ V5 : int 127 100 101 113 118 112 96 121 97 98 ...
## $ V6 : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ V7 : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ V8 : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ V9 : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ V10 : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ V11 : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ V12 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ V13 : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# class 제거
data <- na.omit(data[, -1])
data <- scale(data) # 변수 단위 통일
str(data)
## num [1:178, 1:13] 1.514 0.246 0.196 1.687 0.295 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:178] "1" "2" "3" "4" ...
## ..$ : chr [1:13] "V1" "V2" "V3" "V4" ...
## - attr(*, "scaled:center")= Named num [1:13] 13 2.34 2.37 19.49 99.74 ...
## ..- attr(*, "names")= chr [1:13] "V1" "V2" "V3" "V4" ...
## - attr(*, "scaled:scale")= Named num [1:13] 0.812 1.117 0.274 3.34 14.282 ...
## ..- attr(*, "names")= chr [1:13] "V1" "V2" "V3" "V4" ...
head(data)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 1.5143 -0.56067 0.2314 -1.1663 1.90852 0.8067 1.0319 -0.6577 1.2214
## 2 0.2456 -0.49801 -0.8257 -2.4838 0.01809 0.5670 0.7316 -0.8184 -0.5432
## 3 0.1963 0.02117 1.1062 -0.2680 0.08811 0.8067 1.2121 -0.4970 2.1300
## 4 1.6868 -0.34584 0.4866 -0.8070 0.92830 2.4844 1.4624 -0.9791 1.0293
## 5 0.2949 0.22705 1.8352 0.4507 1.27838 0.8067 0.6615 0.2262 0.4003
## 6 1.4774 -0.51591 0.3043 -1.2861 0.85828 1.5577 1.3623 -0.1756 0.6623
## V10 V11 V12 V13
## 1 0.2510 0.3612 1.8427 1.01016
## 2 -0.2925 0.4049 1.1103 0.96253
## 3 0.2683 0.3174 0.7864 1.39122
## 4 1.1827 -0.4263 1.1807 2.32801
## 5 -0.3184 0.3612 0.4483 -0.03777
## 6 0.7298 0.4049 0.3357 2.23274
wss <- 0
wss
## [1] 0
for (i in 1:15) {
wss[i] <- sum(kmeans(data, centers = i)$withinss)
}
wss
## [1] 2301.0 1649.7 1270.7 1180.0 1111.4 1051.6 984.7 944.7 878.4 865.5
## [11] 839.8 787.1 764.7 761.2 708.2
# centers : either the number of clusters, say k, or a set of initial
# (distinct) cluster centres. If a number, a random set of (distinct) rows
# in x is chosen as the initial centres
# withinss: Vector of within-cluster sum of squares, one component per
# cluster.
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within group sum of squares")
# --- 3까지 그래프가 급격히 감소하므로, 군집을 3개로 해주는 것이 적합함
data2 <- wine
data2$class <- NULL
m <- kmeans(data2, 3)
m
## K-means clustering with 3 clusters of sizes 28, 100, 50
##
## Cluster means:
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 13.82 1.774 2.490 16.96 105.36 2.924 3.111 0.2986 1.987 6.203 1.1036
## 2 12.60 2.464 2.329 20.70 93.74 2.050 1.634 0.3987 1.422 4.695 0.9119
## 3 13.34 2.397 2.372 18.51 108.60 2.432 2.215 0.3236 1.707 5.144 0.9667
## V12 V13
## 1 2.985 1301.5
## 2 2.382 517.8
## 3 2.863 894.6
##
## Clustering vector:
## [1] 3 3 1 1 3 1 1 1 3 3 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 1 1 3 3 1 1 3 1 3
## [36] 3 3 1 3 3 3 3 3 2 3 3 3 3 3 1 1 1 1 1 3 1 3 1 1 2 2 2 2 2 2 2 2 2 3 3
## [71] 3 2 2 3 3 2 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2
## [141] 2 3 2 2 3 3 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 3
## [176] 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 550201 1263331 815784
## (between_SS / total_SS = 85.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(wine$class, m$cluster)
##
## 1 2 3
## 1 28 1 30
## 2 0 62 9
## 3 0 37 11
plot(data2[c("V1", "V2")], col = m$cluster)
points(m$centers[, c("V1", "V2")], col = 1:3, pch = 8, cex = 2)
장점: 좀더 탄탄한 (robust)한 k-means, 결측값(missing data)을 허용,프로파일링시 실제 observation으로 표현
단점: 대용량 데이터에 대해서는 처리시간 급상승
determine the optimal number of clusters
Iris 이용
rm(list = ls(all = TRUE)) #--- 작업 영역에 저장된 데이터 모두 삭제
library(cluster)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
(m1 <- kmeans(iris[, -5], 3))
## K-means clustering with 3 clusters of sizes 62, 38, 50
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.902 2.748 4.394 1.434
## 2 6.850 3.074 5.742 2.071
## 3 5.006 3.428 1.462 0.246
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 39.82 23.88 15.15
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iris$Species, m1$cluster)
##
## 1 2 3
## setosa 0 0 50
## versicolor 48 2 0
## virginica 14 36 0
(m2 <- pam(iris[, -5], 3))
## Medoids:
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 8 5.0 3.4 1.5 0.2
## [2,] 79 6.0 2.9 4.5 1.5
## [3,] 113 6.8 3.0 5.5 2.1
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## Objective function:
## build swap
## 0.6709 0.6542
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
table(iris$Species, m2$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 14 36
summary(m1)
## Length Class Mode
## cluster 150 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
summary(m2)
## Medoids:
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 8 5.0 3.4 1.5 0.2
## [2,] 79 6.0 2.9 4.5 1.5
## [3,] 113 6.8 3.0 5.5 2.1
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## Objective function:
## build swap
## 0.6709 0.6542
##
## Numerical information per cluster:
## size max_diss av_diss diameter separation
## [1,] 50 1.237 0.4846 2.429 1.6401
## [2,] 62 1.838 0.7470 2.678 0.2646
## [3,] 38 1.723 0.7260 2.419 0.2646
##
## Isolated clusters:
## L-clusters: character(0)
## L*-clusters: character(0)
##
## Silhouette plot information:
## cluster neighbor sil_width
## 8 1 2 0.85391
## 1 1 2 0.85296
## 50 1 2 0.85210
## 18 1 2 0.85102
## 40 1 2 0.85033
## 41 1 2 0.84942
## 5 1 2 0.84930
## 29 1 2 0.84364
## 28 1 2 0.84202
## 38 1 2 0.84189
## 12 1 2 0.83591
## 27 1 2 0.83347
## 36 1 2 0.83225
## 3 1 2 0.82932
## 35 1 2 0.82877
## 22 1 2 0.82591
## 10 1 2 0.82529
## 7 1 2 0.82165
## 20 1 2 0.82030
## 48 1 2 0.81902
## 49 1 2 0.81823
## 30 1 2 0.81785
## 2 1 2 0.81549
## 31 1 2 0.81519
## 47 1 2 0.81340
## 13 1 2 0.81056
## 46 1 2 0.80978
## 4 1 2 0.80501
## 11 1 2 0.80310
## 44 1 2 0.80024
## 32 1 2 0.79899
## 26 1 2 0.79866
## 37 1 2 0.79415
## 24 1 2 0.79411
## 23 1 2 0.79297
## 43 1 2 0.78658
## 21 1 2 0.78418
## 17 1 2 0.77568
## 25 1 2 0.77504
## 39 1 2 0.76857
## 33 1 2 0.76273
## 9 1 2 0.75215
## 6 1 2 0.74828
## 45 1 2 0.74699
## 14 1 2 0.74615
## 34 1 2 0.72225
## 19 1 2 0.70686
## 15 1 2 0.70259
## 16 1 2 0.64377
## 42 1 2 0.63900
## 100 2 3 0.63064
## 95 2 3 0.62754
## 93 2 3 0.62445
## 97 2 3 0.62206
## 83 2 3 0.61973
## 90 2 3 0.61436
## 70 2 3 0.61158
## 68 2 3 0.61073
## 89 2 3 0.60716
## 96 2 3 0.60657
## 91 2 3 0.59561
## 81 2 3 0.59466
## 54 2 3 0.59294
## 56 2 3 0.59221
## 72 2 3 0.58969
## 62 2 3 0.58710
## 60 2 3 0.57829
## 65 2 1 0.56310
## 82 2 1 0.56076
## 98 2 3 0.55780
## 67 2 3 0.55751
## 79 2 3 0.55294
## 63 2 3 0.55108
## 85 2 3 0.54077
## 80 2 1 0.50698
## 92 2 3 0.50353
## 75 2 3 0.50005
## 74 2 3 0.49425
## 88 2 3 0.48936
## 64 2 3 0.48217
## 107 2 3 0.46682
## 69 2 3 0.46150
## 86 2 3 0.41599
## 76 2 3 0.38549
## 52 2 3 0.38119
## 61 2 1 0.37479
## 55 2 3 0.36885
## 139 2 3 0.35125
## 59 2 3 0.34419
## 122 2 3 0.33356
## 71 2 3 0.32910
## 120 2 3 0.32869
## 66 2 3 0.32459
## 73 2 3 0.31046
## 127 2 3 0.30904
## 94 2 1 0.29362
## 114 2 3 0.28503
## 57 2 3 0.28233
## 58 2 1 0.26525
## 84 2 3 0.26087
## 128 2 3 0.25227
## 102 2 3 0.23225
## 143 2 3 0.23225
## 124 2 3 0.20297
## 150 2 3 0.18544
## 87 2 3 0.16655
## 99 2 1 0.14132
## 134 2 3 0.13900
## 77 2 3 0.12629
## 147 2 3 0.10417
## 51 2 3 0.02672
## 115 2 3 0.02636
## 144 3 2 0.61325
## 103 3 2 0.61194
## 121 3 2 0.60703
## 125 3 2 0.58015
## 126 3 2 0.57818
## 141 3 2 0.57023
## 145 3 2 0.56708
## 110 3 2 0.56152
## 131 3 2 0.56017
## 108 3 2 0.55917
## 105 3 2 0.55778
## 113 3 2 0.55510
## 136 3 2 0.55187
## 106 3 2 0.54384
## 140 3 2 0.53445
## 130 3 2 0.51609
## 123 3 2 0.51237
## 101 3 2 0.49928
## 118 3 2 0.49487
## 132 3 2 0.48442
## 119 3 2 0.48341
## 133 3 2 0.46255
## 137 3 2 0.45550
## 129 3 2 0.45434
## 109 3 2 0.44076
## 146 3 2 0.42514
## 117 3 2 0.42111
## 142 3 2 0.41026
## 116 3 2 0.39825
## 138 3 2 0.38878
## 104 3 2 0.36076
## 149 3 2 0.35245
## 148 3 2 0.31493
## 111 3 2 0.26063
## 112 3 2 0.22965
## 78 3 2 0.11798
## 53 3 2 0.05340
## 135 3 2 0.05329
## Average silhouette width per cluster:
## [1] 0.7981 0.4173 0.4511
## Average silhouette width of total data set:
## [1] 0.5528
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
Adult 이용
rm(list = ls(all = TRUE))
data(AdultUCI, package = "arules")
data <- AdultUCI
idx <- sample(nrow(data[complete.cases(data), ]), 1000)
head(idx)
## [1] 10531 27789 1060 10270 12822 24575
# ?complete.cases : Return a logical vector indicating which cases are
# complete, i.e., have no missing values.
system.time(m <- kmeans(data[idx, c(1, 3, 5, 11:13)], 2))
## user system elapsed
## 0 0 0
table(m$cluster, data[idx, "income"])
##
## small large
## 1 165 75
## 2 594 166
library(cluster)
system.time(m <- pam(data[idx, c(1, 3, 5, 11:13)], 2))
## user system elapsed
## 0.15 0.00 0.16
table(m$cluster, data[idx, "income"])
##
## small large
## 1 193 86
## 2 566 155
rm(list = ls(all = TRUE)) #--- 작업 영역에 저장된 데이터 모두 삭제
data <- iris
(idx <- sample(1:dim(data)[1], 40))
## [1] 116 4 13 121 12 5 44 150 119 14 43 34 27 36 134 131 59
## [18] 149 56 82 28 86 53 127 58 138 136 92 135 45 18 39 48 95
## [35] 66 57 8 1 112 109
(train <- iris[idx, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 116 6.4 3.2 5.3 2.3 virginica
## 4 4.6 3.1 1.5 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 121 6.9 3.2 5.7 2.3 virginica
## 12 4.8 3.4 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 150 5.9 3.0 5.1 1.8 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 14 4.3 3.0 1.1 0.1 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 134 6.3 2.8 5.1 1.5 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 59 6.6 2.9 4.6 1.3 versicolor
## 149 6.2 3.4 5.4 2.3 virginica
## 56 5.7 2.8 4.5 1.3 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 28 5.2 3.5 1.5 0.2 setosa
## 86 6.0 3.4 4.5 1.6 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 127 6.2 2.8 4.8 1.8 virginica
## 58 4.9 2.4 3.3 1.0 versicolor
## 138 6.4 3.1 5.5 1.8 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 92 6.1 3.0 4.6 1.4 versicolor
## 135 6.1 2.6 5.6 1.4 virginica
## 45 5.1 3.8 1.9 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 95 5.6 2.7 4.2 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 8 5.0 3.4 1.5 0.2 setosa
## 1 5.1 3.5 1.4 0.2 setosa
## 112 6.4 2.7 5.3 1.9 virginica
## 109 6.7 2.5 5.8 1.8 virginica
train$Species <- NULL
(m <- hclust(dist(train), method = "ave"))
##
## Call:
## hclust(d = dist(train), method = "ave")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 40
plot(m, hang = -1, labels = iris$Species[idx])
?fpc : Flexible procedures for clustering
?dbscan
Clusters require a minimum no of points (MinPts) within a maximum distance (eps) around one of its members (the seed). Any point within eps around any point which satisfies the seed condition is a cluster member (recursively). Some points may not belong to any clusters (noise).
We have clustered a 100.000 x 2 dataset in 40 minutes on a Pentium M 1600 MHz.
rm(list = ls(all = TRUE)) #--- 작업 영역에 저장된 데이터 모두 삭제
data <- iris[-5]
library(fpc)
## Loading required package: mclust
## Package 'mclust' version 4.2
## Loading required package: flexmix
## Loading required package: lattice
(m <- dbscan(data, eps = 0.42, MinPts = 5))
## dbscan Pts=150 MinPts=5 eps=0.42
## 0 1 2 3
## border 29 6 10 12
## seed 0 42 27 24
## total 29 48 37 36
table(m$cluster, iris$Species)
##
## setosa versicolor virginica
## 0 2 10 17
## 1 48 0 0
## 2 0 37 0
## 3 0 3 33
plot(m, data)
library(fpc)
(m <- dbscan(data, eps = 0.42, MinPts = 10))
## dbscan Pts=150 MinPts=10 eps=0.42
## 0 1 2 3
## border 81 14 9 12
## seed 0 30 1 3
## total 81 44 10 15
table(m$cluster, iris$Species)
##
## setosa versicolor virginica
## 0 6 25 50
## 1 44 0 0
## 2 0 10 0
## 3 0 15 0
plot(m, data)
library(fpc)
(m <- dbscan(data, eps = 0.5, MinPts = 5))
## dbscan Pts=150 MinPts=5 eps=0.5
## 0 1 2
## border 17 4 12
## seed 0 45 72
## total 17 49 84
table(m$cluster, iris$Species)
##
## setosa versicolor virginica
## 0 1 6 10
## 1 49 0 0
## 2 0 44 40
plot(m, data)
plot(m, data[c(1, 4)])
plotcluster(data, m$cluster)
set.seed(435)
(idx <- sample(1:nrow(iris), 10))
## [1] 120 77 39 126 33 74 113 27 72 73
newData <- iris[idx, -5]
(newData <- newData + matrix(runif(10 * 4, min = 0, max = 0.2), nrow = 10, ncol = 4))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 120 6.047 2.331 5.015 1.5108
## 77 6.872 2.898 4.955 1.5875
## 39 4.532 3.185 1.430 0.3469
## 126 7.374 3.305 6.165 1.8097
## 33 5.255 4.167 1.697 0.2792
## 74 6.206 2.820 4.890 1.3135
## 113 6.933 3.120 5.548 2.2059
## 27 5.199 3.458 1.631 0.5135
## 72 6.183 2.913 4.047 1.4119
## 73 6.311 2.600 4.969 1.6016
(pred <- predict(m, data, newData))
## Loading required package: class
## [1] 2 2 1 2 1 2 2 1 2 2
plot(data[1, 4], col = 1 + m$cluster)
points(newData[c(1, 4)], pch = "*", col = 1 + pred, cex = 3)
table(pred, iris$Species[idx])
##
## pred setosa versicolor virginica
## 1 3 0 0
## 2 0 4 3
rm(list = ls(all = TRUE)) #--- 작업 영역에 저장된 데이터 모두 삭제
(data <- rbind(cbind(rnorm(10, 0, 0.5), rnorm(10, 0, 0.5)), cbind(rnorm(15,
5, 0.5), rnorm(15, 5, 0.5)), cbind(rnorm(3, 3.2, 0.5), rnorm(3, 3.2, 0.5))))
## [,1] [,2]
## [1,] 0.34663 -1.12843
## [2,] -0.36434 0.02269
## [3,] 0.67791 0.39530
## [4,] -0.83303 -0.54488
## [5,] -0.69094 0.05315
## [6,] -0.19260 0.04469
## [7,] 0.20382 0.89237
## [8,] 0.06322 0.48460
## [9,] -0.21612 0.40975
## [10,] 0.39748 -0.16921
## [11,] 3.89270 5.62290
## [12,] 4.95182 4.52439
## [13,] 4.50071 4.58014
## [14,] 5.52677 5.26108
## [15,] 5.03939 5.56245
## [16,] 4.74013 5.74071
## [17,] 4.94086 5.76824
## [18,] 4.29632 4.45295
## [19,] 5.16870 4.34875
## [20,] 4.27382 5.32184
## [21,] 4.61125 5.10828
## [22,] 5.06349 4.95069
## [23,] 4.37634 5.46680
## [24,] 5.50856 4.30084
## [25,] 5.27155 5.37243
## [26,] 3.39470 3.12617
## [27,] 2.47825 2.18290
## [28,] 3.18953 2.78227
(m <- fanny(data, 2))
## Fuzzy Clustering object of class 'fanny' :
## m.ship.expon. 2
## objective 14.18
## tolerance 1e-15
## iterations 10
## converged 1
## maxit 500
## n 28
## Membership coefficients (in %, rounded):
## [,1] [,2]
## [1,] 88 12
## [2,] 96 4
## [3,] 92 8
## [4,] 91 9
## [5,] 94 6
## [6,] 97 3
## [7,] 92 8
## [8,] 96 4
## [9,] 96 4
## [10,] 94 6
## [11,] 11 89
## [12,] 6 94
## [13,] 6 94
## [14,] 7 93
## [15,] 5 95
## [16,] 6 94
## [17,] 7 93
## [18,] 8 92
## [19,] 8 92
## [20,] 6 94
## [21,] 4 96
## [22,] 4 96
## [23,] 6 94
## [24,] 10 90
## [25,] 6 94
## [26,] 32 68
## [27,] 54 46
## [28,] 38 62
## Fuzzyness coefficients:
## dunn_coeff normalized
## 0.8399 0.6798
## Closest hard clustering:
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
##
## Available components:
## [1] "membership" "coeff" "memb.exp" "clustering" "k.crisp"
## [6] "objective" "convergence" "diss" "call" "silinfo"
## [11] "data"
plot(m)
rm(list = ls(all = TRUE)) #--- 작업 영역에 저장된 데이터 모두 삭제