Clustering Example

x = c(10,11,12,10,11,12,20,21,22,20,21,22)
y = c(4000,3900,4000,1000,800,1000,4000,3900,4000,1000,800,1000)
simpleSquare = data.frame(x,y)
simpleSquare

##     x    y
## 1  10 4000
## 2  11 3900
## 3  12 4000
## 4  10 1000
## 5  11  800
## 6  12 1000
## 7  20 4000
## 8  21 3900
## 9  22 4000
## 10 20 1000
## 11 21  800
## 12 22 1000

ss_scaled = scale(simpleSquare)
ss_scaled

##                x         y
##  [1,] -1.1338934  0.977290
##  [2,] -0.9449112  0.914239
##  [3,] -0.7559289  0.977290
##  [4,] -1.1338934 -0.914239
##  [5,] -0.9449112 -1.040341
##  [6,] -0.7559289 -0.914239
##  [7,]  0.7559289  0.977290
##  [8,]  0.9449112  0.914239
##  [9,]  1.1338934  0.977290
## [10,]  0.7559289 -0.914239
## [11,]  0.9449112 -1.040341
## [12,]  1.1338934 -0.914239
## attr(,"scaled:center")
##    x    y 
##   16 2450 
## attr(,"scaled:scale")
##           x           y 
##    5.291503 1586.018457

summary(ss_scaled)

##        x                 y          
##  Min.   :-1.1339   Min.   :-1.0403  
##  1st Qu.:-0.9449   1st Qu.:-0.9142  
##  Median : 0.0000   Median : 0.0000  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.9449   3rd Qu.: 0.9773  
##  Max.   : 1.1339   Max.   : 0.9773

#표준화된 항목의 표준편차 확인
sd(ss_scaled[,'x'])

## [1] 1

sd(ss_scaled[,'y'])

## [1] 1

##            1         2         3         4         5         6         7
## 2  0.1992228                                                            
## 3  0.3779645 0.1992228                                                  
## 4  1.8915291 1.8382183 1.9289218                                        
## 5  2.0264622 1.9545800 2.0264622 0.2271915                              
## 6  1.9289218 1.8382183 1.8915291 0.3779645 0.2271915                    
## 7  1.8898224 1.7020084 1.5118579 2.6738195 2.6388808 2.4214864          
## 8  2.0797606 1.8898224 1.7020084 2.7685304 2.7187886 2.4972363 0.1992228
## 9  2.2677868 2.0797606 1.8898224 2.9530898 2.8969404 2.6738195 0.3779645
## 10 2.6738195 2.4972363 2.4214864 1.8898224 1.7055084 1.5118579 1.8915291
## 11 2.8969404 2.7187886 2.6388808 2.0826258 1.8898224 1.7055084 2.0264622
## 12 2.9530898 2.7685304 2.6738195 2.2677868 2.0826258 1.8898224 1.9289218
##            8         9        10        11
## 2                                         
## 3                                         
## 4                                         
## 5                                         
## 6                                         
## 7                                         
## 8                                         
## 9  0.1992228                              
## 10 1.8382183 1.9289218                    
## 11 1.9545800 2.0264622 0.2271915          
## 12 1.8382183 1.8915291 0.3779645 0.2271915

## [1] 0.1992228

## [1] 2.95309

## 
## Call:
## hclust(d = ss_scaled_dist)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 12

plot(result_hclust)

require(NbClust)

## Loading required package: NbClust

nc = NbClust(ss_scaled, min.nc = 2, max.nc = 6, method = "kmeans")

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 1 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 17 proposed 4 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

barplot(table(nc$Best.nc[1,]), xlab = "Number of Clusters", ylab = "Number of Cirteria", main = "Number of Cluster chosen by criteria")

wssplot = function(data, nc =6, seed=1234, plot =TRUE){
  wss = (nrow(data)-1)*sum(apply(data ,2,var))
  
  for(i in 2:nc){
    set.seed(seed)
    wss[i]<-sum(kmeans(data,centers = i)$withinss)
  }
  if(plot) plot(1:nc, wss, type ="b",xlab = "Number pf clusters", ylab ="Within group sum of squares")
}
wssplot(ss_scaled)

#https://smlee729.github.io/r/machine%20learning/2015/03/20/1-k-means.html
fit.km= kmeans(ss_scaled, 4, nstart = 20)
print(fit.km)

## K-means clustering with 4 clusters of sizes 3, 3, 3, 3
## 
## Cluster means:
##            x         y
## 1  0.9449112  0.956273
## 2 -0.9449112 -0.956273
## 3 -0.9449112  0.956273
## 4  0.9449112 -0.956273
## 
## Clustering vector:
##  [1] 3 3 3 2 2 2 1 1 1 4 4 4
## 
## Within cluster sum of squares by cluster:
## [1] 0.07407885 0.08202970 0.07407885 0.08202970
##  (between_SS / total_SS =  98.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(ss_scaled, col = fit.km$cluster, cex=2, pch=1, lwd=2)

cutree(result_hclust, k=2)

##  [1] 1 1 1 1 1 1 2 2 2 2 2 2

cutree(result_hclust, k=3)

##  [1] 1 1 1 1 1 1 2 2 2 3 3 3

cutree(result_hclust, k=4)

##  [1] 1 1 1 2 2 2 3 3 3 4 4 4

cutree(result_hclust, k=5)

##  [1] 1 1 1 2 2 2 3 3 3 4 4 5

group2 = cutree(result_hclust, k=2)
group3 = cutree(result_hclust, k=3)
group4 = cutree(result_hclust, k=4)
group5 = cutree(result_hclust, k=5)

par(mfrow= c(2,2))
plot(ss_scaled, pch = 16, cex =2, col = group2, main = "2 groups")
plot(ss_scaled, pch = 16, cex =2, col = group3, main = "3 groups")
plot(ss_scaled, pch = 16, cex =2, col = group4, main = "4 groups")
plot(ss_scaled, pch = 16, cex =2, col = group5, main = "5 groups")

require(cluster)

## Loading required package: cluster

pam_result = pam(simpleSquare, k=4, stand =T) #pam 알고리즘으로 군집분석(군집 수:4개)
pam_result$medoids #군집의 중심이 되는 개체

##       x    y
## [1,] 11 3900
## [2,] 11  800
## [3,] 21 3900
## [4,] 21  800

pam_result$clustering

##  [1] 1 1 1 2 2 2 3 3 3 4 4 4

clusplot(pam_result, main = "Bivariate Cluster Plot")

plot(simpleSquare, pch = 16, cex =2, col = pam_result$clustering, main ='4개 그룹')
points(pam_result$medoids, pch = 3, cex=3) #군집의 중심을 +로 표시

ss_scaled$clustering = factor(pam_result$clustering)

## Warning in ss_scaled$clustering = factor(pam_result$clustering): Coercing
## LHS to a list

ggplot(data = as.data.frame(ss_scaled), aes(x=x, y=y, color = clustering, shape = clustering))+
  geom_point()+ggtitle("Clustering")

Clustering Example

LEESUJAE

12/3/2018