x = c(10,11,12,10,11,12,20,21,22,20,21,22)
y = c(4000,3900,4000,1000,800,1000,4000,3900,4000,1000,800,1000)
simpleSquare = data.frame(x,y)
simpleSquare
## x y
## 1 10 4000
## 2 11 3900
## 3 12 4000
## 4 10 1000
## 5 11 800
## 6 12 1000
## 7 20 4000
## 8 21 3900
## 9 22 4000
## 10 20 1000
## 11 21 800
## 12 22 1000
ss_scaled = scale(simpleSquare)
ss_scaled
## x y
## [1,] -1.1338934 0.977290
## [2,] -0.9449112 0.914239
## [3,] -0.7559289 0.977290
## [4,] -1.1338934 -0.914239
## [5,] -0.9449112 -1.040341
## [6,] -0.7559289 -0.914239
## [7,] 0.7559289 0.977290
## [8,] 0.9449112 0.914239
## [9,] 1.1338934 0.977290
## [10,] 0.7559289 -0.914239
## [11,] 0.9449112 -1.040341
## [12,] 1.1338934 -0.914239
## attr(,"scaled:center")
## x y
## 16 2450
## attr(,"scaled:scale")
## x y
## 5.291503 1586.018457
summary(ss_scaled)
## x y
## Min. :-1.1339 Min. :-1.0403
## 1st Qu.:-0.9449 1st Qu.:-0.9142
## Median : 0.0000 Median : 0.0000
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.9449 3rd Qu.: 0.9773
## Max. : 1.1339 Max. : 0.9773
#표준화된 항목의 표준편차 확인
sd(ss_scaled[,'x'])
## [1] 1
sd(ss_scaled[,'y'])
## [1] 1

## 1 2 3 4 5 6 7
## 2 0.1992228
## 3 0.3779645 0.1992228
## 4 1.8915291 1.8382183 1.9289218
## 5 2.0264622 1.9545800 2.0264622 0.2271915
## 6 1.9289218 1.8382183 1.8915291 0.3779645 0.2271915
## 7 1.8898224 1.7020084 1.5118579 2.6738195 2.6388808 2.4214864
## 8 2.0797606 1.8898224 1.7020084 2.7685304 2.7187886 2.4972363 0.1992228
## 9 2.2677868 2.0797606 1.8898224 2.9530898 2.8969404 2.6738195 0.3779645
## 10 2.6738195 2.4972363 2.4214864 1.8898224 1.7055084 1.5118579 1.8915291
## 11 2.8969404 2.7187886 2.6388808 2.0826258 1.8898224 1.7055084 2.0264622
## 12 2.9530898 2.7685304 2.6738195 2.2677868 2.0826258 1.8898224 1.9289218
## 8 9 10 11
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 0.1992228
## 10 1.8382183 1.9289218
## 11 1.9545800 2.0264622 0.2271915
## 12 1.8382183 1.8915291 0.3779645 0.2271915
## [1] 0.1992228
## [1] 2.95309
##
## Call:
## hclust(d = ss_scaled_dist)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 12
plot(result_hclust)

require(NbClust)
## Loading required package: NbClust
nc = NbClust(ss_scaled, min.nc = 2, max.nc = 6, method = "kmeans")
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 17 proposed 4 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
barplot(table(nc$Best.nc[1,]), xlab = "Number of Clusters", ylab = "Number of Cirteria", main = "Number of Cluster chosen by criteria")

wssplot = function(data, nc =6, seed=1234, plot =TRUE){
wss = (nrow(data)-1)*sum(apply(data ,2,var))
for(i in 2:nc){
set.seed(seed)
wss[i]<-sum(kmeans(data,centers = i)$withinss)
}
if(plot) plot(1:nc, wss, type ="b",xlab = "Number pf clusters", ylab ="Within group sum of squares")
}
wssplot(ss_scaled)

#https://smlee729.github.io/r/machine%20learning/2015/03/20/1-k-means.html
fit.km= kmeans(ss_scaled, 4, nstart = 20)
print(fit.km)
## K-means clustering with 4 clusters of sizes 3, 3, 3, 3
##
## Cluster means:
## x y
## 1 0.9449112 0.956273
## 2 -0.9449112 -0.956273
## 3 -0.9449112 0.956273
## 4 0.9449112 -0.956273
##
## Clustering vector:
## [1] 3 3 3 2 2 2 1 1 1 4 4 4
##
## Within cluster sum of squares by cluster:
## [1] 0.07407885 0.08202970 0.07407885 0.08202970
## (between_SS / total_SS = 98.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(ss_scaled, col = fit.km$cluster, cex=2, pch=1, lwd=2)

cutree(result_hclust, k=2)
## [1] 1 1 1 1 1 1 2 2 2 2 2 2
cutree(result_hclust, k=3)
## [1] 1 1 1 1 1 1 2 2 2 3 3 3
cutree(result_hclust, k=4)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4
cutree(result_hclust, k=5)
## [1] 1 1 1 2 2 2 3 3 3 4 4 5
group2 = cutree(result_hclust, k=2)
group3 = cutree(result_hclust, k=3)
group4 = cutree(result_hclust, k=4)
group5 = cutree(result_hclust, k=5)
par(mfrow= c(2,2))
plot(ss_scaled, pch = 16, cex =2, col = group2, main = "2 groups")
plot(ss_scaled, pch = 16, cex =2, col = group3, main = "3 groups")
plot(ss_scaled, pch = 16, cex =2, col = group4, main = "4 groups")
plot(ss_scaled, pch = 16, cex =2, col = group5, main = "5 groups")

require(cluster)
## Loading required package: cluster
pam_result = pam(simpleSquare, k=4, stand =T) #pam 알고리즘으로 군집분석(군집 수:4개)
pam_result$medoids #군집의 중심이 되는 개체
## x y
## [1,] 11 3900
## [2,] 11 800
## [3,] 21 3900
## [4,] 21 800
pam_result$clustering
## [1] 1 1 1 2 2 2 3 3 3 4 4 4
clusplot(pam_result, main = "Bivariate Cluster Plot")

plot(simpleSquare, pch = 16, cex =2, col = pam_result$clustering, main ='4개 그룹')
points(pam_result$medoids, pch = 3, cex=3) #군집의 중심을 +로 표시

ss_scaled$clustering = factor(pam_result$clustering)
## Warning in ss_scaled$clustering = factor(pam_result$clustering): Coercing
## LHS to a list
ggplot(data = as.data.frame(ss_scaled), aes(x=x, y=y, color = clustering, shape = clustering))+
geom_point()+ggtitle("Clustering")
