Penentuan jumlah cluster
## Koefisien silhoutte dan Elbow
fviz_nbclust(df, kmeans, method = "silhouette")

#Dari rata-rata lebar silhouette, kurva mencapai titik maksimum ketika k=3, artinya jumlah klaster yang disarankan adalah sebanyak 3 klaster
fviz_nbclust(df, kmeans, method = "wss")

#Dari total Within Sum of Square, kurva melandai ketika k=3, artinya jumlah klaster yang disarankan adalah sebanyak 3 klaster
fviz_nbclust(x = df, FUNcluster = kmeans, method = "gap_stat")

#Dari gap statistic, kurva mencapai titik maksimum ketika k=3, artinya jumlah klaster yang disarankan adalah sebanyak 3 klaster.
K-Means
km <- kmeans(x = df, centers = 3, nstart = 25)
km
## K-means clustering with 3 clusters of sizes 27, 3, 4
##
## Cluster means:
## X1 X2 X3 X4 X5 X6
## 1 -0.2807163 -0.0567297 -0.1641965 -0.2802687 -0.2801241 -0.05819569
## 2 -0.2532779 -0.1164034 -0.7044065 2.8063111 2.7374587 1.53704231
## 3 2.0847934 0.4702280 1.6366316 -0.2129196 -0.1622566 -0.75996082
## X7 X8 X9 X10 X11 X12
## 1 -0.25956355 -0.2945811 -0.35511977 -0.2727544 -0.3355776 -0.30946989
## 2 2.23976164 -0.2402113 -0.01667494 2.9431467 0.2969241 -0.01772481
## 3 0.07223272 2.1685807 2.40956464 -0.3662678 2.0424559 2.10221535
## X13 X14 X15 X16 X17
## 1 -0.2240174 0.07818793 -0.3021905 -0.1258505 -0.1947817
## 2 -0.4540052 -1.30794912 -0.2061604 -0.1877947 0.3295694
## 3 1.8526212 0.45319332 2.1944058 0.9903371 1.0675995
##
## Clustering vector:
## [1] 1 1 1 1 3 3 1 1 2 2 2 1 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 234.53092 15.62688 29.39630
## (between_SS / total_SS = 50.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Hasil pengelompokan menunjukkan jumlah observasi kluster 1 berjumlah 27, kluster 2 berjumlah 3, kluster 3 berjumlah 4
#Klaster yang memiliki anggota paling banyak adalah klaster 1.
str(km)
## List of 9
## $ cluster : int [1:34] 1 1 1 1 3 3 1 1 2 2 ...
## $ centers : num [1:3, 1:17] -0.2807 -0.2533 2.0848 -0.0567 -0.1164 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:17] "X1" "X2" "X3" "X4" ...
## $ totss : num 561
## $ withinss : num [1:3] 234.5 15.6 29.4
## $ tot.withinss: num 280
## $ betweenss : num 281
## $ size : int [1:3] 27 3 4
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
Visualisasi
fviz_cluster(km, geom = "point", data = df)+ggtitle("k=3")

fviz_cluster(object = km, data = df, palette = "jco", ggtheme = theme_minimal())

#Untuk menjadikan klaster yang baik dan dapat diinterpretasikan, varians within cluster harus sehomogen mungkin dan varians between cluster harus seheterogen mungkin.
#Pada visualisasi klaster tersebut, antar titik observasi pada suatu klaster terlihat berdekatan dan menggerombol. Artinya varians within cluster sudah cukup homogen. Sementara itu, antar titik-titik observasi antar klaster terlihat berjauhan. Artinya varians between cluster sudah cukup heterogen.
#Maka dari itu, analisis dapat dilanjutkan dengan cluster profilling serta interpretasi antar klaster.
#define linkage methods
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
#function to compute agglomerative coefficient
ac <- function(x) {
agnes(df, method = x)$ac
}
#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)
## average single complete ward
## 0.6644974 0.5116934 0.7835753 0.8432422
clust <- agnes(df, method = "ward")
clust
## Call: agnes(x = df, method = "ward")
## Agglomerative coefficient: 0.8432422
## Order of objects:
## [1] 1 34 8 29 26 33 20 30 21 4 32 14 7 13 3 28 2 17 16 31 12 19 25 22 27
## [26] 23 24 9 10 11 5 15 18 6
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.702 2.484 3.056 4.501 4.803 18.490
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call" "method" "data"
#produce dendrogram
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram")

#calculate gap statistic for each number of clusters (up to 10 clusters)
gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50)
#produce plot of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#compute distance matrix
d <- dist(df, method = "euclidean")
#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )
#cut the dendrogram into 3 clusters
groups <- cutree(final_clust, k=3)
groups
## [1] 1 1 1 1 2 2 1 1 3 3 3 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#find number of observations in each cluster
table(groups)
## groups
## 1 2 3
## 27 4 3
#append cluster labels to original data
df2 <- read_excel("databersih.xlsx")
df2
## # A tibble: 34 × 18
## Provinsi X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 34.4 98.5 8.98 1093 103 1059 78 5.8 2.02 2681 22.9
## 2 Sumatera U… 44.6 96.2 8.55 1317 243 722 27 9.78 2.92 1640 39.5
## 3 Sumatera B… 40.8 98.0 8.53 1838 138 614 706 8.22 3.27 4517 37.5
## 4 Riau 37.4 98.6 8.47 392 117 464 41 8.56 2.73 850 26.7
## 5 Jambi 56.2 97.1 9.19 924 263 430 418 19.8 10.1 1627 45.4
## 6 Sumatera S… 58.2 99.5 11.0 2253 138 248 284 27.6 13.1 3085 60.6
## 7 Bengkulu 35.0 97.4 7.28 458 16 389 13 5 2.98 577 27.3
## 8 lampung 37.3 98.2 8.15 598 162 1001 32 9.1 3.49 1804 26.7
## 9 Kepulauan … 39.5 98.6 8.14 5512 807 2592 647 10.4 4.7 18571 37.0
## 10 Kepulauan … 33.3 96.0 7.27 5910 1192 812 535 6.32 3.16 14490 30.7
## # ℹ 24 more rows
## # ℹ 6 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>,
## # X17 <dbl>
final_data <- cbind(df2$Provinsi, cluster = groups)
final_data
## cluster
## [1,] "Aceh" "1"
## [2,] "Sumatera Utara" "1"
## [3,] "Sumatera Barat" "1"
## [4,] "Riau" "1"
## [5,] "Jambi" "2"
## [6,] "Sumatera Selatan" "2"
## [7,] "Bengkulu" "1"
## [8,] "lampung" "1"
## [9,] "Kepulauan Bangka Belitung" "3"
## [10,] "Kepulauan Riau" "3"
## [11,] "DKI Jakarta" "3"
## [12,] "Jawa Barat" "1"
## [13,] "Jawa Tengah" "1"
## [14,] "DI Yogyakarta" "1"
## [15,] "Jawa Timur" "2"
## [16,] "Banten" "1"
## [17,] "Bali" "1"
## [18,] "Nusa Tenggara Barat" "2"
## [19,] "Nusa Tenggara Timur" "1"
## [20,] "Kalimantan Barat" "1"
## [21,] "Kalimantan Tengah" "1"
## [22,] "Kalimantan Selatan" "1"
## [23,] "Kalimantan Timur" "1"
## [24,] "Kalimantan Utara" "1"
## [25,] "Sulawesi Utara" "1"
## [26,] "Sulawesi Tengah" "1"
## [27,] "Sulawesi Selatan" "1"
## [28,] "Sulawesi Tenggara" "1"
## [29,] "Gorontalo" "1"
## [30,] "Sulawesi Barat" "1"
## [31,] "Maluku" "1"
## [32,] "Maluku Utara" "1"
## [33,] "Papua Barat" "1"
## [34,] "Papua" "1"
#display first six rows of final data
head(final_data)
## cluster
## [1,] "Aceh" "1"
## [2,] "Sumatera Utara" "1"
## [3,] "Sumatera Barat" "1"
## [4,] "Riau" "1"
## [5,] "Jambi" "2"
## [6,] "Sumatera Selatan" "2"
View(final_data)