Analisis kluster dan menghitung nilai jarak

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(cluster)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read data

datamultivariat<- read.csv("datamultivariat.csv")
head(datamultivariat)

##     Objek X1 X2 X3 X4
## 1 Objek 1  5  5  3  2
## 2 objek 2  5  3  3  3
## 3 Objek 3  5  5  3  2
## 4 Objek 4  1  1  1  2
## 5 Objek 5  2  1  1  2

Missing Value

Melakukan pengecekan missing value

# cek missing value
anyNA(datamultivariat)

## [1] FALSE

mengecek skala tiap variabel

summary(datamultivariat)

##     Objek                 X1            X2          X3            X4     
##  Length:5           Min.   :1.0   Min.   :1   Min.   :1.0   Min.   :2.0  
##  Class :character   1st Qu.:2.0   1st Qu.:1   1st Qu.:1.0   1st Qu.:2.0  
##  Mode  :character   Median :5.0   Median :3   Median :3.0   Median :2.0  
##                     Mean   :3.6   Mean   :3   Mean   :2.2   Mean   :2.2  
##                     3rd Qu.:5.0   3rd Qu.:5   3rd Qu.:3.0   3rd Qu.:2.0  
##                     Max.   :5.0   Max.   :5   Max.   :3.0   Max.   :3.0

datamulti<-datamultivariat[2:5]
datamulti

##   X1 X2 X3 X4
## 1  5  5  3  2
## 2  5  3  3  3
## 3  5  5  3  2
## 4  1  1  1  2
## 5  2  1  1  2

standisasi Data

scaledata<-scale(datamulti)
scaledata

##              X1 X2         X3         X4
## [1,]  0.7181848  1  0.7302967 -0.4472136
## [2,]  0.7181848  0  0.7302967  1.7888544
## [3,]  0.7181848  1  0.7302967 -0.4472136
## [4,] -1.3337719 -1 -1.0954451 -0.4472136
## [5,] -0.8207827 -1 -1.0954451 -0.4472136
## attr(,"scaled:center")
##  X1  X2  X3  X4 
## 3.6 3.0 2.2 2.2 
## attr(,"scaled:scale")
##        X1        X2        X3        X4 
## 1.9493589 2.0000000 1.0954451 0.4472136

Menghitung Jarak Metode Euclidean

#menghitung kemiripan dan ketidakmiripan
#metode euclidean
us_dist <- dist(x = scaledata, method = "euclidean")
us_dist

##           1         2         3         4
## 2 2.4494897                              
## 3 0.0000000 2.4494897                    
## 4 3.3976256 3.6801983 3.3976256          
## 5 3.1147639 3.4207827 3.1147639 0.5129892

Menghitung jarak Metode Mahalanobis

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(StatMatch)

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: survey

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

## Loading required package: lpSolve

#jarak Mahalanobis antara dua observasi (vektor baris) dalam suatu kumpulan data
(md1 <- mahalanobis.dist(scaledata[1:4,1:2]))

##         [,1]    [,2]    [,3]    [,4]
## [1,] 0.00000 2.12132 0.00000 2.12132
## [2,] 2.12132 0.00000 2.12132 2.44949
## [3,] 0.00000 2.12132 0.00000 2.12132
## [4,] 2.12132 2.44949 2.12132 0.00000

View(md1)

#Analisis Cluster Hierarchical

#analisis clustering hierarchichal
#linked method .membandingkan rupa dendogram yang dihasilkan oleh tiap method
#single linkage
us_hc_single <- hclust(d = us_dist, method = "single")
fviz_dend(us_hc_single, cex = 0.5, 
          main = "Cluster Dendrogram Complete single")

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## cutree untuk menghasilkan cluster data

# cutree untuk menghasilkan cluster data
single_clust <- cutree(us_hc_single,  k = 4)
head(single_clust)

## [1] 1 2 1 3 4

table(single_clust)

## single_clust
## 1 2 3 4 
## 2 1 1 1

fviz_dend(us_hc_single, k = 4, k_colors = "jco", rect = T, 
          main = "single Linkage Cluster")

## menghitung cophenetic distance

#menghitung cophenetic distance
single_coph <- cophenetic(us_hc_single)
cors<- cor(single_coph, us_dist)
cors

## [1] 0.9903457

#average linkage
us_hc_avg <- hclust(d = us_dist, method = "average")
fviz_dend(us_hc_avg, cex = 0.5, 
          main = "Cluster Dendrogram Complete average")

# cutree untuk menghasilkan cluster data
avg_clust <- cutree(us_hc_avg,  k = 4)
head(avg_clust)

## [1] 1 2 1 3 4

table(avg_clust)

## avg_clust
## 1 2 3 4 
## 2 1 1 1

fviz_dend(us_hc_avg, k = 4, k_colors = "jco", rect = T, 
          main = "average Linkage Cluster")

#menghitung cophenetic distance
avg_coph <- cophenetic(us_hc_avg)
coravg<-cor(avg_coph, us_dist)

#centroid linkage
set.seed(100)
us_hc_centroid <- hclust(d = us_dist, method = "centroid")
fviz_dend(us_hc_centroid, cex = 0.5, 
          main = "Cluster Dendrogram Complete centroid")

# cutree untuk menghasilkan cluster data
centroid_clust <- cutree(us_hc_centroid,  k = 4)
head(centroid_clust)

## [1] 1 2 1 3 4

table(centroid_clust)

## centroid_clust
## 1 2 3 4 
## 2 1 1 1

fviz_dend(us_hc_centroid, k = 4, k_colors = "jco", rect = T, 
          main = "centroid Linkage Cluster")

#menghitung cophenetic distance
centroid_coph <- cophenetic(us_hc_centroid)
corcen<-cor(centroid_coph, us_dist)

#Ward’s Minimum Variance
us_hc_ward <- hclust(d = us_dist, method = "ward.D2")
fviz_dend(us_hc_ward, cex = 0.5, 
          main = "Cluster Dendrogram Complete ward")

# cutree untuk menghasilkan cluster data
ward_clust <- cutree(us_hc_ward,  k = 4)
head(ward_clust)

## [1] 1 2 1 3 4

table(ward_clust)

## ward_clust
## 1 2 3 4 
## 2 1 1 1

fviz_dend(us_hc_ward, k = 4, k_colors = "jco", rect = T, 
          main = "ward Linkage Cluster")

#menghitung cophenetic distance
ward_coph <- cophenetic(us_hc_ward)
corward<-cor(ward_coph, us_dist)

#Complete Linkage
us_hc_complete <- hclust(d = us_dist, method = "complete")
fviz_dend(us_hc_complete, cex = 0.5, 
          main = "Cluster Dendrogram Complete Linkage")

# cutree untuk menghasilkan cluster data
complete_clust <- cutree(us_hc_complete,  k = 4)
head(complete_clust)

## [1] 1 2 1 3 4

table(complete_clust)

## complete_clust
## 1 2 3 4 
## 2 1 1 1

fviz_dend(us_hc_complete, k = 4, k_colors = "jco", rect = T, 
          main = "Complete Linkage Cluster")

#menghitung cophenetic distance
complete_coph <- cophenetic(us_hc_complete)
corcomp<-cor(complete_coph, us_dist)

###
KorCop<-data.frame(cors,coravg,corcen,corward,corcomp)
KorCop

##        cors    coravg    corcen   corward   corcomp
## 1 0.9903457 0.9922194 0.9720919 0.9801705 0.9898231

#pemilihan methode clustering "average linkage" karena memiliki korelasi lebih tinggi
#cluster validasi
library(clValid)
# internal measures
internal <- clValid(scaledata, nClust = 2:4,
                    clMethods = "hierarchical",
                    validation = "internal",
                    metric = "euclidean",
                    method = "complete")

## Warning in clValid(scaledata, nClust = 2:4, clMethods = "hierarchical", :
## rownames for data not specified, using 1:nrow(data)

Analisis Cluster K-Means

library(readr)  #Membaca data
library(dplyr)  #Data processing
library(DT)

Dataset

dataaku <- datamultivariat
datatable(dataaku, caption = "Indikator")

data_standardized <- round(scale(dataaku[,2:5]),4) #Hanya memilih kolom/variabel yang berisikan indikator kelahiran dan kematian
datatable(data_standardized, caption = "Data Hasil Standardisasi")

menentukan jumlah cluster

Dalam metode k-means banyaknya klaster ditentukan sendiri oleh pengguna. Maka dari itu perlu dicari jumlah klaster yang optimum yang dapat mengelompokkan objek dengan baik (Perlu diketahui bahwa metode ini relatif subjektif). Salah satu metode yang digunakan adalah Elbow Plot. Elbow Plot merupakan plot antara banyak klaster dengan total within-cluster variation (total dari simpangan per kluster). Banyak klaster yang dipilih adalah bagian “siku” atau titik dimana terdapat penurunan yang tajam sebelum titik tersebut dan disusul penurunan yang tidak tajam setelah titik tersebut. Hal ini karena penambahan jumlah klaster tidak membawa pengaruh banyak atas variasi yang ada di dalam klaster tersebut.

jumlah_klaster <- 4  #Vektor yang berisikan jumlah klaster yang ingin dilihat nilai dari total within-cluster sum of squares
within_ss <- c()  #Vektor kosong yang akan diisi nilai total within-cluster sum of squares
for (i in jumlah_klaster) {
  within_ss <- c(within_ss, kmeans(x = data_standardized, centers = i, nstart = 25)$tot.withinss)
}
plot(x = jumlah_klaster, y = within_ss, type = "b", xlab = "Number of Cluster",
     ylab = "Total Within Sum of Squares", main = "Elbow Plot")
abline(v = 4, col = 'red')

Dilihat dari Elbow Plot diatas, banyaknya klaster yang dipilih adalah 4 klaster. ## Analisis kluster Dalam fungsi kmeans() terdapat parameter nstart yang digunakan untuk memberitahu fungsi berapa kali inisiasi awal yang pengguna inginkan. Pada contoh ini, parameter nstartakan diisi sebanyak 25 sehingga akan ada inisiasi centroid awal yang berbeda sebanyak 25 kali. Setelah itu algoritma K-Means akan memilih hasil klaster terbaik dengan nilai total within-cluster variation yang terkecil dari ke-25 inisiasi centroid awal. Total within-cluster variation yang kecil menunjukkan bahwa secara umum klaster yang terbentuk diisikan oleh observasi yang homogen atau serupa.

#analisis cluster
set.seed(123)
kmeans_clustering <- kmeans(x = data_standardized, centers = 4, nstart = 25)  #parameter nstart digunakan untuk memberitahu fungsi berapa kali inisiasi centroid awal (secara acak) yang akan dibentuk dan centers digunakan untuk memberitahu fungsi berapa jumlah klaster yang akan dibentuk.
kmeans_clustering

## K-means clustering with 4 clusters of sizes 1, 1, 2, 1
## 
## Cluster means:
##        X1 X2      X3      X4
## 1 -1.3338 -1 -1.0954 -0.4472
## 2 -0.8208 -1 -1.0954 -0.4472
## 3  0.7182  1  0.7303 -0.4472
## 4  0.7182  0  0.7303  1.7889
## 
## Clustering vector:
## [1] 3 4 3 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 0 0 0 0
##  (between_SS / total_SS = 100.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Output dari fungsi kmeans() terdiri atas beberapa informasi sebagai berikut :

cluster : vector yang berisikan lokasi klaster tiap objek. centers : matriks yang berisikan centroid/rata-rata nilai tiap klaster. withinss: vektor yang berisikan simpangan tiap klaster yang terbentuk. tot.withinss : total dari simpangan tiap klaster yang terbentuk. Biasa digunakan untuk membuat Elbow Plot guna mengetahui banyak klaster yang sebaiknya dipilih. size : jumlah objek pada tiap klaster.

#mengembalikan nilai centroid
dataaku %>%
  mutate(Klaster = kmeans_clustering$cluster) %>%
  group_by(Klaster) %>%
  summarise(Mean_X1 = mean(X1), Mean_X2 = mean(X2), Mean_X3 = mean(X3), Mean_X4 = mean(X4))

## # A tibble: 4 × 5
##   Klaster Mean_X1 Mean_X2 Mean_X3 Mean_X4
##     <int>   <dbl>   <dbl>   <dbl>   <dbl>
## 1       1       1       1       1       2
## 2       2       2       1       1       2
## 3       3       5       5       3       2
## 4       4       5       3       3       3

dataaku %>%
  mutate (Klaster = kmeans_clustering$cluster) %>%
  select(Objek, Klaster) %>%
  arrange(Klaster)

##     Objek Klaster
## 1 Objek 4       1
## 2 Objek 5       2
## 3 Objek 1       3
## 4 Objek 3       3
## 5 objek 2       4

Dilihat dari rata-rata angka pada masing-masing klaster, maka dapat diinterpretasikan bahwa:

Klaster 1 berisikan objek 4 Klaster 2 berisikan objek 5 Klaster 3 berisikan objek 1 dan 3. Klaster 4 berisikan objek 2