library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
datamultivariat<- read.csv("datamultivariat.csv")
head(datamultivariat)
## Objek X1 X2 X3 X4
## 1 Objek 1 5 5 3 2
## 2 objek 2 5 3 3 3
## 3 Objek 3 5 5 3 2
## 4 Objek 4 1 1 1 2
## 5 Objek 5 2 1 1 2
Melakukan pengecekan missing value
# cek missing value
anyNA(datamultivariat)
## [1] FALSE
summary(datamultivariat)
## Objek X1 X2 X3 X4
## Length:5 Min. :1.0 Min. :1 Min. :1.0 Min. :2.0
## Class :character 1st Qu.:2.0 1st Qu.:1 1st Qu.:1.0 1st Qu.:2.0
## Mode :character Median :5.0 Median :3 Median :3.0 Median :2.0
## Mean :3.6 Mean :3 Mean :2.2 Mean :2.2
## 3rd Qu.:5.0 3rd Qu.:5 3rd Qu.:3.0 3rd Qu.:2.0
## Max. :5.0 Max. :5 Max. :3.0 Max. :3.0
datamulti<-datamultivariat[2:5]
datamulti
## X1 X2 X3 X4
## 1 5 5 3 2
## 2 5 3 3 3
## 3 5 5 3 2
## 4 1 1 1 2
## 5 2 1 1 2
scaledata<-scale(datamulti)
scaledata
## X1 X2 X3 X4
## [1,] 0.7181848 1 0.7302967 -0.4472136
## [2,] 0.7181848 0 0.7302967 1.7888544
## [3,] 0.7181848 1 0.7302967 -0.4472136
## [4,] -1.3337719 -1 -1.0954451 -0.4472136
## [5,] -0.8207827 -1 -1.0954451 -0.4472136
## attr(,"scaled:center")
## X1 X2 X3 X4
## 3.6 3.0 2.2 2.2
## attr(,"scaled:scale")
## X1 X2 X3 X4
## 1.9493589 2.0000000 1.0954451 0.4472136
#menghitung kemiripan dan ketidakmiripan
#metode euclidean
us_dist <- dist(x = scaledata, method = "euclidean")
us_dist
## 1 2 3 4
## 2 2.4494897
## 3 0.0000000 2.4494897
## 4 3.3976256 3.6801983 3.3976256
## 5 3.1147639 3.4207827 3.1147639 0.5129892
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(StatMatch)
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: lpSolve
#jarak Mahalanobis antara dua observasi (vektor baris) dalam suatu kumpulan data
(md1 <- mahalanobis.dist(scaledata[1:4,1:2]))
## [,1] [,2] [,3] [,4]
## [1,] 0.00000 2.12132 0.00000 2.12132
## [2,] 2.12132 0.00000 2.12132 2.44949
## [3,] 0.00000 2.12132 0.00000 2.12132
## [4,] 2.12132 2.44949 2.12132 0.00000
View(md1)
#Analisis Cluster Hierarchical
#analisis clustering hierarchichal
#linked method .membandingkan rupa dendogram yang dihasilkan oleh tiap method
#single linkage
us_hc_single <- hclust(d = us_dist, method = "single")
fviz_dend(us_hc_single, cex = 0.5,
main = "Cluster Dendrogram Complete single")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## cutree untuk menghasilkan cluster data
# cutree untuk menghasilkan cluster data
single_clust <- cutree(us_hc_single, k = 4)
head(single_clust)
## [1] 1 2 1 3 4
table(single_clust)
## single_clust
## 1 2 3 4
## 2 1 1 1
fviz_dend(us_hc_single, k = 4, k_colors = "jco", rect = T,
main = "single Linkage Cluster")
## menghitung cophenetic distance
#menghitung cophenetic distance
single_coph <- cophenetic(us_hc_single)
cors<- cor(single_coph, us_dist)
cors
## [1] 0.9903457
#average linkage
us_hc_avg <- hclust(d = us_dist, method = "average")
fviz_dend(us_hc_avg, cex = 0.5,
main = "Cluster Dendrogram Complete average")
# cutree untuk menghasilkan cluster data
avg_clust <- cutree(us_hc_avg, k = 4)
head(avg_clust)
## [1] 1 2 1 3 4
table(avg_clust)
## avg_clust
## 1 2 3 4
## 2 1 1 1
fviz_dend(us_hc_avg, k = 4, k_colors = "jco", rect = T,
main = "average Linkage Cluster")
#menghitung cophenetic distance
avg_coph <- cophenetic(us_hc_avg)
coravg<-cor(avg_coph, us_dist)
#centroid linkage
set.seed(100)
us_hc_centroid <- hclust(d = us_dist, method = "centroid")
fviz_dend(us_hc_centroid, cex = 0.5,
main = "Cluster Dendrogram Complete centroid")
# cutree untuk menghasilkan cluster data
centroid_clust <- cutree(us_hc_centroid, k = 4)
head(centroid_clust)
## [1] 1 2 1 3 4
table(centroid_clust)
## centroid_clust
## 1 2 3 4
## 2 1 1 1
fviz_dend(us_hc_centroid, k = 4, k_colors = "jco", rect = T,
main = "centroid Linkage Cluster")
#menghitung cophenetic distance
centroid_coph <- cophenetic(us_hc_centroid)
corcen<-cor(centroid_coph, us_dist)
#Ward’s Minimum Variance
us_hc_ward <- hclust(d = us_dist, method = "ward.D2")
fviz_dend(us_hc_ward, cex = 0.5,
main = "Cluster Dendrogram Complete ward")
# cutree untuk menghasilkan cluster data
ward_clust <- cutree(us_hc_ward, k = 4)
head(ward_clust)
## [1] 1 2 1 3 4
table(ward_clust)
## ward_clust
## 1 2 3 4
## 2 1 1 1
fviz_dend(us_hc_ward, k = 4, k_colors = "jco", rect = T,
main = "ward Linkage Cluster")
#menghitung cophenetic distance
ward_coph <- cophenetic(us_hc_ward)
corward<-cor(ward_coph, us_dist)
#Complete Linkage
us_hc_complete <- hclust(d = us_dist, method = "complete")
fviz_dend(us_hc_complete, cex = 0.5,
main = "Cluster Dendrogram Complete Linkage")
# cutree untuk menghasilkan cluster data
complete_clust <- cutree(us_hc_complete, k = 4)
head(complete_clust)
## [1] 1 2 1 3 4
table(complete_clust)
## complete_clust
## 1 2 3 4
## 2 1 1 1
fviz_dend(us_hc_complete, k = 4, k_colors = "jco", rect = T,
main = "Complete Linkage Cluster")
#menghitung cophenetic distance
complete_coph <- cophenetic(us_hc_complete)
corcomp<-cor(complete_coph, us_dist)
###
KorCop<-data.frame(cors,coravg,corcen,corward,corcomp)
KorCop
## cors coravg corcen corward corcomp
## 1 0.9903457 0.9922194 0.9720919 0.9801705 0.9898231
#pemilihan methode clustering "average linkage" karena memiliki korelasi lebih tinggi
#cluster validasi
library(clValid)
# internal measures
internal <- clValid(scaledata, nClust = 2:4,
clMethods = "hierarchical",
validation = "internal",
metric = "euclidean",
method = "complete")
## Warning in clValid(scaledata, nClust = 2:4, clMethods = "hierarchical", :
## rownames for data not specified, using 1:nrow(data)
library(readr) #Membaca data
library(dplyr) #Data processing
library(DT)
dataaku <- datamultivariat
datatable(dataaku, caption = "Indikator")
data_standardized <- round(scale(dataaku[,2:5]),4) #Hanya memilih kolom/variabel yang berisikan indikator kelahiran dan kematian
datatable(data_standardized, caption = "Data Hasil Standardisasi")
Dalam metode k-means banyaknya klaster ditentukan sendiri oleh pengguna. Maka dari itu perlu dicari jumlah klaster yang optimum yang dapat mengelompokkan objek dengan baik (Perlu diketahui bahwa metode ini relatif subjektif). Salah satu metode yang digunakan adalah Elbow Plot. Elbow Plot merupakan plot antara banyak klaster dengan total within-cluster variation (total dari simpangan per kluster). Banyak klaster yang dipilih adalah bagian “siku” atau titik dimana terdapat penurunan yang tajam sebelum titik tersebut dan disusul penurunan yang tidak tajam setelah titik tersebut. Hal ini karena penambahan jumlah klaster tidak membawa pengaruh banyak atas variasi yang ada di dalam klaster tersebut.
jumlah_klaster <- 4 #Vektor yang berisikan jumlah klaster yang ingin dilihat nilai dari total within-cluster sum of squares
within_ss <- c() #Vektor kosong yang akan diisi nilai total within-cluster sum of squares
for (i in jumlah_klaster) {
within_ss <- c(within_ss, kmeans(x = data_standardized, centers = i, nstart = 25)$tot.withinss)
}
plot(x = jumlah_klaster, y = within_ss, type = "b", xlab = "Number of Cluster",
ylab = "Total Within Sum of Squares", main = "Elbow Plot")
abline(v = 4, col = 'red')
Dilihat dari Elbow Plot diatas, banyaknya klaster yang dipilih adalah 4
klaster. ## Analisis kluster Dalam fungsi kmeans() terdapat parameter
nstart yang digunakan untuk memberitahu fungsi berapa kali inisiasi awal
yang pengguna inginkan. Pada contoh ini, parameter nstartakan diisi
sebanyak 25 sehingga akan ada inisiasi centroid awal yang berbeda
sebanyak 25 kali. Setelah itu algoritma K-Means akan memilih hasil
klaster terbaik dengan nilai total within-cluster variation yang
terkecil dari ke-25 inisiasi centroid awal. Total within-cluster
variation yang kecil menunjukkan bahwa secara umum klaster yang
terbentuk diisikan oleh observasi yang homogen atau serupa.
#analisis cluster
set.seed(123)
kmeans_clustering <- kmeans(x = data_standardized, centers = 4, nstart = 25) #parameter nstart digunakan untuk memberitahu fungsi berapa kali inisiasi centroid awal (secara acak) yang akan dibentuk dan centers digunakan untuk memberitahu fungsi berapa jumlah klaster yang akan dibentuk.
kmeans_clustering
## K-means clustering with 4 clusters of sizes 1, 1, 2, 1
##
## Cluster means:
## X1 X2 X3 X4
## 1 -1.3338 -1 -1.0954 -0.4472
## 2 -0.8208 -1 -1.0954 -0.4472
## 3 0.7182 1 0.7303 -0.4472
## 4 0.7182 0 0.7303 1.7889
##
## Clustering vector:
## [1] 3 4 3 1 2
##
## Within cluster sum of squares by cluster:
## [1] 0 0 0 0
## (between_SS / total_SS = 100.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Output dari fungsi kmeans() terdiri atas beberapa informasi sebagai berikut :
cluster : vector yang berisikan lokasi klaster tiap objek. centers : matriks yang berisikan centroid/rata-rata nilai tiap klaster. withinss: vektor yang berisikan simpangan tiap klaster yang terbentuk. tot.withinss : total dari simpangan tiap klaster yang terbentuk. Biasa digunakan untuk membuat Elbow Plot guna mengetahui banyak klaster yang sebaiknya dipilih. size : jumlah objek pada tiap klaster.
#mengembalikan nilai centroid
dataaku %>%
mutate(Klaster = kmeans_clustering$cluster) %>%
group_by(Klaster) %>%
summarise(Mean_X1 = mean(X1), Mean_X2 = mean(X2), Mean_X3 = mean(X3), Mean_X4 = mean(X4))
## # A tibble: 4 × 5
## Klaster Mean_X1 Mean_X2 Mean_X3 Mean_X4
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 1 2
## 2 2 2 1 1 2
## 3 3 5 5 3 2
## 4 4 5 3 3 3
dataaku %>%
mutate (Klaster = kmeans_clustering$cluster) %>%
select(Objek, Klaster) %>%
arrange(Klaster)
## Objek Klaster
## 1 Objek 4 1
## 2 Objek 5 2
## 3 Objek 1 3
## 4 Objek 3 3
## 5 objek 2 4
Dilihat dari rata-rata angka pada masing-masing klaster, maka dapat diinterpretasikan bahwa:
Klaster 1 berisikan objek 4 Klaster 2 berisikan objek 5 Klaster 3 berisikan objek 1 dan 3. Klaster 4 berisikan objek 2