DMBD P12
K-Means
Persiapan
library(stats) # Untuk kmeans()
library(cluster) # Untuk silhouette() jika diperlukan manual
library(factoextra) # Untuk fviz_nbclust(), fviz_cluster()## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 1. Memuat dan Mempersiapkan Data
# Menggunakan dataset bawaan R 'USArrests'
data("USArrests")
df <- USArrests
# Melihat beberapa baris pertama dan ringkasan data
cat("Data Asli (USArrests):\n")## Data Asli (USArrests):
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
# (Opsional) Uji Multikolinearitas jika ada banyak variabel numerik
# library(car)
# model_vif <- lm(Murder ~ Assault + UrbanPop + Rape, data=df)
# vif_values <- vif(model_vif)
# print("Nilai VIF:")
# print(vif_values)
# Jika VIF > 10, pertimbangkan untuk menangani multikolinearitas
# 2. Persiapan Data
# Penskalaan Fitur (Standardisasi)
# K-Means sensitif terhadap skala variabel
df_scaled <- scale(df)
cat("\nData Setelah Penskalaan:\n")##
## Data Setelah Penskalaan:
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
##
## Menentukan Jumlah Klaster Optimal...
Metode Elbow (WCSS - Within-Cluster Sum of Squares)
# Metode Elbow (WCSS - Within-Cluster Sum of Squares)
# fviz_nbclust menghitung WCSS untuk berbagai nilai k
# dan memplotnya
p_elbow <- fviz_nbclust(df_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2, color = "steelblue") + #Ganti xintercept sesuai hasil plot
labs(subtitle = "Elbow Method (K-Means)")
print(p_elbow)cat("Interpretasi Metode Elbow: Cari 'siku' pada plot. Titik di mana
penambahan klaster baru tidak lagi signifikan mengurangi WCSS.\n")## Interpretasi Metode Elbow: Cari 'siku' pada plot. Titik di mana
## penambahan klaster baru tidak lagi signifikan mengurangi WCSS.
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
## Connecticut 3.3 110 77 11.1
## Delaware 5.9 238 72 15.8
## Florida 15.4 335 80 31.9
## Georgia 17.4 211 60 25.8
## Hawaii 5.3 46 83 20.2
## Idaho 2.6 120 54 14.2
## Illinois 10.4 249 83 24.0
## Indiana 7.2 113 65 21.0
## Iowa 2.2 56 57 11.3
## Kansas 6.0 115 66 18.0
## Kentucky 9.7 109 52 16.3
## Louisiana 15.4 249 66 22.2
## Maine 2.1 83 51 7.8
## Maryland 11.3 300 67 27.8
## Massachusetts 4.4 149 85 16.3
## Michigan 12.1 255 74 35.1
## Minnesota 2.7 72 66 14.9
## Mississippi 16.1 259 44 17.1
## Missouri 9.0 178 70 28.2
## Montana 6.0 109 53 16.4
## Nebraska 4.3 102 62 16.5
## Nevada 12.2 252 81 46.0
## New Hampshire 2.1 57 56 9.5
## New Jersey 7.4 159 89 18.8
## New Mexico 11.4 285 70 32.1
## New York 11.1 254 86 26.1
## North Carolina 13.0 337 45 16.1
## North Dakota 0.8 45 44 7.3
## Ohio 7.3 120 75 21.4
## Oklahoma 6.6 151 68 20.0
## Oregon 4.9 159 67 29.3
## Pennsylvania 6.3 106 72 14.9
## Rhode Island 3.4 174 87 8.3
## South Carolina 14.4 279 48 22.5
## South Dakota 3.8 86 45 12.8
## Tennessee 13.2 188 59 26.9
## Texas 12.7 201 80 25.5
## Utah 3.2 120 80 22.9
## Vermont 2.2 48 32 11.2
## Virginia 8.5 156 63 20.7
## Washington 4.0 145 73 26.2
## West Virginia 5.7 81 39 9.3
## Wisconsin 2.6 53 66 10.8
## Wyoming 6.8 161 60 15.6
# Metode Silhouette
# fviz_nbclust juga dapat menghitung rata-rata skor Silhouette
p_silhouette <- fviz_nbclust(df_scaled, kmeans, method = "silhouette") + labs(subtitle = "Silhouette Method (K-Means)")
print(p_silhouette)cat("Interpretasi Metode Silhouette: Pilih jumlah klaster (k) yang
memberikan rata-rata skor Silhouette tertinggi.\n")## Interpretasi Metode Silhouette: Pilih jumlah klaster (k) yang
## memberikan rata-rata skor Silhouette tertinggi.
# Untuk USArrests, k=2 seringkali memberikan Silhouette tertinggi,namun k=4 juga bisa jadi pilihan yang baik tergantung interpretasi
# Metode Gap Statistic (lebih intensif komputasi)
# p_gap <- fviz_nbclust(df_scaled, kmeans, method = "gap_stat", nboot = 50) +
# labs(subtitle = "Gap Statistic Method (K-Means)")
# print(p_gap)
# Berdasarkan metode di atas, mari kita pilih k.
# Misalnya, kita pilih k = 4 berdasarkan metode Elbow (atau k=2 dari Silhouette)
# Untuk demonstrasi ini, kita akan gunakan k=4
k_optimal <- 4
cat(paste("\nJumlah klaster optimal yang dipilih:", k_optimal, "\n"))##
## Jumlah klaster optimal yang dipilih: 4
##
## Menjalankan K-Means dengan k = 4 ...
set.seed(123) # Untuk reproduktifitas hasil
kmeans_result <- kmeans(df_scaled, centers = k_optimal, nstart = 25)
# nstart = 25 berarti algoritma akan dijalankan 25 kali dengan centroid awal acak yang berbeda
# dan hasil terbaik (dengan WCSS terendah) akan dipilih.
# Melihat hasil clustering
print(kmeans_result)## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Visualisasi Hasil Klaster
##
## Memvisualisasikan Hasil Klaster...
# fviz_cluster menggunakan Principal Component Analysis (PCA) secara internal
# untuk memplot data multi-dimensi dalam 2D.
p_cluster <- fviz_cluster(kmeans_result, data = df_scaled, ellipse.type = "confidence", # Jenis elips: "confidence", "convex", "euclid", "norm", "t" geom = "point", # Tampilkan titik data
palette = "jco", # Palet warna
ggtheme = theme_bw()) +
labs(title = paste("K-Means Clustering Results (k =", k_optimal,
")"))
print(p_cluster)# (Opsional) Visualisasi dengan nama negara bagian
# rownames(df_scaled) <- rownames(df) # Pastikan nama baris ada
# p_cluster_labels <- fviz_cluster(kmeans_result, data = df_scaled,
# ellipse.type = "confidence",
# geom = c("point", "text"), #Tambahkan teks
# repel = TRUE, # Hindari tumpangtindih label
# labelsize = 8,
# palette = "jco",
# ggtheme = theme_bw()) +
# labs(title = paste("K-Means Clustering Results with Labels (k =",k_optimal, ")"))
# print(p_cluster_labels)
# 6. Menganalisis Hasil Lebih Lanjut
cat("\nAnalisis Hasil Klaster:\n")##
## Analisis Hasil Klaster:
## Ukuran Klaster:
## [1] 8 13 16 13
# Pusat klaster (centroid) dalam skala asli (jika diinginkan)
# Perlu mengembalikan centroid ke skala asli jika data diskalakan
# Ambil mean dan sd dari data asli sebelum diskalakan
original_means <- attr(df_scaled, "scaled:center")
original_sds <- attr(df_scaled, "scaled:scale")
centroids_original_scale <- t(apply(kmeans_result$centers, 1,
function(r) r * original_sds + original_means))
cat("\nCentroid Klaster (Skala Asli):\n")##
## Centroid Klaster (Skala Asli):
## Murder Assault UrbanPop Rape
## 1 13.93750 243.62500 53.75000 21.41250
## 2 3.60000 78.53846 52.07692 12.17692
## 3 5.65625 138.87500 73.87500 18.78125
## 4 10.81538 257.38462 76.00000 33.19231
# Menambahkan alokasi klaster ke dataset asli
df_clustered <- df %>%
mutate(Cluster = kmeans_result$cluster)
cat("\nData Asli dengan Label Klaster:\n")##
## Data Asli dengan Label Klaster:
## Murder Assault UrbanPop Rape Cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 4
## Arizona 8.1 294 80 31.0 4
## Arkansas 8.8 190 50 19.5 1
## California 9.0 276 91 40.6 4
## Colorado 7.9 204 78 38.7 4
##
## Ringkasan Statistik per Klaster (Skala Asli):
df_clustered %>%
group_by(Cluster) %>%
summarise(across(everything(), list(mean = mean, median = median, sd
= sd))) %>%
print(n = Inf)## # A tibble: 4 × 13
## Cluster Murder_mean Murder_median Murder_sd Assault_mean Assault_median
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.9 13.8 2.60 244. 242.
## 2 2 3.6 2.6 2.36 78.5 81
## 3 3 5.66 5.95 1.65 139. 147
## 4 4 10.8 11.1 2.08 257. 255
## # ℹ 7 more variables: Assault_sd <dbl>, UrbanPop_mean <dbl>,
## # UrbanPop_median <dbl>, UrbanPop_sd <dbl>, Rape_mean <dbl>,
## # Rape_median <dbl>, Rape_sd <dbl>
# (Opsional) Menyimpan hasil plot
# ggsave("kmeans_elbow_plot.png", plot = p_elbow)
# ggsave("kmeans_silhouette_plot.png", plot = p_silhouette)
# ggsave("kmeans_cluster_plot.png", plot = p_cluster)
cat("\nProses K-Means clustering selesai.\n")##
## Proses K-Means clustering selesai.
K-Medoids (PAM)
Persiapan
library(cluster) # Untuk pam(), daisy()
library(factoextra) # Untuk fviz_nbclust(), fviz_cluster()
library(dplyr) # Untuk manipulasi data
library(ggplot2) # Untuk plot kustom jika diperlukan
# --- Contoh 1: K-Medoids (PAM) pada Data Numerik (USArrests) ---
cat("--- Contoh 1: K-Medoids (PAM) pada Data Numerik (USArrests)
---\n")## --- Contoh 1: K-Medoids (PAM) pada Data Numerik (USArrests)
## ---
# 1. Memuat dan Mempersiapkan Data
data("USArrests")
df_pam_numeric <- USArrests
cat("Data Asli (USArrests):\n")## Data Asli (USArrests):
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# 2. Persiapan Data
# Penskalaan Fitur (Standardisasi)
df_pam_numeric_scaled <- scale(df_pam_numeric)
cat("\nData Numerik Setelah Penskalaan:\n")##
## Data Numerik Setelah Penskalaan:
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# 3. Menentukan Jumlah Klaster (k) Optimal
cat("\nMenentukan Jumlah Klaster Optimal untuk Data Numerik...\n")##
## Menentukan Jumlah Klaster Optimal untuk Data Numerik...
# Metode Silhouette (lebih cocok untuk PAM)
# Argumen FUNcluster untuk fviz_nbclust bisa pam
# Untuk data numerik, kita bisa langsung berikan data yang sudah diskalakan
p_silhouette_pam_numeric <- fviz_nbclust(df_pam_numeric_scaled, pam,
method = "silhouette") +
labs(subtitle = "Silhouette Method for PAM (Numeric Data)")
print(p_silhouette_pam_numeric)# Misalkan k=2 atau k=4 adalah pilihan yang baik berdasarkan plot Silhouette. Kita pilih k=2.
k_optimal_pam_numeric <- 2
cat(paste("\nJumlah klaster optimal yang dipilih untuk data numerik:",
k_optimal_pam_numeric, "\n"))##
## Jumlah klaster optimal yang dipilih untuk data numerik: 2
# 4. Menjalankan Algoritma PAM
cat("\nMenjalankan PAM pada Data Numerik dengan k =",
k_optimal_pam_numeric, "...\n")##
## Menjalankan PAM pada Data Numerik dengan k = 2 ...
set.seed(123)
pam_results_numeric <- pam(df_pam_numeric_scaled, k =
k_optimal_pam_numeric, metric = "manhattan")
# 'metric' bisa "euclidean" atau "manhattan". Manhattan sering lebih robust.
# Jika menggunakan matriks jarak: pam(dist_matrix, k, diss = TRUE)
print(pam_results_numeric)## Medoids:
## ID Murder Assault UrbanPop Rape
## New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Objective function:
## build swap
## 2.563358 2.360113
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## Memvisualisasikan Hasil Klaster PAM untuk Data Numerik...
p_cluster_pam_numeric <- fviz_cluster(pam_results_numeric, data =
df_pam_numeric_scaled,
ellipse.type = "confidence",
geom = "point",
palette = "jco",
ggtheme = theme_bw()) +
labs(title = paste("PAM Clustering Results (Numeric Data, k =",
k_optimal_pam_numeric, ")"))
print(p_cluster_pam_numeric)# --- Contoh 2: K-Medoids (PAM) pada Data Campuran menggunakan Gower Distance ---
cat("\n\n--- Contoh 2: K-Medoids (PAM) pada Data Campuran (Gower Distance) ---\n")##
##
## --- Contoh 2: K-Medoids (PAM) pada Data Campuran (Gower Distance) ---
1. Membuat atau Memuat Data Campuran
# 1. Membuat atau Memuat Data Campuran
set.seed(456)
df_mixed <- data.frame(
Age = sample(20:60, 100, replace = TRUE),
Income = rnorm(100, mean = 50000, sd = 15000),
Education = factor(sample(c("HighSchool", "Bachelor", "Master",
"PhD"), 100, replace = TRUE),
levels = c("HighSchool", "Bachelor", "Master",
"PhD"), ordered = TRUE),
Gender = factor(sample(c("Male", "Female"), 100, replace = TRUE)),
OwnsHouse = factor(sample(c("Yes", "No"), 100, replace = TRUE))
)
# Pastikan variabel kategorikal adalah faktor
df_mixed$Education <- as.factor(df_mixed$Education) # Jika bukan ordered, cukup as.factor
df_mixed$Gender <- as.factor(df_mixed$Gender)
df_mixed$OwnsHouse <- as.factor(df_mixed$OwnsHouse)
cat("Data Campuran:\n")## Data Campuran:
## Age Income Education Gender OwnsHouse
## 1 56 76442.77 Bachelor Male No
## 2 54 39176.62 PhD Male Yes
## 3 57 47417.08 HighSchool Female Yes
## 4 40 52938.07 Master Male No
## 5 46 85278.59 Master Male No
## 6 44 68017.19 HighSchool Female Yes
## 'data.frame': 100 obs. of 5 variables:
## $ Age : int 56 54 57 40 46 44 33 50 28 34 ...
## $ Income : num 76443 39177 47417 52938 85279 ...
## $ Education: Ord.factor w/ 4 levels "HighSchool"<"Bachelor"<..: 2 4 1 3 3 1 4 3 2 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 2 2 1 2 1 1 2 ...
## $ OwnsHouse: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 2 2 1 2 ...
2. Persiapan Data: Menghitung Matriks Jarak Gower
# 2. Persiapan Data: Menghitung Matriks Jarak Gower
# Penskalaan tidak diperlukan secara eksplisit sebelum Gower, karena Gower menangani normalisasi internal
# untuk variabel numerik berdasarkan rentang.
cat("\nMenghitung Matriks Jarak Gower...\n")##
## Menghitung Matriks Jarak Gower...
gower_dist_matrix <- daisy(df_mixed, metric = "gower")
# daisy() akan otomatis mendeteksi tipe variabel (numerik, nominal, ordinal, biner simetris/asimetris)
# Anda bisa menentukan tipe secara manual dengan argumen 'type' jika diperlukan.
# Contoh: type = list(ordratio = "Education_column_index_or_name")
# Melihat sebagian kecil dari matriks jarak
# print(as.matrix(gower_dist_matrix)[1:5, 1:5])3. Menentukan Jumlah Klaster (k) Optimal untuk Data Campuran
# 3. Menentukan Jumlah Klaster (k) Optimal untuk Data Campuran
cat("\nMenentukan Jumlah Klaster Optimal untuk Data Campuran (menggunakan Gower dist)...\n")##
## Menentukan Jumlah Klaster Optimal untuk Data Campuran (menggunakan Gower dist)...
# Rentang nilai k yang akan diuji
k_values_pam_mixed <- 2:8
# Hitung nilai rata-rata silhouette untuk setiap k
avg_sil_values_pam_mixed <- sapply(k_values_pam_mixed, function(k) {
pam_fit <- pam(gower_dist_matrix, k = k, diss = TRUE) # 'diss = TRUE' karena input berupa matriks jarak
si <- silhouette(pam_fit$clustering, gower_dist_matrix)
if (is.null(si) || nrow(si) == 0) return(NA) # Penanganan jika silhouette tidak bisa dihitung
mean(si[, 3]) # Ambil rata-rata silhouette width
})
# Hapus nilai NA jika ada
valid_indices <- !is.na(avg_sil_values_pam_mixed)
k_values_pam_mixed_valid <- k_values_pam_mixed[valid_indices]
avg_sil_values_pam_mixed_valid <- avg_sil_values_pam_mixed[valid_indices]
# Plot hasil dan pilih k optimal
if (length(avg_sil_values_pam_mixed_valid) > 0) {
plot(k_values_pam_mixed_valid, avg_sil_values_pam_mixed_valid,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters (k)", ylab = "Average Silhouette Width (Gower)",
main = "Silhouette Method for PAM (Mixed Data)")
k_optimal_pam_mixed <- k_values_pam_mixed_valid[which.max(avg_sil_values_pam_mixed_valid)]
abline(v = k_optimal_pam_mixed, col = "red", lty = 2)
} else {
cat("Tidak dapat menentukan k optimal dengan metode Silhouette. Menggunakan nilai default k = 3.\n")
k_optimal_pam_mixed <- 3
}##
## Jumlah klaster optimal yang dipilih untuk data campuran: 4
4. Menjalankan Algoritma PAM pada Data Campuran
# 4. Menjalankan Algoritma PAM pada Data Campuran
cat("\nMenjalankan PAM pada Data Campuran dengan k =", k_optimal_pam_mixed, "...\n")##
## Menjalankan PAM pada Data Campuran dengan k = 4 ...
set.seed(123)
pam_results_mixed <- pam(gower_dist_matrix, k = k_optimal_pam_mixed, diss = TRUE)
print(pam_results_mixed)## Medoids:
## ID
## [1,] 77 77
## [2,] 76 76
## [3,] 80 80
## [4,] 15 15
## Clustering vector:
## [1] 1 2 3 1 1 3 2 3 4 2 2 2 1 2 4 3 2 3 4 3 4 4 1 3 2 2 1 2 2 2 4 2 4 2 3 4 3
## [38] 2 2 3 4 2 2 3 3 3 1 4 4 3 2 2 3 3 3 1 1 3 3 2 2 4 4 1 2 2 1 4 3 3 4 3 3 3
## [75] 4 2 1 4 1 3 2 2 4 2 3 2 1 3 3 1 4 2 1 4 1 1 1 1 4 2
## Objective function:
## build swap
## 0.1460674 0.1454166
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
##
## Memvisualisasikan Hasil Klaster PAM untuk Data Campuran...
# Menambahkan data asli ke objek hasil PAM agar bisa divisualisasikan dengan fviz_cluster
pam_results_mixed_for_viz <- pam_results_mixed
pam_results_mixed_for_viz$data <- df_mixed
# Alternatif: MDS (jika visualisasi PCA dirasa tidak sesuai)
# mds_coords <- cmdscale(gower_dist_matrix, k = 2)
# df_plot_mixed <- as.data.frame(mds_coords)
# colnames(df_plot_mixed) <- c("Dim1", "Dim2")
# df_plot_mixed$Cluster <- as.factor(pam_results_mixed$clustering)
#
# p_mds_pam_mixed <- ggplot(df_plot_mixed, aes(x = Dim1, y = Dim2, color = Cluster, shape = Cluster)) +
# geom_point(size = 3, alpha = 0.7) +
# labs(title = paste("MDS Plot of PAM Clustering (Gower, k =", k_optimal_pam_mixed, ")")) +
# theme_bw()
# print(p_mds_pam_mixed)5. Hasil
##
## Analisis Hasil Klaster (Data Campuran):
## Indeks Medoid:
## [1] 77 76 80 15
##
## Data Medoid Aktual:
## Age Income Education Gender OwnsHouse
## 77 45 51443.02 Master Male No
## 76 41 49983.41 Bachelor Male Yes
## 80 45 40741.44 Master Female Yes
## 15 33 46104.56 Master Female No
# Tambahkan hasil klaster ke data asli
df_mixed_clustered <- df_mixed %>%
mutate(Cluster = as.factor(pam_results_mixed$clustering))
cat("\nData Campuran dengan Label Klaster:\n")##
## Data Campuran dengan Label Klaster:
## Age Income Education Gender OwnsHouse Cluster
## 1 56 76442.77 Bachelor Male No 1
## 2 54 39176.62 PhD Male Yes 2
## 3 57 47417.08 HighSchool Female Yes 3
## 4 40 52938.07 Master Male No 1
## 5 46 85278.59 Master Male No 1
## 6 44 68017.19 HighSchool Female Yes 3
##
## Ringkasan per Klaster (Variabel Numerik):
df_mixed_clustered %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), list(mean = mean, median = median), .names = "{.col}_{.fn}")) %>%
print(n = Inf)## # A tibble: 4 × 5
## Cluster Age_mean Age_median Income_mean Income_median
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 41.8 44 53498. 51731.
## 2 2 39.5 38 47927. 50221.
## 3 3 45.7 47.5 49169. 46508.
## 4 4 38.6 36 49747. 52585.
##
## Ringkasan per Klaster (Variabel Kategorik):
df_mixed_clustered %>%
group_by(Cluster) %>%
summarise(across(where(is.factor),
~names(which.max(table(.))),
.names = "{.col}_mode")) %>%
print(n = Inf)## # A tibble: 4 × 4
## Cluster Education_mode Gender_mode OwnsHouse_mode
## <fct> <chr> <chr> <chr>
## 1 1 Bachelor Male No
## 2 2 Bachelor Male Yes
## 3 3 Master Female Yes
## 4 4 Master Female No
##
## Proses K-Medoids (PAM) clustering selesai.