#install necessary package
#install.packages("flexclust") #kmedian
#install.packages("dbscan") #dbscan
#install.packages("meanShiftR") #meanshift
#install.packages("e1071") # fuzzy c-means
#install.packages("cluster") #silhouette score
#install.packages("fpc") #another metrics
#install.packages("mclust") # ARI metrics to compare cluster with true label
#install.packages("tidyverse")
library(tidyverse)
library(flexclust)
library(dbscan)
library(meanShiftR)
library(e1071)
library(cluster)
library(fpc)
library(mclust)
data <- read.csv("D:/SEMESTER 4/Materi Sofia/AnMul/Modul 3 Clustering/Dataset - Updated.csv")
data_num <- data[, sapply(data, is.numeric)]
str(data_num)
## 'data.frame': 1205 obs. of 11 variables:
## $ Age : int 22 22 27 20 20 22 20 23 22 26 ...
## $ Systolic.BP : int 90 110 110 100 90 120 110 110 90 110 ...
## $ Diastolic : int 60 70 70 70 60 70 70 80 60 70 ...
## $ BS : num 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
## $ Body.Temp : int 100 98 98 98 98 98 102 98 98 100 ...
## $ BMI : num 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
## $ Previous.Complications: int 1 0 1 0 0 0 0 0 0 1 ...
## $ Preexisting.Diabetes : int 1 0 0 0 0 0 1 0 0 1 ...
## $ Gestational.Diabetes : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Mental.Health : int 1 0 0 0 0 0 0 0 0 1 ...
## $ Heart.Rate : int 80 74 72 74 74 76 78 74 72 80 ...
#install.packages('psych')
summary(data_num)
## Age Systolic.BP Diastolic BS
## Min. :10.00 Min. : 70.0 Min. : 40.00 Min. : 3.000
## 1st Qu.:21.00 1st Qu.:100.0 1st Qu.: 65.00 1st Qu.: 6.000
## Median :25.00 Median :120.0 Median : 80.00 Median : 6.900
## Mean :27.48 Mean :116.8 Mean : 77.17 Mean : 7.501
## 3rd Qu.:31.00 3rd Qu.:130.0 3rd Qu.: 90.00 3rd Qu.: 7.900
## Max. :65.00 Max. :200.0 Max. :140.00 Max. :19.000
## NA's :5 NA's :4 NA's :2
## Body.Temp BMI Previous.Complications Preexisting.Diabetes
## Min. : 97.0 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 98.0 1st Qu.:20.45 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 98.0 Median :23.00 Median :0.0000 Median :0.0000
## Mean : 98.4 Mean :23.32 Mean :0.1754 Mean :0.2884
## 3rd Qu.: 98.0 3rd Qu.:25.00 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :103.0 Max. :37.00 Max. :1.0000 Max. :1.0000
## NA's :18 NA's :2 NA's :2
## Gestational.Diabetes Mental.Health Heart.Rate
## Min. :0.0000 Min. :0.0000 Min. :58.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:70.00
## Median :0.0000 Median :0.0000 Median :76.00
## Mean :0.1178 Mean :0.3344 Mean :75.82
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:80.00
## Max. :1.0000 Max. :1.0000 Max. :92.00
## NA's :2
library(psych)
# statistik lebih lengkap
describe(data_num)
## vars n mean sd median trimmed mad min max
## Age 1 1205 27.48 9.20 25.0 26.27 7.41 10 65
## Systolic.BP 2 1200 116.82 18.72 120.0 116.60 14.83 70 200
## Diastolic 3 1201 77.17 14.31 80.0 76.42 14.83 40 140
## BS 4 1203 7.50 3.05 6.9 7.04 1.48 3 19
## Body.Temp 5 1205 98.40 1.09 98.0 98.08 0.00 97 103
## BMI 6 1187 23.32 3.88 23.0 23.05 2.97 0 37
## Previous.Complications 7 1203 0.18 0.38 0.0 0.09 0.00 0 1
## Preexisting.Diabetes 8 1203 0.29 0.45 0.0 0.24 0.00 0 1
## Gestational.Diabetes 9 1205 0.12 0.32 0.0 0.02 0.00 0 1
## Mental.Health 10 1205 0.33 0.47 0.0 0.29 0.00 0 1
## Heart.Rate 11 1203 75.82 7.23 76.0 75.66 5.93 58 92
## range skew kurtosis se
## Age 55 1.33 2.06 0.26
## Systolic.BP 130 0.26 0.35 0.54
## Diastolic 100 0.38 -0.30 0.41
## BS 16 1.58 2.61 0.09
## Body.Temp 6 2.47 4.88 0.03
## BMI 37 0.46 1.21 0.11
## Previous.Complications 1 1.70 0.91 0.01
## Preexisting.Diabetes 1 0.93 -1.13 0.01
## Gestational.Diabetes 1 2.37 3.61 0.01
## Mental.Health 1 0.70 -1.51 0.01
## Heart.Rate 34 0.21 -0.18 0.21
colSums(is.na(data_num))
## Age Systolic.BP Diastolic
## 0 5 4
## BS Body.Temp BMI
## 2 0 18
## Previous.Complications Preexisting.Diabetes Gestational.Diabetes
## 2 2 0
## Mental.Health Heart.Rate
## 0 2
# handling missing value
data_num1 <- data.frame(lapply(data_num, function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}))
colSums(is.na(data_num1))
## Age Systolic.BP Diastolic
## 0 0 0
## BS Body.Temp BMI
## 0 0 0
## Previous.Complications Preexisting.Diabetes Gestational.Diabetes
## 0 0 0
## Mental.Health Heart.Rate
## 0 0
summary(data_num1)
## Age Systolic.BP Diastolic BS
## Min. :10.00 Min. : 70.0 Min. : 40.00 Min. : 3.0
## 1st Qu.:21.00 1st Qu.:100.0 1st Qu.: 65.00 1st Qu.: 6.0
## Median :25.00 Median :120.0 Median : 80.00 Median : 6.9
## Mean :27.48 Mean :116.8 Mean : 77.18 Mean : 7.5
## 3rd Qu.:31.00 3rd Qu.:130.0 3rd Qu.: 90.00 3rd Qu.: 7.9
## Max. :65.00 Max. :200.0 Max. :140.00 Max. :19.0
## Body.Temp BMI Previous.Complications Preexisting.Diabetes
## Min. : 97.0 Min. : 0.00 Min. :0.0000 Min. :0.000
## 1st Qu.: 98.0 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Median : 98.0 Median :23.00 Median :0.0000 Median :0.000
## Mean : 98.4 Mean :23.31 Mean :0.1751 Mean :0.288
## 3rd Qu.: 98.0 3rd Qu.:25.00 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :103.0 Max. :37.00 Max. :1.0000 Max. :1.000
## Gestational.Diabetes Mental.Health Heart.Rate
## Min. :0.0000 Min. :0.0000 Min. :58.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:70.00
## Median :0.0000 Median :0.0000 Median :76.00
## Mean :0.1178 Mean :0.3344 Mean :75.82
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:80.00
## Max. :1.0000 Max. :1.0000 Max. :92.00
describe(data_num1)
## vars n mean sd median trimmed mad min max
## Age 1 1205 27.48 9.20 25.0 26.27 7.41 10 65
## Systolic.BP 2 1205 116.83 18.68 120.0 116.61 14.83 70 200
## Diastolic 3 1205 77.18 14.28 80.0 76.43 14.83 40 140
## BS 4 1205 7.50 3.05 6.9 7.04 1.48 3 19
## Body.Temp 5 1205 98.40 1.09 98.0 98.08 0.00 97 103
## BMI 6 1205 23.31 3.85 23.0 23.05 2.97 0 37
## Previous.Complications 7 1205 0.18 0.38 0.0 0.09 0.00 0 1
## Preexisting.Diabetes 8 1205 0.29 0.45 0.0 0.24 0.00 0 1
## Gestational.Diabetes 9 1205 0.12 0.32 0.0 0.02 0.00 0 1
## Mental.Health 10 1205 0.33 0.47 0.0 0.29 0.00 0 1
## Heart.Rate 11 1205 75.82 7.22 76.0 75.66 5.93 58 92
## range skew kurtosis se
## Age 55 1.33 2.06 0.26
## Systolic.BP 130 0.26 0.36 0.54
## Diastolic 100 0.38 -0.29 0.41
## BS 16 1.58 2.62 0.09
## Body.Temp 6 2.47 4.88 0.03
## BMI 37 0.46 1.28 0.11
## Previous.Complications 1 1.71 0.92 0.01
## Preexisting.Diabetes 1 0.94 -1.13 0.01
## Gestational.Diabetes 1 2.37 3.61 0.01
## Mental.Health 1 0.70 -1.51 0.01
## Heart.Rate 34 0.21 -0.17 0.21
desc <- describe(data_num1)
#install.packages("openxlsx")
library(openxlsx)
desc_table <- data.frame(
Variable = rownames(desc),
Mean = round(desc$mean, 3),
Median = round(desc$median, 3),
SD = round(desc$sd, 3),
Min = desc$min,
Max = desc$max
)
write.xlsx(desc_table, "tabel_deskriptif.xlsx", rowNames = FALSE)
library(ggplot2)
data_long <- stack(data_num1)
ggplot(data_long, aes(x = values)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
facet_wrap(~ ind, scales = "free") +
theme_minimal()
library(ggplot2)
library(tidyr)
library(dplyr)
data_long <- data_num1 %>%
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
ggplot(data_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "#69b3a2", alpha = 0.7) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Seluruh Variabel Numerik",
x = "Variabel",
y = "Nilai")
df_scaled <- scale(data_num1)
data_scaled_long <- as.data.frame(df_scaled) %>%
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
ggplot(data_scaled_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "#E69F00", alpha = 0.7) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Setelah Standarisasi",
y = "Z-Score")
#menentukan K menggunakan metode Elbow
wss <- sapply(1:10, function(k){
kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method")
Berdasarkan metode Elbow, diperoleh jumlah cluster optimal sebanyak 3,
yang ditunjukkan oleh adanya titik siku (elbow) pada k = 3, di mana
penurunan nilai within-cluster sum of squares (WSS) mulai melandai
setelah titik tersebut.
# Silhouette Analysis
avg_sil <- function(k) {
km_res <- kmeans(df_scaled, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df_scaled))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
par(mfrow = c(1, 1))
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis")
Berdasarkan metode Silhouette, diperoleh jumlah cluster optimal sebanyak
3, yang ditunjukkan oleh nilai rata-rata silhouette tertinggi pada k =
3. Hasil ini juga konsisten dengan metode Elbow, sehingga dipilih 3
cluster sebagai jumlah cluster terbaik dalam analisis K-Means.
km_res <- kmeans(df_scaled, centers = 3)
# K-median
kmed_res <- kcca(df_scaled, k = 4, family = kccaFamily("kmedians"))
library(dbscan)
library(cluster)
eps_values <- seq(0.1, 0.8, b = 0.05)
minpts_values <- c(3, 4, 5)
hasil_dbscan <- data.frame()
for (minpts in minpts_values) {
for (eps in eps_values) {
db <- dbscan(df_scaled, eps = eps, MinPts = minpts)
# ambil cluster (buang noise = 0)
cluster <- db$cluster
# cek minimal ada 2 cluster
if (length(unique(cluster)) > 1 && sum(cluster != 0) > 0) {
sil <- silhouette(cluster, dist(df_scaled))
avg_sil <- mean(sil[, 3])
hasil_dbscan <- rbind(hasil_dbscan,
data.frame(eps = eps,
MinPts = minpts,
SI = avg_sil))
}
}
}
hasil_dbscan[order(-hasil_dbscan$SI), ][1:10, ]
## eps MinPts SI
## 43 0.75 5 -0.01133496
## 44 0.80 5 -0.06396163
## 30 0.80 4 -0.08250852
## 42 0.70 5 -0.08814403
## 29 0.75 4 -0.12114277
## 41 0.65 5 -0.17107168
## 27 0.65 4 -0.18148380
## 28 0.70 4 -0.18539311
## 15 0.80 3 -0.20149989
## 14 0.75 3 -0.24260329
best <- hasil_dbscan[which.max(hasil_dbscan$SI), ]
best
## eps MinPts SI
## 43 0.75 5 -0.01133496
# DBSCAN
db_res <- dbscan(df_scaled, eps = 0.75, MinPts= 5)
# hasil cluster
clustersdb <- db_res$assignment
table(clustersdb)
## < table of extent 0 >
library(meanShiftR)
library(cluster)
bandwidth_values <- seq(0.5, 3, by = 0.2)
hasil_ms <- data.frame()
for (bw in bandwidth_values) {
bw_vector <- rep(bw, ncol(df_scaled))
ms <- meanShift(df_scaled, bandwidth = bw_vector)
cluster <- ms$assignment
if (length(unique(cluster)) > 1) {
sil <- silhouette(cluster, dist(df_scaled))
avg_sil <- mean(sil[, 3])
hasil_ms <- rbind(hasil_ms,
data.frame(bandwidth = bw,
SI = avg_sil,
jumlah_cluster = length(unique(cluster))))
}
}
hasil_ms[order(-hasil_ms$SI), ]
## bandwidth SI jumlah_cluster
## 6 1.5 0.3041680576 2
## 5 1.3 0.1064641722 72
## 3 0.9 0.0857818583 260
## 4 1.1 0.0509506503 106
## 2 0.7 0.0160748049 422
## 1 0.5 -0.0005164585 781
#Mean Shift
bw <- 1.5
ms_res <- meanShift(df_scaled, bandwidth = rep(bw, ncol(df_scaled)))
# hasil cluster
clustersms <- ms_res$assignment
table(clustersms)
## clustersms
## 1 2
## 1195 10
#Fuzzy C-means
fcm_res <- cmeans(df_scaled, centers = 2, m = 2)
library(cluster)
library(fpc)
dist_mat <- dist(df_scaled)
# fungsi evaluasi
evaluate_cluster <- function(cluster_label) {
sil <- mean(silhouette(cluster_label, dist_mat)[,3])
return(c(Silhouette = sil))
}
# hasil semua metode
eval_kmeans <- evaluate_cluster(km_res$cluster)
eval_kmedians <- evaluate_cluster(clusters(kmed_res))
eval_dbscan <- evaluate_cluster(db_res$cluster)
eval_meanshift<- evaluate_cluster(ms_res$assignment)
eval_fuzzy <- evaluate_cluster(fcm_res$cluster)
# gabungkan
results <- rbind(
Kmeans = eval_kmeans,
Kmedians = eval_kmedians,
DBSCAN = eval_dbscan,
MeanShift = eval_meanshift,
Fuzzy = eval_fuzzy
)
round(results, 3)
## Silhouette
## Kmeans 0.359
## Kmedians 0.136
## DBSCAN -0.011
## MeanShift 0.304
## Fuzzy 0.323
pca <- prcomp(df_scaled)
data_pca <- pca$x[,1:3]
km_res1 <- kmeans(data_pca, centers = 3)
# K-median
kmed_res1 <- kcca(data_pca, k = 4, family = kccaFamily("kmedians"))
# DBSCAN
db_res1 <- dbscan(data_pca, eps = 0.75, MinPts = 5)
#Mean Shift
ms_res1 <- meanShift(data_pca)
#Fuzzy C-means
fcm_res1 <- cmeans(data_pca, centers = 2, m = 2) # m = 2
library(cluster)
library(fpc)
dist_mat <- dist(data_pca)
# fungsi evaluasi
evaluate_cluster <- function(cluster_label) {
sil <- mean(silhouette(cluster_label, dist_mat)[,3])
return(c(Silhouette = sil))
}
# hasil semua metode
eval_kmeans <- evaluate_cluster(km_res1$cluster)
eval_kmedians <- evaluate_cluster(clusters(kmed_res1))
eval_dbscan <- evaluate_cluster(db_res1$cluster)
eval_meanshift<- evaluate_cluster(ms_res1$assignment)
eval_fuzzy <- evaluate_cluster(fcm_res1$cluster)
# gabungkan
results <- rbind(
Kmeans = eval_kmeans,
Kmedians = eval_kmedians,
DBSCAN = eval_dbscan,
MeanShift = eval_meanshift,
Fuzzy = eval_fuzzy
)
round(results, 3)
## Silhouette
## Kmeans 0.434
## Kmedians 0.297
## DBSCAN 0.242
## MeanShift 0.100
## Fuzzy 0.459
Nilai Silhouette menunjukkan seberapa baik suatu data berada dalam cluster-nya dibandingkan cluster lain. Semakin tinggi nilainya, maka kualitas clustering semakin baik. Berdasarkan hasil yang diperoleh, Fussy memiliki performa terbaik dengan nilai Silhouette 0,459 yang menunjukkan bahwa cluster yang terbentuk cukup baik.
# ubah ke data frame
df_cluster <- as.data.frame(df_scaled)
# tambahkan label cluster
df_cluster$Cluster <- as.factor(fcm_res$cluster)
mean_cluster <- df_cluster %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean))
mean_cluster
## # A tibble: 2 × 12
## Cluster Age Systolic.BP Diastolic BS Body.Temp BMI
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.233 -0.269 -0.291 -0.470 -0.0906 -0.429
## 2 2 0.433 0.501 0.541 0.874 0.169 0.799
## # ℹ 5 more variables: Previous.Complications <dbl>, Preexisting.Diabetes <dbl>,
## # Gestational.Diabetes <dbl>, Mental.Health <dbl>, Heart.Rate <dbl>
library(dplyr)
library(tidyr)
library(openxlsx)
# ringkasan statistik
summary_cluster <- df_cluster %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd),
.names = "{.col}_{.fn}"))
summary_long <- summary_cluster %>%
pivot_longer(cols = -Cluster,
names_to = "Variable_Stat",
values_to = "Value")
# pisahkan nama variabel dan statistik
summary_long <- summary_long %>%
separate(Variable_Stat, into = c("Variable", "Stat"), sep = "_(?=[^_]+$)")
# ubah jadi format wide (Cluster jadi kolom)
summary_final <- summary_long %>%
pivot_wider(names_from = Cluster,
values_from = Value)
summary_final <- summary_final %>%
arrange(Variable, Stat)
summary_final
## # A tibble: 22 × 4
## Variable Stat `1` `2`
## <chr> <chr> <dbl> <dbl>
## 1 Age mean -0.233 0.433
## 2 Age sd 0.879 1.07
## 3 BMI mean -0.429 0.799
## 4 BMI sd 0.639 1.06
## 5 BS mean -0.470 0.874
## 6 BS sd 0.444 1.15
## 7 Body.Temp mean -0.0906 0.169
## 8 Body.Temp sd 0.888 1.16
## 9 Diastolic mean -0.291 0.541
## 10 Diastolic sd 0.826 1.07
## # ℹ 12 more rows
write.xlsx(summary_final,
file = "EDA_Cluster_Fuzzy.xlsx",
sheetName = "Summary Cluster",
rowNames = FALSE)
#install.packages('pheatmap')
library(pheatmap)
# hitung mean per cluster
heatmap_data <- mean_cluster
rownames(heatmap_data) <- paste("Cluster", heatmap_data$Cluster)
heatmap_data$Cluster <- NULL
pheatmap(heatmap_data,
scale = "column",
main = "Profil Cluster berdasarkan Rata-rata (Heatmap)")
Berdasarkan hasil analisis menggunakan metode Fuzzy C-Means dengan
jumlah klaster optimal sebanyak 2, diperoleh bahwa klaster 1 memiliki
nilai rata-rata yang lebih tinggi pada beberapa variabel utama, sehingga
menunjukkan kelompok dengan tingkat risiko kesehatan yang lebih tinggi.
Sementara itu, klaster 2 memiliki nilai rata-rata yang lebih rendah dan
cenderung lebih homogen, sehingga merepresentasikan kelompok dengan
risiko kesehatan yang lebih rendah.
library(ggplot2)
ggplot(df_cluster, aes(x = Age, fill = as.factor(Cluster))) +
geom_density(alpha = 0.5) +
labs(title = "Distribusi Age per Cluster")
Cluster 1 merepresentasikan kelompok dengan usia relatif lebih muda dan
homogen, sedangkan Cluster 2 merepresentasikan kelompok dengan usia
lebih tua dan variasi yang lebih besar. Perbedaan ini menunjukkan bahwa
variabel usia berkontribusi dalam membedakan karakteristik antar
cluster.
library(dplyr)
data_plot <- as.data.frame(data_pca)
data_plot$cluster <- as.factor(km_res$cluster)
library(ggplot2)
# fungsi untuk ambil convex hull tiap cluster
hull_data <- data_plot %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
# plot
ggplot(data_plot, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
# polygon (area cluster)
geom_polygon(data = hull_data,
aes(fill = cluster),
alpha = 0.3,
color = "red") +
labs(title = "PCA Cluster Plot with K-means",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
library(dplyr)
data_plot <- as.data.frame(data_pca)
data_plot$cluster <- as.factor(clusters(kmed_res))
library(ggplot2)
# fungsi untuk ambil convex hull tiap cluster
hull_data <- data_plot %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
# plot
ggplot(data_plot, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
# polygon (area cluster)
geom_polygon(data = hull_data,
aes(fill = cluster),
alpha = 0.3,
color = "red") +
labs(title = "PCA Cluster Plot with Kmedian",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
library(dplyr)
data_plot <- as.data.frame(data_pca)
data_plot$cluster <- as.factor(fcm_res$cluster)
library(ggplot2)
# fungsi untuk ambil convex hull tiap cluster
hull_data <- data_plot %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
# plot
ggplot(data_plot, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
# polygon (area cluster)
geom_polygon(data = hull_data,
aes(fill = cluster),
alpha = 0.3,
color = "red") +
labs(title = "PCA Cluster Plot with FCM",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
library(dplyr)
data_plot <- as.data.frame(data_pca)
data_plot$cluster <- as.factor(db_res$cluster)
library(ggplot2)
# fungsi untuk ambil convex hull tiap cluster
hull_data <- data_plot %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
# plot
ggplot(data_plot, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
# polygon (area cluster)
geom_polygon(data = hull_data,
aes(fill = cluster),
alpha = 0.3,
color = "red") +
labs(title = "PCA Cluster Plot with DBSCAN",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
library(ggplot2)
library(dplyr)
# Mean Shift
bw <- 1.5
ms_res <- meanShift(df_scaled, bandwidth = rep(bw, ncol(df_scaled)))
# FIX: ubah ke vector
ms_cluster <- unlist(ms_res$assignment)
# PCA
pca_res <- prcomp(df_scaled, center = TRUE, scale. = TRUE)
data_pca_ms <- as.data.frame(pca_res$x[,1:2])
colnames(data_pca_ms) <- c("PC1", "PC2")
data_pca_ms$cluster <- as.factor(ms_cluster)
# convex hull (aman sekarang)
hull_data_ms <- data_pca_ms %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
# plot
ggplot(data_pca_ms, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
geom_polygon(data = hull_data_ms,
aes(fill = cluster),
alpha = 0.3) +
labs(title = "PCA Cluster Plot with Meanshift",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
Berdasarkan hasil analisis yang telah dilakukan, dapat disimpulkan bahwa proses clustering pada data maternal health yang diawali dengan scaling dan reduksi dimensi menggunakan PCA mampu meningkatkan kualitas pengelompokan data. Hasil evaluasi menggunakan Silhouette Index menunjukkan bahwa metode Fuzzy C-Means dengan jumlah klaster optimal sebanyak 2 memberikan performa terbaik dengan nilai sebesar 0,459, yang mengindikasikan bahwa struktur klaster yang terbentuk cukup baik dan memiliki pemisahan yang jelas. Analisis eksplorasi data (EDA) pada metode terbaik, yaitu Fuzzy C-Means, diperoleh bahwa data terbagi menjadi dua klaster utama, di mana klaster 1 merepresentasikan kelompok dengan tingkat risiko kesehatan yang lebih tinggi, ditandai dengan nilai rata-rata variabel yang lebih besar, sedangkan klaster 2 merepresentasikan kelompok dengan risiko kesehatan yang lebih rendah dan karakteristik yang lebih homogen. Dengan demikian, metode Fuzzy C-Means mampu memberikan segmentasi yang jelas dan informatif dalam mengidentifikasi tingkat risiko kesehatan ibu hamil.
Sumber Dataset: https://data.mendeley.com/datasets/8k9pvpmykk/1#:~:text=This%20dataset%20provides%20a%20comprehensive%20collection%20of%20maternal,blood%20pressure%2C%20gestational%20age%2C%20and%20fetal%20health%20status