Modul 3: Clustering (K-Means, K-Medians, DBSCAN, Mean Shift, Fuzzy C-Means)

Link Dataset: https://archive.ics.uci.edu/dataset/913/forty+soybean+cultivars+from+subsequent+harvests

1. Load Data

#import data
data <- read.csv("cultivars.csv", sep=";")
print(head(data))
##   Season   Cultivar Repetition    PH   IFP   NLP    NGP  NGL  NS    MHG      GY
## 1      1 NEO 760 CE          1 58.80 15.20  98.2 177.80 1.81 5.2 152.20 3232.82
## 2      1 NEO 760 CE          2 58.60 13.40 102.0 195.00 1.85 7.2 141.69 3517.36
## 3      1 NEO 760 CE          3 63.40 17.20 100.4 203.00 2.02 6.8 148.81 3391.46
## 4      1 NEO 760 CE          4 60.27 15.27 100.2 191.93 1.89 6.4 148.50 3312.58
## 5      1  MANU IPRO          1 81.20 18.00  98.8 173.00 1.75 7.4 145.59 3230.99
## 6      1  MANU IPRO          2 75.80 20.80  69.2 128.00 1.85 7.2 154.87 3374.80
str(data)
## 'data.frame':    320 obs. of  11 variables:
##  $ Season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Cultivar  : chr  "NEO 760 CE" "NEO 760 CE" "NEO 760 CE" "NEO 760 CE" ...
##  $ Repetition: int  1 2 3 4 1 2 3 4 1 2 ...
##  $ PH        : num  58.8 58.6 63.4 60.3 81.2 ...
##  $ IFP       : num  15.2 13.4 17.2 15.3 18 ...
##  $ NLP       : num  98.2 102 100.4 100.2 98.8 ...
##  $ NGP       : num  178 195 203 192 173 ...
##  $ NGL       : num  1.81 1.85 2.02 1.89 1.75 1.85 1.7 1.77 2.3 2.62 ...
##  $ NS        : num  5.2 7.2 6.8 6.4 7.4 7.2 6.8 7.13 7.2 6.2 ...
##  $ MHG       : num  152 142 149 148 146 ...
##  $ GY        : num  3233 3517 3391 3313 3231 ...
#Pre-processing
sum(is.na(data))
## [1] 0
# Filter kolom numerik
data <- data %>%
  select(where(is.numeric))

# Scaling
data_scaled <- scale(data)
set.seed(123)
summary(data_scaled)
##      Season          Repetition            PH               IFP          
##  Min.   :-0.9984   Min.   :-1.3395   Min.   :-2.3204   Min.   :-2.73286  
##  1st Qu.:-0.9984   1st Qu.:-0.6698   1st Qu.:-0.6069   1st Qu.:-0.61667  
##  Median : 0.0000   Median : 0.0000   Median :-0.1325   Median : 0.04464  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.9984   3rd Qu.: 0.6698   3rd Qu.: 0.6654   3rd Qu.: 0.61667  
##  Max.   : 0.9984   Max.   : 1.3395   Max.   : 2.9485   Max.   : 3.61571  
##       NLP               NGP               NGL                 NS         
##  Min.   :-1.9378   Min.   :-1.4429   Min.   :-1.60792   Min.   :-2.4901  
##  1st Qu.:-0.7344   1st Qu.:-0.6618   1st Qu.:-0.34619   1st Qu.:-0.7268  
##  Median :-0.2286   Median :-0.1998   Median :-0.01291   Median :-0.1842  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6045   3rd Qu.: 0.4342   3rd Qu.: 0.22515   3rd Qu.: 0.6296  
##  Max.   : 3.1847   Max.   : 9.0639   Max.   :14.96121   Max.   : 3.3423  
##       MHG                GY         
##  Min.   :-2.1025   Min.   :-3.7382  
##  1st Qu.:-0.7377   1st Qu.:-0.5804  
##  Median :-0.1107   Median :-0.0423  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7572   3rd Qu.: 0.5760  
##  Max.   : 2.4294   Max.   : 3.0048

2. Penentuan Jumlah Klaster Optimal (Elbow & Silhouette)

# Elbow Method
wss <- sapply(1:10, function(k){
  kmeans(data_scaled, centers = k, nstart = 20)$tot.withinss
})

par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of clusters K",
     ylab = "Total within-clusters sum of squares",
     main = "Elbow Method")

# Silhouette Analysis
avg_sil <- function(k) {
  km_res <- kmeans(data_scaled, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(data_scaled))
  mean(ss[, 3])
}

k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)

plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of clusters K",
     ylab = "Average Silhouette Width",
     main = "Silhouette Analysis")

3. Implementasi Algoritma Clustering

# k = 2, diambil dari hasil silhouette score

# 1. K-means
km_res <- kmeans(data_scaled, centers = 2)

# 2. K-median 
kmed_res <- kcca(data_scaled, k = 2, family = kccaFamily("kmedians"))

# 3. DBSCAN
# eps ditentukan dengan trial error, didapatkan 2
db_res <- dbscan(data_scaled, eps = 2, MinPts = 5) 

# 4. Mean Shift
ms_res <- meanShift(data_scaled)

# 5. Fuzzy C-means
fcm_res <- cmeans(data_scaled, centers = 2, m = 2)

4. Visualisasi dengan PCA

pca_res <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
df_pca <- data.frame(PC1 = pca_res$x[,1], PC2 = pca_res$x[,2])

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))

# Plot menggunakan proyeksi PCA
plot(df_pca, col = km_res$cluster, main = "K-means")
plot(df_pca, col = clusters(kmed_res), main = "K-medians")
plot(df_pca, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(df_pca, col = ms_res$assignment, main = "Mean Shift")
plot(df_pca, col = fcm_res$cluster, main = "Fuzzy C-means")

# reload data raw sementara untuk mengambil label cultivar asli
df_raw <- read.csv("cultivars.csv", sep=";")
plot(df_pca, col = as.numeric(as.factor(df_raw$Cultivar)), main = "Original Cultivar")

par(mfrow = c(1, 1))

5. Evaluasi Metrik

cluster_labels <- list(
  KMeans = km_res$cluster,
  KMedians = clusters(kmed_res),
  DBSCAN = db_res$cluster + 1L,
  MeanShift = ms_res$assignment,
  FCM = fcm_res$cluster
)

# 1. Silhouette score
sil_km <- mean(silhouette(km_res$cluster, dist(data_scaled))[,3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist(data_scaled))[,3])
sil_db <- mean(silhouette(db_res$cluster + 1L, dist(data_scaled))[,3])
sil_ms <- mean(silhouette(ms_res$assignment, dist(data_scaled))[,3])
sil_fcm <- mean(silhouette(fcm_res$cluster, dist(data_scaled))[,3])

results_sil <- data.frame(
  Method = c("KMeans","KMedians","DBSCAN","MeanShift","FCM"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)
print(results_sil)
##      Method  Silhouette
## 1    KMeans  0.20342516
## 2  KMedians  0.16193964
## 3    DBSCAN  0.09167038
## 4 MeanShift -0.08297168
## 5       FCM  0.15393362
# 2. DUNN INDEX, SS, DAN ARI
dunn_scores <- c()
ss_scores <- c()
ari_scores <- c()

true_labels <- as.numeric(as.factor(df_raw$Cultivar))
dist_matrix <- dist(data_scaled)

for(name in names(cluster_labels)) {
  labels <- cluster_labels[[name]]
  
  # cegah eror bilah hanya memiliki 1 klaster
  if(length(unique(labels)) > 1) {
    stats <- cluster.stats(dist_matrix, as.numeric(labels))
    dunn_scores <- c(dunn_scores, stats$dunn)
    ss_scores <- c(ss_scores, stats$within.cluster.ss)
  } else {
    dunn_scores <- c(dunn_scores, NA)
    ss_scores <- c(ss_scores, NA)
  }
  
  # menghitung ari score
  ari <- adjustedRandIndex(labels, true_labels)
  ari_scores <- c(ari_scores, ari)
}

# membuat frame eval
results_eval <- data.frame(
  Method = names(cluster_labels),
  Dunn_Index = dunn_scores,
  Within_SS = ss_scores,
  ARI_Score = ari_scores
)

print(results_eval)
##      Method Dunn_Index Within_SS     ARI_Score
## 1    KMeans 0.05523105 2656.6714  0.0086263833
## 2  KMedians 0.06410796 2717.1733 -0.0041994156
## 3    DBSCAN 0.06136258 2608.6094  0.0028838193
## 4 MeanShift 0.20967810  438.3158 -0.0100197888
## 5       FCM 0.05547429 2724.2361 -0.0000995468