Link Dataset: https://archive.ics.uci.edu/dataset/913/forty+soybean+cultivars+from+subsequent+harvests
#import data
data <- read.csv("cultivars.csv", sep=";")
print(head(data))
## Season Cultivar Repetition PH IFP NLP NGP NGL NS MHG GY
## 1 1 NEO 760 CE 1 58.80 15.20 98.2 177.80 1.81 5.2 152.20 3232.82
## 2 1 NEO 760 CE 2 58.60 13.40 102.0 195.00 1.85 7.2 141.69 3517.36
## 3 1 NEO 760 CE 3 63.40 17.20 100.4 203.00 2.02 6.8 148.81 3391.46
## 4 1 NEO 760 CE 4 60.27 15.27 100.2 191.93 1.89 6.4 148.50 3312.58
## 5 1 MANU IPRO 1 81.20 18.00 98.8 173.00 1.75 7.4 145.59 3230.99
## 6 1 MANU IPRO 2 75.80 20.80 69.2 128.00 1.85 7.2 154.87 3374.80
str(data)
## 'data.frame': 320 obs. of 11 variables:
## $ Season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Cultivar : chr "NEO 760 CE" "NEO 760 CE" "NEO 760 CE" "NEO 760 CE" ...
## $ Repetition: int 1 2 3 4 1 2 3 4 1 2 ...
## $ PH : num 58.8 58.6 63.4 60.3 81.2 ...
## $ IFP : num 15.2 13.4 17.2 15.3 18 ...
## $ NLP : num 98.2 102 100.4 100.2 98.8 ...
## $ NGP : num 178 195 203 192 173 ...
## $ NGL : num 1.81 1.85 2.02 1.89 1.75 1.85 1.7 1.77 2.3 2.62 ...
## $ NS : num 5.2 7.2 6.8 6.4 7.4 7.2 6.8 7.13 7.2 6.2 ...
## $ MHG : num 152 142 149 148 146 ...
## $ GY : num 3233 3517 3391 3313 3231 ...
#Pre-processing
sum(is.na(data))
## [1] 0
# Filter kolom numerik
data <- data %>%
select(where(is.numeric))
# Scaling
data_scaled <- scale(data)
set.seed(123)
summary(data_scaled)
## Season Repetition PH IFP
## Min. :-0.9984 Min. :-1.3395 Min. :-2.3204 Min. :-2.73286
## 1st Qu.:-0.9984 1st Qu.:-0.6698 1st Qu.:-0.6069 1st Qu.:-0.61667
## Median : 0.0000 Median : 0.0000 Median :-0.1325 Median : 0.04464
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.9984 3rd Qu.: 0.6698 3rd Qu.: 0.6654 3rd Qu.: 0.61667
## Max. : 0.9984 Max. : 1.3395 Max. : 2.9485 Max. : 3.61571
## NLP NGP NGL NS
## Min. :-1.9378 Min. :-1.4429 Min. :-1.60792 Min. :-2.4901
## 1st Qu.:-0.7344 1st Qu.:-0.6618 1st Qu.:-0.34619 1st Qu.:-0.7268
## Median :-0.2286 Median :-0.1998 Median :-0.01291 Median :-0.1842
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6045 3rd Qu.: 0.4342 3rd Qu.: 0.22515 3rd Qu.: 0.6296
## Max. : 3.1847 Max. : 9.0639 Max. :14.96121 Max. : 3.3423
## MHG GY
## Min. :-2.1025 Min. :-3.7382
## 1st Qu.:-0.7377 1st Qu.:-0.5804
## Median :-0.1107 Median :-0.0423
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7572 3rd Qu.: 0.5760
## Max. : 2.4294 Max. : 3.0048
# Elbow Method
wss <- sapply(1:10, function(k){
kmeans(data_scaled, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method")
# Silhouette Analysis
avg_sil <- function(k) {
km_res <- kmeans(data_scaled, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(data_scaled))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis")
# k = 2, diambil dari hasil silhouette score
# 1. K-means
km_res <- kmeans(data_scaled, centers = 2)
# 2. K-median
kmed_res <- kcca(data_scaled, k = 2, family = kccaFamily("kmedians"))
# 3. DBSCAN
# eps ditentukan dengan trial error, didapatkan 2
db_res <- dbscan(data_scaled, eps = 2, MinPts = 5)
# 4. Mean Shift
ms_res <- meanShift(data_scaled)
# 5. Fuzzy C-means
fcm_res <- cmeans(data_scaled, centers = 2, m = 2)
pca_res <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
df_pca <- data.frame(PC1 = pca_res$x[,1], PC2 = pca_res$x[,2])
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
# Plot menggunakan proyeksi PCA
plot(df_pca, col = km_res$cluster, main = "K-means")
plot(df_pca, col = clusters(kmed_res), main = "K-medians")
plot(df_pca, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(df_pca, col = ms_res$assignment, main = "Mean Shift")
plot(df_pca, col = fcm_res$cluster, main = "Fuzzy C-means")
# reload data raw sementara untuk mengambil label cultivar asli
df_raw <- read.csv("cultivars.csv", sep=";")
plot(df_pca, col = as.numeric(as.factor(df_raw$Cultivar)), main = "Original Cultivar")
par(mfrow = c(1, 1))
cluster_labels <- list(
KMeans = km_res$cluster,
KMedians = clusters(kmed_res),
DBSCAN = db_res$cluster + 1L,
MeanShift = ms_res$assignment,
FCM = fcm_res$cluster
)
# 1. Silhouette score
sil_km <- mean(silhouette(km_res$cluster, dist(data_scaled))[,3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist(data_scaled))[,3])
sil_db <- mean(silhouette(db_res$cluster + 1L, dist(data_scaled))[,3])
sil_ms <- mean(silhouette(ms_res$assignment, dist(data_scaled))[,3])
sil_fcm <- mean(silhouette(fcm_res$cluster, dist(data_scaled))[,3])
results_sil <- data.frame(
Method = c("KMeans","KMedians","DBSCAN","MeanShift","FCM"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)
print(results_sil)
## Method Silhouette
## 1 KMeans 0.20342516
## 2 KMedians 0.16193964
## 3 DBSCAN 0.09167038
## 4 MeanShift -0.08297168
## 5 FCM 0.15393362
# 2. DUNN INDEX, SS, DAN ARI
dunn_scores <- c()
ss_scores <- c()
ari_scores <- c()
true_labels <- as.numeric(as.factor(df_raw$Cultivar))
dist_matrix <- dist(data_scaled)
for(name in names(cluster_labels)) {
labels <- cluster_labels[[name]]
# cegah eror bilah hanya memiliki 1 klaster
if(length(unique(labels)) > 1) {
stats <- cluster.stats(dist_matrix, as.numeric(labels))
dunn_scores <- c(dunn_scores, stats$dunn)
ss_scores <- c(ss_scores, stats$within.cluster.ss)
} else {
dunn_scores <- c(dunn_scores, NA)
ss_scores <- c(ss_scores, NA)
}
# menghitung ari score
ari <- adjustedRandIndex(labels, true_labels)
ari_scores <- c(ari_scores, ari)
}
# membuat frame eval
results_eval <- data.frame(
Method = names(cluster_labels),
Dunn_Index = dunn_scores,
Within_SS = ss_scores,
ARI_Score = ari_scores
)
print(results_eval)
## Method Dunn_Index Within_SS ARI_Score
## 1 KMeans 0.05523105 2656.6714 0.0086263833
## 2 KMedians 0.06410796 2717.1733 -0.0041994156
## 3 DBSCAN 0.06136258 2608.6094 0.0028838193
## 4 MeanShift 0.20967810 438.3158 -0.0100197888
## 5 FCM 0.05547429 2724.2361 -0.0000995468