Tugas ini bertujuan untuk mengimplementasikan metode clustering dalam analisis multivariat menggunakan dataset Automobile yang terdiri dari 26 fitur. lalu dilakukan tahap preprocessing data dengan memilih variable numerik saja, sehingga tersisa 15 fitur numerik, lalu dilakukan proses untuk menangani missinng value. selanjutnya dilakukann eksplorasi data untuk menentukan jumlah cluster yang optimal menggunakan metode Elbow dan Silhoutte.
Setelah jumlah cluster ditentukan, proses pengelompokan dilakukan menggunakan lima metode, yaitu K-Means, K-Median, DBSCAN, Mean Shift, dan Fuzzy C-Means. Setiap metode menghasilkan pola cluster yang berbeda sesuai dengan karakteristik algoritma masing-masing. Hasil clustering kemudian divisualisasikan untuk mempermudah pengelompokan data, serta dievaluasi menggunakan metrik seperti Silhouette Score dan Dunn Index untuk menilai kualitas cluster yang terbentuk.
data <- read.csv("processed_automobile_data.csv", header = TRUE)
data <- data[, !(names(data) %in% c("symboling"))]
data_numeric <- data[, sapply(data, is.numeric)]
str(data_numeric)
## 'data.frame': 159 obs. of 15 variables:
## $ normalized.losses: int 164 164 158 158 192 192 188 188 121 98 ...
## $ wheel.base : num 99.8 99.4 105.8 105.8 101.2 ...
## $ length : num 177 177 193 193 177 ...
## $ width : num 66.2 66.4 71.4 71.4 64.8 64.8 64.8 64.8 60.3 63.6 ...
## $ height : num 54.3 54.3 55.7 55.9 54.3 54.3 54.3 54.3 53.2 52 ...
## $ curb.weight : int 2337 2824 2844 3086 2395 2395 2710 2765 1488 1874 ...
## $ engine.size : int 109 136 136 131 108 108 164 164 61 90 ...
## $ bore : num 3.19 3.19 3.19 3.13 3.5 3.5 3.31 3.31 2.91 3.03 ...
## $ stroke : num 3.4 3.4 3.4 3.4 2.8 2.8 3.19 3.19 3.03 3.11 ...
## $ compression.ratio: num 10 8 8.5 8.3 8.8 8.8 9 9 9.5 9.6 ...
## $ horsepower : int 102 115 110 140 101 101 121 121 48 70 ...
## $ peak.rpm : int 5500 5500 5500 5500 5800 5800 4250 4250 5100 5400 ...
## $ city.mpg : int 24 18 19 17 23 23 21 21 47 38 ...
## $ highway.mpg : int 30 22 25 20 29 29 28 28 53 43 ...
## $ price : int 13950 17450 17710 23875 16430 16925 20970 21105 5151 6295 ...
data_numeric <- data[, sapply(data, is.numeric)]
data_numeric <- na.omit(data_numeric)
df <- scale(data_numeric)
print(class(df))
## [1] "matrix" "array"
print(dim(df))
## [1] 159 15
data_numeric <- data[sapply(data, is.numeric)]
label <- data_numeric$status
data_clustering <- data_numeric[, colnames(data_numeric) != "status"]
colSums(is.na(data_numeric))
## normalized.losses wheel.base length width
## 0 0 0 0
## height curb.weight engine.size bore
## 0 0 0 0
## stroke compression.ratio horsepower peak.rpm
## 0 0 0 0
## city.mpg highway.mpg price
## 0 0 0
describe(data_numeric)
## vars n mean sd median trimmed mad min
## normalized.losses 1 159 121.13 35.65 113.00 118.43 32.62 65.00
## wheel.base 2 159 98.26 5.17 96.90 97.72 3.56 86.60
## length 3 159 172.41 11.52 172.40 172.49 9.04 141.10
## width 4 159 65.61 1.95 65.40 65.39 1.78 60.30
## height 5 159 53.90 2.27 54.10 53.85 2.37 49.40
## curb.weight 6 159 2461.14 481.94 2340.00 2422.71 492.22 1488.00
## engine.size 7 159 119.23 30.46 110.00 114.88 19.27 61.00
## bore 8 159 3.30 0.27 3.27 3.29 0.36 2.54
## stroke 9 159 3.24 0.29 3.27 3.27 0.21 2.07
## compression.ratio 10 159 10.16 3.89 9.00 9.08 0.59 7.00
## horsepower 11 159 95.84 30.72 88.00 92.43 29.65 48.00
## peak.rpm 12 159 5113.84 465.75 5200.00 5114.34 444.78 4150.00
## city.mpg 13 159 26.52 6.10 26.00 26.09 5.93 15.00
## highway.mpg 14 159 32.08 6.46 32.00 31.74 5.93 18.00
## price 15 159 11445.73 5877.86 9233.00 10543.25 3534.52 5118.00
## max range skew kurtosis se
## normalized.losses 256.00 191.0 0.82 0.52 2.83
## wheel.base 115.60 29.0 0.90 0.53 0.41
## length 202.60 61.5 -0.06 -0.27 0.91
## width 71.70 11.4 0.90 0.74 0.15
## height 59.80 10.4 0.17 -0.34 0.18
## curb.weight 4066.00 2578.0 0.77 0.07 38.22
## engine.size 258.00 197.0 1.46 2.75 2.42
## bore 3.94 1.4 0.15 -0.86 0.02
## stroke 4.17 2.1 -0.97 2.35 0.02
## compression.ratio 23.00 16.0 2.66 5.40 0.31
## horsepower 200.00 152.0 0.90 0.21 2.44
## peak.rpm 6600.00 2450.0 0.15 0.31 36.94
## city.mpg 49.00 34.0 0.72 1.02 0.48
## highway.mpg 54.00 36.0 0.59 0.72 0.51
## price 35056.00 29938.0 1.56 2.41 466.14
df <- scale(iris[, -5])
set.seed(123)
wss <- sapply(1:10, function(k){
kmeans(df, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method")
avg_sil <- function(k) {
km_res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
par(mfrow = c(1, 1))
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis")
km_res <- kmeans(df, centers = 3)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
db_res <- dbscan(df, eps = 0.7, MinPts = 5)
ms_res <- meanShift(df)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 is the fuzziness parameter
plot(df, col = km_res$cluster, main = "K-means")
plot(df, col = clusters(kmed_res), main = "K-medians")
plot(df, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(df, col = ms_res$assignment, main = "Mean Shift")
plot(df, col = fcm_res$cluster, main = "Fuzzy C-means")
plot(df, col = as.numeric(iris$Species), main = "Original Species")
mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.4599482
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.0264966519696275"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 138.888359717351"
ari_score <- adjustedRandIndex(km_res$cluster, iris$Species)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.620135180887038"