deskripsi tugas

Tugas ini bertujuan untuk mengimplementasikan metode clustering dalam analisis multivariat menggunakan dataset Automobile yang terdiri dari 26 fitur. lalu dilakukan tahap preprocessing data dengan memilih variable numerik saja, sehingga tersisa 15 fitur numerik, lalu dilakukan proses untuk menangani missinng value. selanjutnya dilakukann eksplorasi data untuk menentukan jumlah cluster yang optimal menggunakan metode Elbow dan Silhoutte.

Setelah jumlah cluster ditentukan, proses pengelompokan dilakukan menggunakan lima metode, yaitu K-Means, K-Median, DBSCAN, Mean Shift, dan Fuzzy C-Means. Setiap metode menghasilkan pola cluster yang berbeda sesuai dengan karakteristik algoritma masing-masing. Hasil clustering kemudian divisualisasikan untuk mempermudah pengelompokan data, serta dievaluasi menggunakan metrik seperti Silhouette Score dan Dunn Index untuk menilai kualitas cluster yang terbentuk.

data <- read.csv("processed_automobile_data.csv", header = TRUE)
data <- data[, !(names(data) %in% c("symboling"))]

data_numeric <- data[, sapply(data, is.numeric)]

str(data_numeric)
## 'data.frame':    159 obs. of  15 variables:
##  $ normalized.losses: int  164 164 158 158 192 192 188 188 121 98 ...
##  $ wheel.base       : num  99.8 99.4 105.8 105.8 101.2 ...
##  $ length           : num  177 177 193 193 177 ...
##  $ width            : num  66.2 66.4 71.4 71.4 64.8 64.8 64.8 64.8 60.3 63.6 ...
##  $ height           : num  54.3 54.3 55.7 55.9 54.3 54.3 54.3 54.3 53.2 52 ...
##  $ curb.weight      : int  2337 2824 2844 3086 2395 2395 2710 2765 1488 1874 ...
##  $ engine.size      : int  109 136 136 131 108 108 164 164 61 90 ...
##  $ bore             : num  3.19 3.19 3.19 3.13 3.5 3.5 3.31 3.31 2.91 3.03 ...
##  $ stroke           : num  3.4 3.4 3.4 3.4 2.8 2.8 3.19 3.19 3.03 3.11 ...
##  $ compression.ratio: num  10 8 8.5 8.3 8.8 8.8 9 9 9.5 9.6 ...
##  $ horsepower       : int  102 115 110 140 101 101 121 121 48 70 ...
##  $ peak.rpm         : int  5500 5500 5500 5500 5800 5800 4250 4250 5100 5400 ...
##  $ city.mpg         : int  24 18 19 17 23 23 21 21 47 38 ...
##  $ highway.mpg      : int  30 22 25 20 29 29 28 28 53 43 ...
##  $ price            : int  13950 17450 17710 23875 16430 16925 20970 21105 5151 6295 ...
data_numeric <- data[, sapply(data, is.numeric)]

data_numeric <- na.omit(data_numeric)
df <- scale(data_numeric)

print(class(df))
## [1] "matrix" "array"
print(dim(df))
## [1] 159  15
data_numeric <- data[sapply(data, is.numeric)]
label <- data_numeric$status
data_clustering <- data_numeric[, colnames(data_numeric) != "status"]
colSums(is.na(data_numeric))
## normalized.losses        wheel.base            length             width 
##                 0                 0                 0                 0 
##            height       curb.weight       engine.size              bore 
##                 0                 0                 0                 0 
##            stroke compression.ratio        horsepower          peak.rpm 
##                 0                 0                 0                 0 
##          city.mpg       highway.mpg             price 
##                 0                 0                 0
describe(data_numeric)
##                   vars   n     mean      sd  median  trimmed     mad     min
## normalized.losses    1 159   121.13   35.65  113.00   118.43   32.62   65.00
## wheel.base           2 159    98.26    5.17   96.90    97.72    3.56   86.60
## length               3 159   172.41   11.52  172.40   172.49    9.04  141.10
## width                4 159    65.61    1.95   65.40    65.39    1.78   60.30
## height               5 159    53.90    2.27   54.10    53.85    2.37   49.40
## curb.weight          6 159  2461.14  481.94 2340.00  2422.71  492.22 1488.00
## engine.size          7 159   119.23   30.46  110.00   114.88   19.27   61.00
## bore                 8 159     3.30    0.27    3.27     3.29    0.36    2.54
## stroke               9 159     3.24    0.29    3.27     3.27    0.21    2.07
## compression.ratio   10 159    10.16    3.89    9.00     9.08    0.59    7.00
## horsepower          11 159    95.84   30.72   88.00    92.43   29.65   48.00
## peak.rpm            12 159  5113.84  465.75 5200.00  5114.34  444.78 4150.00
## city.mpg            13 159    26.52    6.10   26.00    26.09    5.93   15.00
## highway.mpg         14 159    32.08    6.46   32.00    31.74    5.93   18.00
## price               15 159 11445.73 5877.86 9233.00 10543.25 3534.52 5118.00
##                        max   range  skew kurtosis     se
## normalized.losses   256.00   191.0  0.82     0.52   2.83
## wheel.base          115.60    29.0  0.90     0.53   0.41
## length              202.60    61.5 -0.06    -0.27   0.91
## width                71.70    11.4  0.90     0.74   0.15
## height               59.80    10.4  0.17    -0.34   0.18
## curb.weight        4066.00  2578.0  0.77     0.07  38.22
## engine.size         258.00   197.0  1.46     2.75   2.42
## bore                  3.94     1.4  0.15    -0.86   0.02
## stroke                4.17     2.1 -0.97     2.35   0.02
## compression.ratio    23.00    16.0  2.66     5.40   0.31
## horsepower          200.00   152.0  0.90     0.21   2.44
## peak.rpm           6600.00  2450.0  0.15     0.31  36.94
## city.mpg             49.00    34.0  0.72     1.02   0.48
## highway.mpg          54.00    36.0  0.59     0.72   0.51
## price             35056.00 29938.0  1.56     2.41 466.14
df <- scale(iris[, -5])
set.seed(123)
wss <- sapply(1:10, function(k){
  kmeans(df, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of clusters K",
     ylab = "Total within-clusters sum of squares",
     main = "Elbow Method")

avg_sil <- function(k) {
  km_res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(df))
  mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
par(mfrow = c(1, 1))
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of clusters K",
     ylab = "Average Silhouette Width",
     main = "Silhouette Analysis")

km_res <- kmeans(df, centers = 3)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
db_res <- dbscan(df, eps = 0.7, MinPts = 5)
ms_res <- meanShift(df)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 is the fuzziness parameter
plot(df, col = km_res$cluster, main = "K-means")

plot(df, col = clusters(kmed_res), main = "K-medians")

plot(df, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")

plot(df, col = ms_res$assignment, main = "Mean Shift")

plot(df, col = fcm_res$cluster, main = "Fuzzy C-means")

plot(df, col = as.numeric(iris$Species), main = "Original Species")

mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.4599482
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.0264966519696275"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 138.888359717351"
ari_score <- adjustedRandIndex(km_res$cluster, iris$Species)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.620135180887038"