KELOMPOK: 13
M. Arifin Ilham
Indah Faizah Salsabillah Ramdhany Wadjo
Judul modul: Implementasi Clustering pada Mobile Price Classification menggunakan metode K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
library(dbscan)
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
library(fpc)
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(corrplot)
## corrplot 0.95 loaded
LOAD DATA
df <- read.csv("Mobile Price Classification.csv")
str(df)
## 'data.frame': 1000 obs. of 21 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ battery_power: int 1043 841 1807 1546 1434 1464 1718 833 1111 1520 ...
## $ blue : int 1 1 1 0 0 1 0 0 1 0 ...
## $ clock_speed : num 1.8 0.5 2.8 0.5 1.4 2.9 2.4 2.4 2.9 0.5 ...
## $ dual_sim : int 1 1 0 1 0 1 0 1 1 0 ...
## $ fc : int 14 4 1 18 11 5 1 0 9 1 ...
## $ four_g : int 0 1 0 1 1 1 0 0 1 0 ...
## $ int_memory : int 5 61 27 25 49 50 47 62 25 25 ...
## $ m_dep : num 0.1 0.8 0.9 0.5 0.5 0.8 1 0.8 0.6 0.5 ...
## $ mobile_wt : int 193 191 186 96 108 198 156 111 101 171 ...
## $ n_cores : int 3 5 3 8 6 8 2 1 5 3 ...
## $ pc : int 16 12 4 20 18 9 3 2 19 20 ...
## $ px_height : int 226 746 1270 295 749 569 1283 1312 556 52 ...
## $ px_width : int 1412 857 1366 1752 810 939 1374 1880 876 1009 ...
## $ ram : int 3476 3895 2396 3893 1773 3506 3873 1495 3485 651 ...
## $ sc_h : int 12 6 17 10 15 10 14 7 11 6 ...
## $ sc_w : int 7 0 10 0 8 7 2 2 9 0 ...
## $ talk_time : int 2 7 10 7 7 3 10 18 10 5 ...
## $ three_g : int 0 1 0 1 1 1 0 0 1 1 ...
## $ touch_screen : int 1 0 1 1 0 1 0 1 1 0 ...
## $ wifi : int 0 0 1 0 1 1 0 1 0 1 ...
summary(df)
## id battery_power blue clock_speed
## Min. : 1.0 Min. : 500 Min. :0.000 Min. :0.500
## 1st Qu.: 250.8 1st Qu.: 895 1st Qu.:0.000 1st Qu.:0.700
## Median : 500.5 Median :1246 Median :1.000 Median :1.500
## Mean : 500.5 Mean :1249 Mean :0.516 Mean :1.541
## 3rd Qu.: 750.2 3rd Qu.:1629 3rd Qu.:1.000 3rd Qu.:2.300
## Max. :1000.0 Max. :1999 Max. :1.000 Max. :3.000
## dual_sim fc four_g int_memory
## Min. :0.000 Min. : 0.000 Min. :0.000 Min. : 2.00
## 1st Qu.:0.000 1st Qu.: 1.000 1st Qu.:0.000 1st Qu.:18.00
## Median :1.000 Median : 3.000 Median :0.000 Median :34.50
## Mean :0.517 Mean : 4.593 Mean :0.487 Mean :33.65
## 3rd Qu.:1.000 3rd Qu.: 7.000 3rd Qu.:1.000 3rd Qu.:49.00
## Max. :1.000 Max. :19.000 Max. :1.000 Max. :64.00
## m_dep mobile_wt n_cores pc
## Min. :0.1000 Min. : 80.0 Min. :1.000 Min. : 0.00
## 1st Qu.:0.3000 1st Qu.:109.8 1st Qu.:2.000 1st Qu.: 5.00
## Median :0.5000 Median :139.0 Median :4.000 Median :10.00
## Mean :0.5175 Mean :139.5 Mean :4.328 Mean :10.05
## 3rd Qu.:0.8000 3rd Qu.:170.0 3rd Qu.:6.000 3rd Qu.:16.00
## Max. :1.0000 Max. :200.0 Max. :8.000 Max. :20.00
## px_height px_width ram sc_h
## Min. : 0.0 Min. : 501.0 Min. : 263 Min. : 5.00
## 1st Qu.: 263.8 1st Qu.: 831.8 1st Qu.:1237 1st Qu.: 8.00
## Median : 564.5 Median :1250.0 Median :2154 Median :12.00
## Mean : 627.1 Mean :1239.8 Mean :2139 Mean :11.99
## 3rd Qu.: 903.0 3rd Qu.:1637.8 3rd Qu.:3066 3rd Qu.:16.00
## Max. :1907.0 Max. :1998.0 Max. :3989 Max. :19.00
## sc_w talk_time three_g touch_screen wifi
## Min. : 0.000 Min. : 2.00 Min. :0.000 Min. :0.0 Min. :0.000
## 1st Qu.: 2.000 1st Qu.: 6.75 1st Qu.:1.000 1st Qu.:0.0 1st Qu.:0.000
## Median : 5.000 Median :11.00 Median :1.000 Median :0.5 Median :1.000
## Mean : 5.316 Mean :11.09 Mean :0.756 Mean :0.5 Mean :0.507
## 3rd Qu.: 8.000 3rd Qu.:16.00 3rd Qu.:1.000 3rd Qu.:1.0 3rd Qu.:1.000
## Max. :18.000 Max. :20.00 Max. :1.000 Max. :1.0 Max. :1.000
CLEANING DATA
Pada tahap ini, dilakukan cleaning data, dan hanya mengambil fitur numerik agar membantu penelitian.
# mengambil data numerik
df <- df %>% select(-id)
df_num <- df %>% select(where(is.numeric))
# mengecek missing value
colSums(is.na(df_num))
## battery_power blue clock_speed dual_sim fc
## 0 0 0 0 0
## four_g int_memory m_dep mobile_wt n_cores
## 0 0 0 0 0
## pc px_height px_width ram sc_h
## 0 0 0 0 0
## sc_w talk_time three_g touch_screen wifi
## 0 0 0 0 0
#mengisi nilai kosong
for(i in 1:ncol(df_num)){
df_num[is.na(df_num[,i]), i] <- median(df_num[,i], na.rm=TRUE)
}
#mengecek ulang data
colSums(is.na(df_num))
## battery_power blue clock_speed dual_sim fc
## 0 0 0 0 0
## four_g int_memory m_dep mobile_wt n_cores
## 0 0 0 0 0
## pc px_height px_width ram sc_h
## 0 0 0 0 0
## sc_w talk_time three_g touch_screen wifi
## 0 0 0 0 0
EKSPLORASI DATA
Eksplorasi data digunakan untuk melihat sebaran data dan potensi outlier
#statistikadeskriptif
summary(df_num)
## battery_power blue clock_speed dual_sim
## Min. : 500 Min. :0.000 Min. :0.500 Min. :0.000
## 1st Qu.: 895 1st Qu.:0.000 1st Qu.:0.700 1st Qu.:0.000
## Median :1246 Median :1.000 Median :1.500 Median :1.000
## Mean :1249 Mean :0.516 Mean :1.541 Mean :0.517
## 3rd Qu.:1629 3rd Qu.:1.000 3rd Qu.:2.300 3rd Qu.:1.000
## Max. :1999 Max. :1.000 Max. :3.000 Max. :1.000
## fc four_g int_memory m_dep
## Min. : 0.000 Min. :0.000 Min. : 2.00 Min. :0.1000
## 1st Qu.: 1.000 1st Qu.:0.000 1st Qu.:18.00 1st Qu.:0.3000
## Median : 3.000 Median :0.000 Median :34.50 Median :0.5000
## Mean : 4.593 Mean :0.487 Mean :33.65 Mean :0.5175
## 3rd Qu.: 7.000 3rd Qu.:1.000 3rd Qu.:49.00 3rd Qu.:0.8000
## Max. :19.000 Max. :1.000 Max. :64.00 Max. :1.0000
## mobile_wt n_cores pc px_height
## Min. : 80.0 Min. :1.000 Min. : 0.00 Min. : 0.0
## 1st Qu.:109.8 1st Qu.:2.000 1st Qu.: 5.00 1st Qu.: 263.8
## Median :139.0 Median :4.000 Median :10.00 Median : 564.5
## Mean :139.5 Mean :4.328 Mean :10.05 Mean : 627.1
## 3rd Qu.:170.0 3rd Qu.:6.000 3rd Qu.:16.00 3rd Qu.: 903.0
## Max. :200.0 Max. :8.000 Max. :20.00 Max. :1907.0
## px_width ram sc_h sc_w
## Min. : 501.0 Min. : 263 Min. : 5.00 Min. : 0.000
## 1st Qu.: 831.8 1st Qu.:1237 1st Qu.: 8.00 1st Qu.: 2.000
## Median :1250.0 Median :2154 Median :12.00 Median : 5.000
## Mean :1239.8 Mean :2139 Mean :11.99 Mean : 5.316
## 3rd Qu.:1637.8 3rd Qu.:3066 3rd Qu.:16.00 3rd Qu.: 8.000
## Max. :1998.0 Max. :3989 Max. :19.00 Max. :18.000
## talk_time three_g touch_screen wifi
## Min. : 2.00 Min. :0.000 Min. :0.0 Min. :0.000
## 1st Qu.: 6.75 1st Qu.:1.000 1st Qu.:0.0 1st Qu.:0.000
## Median :11.00 Median :1.000 Median :0.5 Median :1.000
## Mean :11.09 Mean :0.756 Mean :0.5 Mean :0.507
## 3rd Qu.:16.00 3rd Qu.:1.000 3rd Qu.:1.0 3rd Qu.:1.000
## Max. :20.00 Max. :1.000 Max. :1.0 Max. :1.000
#korelasi
boxplot(df_num,
las = 2,
col = "lightblue",
main = "Boxplot Variabel Penelitian",
cex.axis = 0.7,
outline = TRUE)
colnames(df_num)
## [1] "battery_power" "blue" "clock_speed" "dual_sim"
## [5] "fc" "four_g" "int_memory" "m_dep"
## [9] "mobile_wt" "n_cores" "pc" "px_height"
## [13] "px_width" "ram" "sc_h" "sc_w"
## [17] "talk_time" "three_g" "touch_screen" "wifi"
SCALE DATA
Tahap standardisasi data (mean=0, sd=1). Wajib dilakukan untuk algoritma berbasis jarak (K-Means, K-Medians, DBSCAN) agar variabel dengan skala besar tidak mendominasi.
df_scaled <- scale(df_num)
head(df_scaled)
## battery_power blue clock_speed dual_sim fc four_g
## [1,] -0.4752135 0.9680116 0.3124444 0.9660754 2.10762144 -0.9738421
## [2,] -0.9423107 0.9680116 -1.2552039 0.9660754 -0.13286058 1.0258337
## [3,] 1.2914311 0.9680116 1.5183276 -1.0340807 -0.80500519 -0.9738421
## [4,] 0.6879046 -1.0320124 -1.2552039 0.9660754 3.00381425 1.0258337
## [5,] 0.4289200 -1.0320124 -0.1699090 -1.0340807 1.43547683 1.0258337
## [6,] 0.4982909 0.9680116 1.6389159 0.9660754 0.09118762 1.0258337
## int_memory m_dep mobile_wt n_cores pc px_height
## [1,] -1.5804779 -1.48650297 1.5347668 -0.5803804 0.9755379 -0.9265269
## [2,] 1.5085477 1.00583734 1.4773805 0.2936864 0.3192729 0.2745919
## [3,] -0.3669321 1.36188596 1.3339149 -0.5803804 -0.9932570 1.4849501
## [4,] -0.4772545 -0.06230851 -1.2484667 1.6047866 1.6318028 -0.7671476
## [5,] 0.8466137 -0.06230851 -0.9041492 0.7307198 1.3036703 0.2815215
## [6,] 0.9017748 1.00583734 1.6782324 1.6047866 -0.1729258 -0.1342504
## px_width ram sc_h sc_w talk_time three_g
## [1,] 0.3917156 1.2287579 0.001157245 0.3971640 -1.6525285 -1.7593358
## [2,] -0.8705919 1.6138356 -1.387536602 -1.2537554 -0.7430467 0.5678279
## [3,] 0.2870920 0.2361950 1.158402117 1.1047009 -0.1973576 -1.7593358
## [4,] 1.1650212 1.6119975 -0.461740704 -1.2537554 -0.7430467 0.5678279
## [5,] -0.9774900 -0.3363667 0.695504168 0.6330097 -0.7430467 0.5678279
## [6,] -0.6840888 1.2563291 -0.461740704 0.3971640 -1.4706322 0.5678279
## touch_screen wifi
## [1,] 0.9994999 -1.0135922
## [2,] -0.9994999 -1.0135922
## [3,] 0.9994999 0.9856035
## [4,] 0.9994999 -1.0135922
## [5,] -0.9994999 0.9856035
## [6,] 0.9994999 0.9856035
ELBOW METHOD
Pada Elbow Method dilakukan untuk mencari nilai k (jumlah cluster) dengan melihat penurunan within-cluster sum of squares (WSS).
wss <- sapply(1:10, function(k){
kmeans(df_scaled, centers=k, nstart=20)$tot.withinss
})
## Warning: did not converge in 10 iterations
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")
SILHOUETTE
Tahap ini digunakan untuk menghitung rata-rata silhouette coefficient untuk setiap k. Nilai mendekati 1 menandakan cluster yang padat dan terpisah baik.
avg_sil <- function(k){
km <- kmeans(df_scaled, centers=k)
ss <- silhouette(km$cluster, dist(df_scaled))
mean(ss[,3])
}
k <- 2:10
plot(k, sapply(k, avg_sil), type="b", pch=19,
main="Silhouette Method")
METODE
Partisi data menjadi 3 cluster dengan meminimalkan jarak Euclidean ke centroid. set.seed agar hasil dapat direproduksi.
set.seed(123)
km <- kmeans(df_scaled, centers=3)
Menggunakan median sebagai pusat cluster. Lebih tahan terhadap outlier.
kmed <- kcca(df_scaled, k=3, family=kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
Density-based clustering. Mengelompokkan titik yang rapat (eps = radius, minPts = minimal titik untuk membentuk cluster). Bisa mendeteksi noise (label -1).
db <- dbscan::dbscan(df_scaled, eps=1, minPts=5)
Algoritma berbasis density gradient yang tidak perlu menentukan jumlah cluster di awal. Parameter default digunakan.
ms <- meanShift(df_scaled)
Setiap titik memiliki derajat keanggotaan terhadap setiap cluster (0-1). Parameter m = fuzziness (semakin besar, semakin kabur cluster).
fcm <- cmeans(df_scaled, centers=3, m=2)
VISUALISASI
x <- df_scaled[, "px_height"]
y <- df_scaled[, "ram"]
cols <- c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00","#ffff33")
par(mfrow = c(2,3), mar = c(4,4,2,1))
plot(x, y,
col = cols[km$cluster],
pch = 19, cex = 0.5,
main = "K-Means",
xlab = "Pixel Height", ylab = "RAM")
plot(x, y,
col = cols[clusters(kmed)],
pch = 19, cex = 0.5,
main = "K-Medians",
xlab = "Pixel Height", ylab = "RAM")
plot(x, y,
col = cols[db$cluster + 1],
pch = 19, cex = 0.5,
main = "DBSCAN",
xlab = "Pixel Height", ylab = "RAM")
plot(x, y,
col = cols[ms$assignment],
pch = 19, cex = 0.5,
main = "Mean Shift",
xlab = "Pixel Height", ylab = "RAM")
plot(x, y,
col = cols[fcm$cluster],
pch = 19, cex = 0.5,
main = "Fuzzy C-Means",
xlab = "Pixel Height", ylab = "RAM")
plot(x, y,
col = cols[km$cluster],
pch = 19, cex = 0.5,
main = "Cluster Overview",
xlab = "Pixel Height", ylab = "RAM")
METRICS
Metrik evaluasi Silhouette Coefficient digunakan untuk mengukur seberapa mirip suatu titik dengan clusternya sendiri dibanding cluster lain. Rentang -1 hingga 1 (semakin tinggi semakin baik).
mean(silhouette(km$cluster, dist(df_scaled))[,3])
## [1] 0.05337554
Metrik evaluasi Dunn Index rasio antara jarak antar-cluster terkecil dengan diameter dalam-cluster terbesar. Semakin besar, semakin baik (cluster padat dan terpisah).
stats <- cluster.stats(dist(df_scaled), km$cluster)
stats$dunn
## [1] 0.2133994
Metrik evaluasi Within-cluster SS memperlihatkan jumlah kuadrat jarak titik ke centroid clusternya. Semakin kecil, semakin padat cluster (hanya untuk K-Means).
stats$within.cluster.ss
## [1] 17965.11