Faiz Azmin Ahmad (24031554175), Cut Azzahra Firdausi Syakiena Nazwa (24031554180)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
library(dbscan)
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
library(fpc)
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
data <- read.csv("train.csv")
# cek data
str(data)
## 'data.frame': 2000 obs. of 21 variables:
## $ battery_power: int 842 1021 563 615 1821 1859 1821 1954 1445 509 ...
## $ blue : int 0 1 1 1 1 0 0 0 1 1 ...
## $ clock_speed : num 2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
## $ dual_sim : int 0 1 1 0 0 1 0 1 0 1 ...
## $ fc : int 1 0 2 0 13 3 4 0 0 2 ...
## $ four_g : int 0 1 1 0 1 0 1 0 0 1 ...
## $ int_memory : int 7 53 41 10 44 22 10 24 53 9 ...
## $ m_dep : num 0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
## $ mobile_wt : int 188 136 145 131 141 164 139 187 174 93 ...
## $ n_cores : int 2 3 5 6 2 1 8 4 7 5 ...
## $ pc : int 2 6 6 9 14 7 10 0 14 15 ...
## $ px_height : int 20 905 1263 1216 1208 1004 381 512 386 1137 ...
## $ px_width : int 756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
## $ ram : int 2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
## $ sc_h : int 9 17 11 16 8 17 13 16 17 19 ...
## $ sc_w : int 7 3 2 8 2 1 8 3 1 10 ...
## $ talk_time : int 19 7 9 11 15 10 18 5 20 12 ...
## $ three_g : int 0 1 1 1 1 1 1 1 1 1 ...
## $ touch_screen : int 0 1 1 0 1 0 0 1 0 0 ...
## $ wifi : int 1 0 0 0 0 0 1 1 0 0 ...
## $ price_range : int 1 2 2 2 1 1 3 0 0 0 ...
summary(data)
## battery_power blue clock_speed dual_sim
## Min. : 501.0 Min. :0.000 Min. :0.500 Min. :0.0000
## 1st Qu.: 851.8 1st Qu.:0.000 1st Qu.:0.700 1st Qu.:0.0000
## Median :1226.0 Median :0.000 Median :1.500 Median :1.0000
## Mean :1238.5 Mean :0.495 Mean :1.522 Mean :0.5095
## 3rd Qu.:1615.2 3rd Qu.:1.000 3rd Qu.:2.200 3rd Qu.:1.0000
## Max. :1998.0 Max. :1.000 Max. :3.000 Max. :1.0000
## fc four_g int_memory m_dep
## Min. : 0.000 Min. :0.0000 Min. : 2.00 Min. :0.1000
## 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:16.00 1st Qu.:0.2000
## Median : 3.000 Median :1.0000 Median :32.00 Median :0.5000
## Mean : 4.309 Mean :0.5215 Mean :32.05 Mean :0.5018
## 3rd Qu.: 7.000 3rd Qu.:1.0000 3rd Qu.:48.00 3rd Qu.:0.8000
## Max. :19.000 Max. :1.0000 Max. :64.00 Max. :1.0000
## mobile_wt n_cores pc px_height
## Min. : 80.0 Min. :1.000 Min. : 0.000 Min. : 0.0
## 1st Qu.:109.0 1st Qu.:3.000 1st Qu.: 5.000 1st Qu.: 282.8
## Median :141.0 Median :4.000 Median :10.000 Median : 564.0
## Mean :140.2 Mean :4.521 Mean : 9.916 Mean : 645.1
## 3rd Qu.:170.0 3rd Qu.:7.000 3rd Qu.:15.000 3rd Qu.: 947.2
## Max. :200.0 Max. :8.000 Max. :20.000 Max. :1960.0
## px_width ram sc_h sc_w
## Min. : 500.0 Min. : 256 Min. : 5.00 Min. : 0.000
## 1st Qu.: 874.8 1st Qu.:1208 1st Qu.: 9.00 1st Qu.: 2.000
## Median :1247.0 Median :2146 Median :12.00 Median : 5.000
## Mean :1251.5 Mean :2124 Mean :12.31 Mean : 5.767
## 3rd Qu.:1633.0 3rd Qu.:3064 3rd Qu.:16.00 3rd Qu.: 9.000
## Max. :1998.0 Max. :3998 Max. :19.00 Max. :18.000
## talk_time three_g touch_screen wifi
## Min. : 2.00 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.: 6.00 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:0.000
## Median :11.00 Median :1.0000 Median :1.000 Median :1.000
## Mean :11.01 Mean :0.7615 Mean :0.503 Mean :0.507
## 3rd Qu.:16.00 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :20.00 Max. :1.0000 Max. :1.000 Max. :1.000
## price_range
## Min. :0.00
## 1st Qu.:0.75
## Median :1.50
## Mean :1.50
## 3rd Qu.:2.25
## Max. :3.00
head(data)
## battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1 842 0 2.2 0 1 0 7 0.6 188
## 2 1021 1 0.5 1 0 1 53 0.7 136
## 3 563 1 0.5 1 2 1 41 0.9 145
## 4 615 1 2.5 0 0 0 10 0.8 131
## 5 1821 1 1.2 0 13 1 44 0.6 141
## 6 1859 0 0.5 1 3 0 22 0.7 164
## n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen
## 1 2 2 20 756 2549 9 7 19 0 0
## 2 3 6 905 1988 2631 17 3 7 1 1
## 3 5 6 1263 1716 2603 11 2 9 1 1
## 4 6 9 1216 1786 2769 16 8 11 1 0
## 5 2 14 1208 1212 1411 8 2 15 1 1
## 6 1 7 1004 1654 1067 17 1 10 1 0
## wifi price_range
## 1 1 1
## 2 0 2
## 3 0 2
## 4 0 2
## 5 0 1
## 6 0 1
# pisahin fitur & label
df <- data[, c(
"battery_power",
"clock_speed",
"int_memory",
"mobile_wt",
"n_cores",
"pc",
"px_height",
"px_width",
"ram",
"talk_time"
)] # semua fitur numerik
label <- data$price_range # ground truth
# scaling
df_scaled <- scale(df)
set.seed(123)
# number of K
# Elbow Method using base R and a loop
wss <- sapply(1:10, function(k){
kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Plotting
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method")
# Silhouette Analysis
# Hitung Silhouette
avg_sil <- function(k) {
km <- kmeans(df_scaled, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df_scaled))
mean(ss[, 3])
}
# Coba Cluster & Looping
k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Plotting
plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster (k)",
ylab="Silhouette Score",
main="Silhouette Analysis")
# Clustering
# ---1. K-means---
km <- kmeans(df_scaled, centers = 4)
# --- 2. K-median ---
kmed <- kcca(df_scaled, k = 4, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# --- 3. DBSCAN ---
db <- dbscan(df_scaled, eps = 2, MinPts = 5)
# --- 4. Mean Shift ---
ms <- meanShift(df_scaled)
# --- 5. Fuzzy C-means ---
fcm <- cmeans(df_scaled, centers = 4, m = 2)
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
pca <- prcomp(df_scaled)
plot(pca$x[,1:2], col=km$cluster, pch=19, cex=0.6, main="K-Means")
plot(pca$x[,1:2], col=clusters(kmed), pch=19, cex=0.6, main="K-Median")
plot(pca$x[,1:2], col=as.numeric(as.factor(db$cluster)), pch=19, cex=0.6, main="DBSCAN")
plot(pca$x[,1:2], col=ms$assignment, pch=19, cex=0.6, main="Mean Shift")
plot(pca$x[,1:2], col=fcm$cluster, pch=19, cex=0.6, main="Fuzzy C-Means")
plot(pca$x[,1:2], col=label, pch=19, cex=0.6, main="Label Asli")
# Metrik
# 1. Silhouette
sil_km <- mean(silhouette(km$cluster, dist(df_scaled))[,3])
sil_km
## [1] 0.07405444
# 2. Dunn-Index
stats <- cluster.stats(dist(df_scaled), km$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.135384026121787"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 16169.8203129589"
# 3. ARI Score
ari_km <- adjustedRandIndex(km$cluster, label)
ari_km
## [1] 0.1569832
print(paste("Adjusted Rand Index:", ari_km))
## [1] "Adjusted Rand Index: 0.156983229772753"
# Visualisasi
# Distribusi Fitur
data_long <- pivot_longer(df, cols = everything())
ggplot(data_long, aes(x = value)) +
geom_histogram(bins = 30, fill = "skyblue", color = "white") +
facet_wrap(~name, scales = "free") +
labs(title = "Distribusi Fitur") +
theme_minimal()
# Perbandingan Fitur & Label
par(mfrow = c(2, 5)) # pas 10 plot
fitur_names <- c(
"battery_power",
"clock_speed",
"int_memory",
"mobile_wt",
"n_cores",
"pc",
"px_height",
"px_width",
"ram",
"talk_time"
)
for(f in fitur_names){
boxplot(data[[f]] ~ data$price_range,
main = f,
xlab = "Price Range",
ylab = "",
col = "lightblue")
}
fitur <- data[, c(
"battery_power",
"clock_speed",
"int_memory",
"mobile_wt",
"n_cores",
"pc",
"px_height",
"px_width",
"ram",
"talk_time",
"price_range"
)]
library(corrplot)
## corrplot 0.95 loaded
cor_matrix <- cor(fitur)
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45)
# Distribusi Label
table(data$price_range)
##
## 0 1 2 3
## 500 500 500 500
barplot(table(data$price_range),
main="Distribusi Price Range")
# Distribusi Cluster
table(km$cluster)
##
## 1 2 3 4
## 493 473 506 528
barplot(table(km$cluster),
main="Distribusi Cluster K-Means")
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.2297 1.0407 1.0258 1.0161 1.0074 0.99763 0.98757
## Proportion of Variance 0.1512 0.1083 0.1052 0.1033 0.1015 0.09953 0.09753
## Cumulative Proportion 0.1512 0.2595 0.3647 0.4680 0.5695 0.66901 0.76654
## PC8 PC9 PC10
## Standard deviation 0.96304 0.9597 0.69728
## Proportion of Variance 0.09274 0.0921 0.04862
## Cumulative Proportion 0.85928 0.9514 1.00000
# Rata" Tiap Cluster
aggregate(df_scaled, by=list(cluster=km$cluster), mean)
## cluster battery_power clock_speed int_memory mobile_wt n_cores
## 1 1 0.06351926 0.01499105 -0.90286958 0.16142781 0.12713277
## 2 2 -0.35703201 0.25617471 0.85448807 -0.01079650 0.05249149
## 3 3 0.06660639 -0.04107187 0.02891296 -0.04594594 -0.01932630
## 4 4 0.19670134 -0.20412662 0.04983312 -0.09702372 -0.14720800
## pc px_height px_width ram talk_time
## 1 0.2833607 -0.3454561 -0.2320511 -0.6844737 -0.02205975
## 2 -0.3460108 -0.4313601 -0.3796982 -0.4903126 0.24059154
## 3 -0.1032245 1.3034216 1.0222103 0.1067133 0.00124206
## 4 0.1443142 -0.5401291 -0.4228029 0.9760729 -0.19612278