Tugas Analisis Multivariat

Faiz Azmin Ahmad (24031554175), Cut Azzahra Firdausi Syakiena Nazwa (24031554180)

Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
library(dbscan)
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
library(fpc)
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map
data <- read.csv("train.csv")

# cek data
str(data)
## 'data.frame':    2000 obs. of  21 variables:
##  $ battery_power: int  842 1021 563 615 1821 1859 1821 1954 1445 509 ...
##  $ blue         : int  0 1 1 1 1 0 0 0 1 1 ...
##  $ clock_speed  : num  2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
##  $ dual_sim     : int  0 1 1 0 0 1 0 1 0 1 ...
##  $ fc           : int  1 0 2 0 13 3 4 0 0 2 ...
##  $ four_g       : int  0 1 1 0 1 0 1 0 0 1 ...
##  $ int_memory   : int  7 53 41 10 44 22 10 24 53 9 ...
##  $ m_dep        : num  0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
##  $ mobile_wt    : int  188 136 145 131 141 164 139 187 174 93 ...
##  $ n_cores      : int  2 3 5 6 2 1 8 4 7 5 ...
##  $ pc           : int  2 6 6 9 14 7 10 0 14 15 ...
##  $ px_height    : int  20 905 1263 1216 1208 1004 381 512 386 1137 ...
##  $ px_width     : int  756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
##  $ ram          : int  2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
##  $ sc_h         : int  9 17 11 16 8 17 13 16 17 19 ...
##  $ sc_w         : int  7 3 2 8 2 1 8 3 1 10 ...
##  $ talk_time    : int  19 7 9 11 15 10 18 5 20 12 ...
##  $ three_g      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ touch_screen : int  0 1 1 0 1 0 0 1 0 0 ...
##  $ wifi         : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ price_range  : int  1 2 2 2 1 1 3 0 0 0 ...
summary(data)
##  battery_power         blue        clock_speed       dual_sim     
##  Min.   : 501.0   Min.   :0.000   Min.   :0.500   Min.   :0.0000  
##  1st Qu.: 851.8   1st Qu.:0.000   1st Qu.:0.700   1st Qu.:0.0000  
##  Median :1226.0   Median :0.000   Median :1.500   Median :1.0000  
##  Mean   :1238.5   Mean   :0.495   Mean   :1.522   Mean   :0.5095  
##  3rd Qu.:1615.2   3rd Qu.:1.000   3rd Qu.:2.200   3rd Qu.:1.0000  
##  Max.   :1998.0   Max.   :1.000   Max.   :3.000   Max.   :1.0000  
##        fc             four_g         int_memory        m_dep       
##  Min.   : 0.000   Min.   :0.0000   Min.   : 2.00   Min.   :0.1000  
##  1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:16.00   1st Qu.:0.2000  
##  Median : 3.000   Median :1.0000   Median :32.00   Median :0.5000  
##  Mean   : 4.309   Mean   :0.5215   Mean   :32.05   Mean   :0.5018  
##  3rd Qu.: 7.000   3rd Qu.:1.0000   3rd Qu.:48.00   3rd Qu.:0.8000  
##  Max.   :19.000   Max.   :1.0000   Max.   :64.00   Max.   :1.0000  
##    mobile_wt        n_cores            pc           px_height     
##  Min.   : 80.0   Min.   :1.000   Min.   : 0.000   Min.   :   0.0  
##  1st Qu.:109.0   1st Qu.:3.000   1st Qu.: 5.000   1st Qu.: 282.8  
##  Median :141.0   Median :4.000   Median :10.000   Median : 564.0  
##  Mean   :140.2   Mean   :4.521   Mean   : 9.916   Mean   : 645.1  
##  3rd Qu.:170.0   3rd Qu.:7.000   3rd Qu.:15.000   3rd Qu.: 947.2  
##  Max.   :200.0   Max.   :8.000   Max.   :20.000   Max.   :1960.0  
##     px_width           ram            sc_h            sc_w       
##  Min.   : 500.0   Min.   : 256   Min.   : 5.00   Min.   : 0.000  
##  1st Qu.: 874.8   1st Qu.:1208   1st Qu.: 9.00   1st Qu.: 2.000  
##  Median :1247.0   Median :2146   Median :12.00   Median : 5.000  
##  Mean   :1251.5   Mean   :2124   Mean   :12.31   Mean   : 5.767  
##  3rd Qu.:1633.0   3rd Qu.:3064   3rd Qu.:16.00   3rd Qu.: 9.000  
##  Max.   :1998.0   Max.   :3998   Max.   :19.00   Max.   :18.000  
##    talk_time        three_g        touch_screen        wifi      
##  Min.   : 2.00   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 6.00   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :11.00   Median :1.0000   Median :1.000   Median :1.000  
##  Mean   :11.01   Mean   :0.7615   Mean   :0.503   Mean   :0.507  
##  3rd Qu.:16.00   3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :20.00   Max.   :1.0000   Max.   :1.000   Max.   :1.000  
##   price_range  
##  Min.   :0.00  
##  1st Qu.:0.75  
##  Median :1.50  
##  Mean   :1.50  
##  3rd Qu.:2.25  
##  Max.   :3.00
head(data)
##   battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1           842    0         2.2        0  1      0          7   0.6       188
## 2          1021    1         0.5        1  0      1         53   0.7       136
## 3           563    1         0.5        1  2      1         41   0.9       145
## 4           615    1         2.5        0  0      0         10   0.8       131
## 5          1821    1         1.2        0 13      1         44   0.6       141
## 6          1859    0         0.5        1  3      0         22   0.7       164
##   n_cores pc px_height px_width  ram sc_h sc_w talk_time three_g touch_screen
## 1       2  2        20      756 2549    9    7        19       0            0
## 2       3  6       905     1988 2631   17    3         7       1            1
## 3       5  6      1263     1716 2603   11    2         9       1            1
## 4       6  9      1216     1786 2769   16    8        11       1            0
## 5       2 14      1208     1212 1411    8    2        15       1            1
## 6       1  7      1004     1654 1067   17    1        10       1            0
##   wifi price_range
## 1    1           1
## 2    0           2
## 3    0           2
## 4    0           2
## 5    0           1
## 6    0           1

Data Preparation

# pisahin fitur & label
df <- data[, c(
  "battery_power",
  "clock_speed",
  "int_memory",
  "mobile_wt",
  "n_cores",
  "pc",
  "px_height",
  "px_width",
  "ram",
  "talk_time"
)]   # semua fitur numerik
label <- data$price_range   # ground truth

# scaling
df_scaled <- scale(df)
set.seed(123)

Elbow Method

# number of K
# Elbow Method using base R and a loop
wss <- sapply(1:10, function(k){
 kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Plotting
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
 xlab = "Number of clusters K",
 ylab = "Total within-clusters sum of squares",
 main = "Elbow Method")

# Silhouette Analysis

# Hitung Silhouette
avg_sil <- function(k) {
  km <- kmeans(df_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(df_scaled))
  mean(ss[, 3])
}

# Coba Cluster & Looping
k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# Plotting
plot(k_values, sil_values, type="b", pch=19,
     xlab="Jumlah Cluster (k)",
     ylab="Silhouette Score",
     main="Silhouette Analysis")

# Clustering

# ---1. K-means---
km <- kmeans(df_scaled, centers = 4)
# --- 2. K-median ---
kmed <- kcca(df_scaled, k = 4, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# --- 3. DBSCAN ---
db <- dbscan(df_scaled, eps = 2, MinPts = 5)
# --- 4. Mean Shift ---
ms <- meanShift(df_scaled)
# --- 5. Fuzzy C-means ---
fcm <- cmeans(df_scaled, centers = 4, m = 2)

Visualisasi

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
pca <- prcomp(df_scaled)

plot(pca$x[,1:2], col=km$cluster, pch=19, cex=0.6, main="K-Means")
plot(pca$x[,1:2], col=clusters(kmed), pch=19, cex=0.6, main="K-Median")
plot(pca$x[,1:2], col=as.numeric(as.factor(db$cluster)), pch=19, cex=0.6, main="DBSCAN")
plot(pca$x[,1:2], col=ms$assignment, pch=19, cex=0.6, main="Mean Shift")
plot(pca$x[,1:2], col=fcm$cluster, pch=19, cex=0.6, main="Fuzzy C-Means")
plot(pca$x[,1:2], col=label, pch=19, cex=0.6, main="Label Asli")

# Metrik

# 1. Silhouette
sil_km <- mean(silhouette(km$cluster, dist(df_scaled))[,3])
sil_km
## [1] 0.07405444
# 2. Dunn-Index
stats <- cluster.stats(dist(df_scaled), km$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.135384026121787"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 16169.8203129589"
# 3. ARI Score
ari_km <- adjustedRandIndex(km$cluster, label)
ari_km
## [1] 0.1569832
print(paste("Adjusted Rand Index:", ari_km))
## [1] "Adjusted Rand Index: 0.156983229772753"
# Visualisasi
# Distribusi Fitur
data_long <- pivot_longer(df, cols = everything())

ggplot(data_long, aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  facet_wrap(~name, scales = "free") +
  labs(title = "Distribusi Fitur") +
  theme_minimal()

# Perbandingan Fitur & Label
par(mfrow = c(2, 5))  # pas 10 plot

fitur_names <- c(
  "battery_power",
  "clock_speed",
  "int_memory",
  "mobile_wt",
  "n_cores",
  "pc",
  "px_height",
  "px_width",
  "ram",
  "talk_time"
)

for(f in fitur_names){
  boxplot(data[[f]] ~ data$price_range,
          main = f,
          xlab = "Price Range",
          ylab = "",
          col = "lightblue")
}

fitur <- data[, c(
  "battery_power",
  "clock_speed",
  "int_memory",
  "mobile_wt",
  "n_cores",
  "pc",
  "px_height",
  "px_width",
  "ram",
  "talk_time",
  "price_range"
)]

library(corrplot)
## corrplot 0.95 loaded
cor_matrix <- cor(fitur)

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black",
         tl.srt = 45)

# Distribusi Label
table(data$price_range)
## 
##   0   1   2   3 
## 500 500 500 500
barplot(table(data$price_range),
        main="Distribusi Price Range")

# Distribusi Cluster
table(km$cluster)
## 
##   1   2   3   4 
## 493 473 506 528
barplot(table(km$cluster),
        main="Distribusi Cluster K-Means")

summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.2297 1.0407 1.0258 1.0161 1.0074 0.99763 0.98757
## Proportion of Variance 0.1512 0.1083 0.1052 0.1033 0.1015 0.09953 0.09753
## Cumulative Proportion  0.1512 0.2595 0.3647 0.4680 0.5695 0.66901 0.76654
##                            PC8    PC9    PC10
## Standard deviation     0.96304 0.9597 0.69728
## Proportion of Variance 0.09274 0.0921 0.04862
## Cumulative Proportion  0.85928 0.9514 1.00000
# Rata" Tiap Cluster
aggregate(df_scaled, by=list(cluster=km$cluster), mean)
##   cluster battery_power clock_speed  int_memory   mobile_wt     n_cores
## 1       1    0.06351926  0.01499105 -0.90286958  0.16142781  0.12713277
## 2       2   -0.35703201  0.25617471  0.85448807 -0.01079650  0.05249149
## 3       3    0.06660639 -0.04107187  0.02891296 -0.04594594 -0.01932630
## 4       4    0.19670134 -0.20412662  0.04983312 -0.09702372 -0.14720800
##           pc  px_height   px_width        ram   talk_time
## 1  0.2833607 -0.3454561 -0.2320511 -0.6844737 -0.02205975
## 2 -0.3460108 -0.4313601 -0.3796982 -0.4903126  0.24059154
## 3 -0.1032245  1.3034216  1.0222103  0.1067133  0.00124206
## 4  0.1443142 -0.5401291 -0.4228029  0.9760729 -0.19612278