install.packages(c(
  "flexclust",
  "dbscan",
  "meanShiftR",
  "e1071",
  "cluster",
  "fpc",
  "mclust",
  "dplyr",
  "psych",
  "factoextra",
  "openxlsx"
))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(flexclust)
library(dbscan)
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
## 
##     bclust
library(cluster)
library(fpc)
## 
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:mclust':
## 
##     count
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:mclust':
## 
##     sim
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## The following object is masked from 'package:e1071':
## 
##     element
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
data <- read.csv("hcvdat0.csv")
str(data)
## 'data.frame':    615 obs. of  14 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Category: chr  "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" ...
##  $ Age     : int  32 32 32 32 32 32 32 32 32 32 ...
##  $ Sex     : chr  "m" "m" "m" "m" ...
##  $ ALB     : num  38.5 38.5 46.9 43.2 39.2 41.6 46.3 42.2 50.9 42.4 ...
##  $ ALP     : num  52.5 70.3 74.7 52 74.1 43.3 41.3 41.9 65.5 86.3 ...
##  $ ALT     : num  7.7 18 36.2 30.6 32.6 18.5 17.5 35.8 23.2 20.3 ...
##  $ AST     : num  22.1 24.7 52.6 22.6 24.8 19.7 17.8 31.1 21.2 20 ...
##  $ BIL     : num  7.5 3.9 6.1 18.9 9.6 12.3 8.5 16.1 6.9 35.2 ...
##  $ CHE     : num  6.93 11.17 8.84 7.33 9.15 ...
##  $ CHOL    : num  3.23 4.8 5.2 4.74 4.32 6.05 4.79 4.6 4.1 4.45 ...
##  $ CREA    : num  106 74 86 80 76 111 70 109 83 81 ...
##  $ GGT     : num  12.1 15.6 33.2 33.8 29.9 91 16.9 21.5 13.7 15.9 ...
##  $ PROT    : num  69 76.5 79.3 75.7 68.7 74 74.5 67.1 71.3 69.9 ...
colSums(is.na(data))
##        X Category      Age      Sex      ALB      ALP      ALT      AST 
##        0        0        0        0        1       18        1        0 
##      BIL      CHE     CHOL     CREA      GGT     PROT 
##        0        0       10        0        0        1
data_clean <- na.omit(data)
colSums(is.na(data_clean))
##        X Category      Age      Sex      ALB      ALP      ALT      AST 
##        0        0        0        0        0        0        0        0 
##      BIL      CHE     CHOL     CREA      GGT     PROT 
##        0        0        0        0        0        0
label <- data_clean$Category
data_clustering <- data_clean[, sapply(data_clean, is.numeric)]
colSums(is.na(data_clustering))
##    X  Age  ALB  ALP  ALT  AST  BIL  CHE CHOL CREA  GGT PROT 
##    0    0    0    0    0    0    0    0    0    0    0    0
describe(data_clustering)
##      vars   n   mean     sd median trimmed    mad   min     max   range  skew
## X       1 589 298.65 174.14 296.00  297.54 222.39  1.00  613.00  612.00  0.04
## Age     2 589  47.42   9.93  47.00   47.03  11.86 23.00   77.00   54.00  0.29
## ALB     3 589  41.62   5.76  41.90   41.89   4.74 14.90   82.20   67.30 -0.10
## ALP     4 589  68.12  25.92  66.20   66.38  20.31 11.30  416.60  405.30  4.73
## ALT     5 589  26.58  20.86  22.70   23.92  10.67  0.90  325.30  324.40  6.78
## AST     6 589  33.77  32.87  25.70   26.91   7.41 10.60  324.00  313.40  5.22
## BIL     7 589  11.02  17.41   7.10    8.15   3.85  0.80  209.00  208.20  8.05
## CHE     8 589   8.20   2.19   8.26    8.23   1.97  1.42   16.41   14.99 -0.07
## CHOL    9 589   5.39   1.13   5.31    5.36   1.08  1.43    9.67    8.24  0.38
## CREA   10 589  81.67  50.70  77.00   78.04  14.83  8.00 1079.10 1071.10 14.88
## GGT    11 589  38.20  54.30  22.80   27.62  13.05  4.50  650.90  646.40  5.91
## PROT   12 589  71.89   5.35  72.10   72.17   4.45 44.80   86.50   41.70 -1.06
##      kurtosis   se
## X       -1.18 7.18
## Age     -0.50 0.41
## ALB      6.03 0.24
## ALP     56.08 1.07
## ALT     80.63 0.86
## AST     33.30 1.35
## BIL     78.83 0.72
## CHE      1.32 0.09
## CHOL     0.70 0.05
## CREA   267.71 2.09
## GGT     46.73 2.24
## PROT     3.58 0.22
cor(data_clustering)
##                X         Age          ALB         ALP         ALT         AST
## X     1.00000000  0.44305790 -0.315204550  0.01794376 -0.20023304  0.30360292
## Age   0.44305790  1.00000000 -0.191093637  0.17771977 -0.04057647  0.07273886
## ALB  -0.31520455 -0.19109364  1.000000000 -0.14611991  0.03949714 -0.17760895
## ALP   0.01794376  0.17771977 -0.146119911  1.00000000  0.22160301  0.06702428
## ALT  -0.20023304 -0.04057647  0.039497139  0.22160301  1.00000000  0.19865775
## AST   0.30360292  0.07273886 -0.177608947  0.06702428  0.19865775  1.00000000
## BIL   0.17651109  0.03965486 -0.169597498  0.05837241 -0.10679662  0.30957974
## CHE  -0.27853454 -0.07586328  0.360919403  0.02948169  0.22434447 -0.19727042
## CHOL -0.05794709  0.12474161  0.210419878  0.12590008  0.14999727 -0.20121300
## CREA -0.02016270 -0.02514225  0.001433247  0.15390895 -0.03610554 -0.01794810
## GGT   0.22146275  0.14337927 -0.147598318  0.46130000  0.21970686  0.47777362
## PROT -0.16648242 -0.15975998  0.570725680 -0.06308514  0.01678633  0.01740394
##              BIL         CHE         CHOL         CREA          GGT        PROT
## X     0.17651109 -0.27853454 -0.057947087 -0.020162704  0.221462754 -0.16648242
## Age   0.03965486 -0.07586328  0.124741615 -0.025142253  0.143379268 -0.15975998
## ALB  -0.16959750  0.36091940  0.210419878  0.001433247 -0.147598318  0.57072568
## ALP   0.05837241  0.02948169  0.125900079  0.153908950  0.461299996 -0.06308514
## ALT  -0.10679662  0.22434447  0.149997271 -0.036105541  0.219706857  0.01678633
## AST   0.30957974 -0.19727042 -0.201213004 -0.017948098  0.477773617  0.01740394
## BIL   1.00000000 -0.32071323 -0.181569556  0.019909617  0.210566559 -0.05257491
## CHE  -0.32071323  1.00000000  0.428018276 -0.012119999 -0.095716131  0.30628754
## CHOL -0.18156956  0.42801828  1.000000000 -0.051464078  0.008822692  0.24504950
## CREA  0.01990962 -0.01212000 -0.051464078  1.000000000  0.125353469 -0.03011070
## GGT   0.21056656 -0.09571613  0.008822692  0.125353469  1.000000000 -0.03712701
## PROT -0.05257491  0.30628754  0.245049503 -0.030110695 -0.037127008  1.00000000
corrplot::corrplot(cor(data_clustering),tl.col = "black",type= "full",tl.srt=40,tl.cex = 0.5)

boxplot(data_clustering, main = "Boxplot Variabel Penelitian", las = 2)

df <- scale(data_clustering)
set.seed(123)

5 Metode Clustering

1. K-Means

CLustering

Elbow
wss <- sapply(1:10, function(k){
  kmeans(df, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type="b", pch=19,
     xlab="Jumlah Cluster (K)",
     ylab="Total Within Sum of Squares",
     main="Elbow Method")

Silhouette Analysis
avg_sil <- function(k){
  km_res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(df))
  mean(ss[,3])
}

k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)

plot(k_values, avg_sil_values, type="b", pch=19,
     xlab="Jumlah Cluster",
     ylab="Average Silhouette Width",
     main="Silhouette Analysis")

Visualisasi

km_res <- kmeans(df, centers = 2)
plot(df, col = km_res$cluster, main = "K-Means")

fviz_cluster(list(data = df, cluster = km_res$cluster), main="K-Means")

EValuasi Model

Silhouette
mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.5861834
Dunn-Index
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.0596453549648971"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 5978.52998558064"
Ari Score
ari_score <- adjustedRandIndex(km_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.563738585944082"

2. K-Median

Clustering

Silhouette
avg_sil_kmed <- function(k){
  kmed_res <- pam(df, k = k)
  ss <- silhouette(kmed_res$clustering, dist(df))
  mean(ss[,3])
}

k_values <- 2:10
sil_values <- sapply(k_values, avg_sil_kmed)

plot(k_values, sil_values, type="b", pch=19,
     xlab="Jumlah Cluster",
     ylab="Average Silhouette",
     main="Silhouette Analysis")

Visualisasi

kmed_res <- pam(df, k = 2)
plot(df, col = kmed_res$clustering, main = "K-Median (PAM)")

fviz_cluster(list(data = df, cluster = kmed_res$cluster), main="K-Median")

Evaluasi Model

Silhouette
mean(silhouette(kmed_res$cluster, dist(df))[,3])
## [1] 0.140085
Dunn-Index
stats_kmed <- cluster.stats(dist(df), kmed_res$cluster)
paste("Dunn Index:", stats_kmed$dunn)
## [1] "Dunn Index: 0.0300536885545642"
paste("Within-cluster SS:", stats_kmed$within.cluster.ss)
## [1] "Within-cluster SS: 6344.313794379"
Ari Score
ari_kmed <- adjustedRandIndex(kmed_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_kmed))
## [1] "Adjusted Rand Index: 0.0631381788606234"

3.DBSCAN

Clustering

db_res <- dbscan(df, eps = 1.0, MinPts = 3)

Visualisasi

plot(df, col = db_res$cluster + 1, main = "DBSCAN (0 = Noise)")

fviz_cluster(list(data = df, cluster = db_res$cluster), main="DBSCAN")

Evaluasi Model

valid <- db_res$cluster != 0
cluster_db <- db_res$cluster[valid]
data_db <- df[valid, ]
label_db <- label[valid]
Silhouette
if(length(unique(cluster_db)) > 1){
  sil_db <- silhouette(cluster_db, dist(data_db))
  mean(sil_db[,3])
} else {
  print("Cluster tidak cukup untuk dihitung")
}
## [1] 0.0858742
Dunn-Index
if(length(unique(cluster_db)) > 1){
  stats_db <- cluster.stats(
    dist(data_db),
    as.integer(as.factor(cluster_db))
  )
  
  print(paste("Dunn Index:", stats_db$dunn))
  print(paste("Within-cluster SS:", stats_db$within.cluster.ss))
} else {
  print("Cluster tidak cukup untuk dihitung")
}
## [1] "Dunn Index: 0.256447262949843"
## [1] "Within-cluster SS: 135.116546236516"
Ari Score
ari_db <- adjustedRandIndex(cluster_db, label_db)
print(paste("Adjusted Rand Index:", ari_db))
## [1] "Adjusted Rand Index: 0"

4. Mean Shift

Clustering

bw <- rep(3, ncol(df))
ms_res <- meanShift(df, bandwidth = bw)

Visualisasi

plot(df, col = ms_res$assignment, main = "Mean Shift")

Evaluasi

Silhouette
mean(silhouette(ms_res$assignment, dist(df))[,3])
## [1] 0.7634329
table(ms_res$assignment)
## 
##   1   2   3   4 
## 586   1   1   1
Dunn-Index
stats_ms <- cluster.stats(dist(df), ms_res$assignment)
paste("Dunn Index:", stats_ms$dunn)
## [1] "Dunn Index: 0.645479153697263"
paste("Within-cluster SS:", stats_ms$within.cluster.ss)
## [1] "Within-cluster SS: 6022.40198541558"
Ari Score
ari_ms <- adjustedRandIndex(ms_res$assignment, label)
print(paste("Adjusted Rand Index:", ari_ms))
## [1] "Adjusted Rand Index: 0.0763427160484205"

5. Fuzzy C-Means

Clustering

Silhouette Analysis
avg_sil_fcm <- function(k){
  fcm_res <- cmeans(df, centers = k, m = 2)
  ss <- silhouette(fcm_res$cluster, dist(df))
  mean(ss[,3])
}

k_values_fcm <- 2:10
avg_sil_values_fcm <- sapply(k_values_fcm, avg_sil_fcm)

plot(k_values_fcm, avg_sil_values_fcm, type="b", pch=19,
     xlab="Jumlah Cluster",
     ylab="Average Silhouette Width",
     main="Silhouette Analysis")

Visualisasi

fcm_res <- cmeans(df, centers = 2, m = 2)
plot(df, col = fcm_res$cluster, main = "Fuzzy C-Means")

fviz_cluster(list(data = df, cluster = fcm_res$cluster), main="Fuzzy C-Means")

Evaluasi Model

Silhouette
mean(silhouette(fcm_res$cluster, dist(df))[,3])
## [1] 0.1341802
Dunn-Index
stats_fcm <- cluster.stats(dist(df), fcm_res$cluster)
paste("Dunn Index:", stats_fcm$dunn)
## [1] "Dunn Index: 0.0302171322300775"
paste("Within-cluster SS:", stats_fcm$within.cluster.ss)
## [1] "Within-cluster SS: 6270.8956362676"
Ari Score
ari_score <- adjustedRandIndex(fcm_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.0143781832065237"