install.packages(c(
"flexclust",
"dbscan",
"meanShiftR",
"e1071",
"cluster",
"fpc",
"mclust",
"dplyr",
"psych",
"factoextra",
"openxlsx"
))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(flexclust)
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
##
## bclust
library(cluster)
library(fpc)
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:mclust':
##
## count
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:mclust':
##
## sim
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## The following object is masked from 'package:e1071':
##
## element
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
data <- read.csv("hcvdat0.csv")
str(data)
## 'data.frame': 615 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Category: chr "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" ...
## $ Age : int 32 32 32 32 32 32 32 32 32 32 ...
## $ Sex : chr "m" "m" "m" "m" ...
## $ ALB : num 38.5 38.5 46.9 43.2 39.2 41.6 46.3 42.2 50.9 42.4 ...
## $ ALP : num 52.5 70.3 74.7 52 74.1 43.3 41.3 41.9 65.5 86.3 ...
## $ ALT : num 7.7 18 36.2 30.6 32.6 18.5 17.5 35.8 23.2 20.3 ...
## $ AST : num 22.1 24.7 52.6 22.6 24.8 19.7 17.8 31.1 21.2 20 ...
## $ BIL : num 7.5 3.9 6.1 18.9 9.6 12.3 8.5 16.1 6.9 35.2 ...
## $ CHE : num 6.93 11.17 8.84 7.33 9.15 ...
## $ CHOL : num 3.23 4.8 5.2 4.74 4.32 6.05 4.79 4.6 4.1 4.45 ...
## $ CREA : num 106 74 86 80 76 111 70 109 83 81 ...
## $ GGT : num 12.1 15.6 33.2 33.8 29.9 91 16.9 21.5 13.7 15.9 ...
## $ PROT : num 69 76.5 79.3 75.7 68.7 74 74.5 67.1 71.3 69.9 ...
colSums(is.na(data))
## X Category Age Sex ALB ALP ALT AST
## 0 0 0 0 1 18 1 0
## BIL CHE CHOL CREA GGT PROT
## 0 0 10 0 0 1
data_clean <- na.omit(data)
colSums(is.na(data_clean))
## X Category Age Sex ALB ALP ALT AST
## 0 0 0 0 0 0 0 0
## BIL CHE CHOL CREA GGT PROT
## 0 0 0 0 0 0
label <- data_clean$Category
data_clustering <- data_clean[, sapply(data_clean, is.numeric)]
colSums(is.na(data_clustering))
## X Age ALB ALP ALT AST BIL CHE CHOL CREA GGT PROT
## 0 0 0 0 0 0 0 0 0 0 0 0
describe(data_clustering)
## vars n mean sd median trimmed mad min max range skew
## X 1 589 298.65 174.14 296.00 297.54 222.39 1.00 613.00 612.00 0.04
## Age 2 589 47.42 9.93 47.00 47.03 11.86 23.00 77.00 54.00 0.29
## ALB 3 589 41.62 5.76 41.90 41.89 4.74 14.90 82.20 67.30 -0.10
## ALP 4 589 68.12 25.92 66.20 66.38 20.31 11.30 416.60 405.30 4.73
## ALT 5 589 26.58 20.86 22.70 23.92 10.67 0.90 325.30 324.40 6.78
## AST 6 589 33.77 32.87 25.70 26.91 7.41 10.60 324.00 313.40 5.22
## BIL 7 589 11.02 17.41 7.10 8.15 3.85 0.80 209.00 208.20 8.05
## CHE 8 589 8.20 2.19 8.26 8.23 1.97 1.42 16.41 14.99 -0.07
## CHOL 9 589 5.39 1.13 5.31 5.36 1.08 1.43 9.67 8.24 0.38
## CREA 10 589 81.67 50.70 77.00 78.04 14.83 8.00 1079.10 1071.10 14.88
## GGT 11 589 38.20 54.30 22.80 27.62 13.05 4.50 650.90 646.40 5.91
## PROT 12 589 71.89 5.35 72.10 72.17 4.45 44.80 86.50 41.70 -1.06
## kurtosis se
## X -1.18 7.18
## Age -0.50 0.41
## ALB 6.03 0.24
## ALP 56.08 1.07
## ALT 80.63 0.86
## AST 33.30 1.35
## BIL 78.83 0.72
## CHE 1.32 0.09
## CHOL 0.70 0.05
## CREA 267.71 2.09
## GGT 46.73 2.24
## PROT 3.58 0.22
cor(data_clustering)
## X Age ALB ALP ALT AST
## X 1.00000000 0.44305790 -0.315204550 0.01794376 -0.20023304 0.30360292
## Age 0.44305790 1.00000000 -0.191093637 0.17771977 -0.04057647 0.07273886
## ALB -0.31520455 -0.19109364 1.000000000 -0.14611991 0.03949714 -0.17760895
## ALP 0.01794376 0.17771977 -0.146119911 1.00000000 0.22160301 0.06702428
## ALT -0.20023304 -0.04057647 0.039497139 0.22160301 1.00000000 0.19865775
## AST 0.30360292 0.07273886 -0.177608947 0.06702428 0.19865775 1.00000000
## BIL 0.17651109 0.03965486 -0.169597498 0.05837241 -0.10679662 0.30957974
## CHE -0.27853454 -0.07586328 0.360919403 0.02948169 0.22434447 -0.19727042
## CHOL -0.05794709 0.12474161 0.210419878 0.12590008 0.14999727 -0.20121300
## CREA -0.02016270 -0.02514225 0.001433247 0.15390895 -0.03610554 -0.01794810
## GGT 0.22146275 0.14337927 -0.147598318 0.46130000 0.21970686 0.47777362
## PROT -0.16648242 -0.15975998 0.570725680 -0.06308514 0.01678633 0.01740394
## BIL CHE CHOL CREA GGT PROT
## X 0.17651109 -0.27853454 -0.057947087 -0.020162704 0.221462754 -0.16648242
## Age 0.03965486 -0.07586328 0.124741615 -0.025142253 0.143379268 -0.15975998
## ALB -0.16959750 0.36091940 0.210419878 0.001433247 -0.147598318 0.57072568
## ALP 0.05837241 0.02948169 0.125900079 0.153908950 0.461299996 -0.06308514
## ALT -0.10679662 0.22434447 0.149997271 -0.036105541 0.219706857 0.01678633
## AST 0.30957974 -0.19727042 -0.201213004 -0.017948098 0.477773617 0.01740394
## BIL 1.00000000 -0.32071323 -0.181569556 0.019909617 0.210566559 -0.05257491
## CHE -0.32071323 1.00000000 0.428018276 -0.012119999 -0.095716131 0.30628754
## CHOL -0.18156956 0.42801828 1.000000000 -0.051464078 0.008822692 0.24504950
## CREA 0.01990962 -0.01212000 -0.051464078 1.000000000 0.125353469 -0.03011070
## GGT 0.21056656 -0.09571613 0.008822692 0.125353469 1.000000000 -0.03712701
## PROT -0.05257491 0.30628754 0.245049503 -0.030110695 -0.037127008 1.00000000
corrplot::corrplot(cor(data_clustering),tl.col = "black",type= "full",tl.srt=40,tl.cex = 0.5)

boxplot(data_clustering, main = "Boxplot Variabel Penelitian", las = 2)

df <- scale(data_clustering)
set.seed(123)
5 Metode Clustering
1. K-Means
CLustering
Elbow
wss <- sapply(1:10, function(k){
kmeans(df, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster (K)",
ylab="Total Within Sum of Squares",
main="Elbow Method")

Silhouette Analysis
avg_sil <- function(k){
km_res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df))
mean(ss[,3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
plot(k_values, avg_sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Average Silhouette Width",
main="Silhouette Analysis")

Visualisasi
km_res <- kmeans(df, centers = 2)
plot(df, col = km_res$cluster, main = "K-Means")

fviz_cluster(list(data = df, cluster = km_res$cluster), main="K-Means")

EValuasi Model
Silhouette
mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.5861834
Dunn-Index
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.0596453549648971"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 5978.52998558064"
Ari Score
ari_score <- adjustedRandIndex(km_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.563738585944082"
3.DBSCAN
Clustering
db_res <- dbscan(df, eps = 1.0, MinPts = 3)
Visualisasi
plot(df, col = db_res$cluster + 1, main = "DBSCAN (0 = Noise)")

fviz_cluster(list(data = df, cluster = db_res$cluster), main="DBSCAN")

Evaluasi Model
valid <- db_res$cluster != 0
cluster_db <- db_res$cluster[valid]
data_db <- df[valid, ]
label_db <- label[valid]
Silhouette
if(length(unique(cluster_db)) > 1){
sil_db <- silhouette(cluster_db, dist(data_db))
mean(sil_db[,3])
} else {
print("Cluster tidak cukup untuk dihitung")
}
## [1] 0.0858742
Dunn-Index
if(length(unique(cluster_db)) > 1){
stats_db <- cluster.stats(
dist(data_db),
as.integer(as.factor(cluster_db))
)
print(paste("Dunn Index:", stats_db$dunn))
print(paste("Within-cluster SS:", stats_db$within.cluster.ss))
} else {
print("Cluster tidak cukup untuk dihitung")
}
## [1] "Dunn Index: 0.256447262949843"
## [1] "Within-cluster SS: 135.116546236516"
Ari Score
ari_db <- adjustedRandIndex(cluster_db, label_db)
print(paste("Adjusted Rand Index:", ari_db))
## [1] "Adjusted Rand Index: 0"
4. Mean Shift
Clustering
bw <- rep(3, ncol(df))
ms_res <- meanShift(df, bandwidth = bw)
Visualisasi
plot(df, col = ms_res$assignment, main = "Mean Shift")

Evaluasi
Silhouette
mean(silhouette(ms_res$assignment, dist(df))[,3])
## [1] 0.7634329
table(ms_res$assignment)
##
## 1 2 3 4
## 586 1 1 1
Dunn-Index
stats_ms <- cluster.stats(dist(df), ms_res$assignment)
paste("Dunn Index:", stats_ms$dunn)
## [1] "Dunn Index: 0.645479153697263"
paste("Within-cluster SS:", stats_ms$within.cluster.ss)
## [1] "Within-cluster SS: 6022.40198541558"
Ari Score
ari_ms <- adjustedRandIndex(ms_res$assignment, label)
print(paste("Adjusted Rand Index:", ari_ms))
## [1] "Adjusted Rand Index: 0.0763427160484205"
5. Fuzzy C-Means
Clustering
Silhouette Analysis
avg_sil_fcm <- function(k){
fcm_res <- cmeans(df, centers = k, m = 2)
ss <- silhouette(fcm_res$cluster, dist(df))
mean(ss[,3])
}
k_values_fcm <- 2:10
avg_sil_values_fcm <- sapply(k_values_fcm, avg_sil_fcm)
plot(k_values_fcm, avg_sil_values_fcm, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Average Silhouette Width",
main="Silhouette Analysis")

Visualisasi
fcm_res <- cmeans(df, centers = 2, m = 2)
plot(df, col = fcm_res$cluster, main = "Fuzzy C-Means")

fviz_cluster(list(data = df, cluster = fcm_res$cluster), main="Fuzzy C-Means")

Evaluasi Model
Silhouette
mean(silhouette(fcm_res$cluster, dist(df))[,3])
## [1] 0.1341802
Dunn-Index
stats_fcm <- cluster.stats(dist(df), fcm_res$cluster)
paste("Dunn Index:", stats_fcm$dunn)
## [1] "Dunn Index: 0.0302171322300775"
paste("Within-cluster SS:", stats_fcm$within.cluster.ss)
## [1] "Within-cluster SS: 6270.8956362676"
Ari Score
ari_score <- adjustedRandIndex(fcm_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.0143781832065237"