library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
## 
##     bclust
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element

Load & Preprocessing

df <- read.csv("covid_19_indonesia_time_series_all.csv")

df <- df %>% filter(Location.Level == "Province")

df_final <- df %>%
  group_by(Location) %>%
  summarise(
    cases = mean(New.Cases, na.rm = TRUE),
    deaths = mean(New.Deaths, na.rm = TRUE),
    total_cases = mean(Total.Cases, na.rm = TRUE),
    total_deaths = mean(Total.Deaths, na.rm = TRUE)
  )

df_scaled <- scale(df_final[,-1])

Elbow Method

wss <- sapply(1:10, function(k){
  kmeans(df_scaled, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type = "b")

k <- 3

Clustering Models

# K-means
km <- kmeans(df_scaled, centers = k, nstart = 25)

# K-medians
kmed <- kcca(df_scaled, k = k, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# DBSCAN
db <- dbscan::dbscan(df_scaled, eps = 0.5, minPts = 3)

# Mean Shift
ms <- meanShift(df_scaled)

# Fuzzy C-means
fcm <- cmeans(df_scaled, centers = k)

Silhouette Evaluation

dist_mat <- dist(df_scaled)

# K-means
sil_km <- mean(silhouette(km$cluster, dist_mat)[,3])

# K-medians
sil_kmed <- mean(silhouette(clusters(kmed), dist_mat)[,3])

# Fuzzy
sil_fcm <- mean(silhouette(fcm$cluster, dist_mat)[,3])

# DBSCAN (SAFE)
if(length(unique(db$cluster[db$cluster != 0])) > 1){
  idx <- db$cluster != 0
  sil_db <- mean(silhouette(db$cluster[idx], dist(df_scaled[idx, ]))[,3])
} else {
  sil_db <- NA
}

# MeanShift
if(length(unique(ms$assignment)) > 1){
  sil_ms <- mean(silhouette(ms$assignment, dist_mat)[,3])
} else {
  sil_ms <- NA
}

# RESULT TABLE
hasil <- data.frame(
  Method = c("K-means","K-medians","DBSCAN","MeanShift","Fuzzy"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)

hasil
##      Method Silhouette
## 1   K-means  0.5939236
## 2 K-medians  0.5721922
## 3    DBSCAN         NA
## 4 MeanShift  0.9031243
## 5     Fuzzy  0.5939236

Best Method

best <- hasil[which.max(hasil$Silhouette),]
best
##      Method Silhouette
## 4 MeanShift  0.9031243

EDA

df_final$cluster <- km$cluster

aggregate(df_final[,-1], by = list(cluster = df_final$cluster), mean)
##   cluster      cases    deaths total_cases total_deaths cluster
## 1       1   56.96633  1.521855    23955.67     687.7004       1
## 2       2 1034.10249 26.243828   405127.54   12255.6685       2
## 3       3  216.22500  4.599798    83862.97    2071.1536       3

Visualisasi PCA

pca <- prcomp(df_scaled)

plot(pca$x[,1:2],
     col = df_final$cluster,
     pch = 19,
     main = "Cluster Visualization (PCA)")

Boxplot

boxplot(cases ~ cluster,
        data = df_final,
        main = "Cases per Cluster")