Logo Branding

Deden Ahmad Rabani

Data Enthusiast | Statistics and Data Science | IPB University


Clustering K-Means


Package

# Memuat library yang diperlukan
library(readxl)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(readxl)
library(factoextra)
library(cluster)
library(ggplot2)
library(tidyr)
library(ggiraphExtra)
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(sf)

Data Preprocessing

setwd("C:/Users/Deden Ahmad Rabani/OneDrive - apps.ipb.ac.id/Semester 4/Gammaphics") 
# Membaca file Excel
datak <- read_excel("ikp.xlsx", sheet = "IKP22")

# Menampilkan beberapa baris pertama dari data
head(datak)
## # A tibble: 6 × 5
##   PROVINSI            IK    IA    IP   IKP
##   <chr>            <dbl> <dbl> <dbl> <dbl>
## 1 ACEH              80.2  69.3  66.2  70.2
## 2 SUMATERA UTARA    80.2  75.0  64.4  71.2
## 3 SUMATERA BARAT    87.5  84.2  71.8  79.4
## 4 RIAU              38.8  86.5  69.8  67.6
## 5 JAMBI             58.5  82    67.0  69.5
## 6 SUMATERA SELATAN  81.6  74.1  61.3  69.6
str(datak)
## tibble [34 × 5] (S3: tbl_df/tbl/data.frame)
##  $ PROVINSI: chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ IK      : num [1:34] 80.2 80.2 87.5 38.8 58.5 ...
##  $ IA      : num [1:34] 69.3 75 84.2 86.5 82 ...
##  $ IP      : num [1:34] 66.2 64.4 71.8 69.8 67 ...
##  $ IKP     : num [1:34] 70.2 71.2 79.5 67.6 69.5 ...
str(datak)
## tibble [34 × 5] (S3: tbl_df/tbl/data.frame)
##  $ PROVINSI: chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ IK      : num [1:34] 80.2 80.2 87.5 38.8 58.5 ...
##  $ IA      : num [1:34] 69.3 75 84.2 86.5 82 ...
##  $ IP      : num [1:34] 66.2 64.4 71.8 69.8 67 ...
##  $ IKP     : num [1:34] 70.2 71.2 79.5 67.6 69.5 ...
data <- datak[,-1] #Hapus Row Provinsi
data <- data[,-4] #Hapus Row IKP
data
## # A tibble: 34 × 3
##       IK    IA    IP
##    <dbl> <dbl> <dbl>
##  1 80.2   69.3  66.2
##  2 80.2   75.0  64.4
##  3 87.5   84.2  71.8
##  4 38.8   86.5  69.8
##  5 58.5   82    67.0
##  6 81.6   74.1  61.3
##  7 69.9   75.2  61.5
##  8 94.9   79.6  67.7
##  9 36.4   92.5  77.5
## 10  5.83  88.9  72.5
## # ℹ 24 more rows

Penentuan Cluster Teroptimal

Bisa Subjektif (Tentuin mau berapa klaster) atau sesuai teori optimal:

Metode Silhouette

Hint: Lihat k yang nilai silhouette paling tinggi

# Load required libraries
library(cluster)
library(ggplot2)

# Compute distance matrix
dist_matrix <- dist(data)

# Initialize silhouette width vector
sil_width <- c(0)  # set silhouette width for k = 1 as 0

# Loop over several k values
for(k in 2:10) {
  # Perform k-means clustering
  cluster <- kmeans(data, centers=k)
  
  # Compute silhouette widths
  sil <- silhouette(cluster$cluster, dist_matrix)
  
  # Compute and store the average silhouette width
  sil_width[k] <- mean(sil[, 3])
}

# Create a data frame for plotting
sil_df <- data.frame(
  k = 1:10,
  silhouette_width = sil_width[1:10]
)

# Create a ggplot
ggplot(sil_df, aes(x=k, y=silhouette_width)) +
  geom_line(color = "darkblue", size = 1.2) +
  geom_point(color = "red", size = 4) +
  labs(title = "Silhouette Method to Determine Optimal k",
       x = "Number of Clusters",
       y = "Average Silhouette Width") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, color = "darkblue", size = 14, face = "bold.italic"),
    axis.title.x = element_text(color = "darkblue", size = 12, face = "bold"),
    axis.title.y = element_text(color = "darkblue", size = 12, face = "bold"),
    axis.text.x = element_text(color = "black", size = 10),
    axis.text.y = element_text(color = "black", size = 10)
  ) + scale_x_continuous(breaks = 1:10)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_nbclust(data, hcut, method = "silhouette")

Metode Elbow

Hint: Lihat k yang nilai WSS paling membuat sudut tajam

# Menghitung matriks jarak
d <- dist(data)

# Menghitung jumlah kluster k optimal menggunakan metode elbow
wss <- c(NA)
for(i in 1:10) {
  km.res <- kmeans(data, centers = i)
  wss[i] <- km.res$tot.withinss
}

# Membuat dataframe untuk plotting
df <- data.frame(
  k = 1:10,
  wss = wss
)

# Membuat plot dengan ggplot2
ggplot(df, aes(x = k, y = wss)) +
  geom_line(color = "darkblue", size = 1.2) +
  geom_point(color = "red", size = 4) +
  labs(title = "Elbow Method to Determine Optimal k",
       x = "Number of Clusters",
       y = "Total Within Sum of Squares") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, color = "darkblue", size = 14, face = "bold.italic"),
    axis.title.x = element_text(color = "darkblue", size = 12, face = "bold"),
    axis.title.y = element_text(color = "darkblue", size = 12, face = "bold"),
    axis.text.x = element_text(color = "black", size = 10),
    axis.text.y = element_text(color = "black", size = 10)
  ) + scale_x_continuous(breaks = 1:10)

K-Means (k=4)

Modelling K-Means

set.seed(123)

# Melakukan klustering k-means dengan k = 3 pada data yang telah distandarisasi
km.res <- eclust(data, FUNcluster = "kmeans", k = 4, graph = FALSE)

# Menampilkan hasil klustering
print(km.res)
## K-means clustering with 4 clusters of sizes 5, 11, 11, 7
## 
## Cluster means:
##         IK       IA       IP
## 1  9.96400 74.44800 62.55400
## 2 89.12364 82.51909 73.65727
## 3 77.58000 77.19000 65.02182
## 4 46.45857 85.83857 70.83571
## 
## Clustering vector:
##  [1] 3 3 2 4 4 3 3 2 4 1 1 2 2 2 2 3 2 2 3 3 4 2 4 4 3 3 2 3 2 3 4 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 2325.3017  808.2114 1092.8916 1002.1799
##  (between_SS / total_SS =  83.7 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "silinfo"     
## [11] "nbclust"      "data"

Visualisasi K-Means

set.seed(123)
factoextra::fviz_cluster(km.res, data = data,
             geom = "point",  
             ellipse.type = "convex",  
             pointsize = 2,  
             palette = "jco",  
             ggtheme = theme_minimal())

Profilling Kluster

# Ambil data centeroids
centers <- km.res$centers

# Konvert data centeroids ke data frame
centers_df <- as.data.frame(centers)

# Tambahkan Kolom Kluster
centers_df$cluster <- rownames(centers_df)

# Konversi Data Wide ke Long
library(tidyr)
centers_long <- gather(centers_df, key = "variable", value = "value", -cluster)
centers_df
##         IK       IA       IP cluster
## 1  9.96400 74.44800 62.55400       1
## 2 89.12364 82.51909 73.65727       2
## 3 77.58000 77.19000 65.02182       3
## 4 46.45857 85.83857 70.83571       4

Dari hasil diatas, bisa dilihat variabel apa yang tinggi/rendah disetiap klasternya. Visualisasinya kayak dibawah. (Paling Rekomendasi: Radar Chart)

Bar Chart Per Cluster

# Plot the cluster profiles
library(ggplot2)
ggplot(centers_long, aes(x = cluster, y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Cluster", y = "Mean Value", fill = "Variable") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Profil Klaster Demografi di Indonesia")

Bar Chart Per Variabel

library(ggplot2)

# Melt the data for easier plotting
melted_data <- reshape2::melt(centers_df, id.vars = "cluster")

# Plotting
ggplot(melted_data, aes(x = factor(cluster), y = value, fill = factor(cluster))) +
  geom_bar(stat = "identity") +
  facet_wrap(~variable, scales = "free_y") +
  theme_minimal() +
  labs(
    title = "Perbandingan Cluster",
    x = "Cluster",
    y = "Nilai",
    fill = "Cluster"
  ) +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Set1")

Radar Chart

# Add the cluster assignments to your data
data$Cluster <- km.res$cluster

# Load the 'ggiraphExtra' library for the ggRadar function
library(ggiraphExtra)

# Radar Plot
old_par <- par()
par(oma = c(10, 4, 4, 8))

ggRadar(
  data = data,
  mapping = aes(colours = Cluster),
) + 
theme_light() +
theme(
  text = element_text(size = 10),  # Mengubah ukuran font global
  title = element_text(size = 12),  # Mengubah ukuran font judul
  axis.text = element_text(size = 10),  # Mengubah ukuran font label sumbu
  legend.text = element_text(size = 8)  # Mengubah ukuran font legenda
)

par(old_par)
## Warning in par(old_par): graphical parameter "cin" cannot be set
## Warning in par(old_par): graphical parameter "cra" cannot be set
## Warning in par(old_par): graphical parameter "csi" cannot be set
## Warning in par(old_par): graphical parameter "cxy" cannot be set
## Warning in par(old_par): graphical parameter "din" cannot be set
## Warning in par(old_par): graphical parameter "page" cannot be set

Klasterisasi Daerah

data_cluster <- data
data_cluster$cluster <- km.res$cluster #Ambil Angka Cluster
data_cluster <- cbind(datak[,1],data_cluster) #Ambil Provinsi sama Kode BPS
data_cluster
##                      PROVINSI    IK    IA    IP Cluster cluster
## 1                        ACEH 80.17 69.34 66.25       3       3
## 2              SUMATERA UTARA 80.19 75.02 64.44       3       3
## 3              SUMATERA BARAT 87.53 84.23 71.84       2       2
## 4                        RIAU 38.78 86.50 69.81       4       4
## 5                       JAMBI 58.48 82.00 67.04       4       4
## 6            SUMATERA SELATAN 81.57 74.11 61.29       3       3
## 7                    BENGKULU 69.92 75.18 61.53       3       3
## 8                     LAMPUNG 94.94 79.62 67.72       2       2
## 9   KEPULAUAN BANGKA BELITUNG 36.38 92.47 77.54       4       4
## 10             KEPULAUAN RIAU  5.83 88.90 72.53       1       1
## 11                DKI JAKARTA  0.00 83.22 81.12       1       1
## 12                 JAWA BARAT 84.59 79.11 73.71       2       2
## 13                JAWA TENGAH 88.88 81.47 80.69       2       2
## 14 DAERAH LSTIMEWA YOGYAKARTA 84.20 79.66 79.97       2       2
## 15                 JAWA TIMUR 90.64 79.42 74.27       2       2
## 16                     BANTEN 82.81 83.99 62.79       3       3
## 17                       BALI 79.08 94.00 82.08       2       2
## 18        NUSA TENGGARA BARAT 92.85 76.24 67.81       2       2
## 19        NUSA TENGGARA TIMUR 86.25 61.62 60.55       3       3
## 20           KALIMANTAN BARAT 76.21 86.33 56.29       3       3
## 21          KALIMANTAN TENGAH 49.87 91.07 67.55       4       4
## 22         KALIMANTAN SELATAN 88.69 90.68 69.01       2       2
## 23           KALIMANTAN TIMUR 52.33 90.27 79.43       4       4
## 24           KALIMANTAN UTARA 43.50 89.99 72.21       4       4
## 25             SULAWESI UTARA 70.37 81.42 71.42       3       3
## 26            SULAWESI TENGAH 77.98 80.39 70.88       3       3
## 27           SULAWESI SELATAN 94.71 83.52 71.22       2       2
## 28          SULAWESI TENGGARA 71.80 78.24 74.38       3       3
## 29                  GORONTALO 94.25 79.76 71.91       2       2
## 30             SULAWESI BARAT 76.11 83.45 65.42       3       3
## 31                     MALUKU 45.87 68.57 62.27       4       4
## 32               MALUKU UTARA 14.50 88.12 60.61       1       1
## 33                PAPUA BARAT 11.81 62.11 56.17       1       1
## 34                      PAPUA 17.68 49.89 42.34       1       1
str(data_cluster)
## 'data.frame':    34 obs. of  6 variables:
##  $ PROVINSI: chr  "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ IK      : num  80.2 80.2 87.5 38.8 58.5 ...
##  $ IA      : num  69.3 75 84.2 86.5 82 ...
##  $ IP      : num  66.2 64.4 71.8 69.8 67 ...
##  $ Cluster : int  3 3 2 4 4 3 3 2 4 1 ...
##  $ cluster : int  3 3 2 4 4 3 3 2 4 1 ...

Baca Data Geospasial

library(spdep)
library(sf)
map_indonesia <- st_read("C:/Users/Deden Ahmad Rabani/OneDrive - apps.ipb.ac.id/Semester 4/Gammaphics/Batas Provinsi/BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL.shp")
## Reading layer `BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL' from data source 
##   `C:\Users\Deden Ahmad Rabani\OneDrive - apps.ipb.ac.id\Semester 4\Gammaphics\Batas Provinsi\BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 34 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XYZ
## Bounding box:  xmin: 95.01079 ymin: -11.00762 xmax: 141.0194 ymax: 6.07693
## z_range:       zmin: 2.65e-05 zmax: 2.65e-05
## Geodetic CRS:  WGS 84

Merged Data Asli dan Data Geospasial

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Data_Indonesia <- map_indonesia %>%
  inner_join(data_cluster, by = c("PROVINSI" = "PROVINSI"))
Data_Indonesia
## Simple feature collection with 33 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XYZ
## Bounding box:  xmin: 95.01079 ymin: -11.00762 xmax: 141.0194 ymax: 6.07693
## z_range:       zmin: 2.65e-05 zmax: 2.65e-05
## Geodetic CRS:  WGS 84
## First 10 features:
##    OBJECTID    PROVINSI Shape_Leng Shape_Area    IK    IA    IP Cluster cluster
## 1         1        ACEH  27.455786 4.62543602 80.17 69.34 66.25       3       3
## 2         2        BALI   6.026646 0.45871702 79.08 94.00 82.08       2       2
## 3         3      BANTEN   9.282228 0.76491115 82.81 83.99 62.79       3       3
## 4         4    BENGKULU  11.706367 1.63012892 69.92 75.18 61.53       3       3
## 5         6 DKI JAKARTA   3.123689 0.05342567  0.00 83.22 81.12       1       1
## 6         7   GORONTALO  11.013237 0.97760000 94.25 79.76 71.91       2       2
## 7         8       JAMBI  11.835072 3.97771124 58.48 82.00 67.04       4       4
## 8         9  JAWA BARAT  11.614950 3.03278484 84.59 79.11 73.71       2       2
## 9        10 JAWA TENGAH  15.456349 2.81983779 88.88 81.47 80.69       2       2
## 10       11  JAWA TIMUR  33.571284 3.93831998 90.64 79.42 74.27       2       2
##                          geometry
## 1  MULTIPOLYGON Z (((97.39178 ...
## 2  MULTIPOLYGON Z (((115.1251 ...
## 3  MULTIPOLYGON Z (((105.5498 ...
## 4  MULTIPOLYGON Z (((102.3862 ...
## 5  MULTIPOLYGON Z (((106.8768 ...
## 6  MULTIPOLYGON Z (((121.4254 ...
## 7  MULTIPOLYGON Z (((104.4071 ...
## 8  MULTIPOLYGON Z (((108.685 -...
## 9  MULTIPOLYGON Z (((108.8835 ...
## 10 MULTIPOLYGON Z (((114.2038 ...

Visualisasi Peta

cluster_full <- ggplot() +  
  geom_sf(data = Data_Indonesia, aes(fill = factor(`cluster`))) +
  geom_sf(data = Data_Indonesia, fill = NA, color = "black") +  # Tambahkan baris ini
  scale_fill_manual(values = c("#CC415E", "#E9CE2C", "#66D7D1", "#2F41DD"), name = "Keterangan") +
  theme_minimal() +
  theme(
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 12, face = "bold", hjust = 0.5),
    panel.background = element_blank(),
    panel.grid = element_blank()
  ) +
  scale_x_continuous(labels = function(x) paste0(x, "°")) +
  scale_y_continuous(labels = function(y) paste0(y, "°"))

cluster_full