data <- read.csv("C:/Users/Darren Eduardo/Downloads/Project UAS_Analisis Multivariat/spmb2023_1_skd_polstatstis.csv")
asal_count <- data %>%
  count(kode.asal, name = "jumlah")

ggplot(data, aes(x = factor(kode.asal), fill = factor(kode.asal))) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Count Plot: Jumlah Data per Kode Asal",
       x = "Kode Asal",
       y = "Jumlah") +
  theme_minimal() +
  theme(legend.position = "none")

jk_count <- data %>%
  count(kode.pendidikan, name = "jumlah")

ggplot(data, aes(x = factor(kode.pendidikan), fill = factor(kode.pendidikan))) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Count Plot: Jumlah Data per Kode Pendidikan",
       x = "Kode Pendidikan",
       y = "Jumlah") +
  theme_minimal() +
  theme(legend.position = "none")

pendidikan_count <- data %>%
  count(jenis.kelamin, name = "jumlah")

ggplot(data, aes(x = factor(jenis.kelamin), fill = factor(jenis.kelamin))) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Count Plot: Jumlah Data per Jenis Kelamin",
       x = "Jenis Kelamin",
       y = "Jumlah") +
  theme_minimal() +
  theme(legend.position = "none")

Hilangkan NA pada fitur yang penting

data <- data %>%
  drop_na(skd.twk, skd.tiu, skd.tkp)

Normalisasi hanya kolom SKD

skd_scaled <- data %>%
  select(skd.twk, skd.tiu, skd.tkp) %>%
  scale()
non_skd_data <- data %>%
  select(kode.pendidikan, kode.asal, jenis.kelamin) %>%
  mutate(
    pendidikan_encoded = case_when(
      kode.pendidikan == 3001000 ~ 0,  # SMA
      kode.pendidikan == 3002000 ~ 1,  # MA
      kode.pendidikan == 3171110 ~ 2,  # SMK-TIK
      TRUE ~ -1  
    ),
    jk_encoded = case_when(
      jenis.kelamin == "L" ~ 0,
      jenis.kelamin == "P" ~ 1,
      TRUE ~ NA_real_ 
    )
  )

Gabungkan data untuk clustering

data_for_clustering <- cbind(as.data.frame(skd_scaled), non_skd_data)
data_for_clustering <- cbind(as.data.frame(skd_scaled), 
                              pendidikan = non_skd_data$pendidikan_encoded,
                              asal = non_skd_data$kode.asal,
                              jenis_kelamin = non_skd_data$jk_encoded)
sapply(data_for_clustering, max)
##       skd.twk       skd.tiu       skd.tkp    pendidikan          asal 
##      2.799798      1.788658      1.128078      2.000000     92.000000 
## jenis_kelamin 
##      1.000000

K-Means Clustering

Hitung silhouette score untuk setiap k

get_silhouette <- function(k, data) {
  km <- kmeans(data, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(data))
  mean(ss[, 3])
}

silhouette_scores <- sapply(2:5, get_silhouette, data = data_for_clustering)
names(silhouette_scores) <- paste0("k = ", 2:5)
print(silhouette_scores)
##     k = 2     k = 3     k = 4     k = 5 
## 0.7095836 0.7716911 0.7863142 0.7843474

Simpan hasil klaster ke data asli

set.seed(123)
kmeans_result <- kmeans(data_for_clustering, centers = 4, nstart = 25)

data$cluster_kmeans <- as.factor(kmeans_result$cluster)

PCA untuk visualisasi

pca_res <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_res$x[, 1:2])
pca_data$cluster <- data$cluster_kmeans

find_hull <- function(df) df[chull(df$PC1, df$PC2), ]
hulls <- pca_data %>% group_by(cluster) %>% do(find_hull(.))

ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_polygon(data = hulls, aes(fill = cluster), alpha = 0.2, color = NA, show.legend = FALSE) +
  scale_shape_manual(values = c(16, 17, 15, 18, 8)) + 
  scale_fill_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
  scale_color_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
  labs(title = "K-Means Clustering", x = "PC1", y = "PC2") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0, face = "bold"),
    legend.position = "right"
  )

Silhouette Score visual

sil <- silhouette(kmeans_result$cluster, dist(data_for_clustering))
fviz_silhouette(sil)
##   cluster size ave.sil.width
## 1       1 8108          0.80
## 2       2 8976          0.85
## 3       3 2101          0.59
## 4       4 2030          0.67

data %>%
  group_by(cluster_kmeans) %>%
  summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd)))
## # A tibble: 4 × 7
##   cluster_kmeans skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
##   <fct>                 <dbl>      <dbl>        <dbl>      <dbl>        <dbl>
## 1 1                      78.2       22.5        103.        37.2         169.
## 2 2                      81.2       24.3        113.        38.5         168.
## 3 3                      79.5       22.8        101.        38.4         169.
## 4 4                      74.3       23.7         90.6       38.4         165.
## # ℹ 1 more variable: skd.tkp_sd <dbl>
data %>%
  group_by(cluster_kmeans, jenis.kelamin) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_kmeans)
## # A tibble: 8 × 3
##   cluster_kmeans jenis.kelamin jumlah
##   <fct>          <chr>          <int>
## 1 1              L               2535
## 2 1              P               5573
## 3 2              L               2938
## 4 2              P               6038
## 5 3              L                728
## 6 3              P               1373
## 7 4              L                669
## 8 4              P               1361
data %>%
  group_by(cluster_kmeans, kode.asal) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_kmeans)
## # A tibble: 34 × 3
##    cluster_kmeans kode.asal jumlah
##    <fct>              <int>  <int>
##  1 1                     11    534
##  2 1                     12   2650
##  3 1                     13   1292
##  4 1                     14    648
##  5 1                     15    518
##  6 1                     16   1082
##  7 1                     17    398
##  8 1                     18    616
##  9 1                     19    114
## 10 1                     21    256
## # ℹ 24 more rows
data %>%
  group_by(cluster_kmeans, kode.pendidikan) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_kmeans)
## # A tibble: 12 × 3
##    cluster_kmeans kode.pendidikan jumlah
##    <fct>                    <int>  <int>
##  1 1                      3001000   6776
##  2 1                      3002000   1205
##  3 1                      3171110    127
##  4 2                      3001000   7637
##  5 2                      3002000   1039
##  6 2                      3171110    300
##  7 3                      3001000   1697
##  8 3                      3002000    357
##  9 3                      3171110     47
## 10 4                      3001000   1691
## 11 4                      3002000    286
## 12 4                      3171110     53

Fuzzy C-Means

get_silhouette_fcm <- function(k, data) {
  fcm_result <- fcm(data, centers = k, m = 2)
  cluster_labels <- apply(fcm_result$u, 1, which.max)
  sil <- silhouette(cluster_labels, dist(data))
  mean(sil[, 3])
}

silhouette_scores <- sapply(2:5, get_silhouette_fcm, data = data_for_clustering)
names(silhouette_scores) <- paste0("k = ", 2:5)
print(silhouette_scores)
##     k = 2     k = 3     k = 4     k = 5 
## 0.7095836 0.7621403 0.7863142 0.7773195

Ambil label klaster berdasarkan membership tertinggi

set.seed(123)
fcm_result <- fcm(data_for_clustering, centers = 4, m = 2)

data$cluster_fcm <- as.factor(apply(fcm_result$u, 1, which.max))
pca_res <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_res$x[, 1:2])
pca_data$cluster <- data$cluster_fcm

find_hull <- function(df) df[chull(df$PC1, df$PC2), ]
hulls <- pca_data %>% group_by(cluster) %>% do(find_hull(.))

ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_polygon(data = hulls, aes(fill = cluster), alpha = 0.2, color = NA, show.legend = FALSE) +
  scale_shape_manual(values = c(16, 17, 15, 18, 8)) + # Tambah variasi jika klaster > 2
  scale_fill_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
  scale_color_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
  labs(title = "Visualisasi Fuzzy C-Means", x = "PC1", y = "PC2") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0, face = "bold"),
    legend.position = "right"
  )

Gunakan label dari membership tertinggi

cluster_labels <- apply(fcm_result$u, 1, which.max)
sil <- silhouette(cluster_labels, dist(data_for_clustering))
fviz_silhouette(sil)
##   cluster size ave.sil.width
## 1       1 8108          0.80
## 2       2 2101          0.59
## 3       3 2030          0.67
## 4       4 8976          0.85

data %>%
  group_by(cluster_fcm) %>%
  summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd)))
## # A tibble: 4 × 7
##   cluster_fcm skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
##   <fct>              <dbl>      <dbl>        <dbl>      <dbl>        <dbl>
## 1 1                   78.2       22.5        103.        37.2         169.
## 2 2                   79.5       22.8        101.        38.4         169.
## 3 3                   74.3       23.7         90.6       38.4         165.
## 4 4                   81.2       24.3        113.        38.5         168.
## # ℹ 1 more variable: skd.tkp_sd <dbl>
data %>%
  group_by(cluster_fcm, jenis.kelamin) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_fcm)
## # A tibble: 8 × 3
##   cluster_fcm jenis.kelamin jumlah
##   <fct>       <chr>          <int>
## 1 1           L               2535
## 2 1           P               5573
## 3 2           L                728
## 4 2           P               1373
## 5 3           L                669
## 6 3           P               1361
## 7 4           L               2938
## 8 4           P               6038
data %>%
  group_by(cluster_fcm, kode.asal) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_fcm)
## # A tibble: 34 × 3
##    cluster_fcm kode.asal jumlah
##    <fct>           <int>  <int>
##  1 1                  11    534
##  2 1                  12   2650
##  3 1                  13   1292
##  4 1                  14    648
##  5 1                  15    518
##  6 1                  16   1082
##  7 1                  17    398
##  8 1                  18    616
##  9 1                  19    114
## 10 1                  21    256
## # ℹ 24 more rows
data %>%
  group_by(cluster_fcm, kode.pendidikan) %>%
  summarise(jumlah = n(), .groups = "drop") %>%
  arrange(cluster_fcm)
## # A tibble: 12 × 3
##    cluster_fcm kode.pendidikan jumlah
##    <fct>                 <int>  <int>
##  1 1                   3001000   6776
##  2 1                   3002000   1205
##  3 1                   3171110    127
##  4 2                   3001000   1697
##  5 2                   3002000    357
##  6 2                   3171110     47
##  7 3                   3001000   1691
##  8 3                   3002000    286
##  9 3                   3171110     53
## 10 4                   3001000   7637
## 11 4                   3002000   1039
## 12 4                   3171110    300

#Dbscan

k <- 8
kNN <- kNNdist(data_for_clustering, k = k)
distance_values <- sort(kNN)

plot(distance_values, type = "l", main = "kNN Distance Plot", 
     xlab = "Points sorted by distance", ylab = "k-distance")
grid()

x <- 1:length(distance_values)
y <- distance_values
second_diff <- diff(diff(y))

best_elbow_index <- which.max(second_diff) + 1
optimal_eps <- y[best_elbow_index]

plot(x, y, type = "b", main = "K-distance Graph", xlab = "Points", ylab = "Distance")
abline(h = optimal_eps, col = "red", lty = 2)

cat("Epsilon optimal berdasarkan perhitungan manual:", round(optimal_eps, 3), "\n")
## Epsilon optimal berdasarkan perhitungan manual: 2.853
hasil_dbscan <- dbscan(data_for_clustering, eps = 1.2 , minPts = 5)
print(hasil_dbscan)
## DBSCAN clustering for 21215 objects.
## Parameters: eps = 1.2, minPts = 5
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 16 cluster(s) and 80 noise points.
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##   80 7554 8530 1488  275  916  435  240 1091   12   32  200   44   64  226    8 
##   16 
##   20 
## 
## Available fields: cluster, eps, minPts, metric, borderPoints
dbscan_result <- dbscan(data_for_clustering, eps = 1.2 , minPts = 5)

Visualisasi hasil DBSCAN (tanpa noise = cluster 0)

non_noise_idx <- dbscan_result$cluster != 0

fviz_cluster(list(data = data_for_clustering[non_noise_idx, ],
                  cluster = dbscan_result$cluster[non_noise_idx]),
             geom = "point",
             palette = rainbow(length(unique(dbscan_result$cluster[non_noise_idx]))),
             ggtheme = theme_minimal(),
             main = paste("Visualisasi DBSCAN (eps =", round(optimal_eps, 3), ")"))

final_data <- as.data.frame(data_for_clustering)
final_data$cluster_dbscan <- as.factor(dbscan_result$cluster)
table(final_data$cluster_dbscan)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##   80 7554 8530 1488  275  916  435  240 1091   12   32  200   44   64  226    8 
##   16 
##   20
aggregate(. ~ cluster_dbscan, data = final_data, FUN = mean)
##    cluster_dbscan     skd.twk     skd.tiu      skd.tkp pendidikan     asal
## 1               0 -0.47705909 -0.68432149 -1.527256428 1.08750000 47.21250
## 2               1  0.07375976  0.03157076  0.191502273 0.17434472 13.83320
## 3               2  0.26092110  0.32866779  0.226042958 0.18112544 33.36600
## 4               3 -0.02458071 -0.18540620  0.162117056 0.19690860 73.17003
## 5               4 -3.37090733 -2.74298006 -4.448883947 0.25454545 13.93818
## 6               5  0.16811928  0.06985848  0.221196788 0.27292576 62.52729
## 7               6 -3.37090733 -2.74298006 -4.448883947 0.19540230 33.48736
## 8               7  0.30580488  0.22632203  0.241754950 0.18750000 21.00000
## 9               8  0.12509133 -0.11608952  0.167182590 0.15307058 52.21632
## 10              9 -3.37090733 -2.74298006 -4.448883947 0.08333333 21.00000
## 11             10 -3.37090733 -2.74298006 -4.448883947 0.12500000 52.40625
## 12             11 -0.08021887 -0.35480664  0.115766715 0.05500000 91.33500
## 13             12 -3.37090733 -2.74298006 -4.448883947 0.29545455 62.65909
## 14             13 -3.37090733 -2.74298006 -4.448883947 0.20312500 73.23438
## 15             14 -0.24695221 -0.84438973 -0.002144051 0.14601770 81.38496
## 16             15 -3.37090733 -2.74298006 -4.448883947 0.00000000 81.25000
## 17             16 -3.37090733 -2.74298006 -4.448883947 0.35000000 91.35000
##    jenis_kelamin
## 1      0.4250000
## 2      0.6879799
## 3      0.6752638
## 4      0.6525538
## 5      0.6545455
## 6      0.6157205
## 7      0.6321839
## 8      0.7208333
## 9      0.6966086
## 10     0.5833333
## 11     0.6562500
## 12     0.7400000
## 13     0.5000000
## 14     0.6718750
## 15     0.7477876
## 16     1.0000000
## 17     0.6500000
data$cluster_dbscan <- final_data$cluster_dbscan
data %>%
  group_by(cluster_dbscan) %>%
  summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))
## # A tibble: 17 × 7
##    cluster_dbscan skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
##    <fct>                 <dbl>      <dbl>        <dbl>      <dbl>        <dbl>
##  1 0                      68         26.6         79.5       34.4         111.
##  2 1                      80.9       17.0        107.        32.0         176.
##  3 2                      85.3       16.3        119.        29.6         177.
##  4 3                      78.6       17.0         98.8       32.5         174.
##  5 4                       0          0            0          0             0 
##  6 5                      83.2       16.6        109.        31.2         177.
##  7 6                       0          0            0          0             0 
##  8 7                      86.4       16.4        115.        32.0         177.
##  9 8                      82.1       17.2        101.        35.0         175.
## 10 9                       0          0            0          0             0 
## 11 10                      0          0            0          0             0 
## 12 11                     77.3       19.7         92.2       36.8         173.
## 13 12                      0          0            0          0             0 
## 14 13                      0          0            0          0             0 
## 15 14                     73.4       18.1         73.3       30.1         168.
## 16 15                      0          0            0          0             0 
## 17 16                      0          0            0          0             0 
## # ℹ 1 more variable: skd.tkp_sd <dbl>

1. Rata-rata semua variabel numerik per klaster

numeric_summary <- final_data %>%
  group_by(cluster_dbscan) %>%
  summarise(
    across(
      .cols = where(is.numeric),
      .fns = ~ mean(.x, na.rm = TRUE),
      .names = "mean_{.col}"
    )
  )
print(numeric_summary)
## # A tibble: 17 × 7
##    cluster_dbscan mean_skd.twk mean_skd.tiu mean_skd.tkp mean_pendidikan
##    <fct>                 <dbl>        <dbl>        <dbl>           <dbl>
##  1 0                   -0.477       -0.684      -1.53             1.09  
##  2 1                    0.0738       0.0316      0.192            0.174 
##  3 2                    0.261        0.329       0.226            0.181 
##  4 3                   -0.0246      -0.185       0.162            0.197 
##  5 4                   -3.37        -2.74       -4.45             0.255 
##  6 5                    0.168        0.0699      0.221            0.273 
##  7 6                   -3.37        -2.74       -4.45             0.195 
##  8 7                    0.306        0.226       0.242            0.188 
##  9 8                    0.125       -0.116       0.167            0.153 
## 10 9                   -3.37        -2.74       -4.45             0.0833
## 11 10                  -3.37        -2.74       -4.45             0.125 
## 12 11                  -0.0802      -0.355       0.116            0.055 
## 13 12                  -3.37        -2.74       -4.45             0.295 
## 14 13                  -3.37        -2.74       -4.45             0.203 
## 15 14                  -0.247       -0.844      -0.00214          0.146 
## 16 15                  -3.37        -2.74       -4.45             0     
## 17 16                  -3.37        -2.74       -4.45             0.35  
## # ℹ 2 more variables: mean_asal <dbl>, mean_jenis_kelamin <dbl>

2. Distribusi Gender per klaster

jenis_kelamin <- final_data %>%
  group_by(cluster_dbscan, jenis_kelamin) %>%
  summarise(count = n(), .groups = "drop_last") %>%
  mutate(prop = count / sum(count))

print(jenis_kelamin)
## # A tibble: 33 × 4
## # Groups:   cluster_dbscan [17]
##    cluster_dbscan jenis_kelamin count  prop
##    <fct>                  <dbl> <int> <dbl>
##  1 0                          0    46 0.575
##  2 0                          1    34 0.425
##  3 1                          0  2357 0.312
##  4 1                          1  5197 0.688
##  5 2                          0  2770 0.325
##  6 2                          1  5760 0.675
##  7 3                          0   517 0.347
##  8 3                          1   971 0.653
##  9 4                          0    95 0.345
## 10 4                          1   180 0.655
## # ℹ 23 more rows

3. Distribusi Kode Asal per klaster

mean_asal <- final_data %>%
  group_by(cluster_dbscan, asal) %>%
  summarise(count = n(), .groups = "drop_last") %>%
  mutate(prop = count / sum(count))

print(mean_asal)
## # A tibble: 93 × 4
## # Groups:   cluster_dbscan [17]
##    cluster_dbscan  asal count   prop
##    <fct>          <int> <int>  <dbl>
##  1 0                 12    10 0.125 
##  2 0                 13     2 0.025 
##  3 0                 14     3 0.0375
##  4 0                 15     3 0.0375
##  5 0                 16     4 0.05  
##  6 0                 18     1 0.0125
##  7 0                 21     4 0.05  
##  8 0                 31     2 0.025 
##  9 0                 33     2 0.025 
## 10 0                 34     2 0.025 
## # ℹ 83 more rows

4. Distribusi Kode Pendidikan per klaster

mean_pendidikan <- final_data %>%
  group_by(cluster_dbscan, pendidikan) %>%
  summarise(count = n(), .groups = "drop_last") %>%
  mutate(prop = count / sum(count))

print(mean_pendidikan)
## # A tibble: 47 × 4
## # Groups:   cluster_dbscan [17]
##    cluster_dbscan pendidikan count   prop
##    <fct>               <dbl> <int>  <dbl>
##  1 0                       0    25 0.312 
##  2 0                       1    23 0.288 
##  3 0                       2    32 0.4   
##  4 1                       0  6344 0.840 
##  5 1                       1  1103 0.146 
##  6 1                       2   107 0.0142
##  7 2                       0  7265 0.852 
##  8 2                       1   985 0.115 
##  9 2                       2   280 0.0328
## 10 3                       0  1230 0.827 
## # ℹ 37 more rows