data <- read.csv("C:/Users/Darren Eduardo/Downloads/Project UAS_Analisis Multivariat/spmb2023_1_skd_polstatstis.csv")
asal_count <- data %>%
count(kode.asal, name = "jumlah")
ggplot(data, aes(x = factor(kode.asal), fill = factor(kode.asal))) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Count Plot: Jumlah Data per Kode Asal",
x = "Kode Asal",
y = "Jumlah") +
theme_minimal() +
theme(legend.position = "none")
jk_count <- data %>%
count(kode.pendidikan, name = "jumlah")
ggplot(data, aes(x = factor(kode.pendidikan), fill = factor(kode.pendidikan))) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Count Plot: Jumlah Data per Kode Pendidikan",
x = "Kode Pendidikan",
y = "Jumlah") +
theme_minimal() +
theme(legend.position = "none")
pendidikan_count <- data %>%
count(jenis.kelamin, name = "jumlah")
ggplot(data, aes(x = factor(jenis.kelamin), fill = factor(jenis.kelamin))) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Count Plot: Jumlah Data per Jenis Kelamin",
x = "Jenis Kelamin",
y = "Jumlah") +
theme_minimal() +
theme(legend.position = "none")
data <- data %>%
drop_na(skd.twk, skd.tiu, skd.tkp)
skd_scaled <- data %>%
select(skd.twk, skd.tiu, skd.tkp) %>%
scale()
non_skd_data <- data %>%
select(kode.pendidikan, kode.asal, jenis.kelamin) %>%
mutate(
pendidikan_encoded = case_when(
kode.pendidikan == 3001000 ~ 0, # SMA
kode.pendidikan == 3002000 ~ 1, # MA
kode.pendidikan == 3171110 ~ 2, # SMK-TIK
TRUE ~ -1
),
jk_encoded = case_when(
jenis.kelamin == "L" ~ 0,
jenis.kelamin == "P" ~ 1,
TRUE ~ NA_real_
)
)
data_for_clustering <- cbind(as.data.frame(skd_scaled), non_skd_data)
data_for_clustering <- cbind(as.data.frame(skd_scaled),
pendidikan = non_skd_data$pendidikan_encoded,
asal = non_skd_data$kode.asal,
jenis_kelamin = non_skd_data$jk_encoded)
sapply(data_for_clustering, max)
## skd.twk skd.tiu skd.tkp pendidikan asal
## 2.799798 1.788658 1.128078 2.000000 92.000000
## jenis_kelamin
## 1.000000
get_silhouette <- function(k, data) {
km <- kmeans(data, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(data))
mean(ss[, 3])
}
silhouette_scores <- sapply(2:5, get_silhouette, data = data_for_clustering)
names(silhouette_scores) <- paste0("k = ", 2:5)
print(silhouette_scores)
## k = 2 k = 3 k = 4 k = 5
## 0.7095836 0.7716911 0.7863142 0.7843474
set.seed(123)
kmeans_result <- kmeans(data_for_clustering, centers = 4, nstart = 25)
data$cluster_kmeans <- as.factor(kmeans_result$cluster)
pca_res <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_res$x[, 1:2])
pca_data$cluster <- data$cluster_kmeans
find_hull <- function(df) df[chull(df$PC1, df$PC2), ]
hulls <- pca_data %>% group_by(cluster) %>% do(find_hull(.))
ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 3, alpha = 0.8) +
geom_polygon(data = hulls, aes(fill = cluster), alpha = 0.2, color = NA, show.legend = FALSE) +
scale_shape_manual(values = c(16, 17, 15, 18, 8)) +
scale_fill_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
scale_color_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
labs(title = "K-Means Clustering", x = "PC1", y = "PC2") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0, face = "bold"),
legend.position = "right"
)
sil <- silhouette(kmeans_result$cluster, dist(data_for_clustering))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 8108 0.80
## 2 2 8976 0.85
## 3 3 2101 0.59
## 4 4 2030 0.67
data %>%
group_by(cluster_kmeans) %>%
summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd)))
## # A tibble: 4 × 7
## cluster_kmeans skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 78.2 22.5 103. 37.2 169.
## 2 2 81.2 24.3 113. 38.5 168.
## 3 3 79.5 22.8 101. 38.4 169.
## 4 4 74.3 23.7 90.6 38.4 165.
## # ℹ 1 more variable: skd.tkp_sd <dbl>
data %>%
group_by(cluster_kmeans, jenis.kelamin) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_kmeans)
## # A tibble: 8 × 3
## cluster_kmeans jenis.kelamin jumlah
## <fct> <chr> <int>
## 1 1 L 2535
## 2 1 P 5573
## 3 2 L 2938
## 4 2 P 6038
## 5 3 L 728
## 6 3 P 1373
## 7 4 L 669
## 8 4 P 1361
data %>%
group_by(cluster_kmeans, kode.asal) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_kmeans)
## # A tibble: 34 × 3
## cluster_kmeans kode.asal jumlah
## <fct> <int> <int>
## 1 1 11 534
## 2 1 12 2650
## 3 1 13 1292
## 4 1 14 648
## 5 1 15 518
## 6 1 16 1082
## 7 1 17 398
## 8 1 18 616
## 9 1 19 114
## 10 1 21 256
## # ℹ 24 more rows
data %>%
group_by(cluster_kmeans, kode.pendidikan) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_kmeans)
## # A tibble: 12 × 3
## cluster_kmeans kode.pendidikan jumlah
## <fct> <int> <int>
## 1 1 3001000 6776
## 2 1 3002000 1205
## 3 1 3171110 127
## 4 2 3001000 7637
## 5 2 3002000 1039
## 6 2 3171110 300
## 7 3 3001000 1697
## 8 3 3002000 357
## 9 3 3171110 47
## 10 4 3001000 1691
## 11 4 3002000 286
## 12 4 3171110 53
get_silhouette_fcm <- function(k, data) {
fcm_result <- fcm(data, centers = k, m = 2)
cluster_labels <- apply(fcm_result$u, 1, which.max)
sil <- silhouette(cluster_labels, dist(data))
mean(sil[, 3])
}
silhouette_scores <- sapply(2:5, get_silhouette_fcm, data = data_for_clustering)
names(silhouette_scores) <- paste0("k = ", 2:5)
print(silhouette_scores)
## k = 2 k = 3 k = 4 k = 5
## 0.7095836 0.7621403 0.7863142 0.7773195
set.seed(123)
fcm_result <- fcm(data_for_clustering, centers = 4, m = 2)
data$cluster_fcm <- as.factor(apply(fcm_result$u, 1, which.max))
pca_res <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_res$x[, 1:2])
pca_data$cluster <- data$cluster_fcm
find_hull <- function(df) df[chull(df$PC1, df$PC2), ]
hulls <- pca_data %>% group_by(cluster) %>% do(find_hull(.))
ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster, shape = cluster)) +
geom_point(size = 3, alpha = 0.8) +
geom_polygon(data = hulls, aes(fill = cluster), alpha = 0.2, color = NA, show.legend = FALSE) +
scale_shape_manual(values = c(16, 17, 15, 18, 8)) + # Tambah variasi jika klaster > 2
scale_fill_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
scale_color_manual(values = c("#FF6F61", "#00BFC4", "#C77CFF", "#7CAE00", "#F8766D")) +
labs(title = "Visualisasi Fuzzy C-Means", x = "PC1", y = "PC2") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0, face = "bold"),
legend.position = "right"
)
cluster_labels <- apply(fcm_result$u, 1, which.max)
sil <- silhouette(cluster_labels, dist(data_for_clustering))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 8108 0.80
## 2 2 2101 0.59
## 3 3 2030 0.67
## 4 4 8976 0.85
data %>%
group_by(cluster_fcm) %>%
summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd)))
## # A tibble: 4 × 7
## cluster_fcm skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 78.2 22.5 103. 37.2 169.
## 2 2 79.5 22.8 101. 38.4 169.
## 3 3 74.3 23.7 90.6 38.4 165.
## 4 4 81.2 24.3 113. 38.5 168.
## # ℹ 1 more variable: skd.tkp_sd <dbl>
data %>%
group_by(cluster_fcm, jenis.kelamin) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_fcm)
## # A tibble: 8 × 3
## cluster_fcm jenis.kelamin jumlah
## <fct> <chr> <int>
## 1 1 L 2535
## 2 1 P 5573
## 3 2 L 728
## 4 2 P 1373
## 5 3 L 669
## 6 3 P 1361
## 7 4 L 2938
## 8 4 P 6038
data %>%
group_by(cluster_fcm, kode.asal) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_fcm)
## # A tibble: 34 × 3
## cluster_fcm kode.asal jumlah
## <fct> <int> <int>
## 1 1 11 534
## 2 1 12 2650
## 3 1 13 1292
## 4 1 14 648
## 5 1 15 518
## 6 1 16 1082
## 7 1 17 398
## 8 1 18 616
## 9 1 19 114
## 10 1 21 256
## # ℹ 24 more rows
data %>%
group_by(cluster_fcm, kode.pendidikan) %>%
summarise(jumlah = n(), .groups = "drop") %>%
arrange(cluster_fcm)
## # A tibble: 12 × 3
## cluster_fcm kode.pendidikan jumlah
## <fct> <int> <int>
## 1 1 3001000 6776
## 2 1 3002000 1205
## 3 1 3171110 127
## 4 2 3001000 1697
## 5 2 3002000 357
## 6 2 3171110 47
## 7 3 3001000 1691
## 8 3 3002000 286
## 9 3 3171110 53
## 10 4 3001000 7637
## 11 4 3002000 1039
## 12 4 3171110 300
#Dbscan
k <- 8
kNN <- kNNdist(data_for_clustering, k = k)
distance_values <- sort(kNN)
plot(distance_values, type = "l", main = "kNN Distance Plot",
xlab = "Points sorted by distance", ylab = "k-distance")
grid()
x <- 1:length(distance_values)
y <- distance_values
second_diff <- diff(diff(y))
best_elbow_index <- which.max(second_diff) + 1
optimal_eps <- y[best_elbow_index]
plot(x, y, type = "b", main = "K-distance Graph", xlab = "Points", ylab = "Distance")
abline(h = optimal_eps, col = "red", lty = 2)
cat("Epsilon optimal berdasarkan perhitungan manual:", round(optimal_eps, 3), "\n")
## Epsilon optimal berdasarkan perhitungan manual: 2.853
hasil_dbscan <- dbscan(data_for_clustering, eps = 1.2 , minPts = 5)
print(hasil_dbscan)
## DBSCAN clustering for 21215 objects.
## Parameters: eps = 1.2, minPts = 5
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 16 cluster(s) and 80 noise points.
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 80 7554 8530 1488 275 916 435 240 1091 12 32 200 44 64 226 8
## 16
## 20
##
## Available fields: cluster, eps, minPts, metric, borderPoints
dbscan_result <- dbscan(data_for_clustering, eps = 1.2 , minPts = 5)
non_noise_idx <- dbscan_result$cluster != 0
fviz_cluster(list(data = data_for_clustering[non_noise_idx, ],
cluster = dbscan_result$cluster[non_noise_idx]),
geom = "point",
palette = rainbow(length(unique(dbscan_result$cluster[non_noise_idx]))),
ggtheme = theme_minimal(),
main = paste("Visualisasi DBSCAN (eps =", round(optimal_eps, 3), ")"))
final_data <- as.data.frame(data_for_clustering)
final_data$cluster_dbscan <- as.factor(dbscan_result$cluster)
table(final_data$cluster_dbscan)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 80 7554 8530 1488 275 916 435 240 1091 12 32 200 44 64 226 8
## 16
## 20
aggregate(. ~ cluster_dbscan, data = final_data, FUN = mean)
## cluster_dbscan skd.twk skd.tiu skd.tkp pendidikan asal
## 1 0 -0.47705909 -0.68432149 -1.527256428 1.08750000 47.21250
## 2 1 0.07375976 0.03157076 0.191502273 0.17434472 13.83320
## 3 2 0.26092110 0.32866779 0.226042958 0.18112544 33.36600
## 4 3 -0.02458071 -0.18540620 0.162117056 0.19690860 73.17003
## 5 4 -3.37090733 -2.74298006 -4.448883947 0.25454545 13.93818
## 6 5 0.16811928 0.06985848 0.221196788 0.27292576 62.52729
## 7 6 -3.37090733 -2.74298006 -4.448883947 0.19540230 33.48736
## 8 7 0.30580488 0.22632203 0.241754950 0.18750000 21.00000
## 9 8 0.12509133 -0.11608952 0.167182590 0.15307058 52.21632
## 10 9 -3.37090733 -2.74298006 -4.448883947 0.08333333 21.00000
## 11 10 -3.37090733 -2.74298006 -4.448883947 0.12500000 52.40625
## 12 11 -0.08021887 -0.35480664 0.115766715 0.05500000 91.33500
## 13 12 -3.37090733 -2.74298006 -4.448883947 0.29545455 62.65909
## 14 13 -3.37090733 -2.74298006 -4.448883947 0.20312500 73.23438
## 15 14 -0.24695221 -0.84438973 -0.002144051 0.14601770 81.38496
## 16 15 -3.37090733 -2.74298006 -4.448883947 0.00000000 81.25000
## 17 16 -3.37090733 -2.74298006 -4.448883947 0.35000000 91.35000
## jenis_kelamin
## 1 0.4250000
## 2 0.6879799
## 3 0.6752638
## 4 0.6525538
## 5 0.6545455
## 6 0.6157205
## 7 0.6321839
## 8 0.7208333
## 9 0.6966086
## 10 0.5833333
## 11 0.6562500
## 12 0.7400000
## 13 0.5000000
## 14 0.6718750
## 15 0.7477876
## 16 1.0000000
## 17 0.6500000
data$cluster_dbscan <- final_data$cluster_dbscan
data %>%
group_by(cluster_dbscan) %>%
summarise(across(c(skd.twk, skd.tiu, skd.tkp), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"))
## # A tibble: 17 × 7
## cluster_dbscan skd.twk_mean skd.twk_sd skd.tiu_mean skd.tiu_sd skd.tkp_mean
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 68 26.6 79.5 34.4 111.
## 2 1 80.9 17.0 107. 32.0 176.
## 3 2 85.3 16.3 119. 29.6 177.
## 4 3 78.6 17.0 98.8 32.5 174.
## 5 4 0 0 0 0 0
## 6 5 83.2 16.6 109. 31.2 177.
## 7 6 0 0 0 0 0
## 8 7 86.4 16.4 115. 32.0 177.
## 9 8 82.1 17.2 101. 35.0 175.
## 10 9 0 0 0 0 0
## 11 10 0 0 0 0 0
## 12 11 77.3 19.7 92.2 36.8 173.
## 13 12 0 0 0 0 0
## 14 13 0 0 0 0 0
## 15 14 73.4 18.1 73.3 30.1 168.
## 16 15 0 0 0 0 0
## 17 16 0 0 0 0 0
## # ℹ 1 more variable: skd.tkp_sd <dbl>
numeric_summary <- final_data %>%
group_by(cluster_dbscan) %>%
summarise(
across(
.cols = where(is.numeric),
.fns = ~ mean(.x, na.rm = TRUE),
.names = "mean_{.col}"
)
)
print(numeric_summary)
## # A tibble: 17 × 7
## cluster_dbscan mean_skd.twk mean_skd.tiu mean_skd.tkp mean_pendidikan
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.477 -0.684 -1.53 1.09
## 2 1 0.0738 0.0316 0.192 0.174
## 3 2 0.261 0.329 0.226 0.181
## 4 3 -0.0246 -0.185 0.162 0.197
## 5 4 -3.37 -2.74 -4.45 0.255
## 6 5 0.168 0.0699 0.221 0.273
## 7 6 -3.37 -2.74 -4.45 0.195
## 8 7 0.306 0.226 0.242 0.188
## 9 8 0.125 -0.116 0.167 0.153
## 10 9 -3.37 -2.74 -4.45 0.0833
## 11 10 -3.37 -2.74 -4.45 0.125
## 12 11 -0.0802 -0.355 0.116 0.055
## 13 12 -3.37 -2.74 -4.45 0.295
## 14 13 -3.37 -2.74 -4.45 0.203
## 15 14 -0.247 -0.844 -0.00214 0.146
## 16 15 -3.37 -2.74 -4.45 0
## 17 16 -3.37 -2.74 -4.45 0.35
## # ℹ 2 more variables: mean_asal <dbl>, mean_jenis_kelamin <dbl>
jenis_kelamin <- final_data %>%
group_by(cluster_dbscan, jenis_kelamin) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(prop = count / sum(count))
print(jenis_kelamin)
## # A tibble: 33 × 4
## # Groups: cluster_dbscan [17]
## cluster_dbscan jenis_kelamin count prop
## <fct> <dbl> <int> <dbl>
## 1 0 0 46 0.575
## 2 0 1 34 0.425
## 3 1 0 2357 0.312
## 4 1 1 5197 0.688
## 5 2 0 2770 0.325
## 6 2 1 5760 0.675
## 7 3 0 517 0.347
## 8 3 1 971 0.653
## 9 4 0 95 0.345
## 10 4 1 180 0.655
## # ℹ 23 more rows
mean_asal <- final_data %>%
group_by(cluster_dbscan, asal) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(prop = count / sum(count))
print(mean_asal)
## # A tibble: 93 × 4
## # Groups: cluster_dbscan [17]
## cluster_dbscan asal count prop
## <fct> <int> <int> <dbl>
## 1 0 12 10 0.125
## 2 0 13 2 0.025
## 3 0 14 3 0.0375
## 4 0 15 3 0.0375
## 5 0 16 4 0.05
## 6 0 18 1 0.0125
## 7 0 21 4 0.05
## 8 0 31 2 0.025
## 9 0 33 2 0.025
## 10 0 34 2 0.025
## # ℹ 83 more rows
mean_pendidikan <- final_data %>%
group_by(cluster_dbscan, pendidikan) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(prop = count / sum(count))
print(mean_pendidikan)
## # A tibble: 47 × 4
## # Groups: cluster_dbscan [17]
## cluster_dbscan pendidikan count prop
## <fct> <dbl> <int> <dbl>
## 1 0 0 25 0.312
## 2 0 1 23 0.288
## 3 0 2 32 0.4
## 4 1 0 6344 0.840
## 5 1 1 1103 0.146
## 6 1 2 107 0.0142
## 7 2 0 7265 0.852
## 8 2 1 985 0.115
## 9 2 2 280 0.0328
## 10 3 0 1230 0.827
## # ℹ 37 more rows