#library yang di butuhkan
# Data Wrangling
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.1
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'dplyr' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
# Machine Learning
## Clustering
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.1
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Clustering Visualization
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.5.1
library(rgl)
## Warning: package 'rgl' was built under R version 4.5.1
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.1
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:Hmisc':
##
## subplot
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Knit
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.1
library(rgl)
#====================EKSPLORASI DATA==========================================
library(readxl)
DATA_NEW <- read_excel("DATA NEW.xlsx")
head(DATA_NEW)
## # A tibble: 6 × 7
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 25.9 30.6 90.8 67.8
## 2 Sumatera Utara 41.1 23.3 92.9 66.4
## 3 Sumatera Barat 39.8 28.1 86.5 76.4
## 4 Riau 45.6 22.6 92.2 71.5
## 5 Jambi 53.7 31.7 82.2 74.3
## 6 Sumatera Selatan 54.5 26.6 87.2 75.7
## # ℹ abbreviated name: ¹Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <chr>,
## # Prevalensi_Ketidakcukupan_Pangan <chr>
# Mengonversi Kolom Kategorikal
# Mengubah Kolom 1 (Provinsi) dari character menjadi factor
DATA_NEW$Provinsi <- as.factor(DATA_NEW$Provinsi)
cat("--- Struktur Data Setelah Pembersihan Akhir ---\n")
## --- Struktur Data Setelah Pembersihan Akhir ---
str(DATA_NEW)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
## $ Provinsi : Factor w/ 38 levels "Aceh","Bali",..: 1 38 36 30 8 37 4 19 17 18 ...
## $ Imunisasi : num [1:38] 25.9 41.1 39.8 45.6 53.7 ...
## $ PAUD : num [1:38] 30.6 23.3 28.1 22.6 31.7 ...
## $ Sumber_Air_Minum_Layak : num [1:38] 90.8 92.9 86.5 92.2 82.2 ...
## $ Persentase_ASI_Eksklusif : num [1:38] 67.8 66.4 76.4 71.5 74.3 ...
## $ Prevalensi_Stunting : chr [1:38] "sedang" "sedang" "sedang" "sedang" ...
## $ Prevalensi_Ketidakcukupan_Pangan: chr [1:38] "rendah" "rendah" "rendah" "rendah" ...
#pendalaman data
summary(DATA_NEW)
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak
## Aceh : 1 Min. :18.41 Min. : 3.80 Min. :30.64
## Bali : 1 1st Qu.:53.54 1st Qu.:26.67 1st Qu.:82.22
## Banten : 1 Median :62.13 Median :31.63 Median :89.02
## Bengkulu : 1 Mean :60.46 Mean :31.94 Mean :87.01
## DI Yogyakarta: 1 3rd Qu.:71.45 3rd Qu.:36.48 3rd Qu.:94.65
## DKI Jakarta : 1 Max. :85.58 Max. :65.67 Max. :99.96
## (Other) :32
## Persentase_ASI_Eksklusif Prevalensi_Stunting Prevalensi_Ketidakcukupan_Pangan
## Min. :44.64 Length:38 Length:38
## 1st Qu.:65.61 Class :character Class :character
## Median :72.42 Mode :character Mode :character
## Mean :70.49
## 3rd Qu.:76.44
## Max. :83.70
##
table(DATA_NEW$Provinsi)
##
## Aceh Bali Banten
## 1 1 1
## Bengkulu DI Yogyakarta DKI Jakarta
## 1 1 1
## Gorontalo Jambi Jawa Barat
## 1 1 1
## Jawa Tengah Jawa Timur Kalimantan Barat
## 1 1 1
## Kalimantan Selatan Kalimantan Tengah Kalimantan Timur
## 1 1 1
## Kalimantan Utara Kep. Bangka Belitung Kep. Riau
## 1 1 1
## Lampung Maluku Maluku Utara
## 1 1 1
## NTB NTT Papua
## 1 1 1
## Papua Selatan Papua Barat Papua Barat Daya
## 1 1 1
## Papua Pegunungan Papua Tengah Riau
## 1 1 1
## Sulawesi Barat Sulawesi Selatan Sulawesi Tengah
## 1 1 1
## Sulawesi Tenggara Sulawesi Utara Sumatera Barat
## 1 1 1
## Sumatera Selatan Sumatera Utara
## 1 1
# Memuat library yang dibutuhkan
library(tidyverse)
# Mengubah kolom Prevalensi Stunting dan Prevalensi Ketidakcukupan Pangan
# menjadi tipe data factor.
library(dplyr)
library(cluster)
DATA_NEW_CLEAN <- DATA_NEW
DATA_NEW_CLEAN <- DATA_NEW_CLEAN %>%
# Mengubah kolom Provinsi (karakter) menjadi Factor
mutate(Provinsi = as.factor(Provinsi))
DATA_NEW <- DATA_NEW %>%
mutate(`Prevalensi_Stunting` = as.factor(`Prevalensi_Stunting`),
`Prevalensi_Ketidakcukupan_Pangan` = as.factor(`Prevalensi_Ketidakcukupan_Pangan`))
DATA_NEW %>%
select(Imunisasi, PAUD, Sumber_Air_Minum_Layak, Persentase_ASI_Eksklusif) %>%
hist.data.frame()

# Cek kembali struktur data untuk memverifikasi perubahan
cat("--- Struktur Data Setelah Konversi ke Factor ---\n")
## --- Struktur Data Setelah Konversi ke Factor ---
str(DATA_NEW)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
## $ Provinsi : Factor w/ 38 levels "Aceh","Bali",..: 1 38 36 30 8 37 4 19 17 18 ...
## $ Imunisasi : num [1:38] 25.9 41.1 39.8 45.6 53.7 ...
## $ PAUD : num [1:38] 30.6 23.3 28.1 22.6 31.7 ...
## $ Sumber_Air_Minum_Layak : num [1:38] 90.8 92.9 86.5 92.2 82.2 ...
## $ Persentase_ASI_Eksklusif : num [1:38] 67.8 66.4 76.4 71.5 74.3 ...
## $ Prevalensi_Stunting : Factor w/ 4 levels "rendah","sangat tinggi",..: 3 3 3 3 1 1 1 1 3 1 ...
## $ Prevalensi_Ketidakcukupan_Pangan: Factor w/ 2 levels "rendah","sedang": 1 1 1 1 1 1 1 1 1 1 ...
#IMPLEMENTASI PAM/K-MEDOIDS (ada 3 langkah)
#Langka 1 (Perhitungan jarak)
library(cluster)
DATA_NEW_CLEAN <- DATA_NEW
DATA_NEW_gower <- daisy(x = DATA_NEW_CLEAN,
metric = "gower")
cat("\nPerhitungan jarak Gower berhasil diselesaikan!\n")
##
## Perhitungan jarak Gower berhasil diselesaikan!
print(DATA_NEW_gower[1:5])
## [1] 0.2017115 0.2186205 0.2199256 0.3889251 0.3921714
#Sebagai pembuktian apakah metode Gower Distance memang bisa menempatkan data-data kita berdasarkan kemiripannya, kita bisa mencetak pasangan yang paling mirip dan berbeda dalam data kita dengan menggunakan fungsi di bawah ini.
#===========a).CONTOH PASANGAN PALING IDENTIK
example <- as.matrix(DATA_NEW_gower)
DATA_NEW[which(example == min(example[example != min(example)]),
arr.ind = TRUE)[1, ], ]
## # A tibble: 2 × 7
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Sulawesi Tengga… 72.4 32.4 95.3 66.4
## 2 Sulawesi Utara 68.3 29.6 94.5 65.0
## # ℹ abbreviated name: ¹Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## # Prevalensi_Ketidakcukupan_Pangan <fct>
#===========b).CONTOH PASANGAN PALING TIDAK IDENTIK
DATA_NEW[which(example == max(example[example != max(example)]),
arr.ind = TRUE)[1, ], ]
## # A tibble: 2 × 7
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Papua Pegunungan 18.4 3.8 30.6 82.2
## 2 Maluku Utara 53.5 37.1 88.9 65.6
## # ℹ abbreviated name: ¹Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## # Prevalensi_Ketidakcukupan_Pangan <fct>
#LANGKAH 2 (Penentuan Nilai Cluster(Kelompok))
#===========a).Elbow Method
set.seed(123)
fviz_nbclust(x = as.matrix(DATA_NEW_gower),
FUNcluster = pam,
method = "wss") +
labs(subtitle = "Elbow Method")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===========b).Silhouette Method
set.seed(123)
fviz_nbclust(x = as.matrix(DATA_NEW_gower),
FUNcluster = pam,
method = "silhouette") +
labs(subtitle = "Silhouette Method")

#===========c).Gap Statistic Method
set.seed(123)
fviz_nbclust(x = as.matrix(DATA_NEW_gower),
FUNcluster = pam,
method = "gap_stat") +
labs(subtitle = "Gap Statistic Method")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#3).LANGKAH 3 (CLUSTERING)
pam_fit <- pam(x = DATA_NEW_gower,
k = 2)
DATA_NEW[pam_fit$medoids, ]
## # A tibble: 2 × 7
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak Persentase_ASI_Eksklu…¹
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Sulawesi Tengah 61.6 36.5 87.9 67.3
## 2 Lampung 74.4 32.4 84.5 76.4
## # ℹ abbreviated name: ¹Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## # Prevalensi_Ketidakcukupan_Pangan <fct>
#=========a).METODE STATISTIKA DESKRIPTIF
pam_results <- DATA_NEW %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
#KELOMPOK 1
pam_results$the_summary[[1]]
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak
## Aceh : 1 Min. :25.88 Min. : 9.98 Min. :71.90
## Banten : 1 1st Qu.:50.66 1st Qu.:22.95 1st Qu.:82.25
## Gorontalo : 1 Median :57.73 Median :30.64 Median :88.92
## Kalimantan Barat : 1 Mean :57.13 Mean :29.78 Mean :87.59
## Kalimantan Selatan: 1 3rd Qu.:69.50 3rd Qu.:34.39 3rd Qu.:93.13
## Kalimantan Tengah : 1 Max. :73.39 Max. :50.31 Max. :96.13
## (Other) :17
## Persentase_ASI_Eksklusif Prevalensi_Stunting
## Min. :44.64 rendah : 0
## 1st Qu.:63.60 sangat tinggi: 0
## Median :66.42 sedang :21
## Mean :67.04 tinggi : 2
## 3rd Qu.:72.42
## Max. :83.70
##
## Prevalensi_Ketidakcukupan_Pangan cluster
## rendah:21 Min. :1
## sedang: 2 1st Qu.:1
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
##
#KELOMPOK 2
pam_results$the_summary[[2]]
## Provinsi Imunisasi PAUD Sumber_Air_Minum_Layak
## Bali :1 Min. :18.41 Min. : 3.80 Min. :30.64
## Bengkulu :1 1st Qu.:58.26 1st Qu.:29.12 1st Qu.:83.33
## DI Yogyakarta:1 Median :70.99 Median :32.42 Median :89.42
## DKI Jakarta :1 Mean :65.56 Mean :35.26 Mean :86.13
## Jambi :1 3rd Qu.:74.83 3rd Qu.:41.17 3rd Qu.:96.17
## Jawa Barat :1 Max. :85.58 Max. :65.67 Max. :99.96
## (Other) :9
## Persentase_ASI_Eksklusif Prevalensi_Stunting
## Min. :63.58 rendah :12
## 1st Qu.:73.83 sangat tinggi: 1
## Median :76.40 sedang : 0
## Mean :75.79 tinggi : 2
## 3rd Qu.:79.90
## Max. :82.25
##
## Prevalensi_Ketidakcukupan_Pangan cluster
## rendah:14 Min. :2
## sedang: 1 1st Qu.:2
## Median :2
## Mean :2
## 3rd Qu.:2
## Max. :2
##
#=========b).METODE VISUALISASI
library(Rtsne)
set.seed(123)
tsne_obj <- Rtsne(X = DATA_NEW_gower,
is_distance = TRUE,
perplexity = 10) # <-- Solusi untuk error perplexity
K <- 3
# 3. Buat dataframe untuk visualisasi (Asumsi tsne_obj dan pam_fit sudah ada)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(Provinsi = DATA_NEW$Provinsi,
cluster = factor(pam_fit$clustering))
# 4. Plot Visualisasi
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster), size = 3) +
geom_text(aes(label = Provinsi), check_overlap = TRUE, vjust = -1) +
labs(title = paste("Visualisasi Clustering 2D t-SNE (k =", K, ")"), # Sekarang K sudah ditemukan
color = "Klaster") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_blank())

#CLUSTERING DENGAN MENGGUNAKN PLOT 3 DIMENSI
#Selain menampilkan hasil pengelompokan dengan menggunakan plot secara 2 dimensi, kita juga dapat menampilkan hasil clustering dengan menggunakan plot 3 dimensi.
# PENTING: Pastikan library Rtsne, dplyr, dan cluster sudah dimuat
library(Rtsne)
library(dplyr)
library(cluster)
set.seed(123)
# 1. Perbaiki Rtsne 3D dengan menambahkan parameter perplexity
# Gunakan perplexity = 10 (nilai aman untuk 38 sampel)
tsne_obj_3d <- Rtsne(X = DATA_NEW_gower,
is_distance = TRUE,
dims = 3, # Menentukan output 3 dimensi
perplexity = 10) # <-- Solusi untuk error perplexity
# 2. Buat dataframe untuk visualisasi (Sekarang objek tsne_obj_3d sudah berhasil dibuat)
tsne_data_3d <- tsne_obj_3d$Y %>%
data.frame() %>%
setNames(c("X", "Y", "Z")) %>%
mutate(Provinsi = DATA_NEW$Provinsi, # Tambahkan label Provinsi
cluster = factor(pam_fit$clustering),
Observasi = row_number())
head(tsne_data_3d)
## X Y Z Provinsi cluster Observasi
## 1 -50.86054 -10.9773675 0.72540 Aceh 1 1
## 2 -55.29086 -9.8285991 17.16906 Sumatera Utara 1 2
## 3 -35.84983 -0.4365006 11.65410 Sumatera Barat 1 3
## 4 -54.31266 4.8162107 16.67430 Riau 1 4
## 5 43.61005 5.5001702 -82.36551 Jambi 2 5
## 6 38.87114 -6.2920433 -73.70671 Sumatera Selatan 2 6
#VISUALISASI DATA 3D
tsne_3d <- plot_ly(tsne_data_3d,
x = ~X,
y = ~Y,
z = ~Z,
color = ~cluster,
colors = c('#BF382A','#0C4B8E'),
size = 3,
text = ~paste("Product No:", Observasi),
hoverinfo = 'text') %>%
layout(title = "Visualisasi Clustering 3D",
scene = list(xaxis = list(title = ''),
yaxis = list(title = ''),
zaxis = list(title = '')))
tsne_3d <- tsne_3d %>%
add_markers()
tsne_3d