#library yang di butuhkan
# Data Wrangling
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.1
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'dplyr' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# Machine Learning
## Clustering 
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.1
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Clustering Visualization
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.5.1
library(rgl)
## Warning: package 'rgl' was built under R version 4.5.1
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.1
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Knit
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.1
library(rgl)

#====================EKSPLORASI DATA==========================================
library(readxl)
DATA_NEW <- read_excel("DATA NEW.xlsx")
head(DATA_NEW)
## # A tibble: 6 × 7
##   Provinsi         Imunisasi  PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
##   <chr>                <dbl> <dbl>                  <dbl>                  <dbl>
## 1 Aceh                  25.9  30.6                   90.8                   67.8
## 2 Sumatera Utara        41.1  23.3                   92.9                   66.4
## 3 Sumatera Barat        39.8  28.1                   86.5                   76.4
## 4 Riau                  45.6  22.6                   92.2                   71.5
## 5 Jambi                 53.7  31.7                   82.2                   74.3
## 6 Sumatera Selatan      54.5  26.6                   87.2                   75.7
## # ℹ abbreviated name: ¹​Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <chr>,
## #   Prevalensi_Ketidakcukupan_Pangan <chr>
# Mengonversi Kolom Kategorikal
# Mengubah Kolom 1 (Provinsi) dari character menjadi factor
DATA_NEW$Provinsi <- as.factor(DATA_NEW$Provinsi)
cat("--- Struktur Data Setelah Pembersihan Akhir ---\n")
## --- Struktur Data Setelah Pembersihan Akhir ---
str(DATA_NEW)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi                        : Factor w/ 38 levels "Aceh","Bali",..: 1 38 36 30 8 37 4 19 17 18 ...
##  $ Imunisasi                       : num [1:38] 25.9 41.1 39.8 45.6 53.7 ...
##  $ PAUD                            : num [1:38] 30.6 23.3 28.1 22.6 31.7 ...
##  $ Sumber_Air_Minum_Layak          : num [1:38] 90.8 92.9 86.5 92.2 82.2 ...
##  $ Persentase_ASI_Eksklusif        : num [1:38] 67.8 66.4 76.4 71.5 74.3 ...
##  $ Prevalensi_Stunting             : chr [1:38] "sedang" "sedang" "sedang" "sedang" ...
##  $ Prevalensi_Ketidakcukupan_Pangan: chr [1:38] "rendah" "rendah" "rendah" "rendah" ...
#pendalaman data
summary(DATA_NEW)
##           Provinsi    Imunisasi          PAUD       Sumber_Air_Minum_Layak
##  Aceh         : 1   Min.   :18.41   Min.   : 3.80   Min.   :30.64         
##  Bali         : 1   1st Qu.:53.54   1st Qu.:26.67   1st Qu.:82.22         
##  Banten       : 1   Median :62.13   Median :31.63   Median :89.02         
##  Bengkulu     : 1   Mean   :60.46   Mean   :31.94   Mean   :87.01         
##  DI Yogyakarta: 1   3rd Qu.:71.45   3rd Qu.:36.48   3rd Qu.:94.65         
##  DKI Jakarta  : 1   Max.   :85.58   Max.   :65.67   Max.   :99.96         
##  (Other)      :32                                                         
##  Persentase_ASI_Eksklusif Prevalensi_Stunting Prevalensi_Ketidakcukupan_Pangan
##  Min.   :44.64            Length:38           Length:38                       
##  1st Qu.:65.61            Class :character    Class :character                
##  Median :72.42            Mode  :character    Mode  :character                
##  Mean   :70.49                                                                
##  3rd Qu.:76.44                                                                
##  Max.   :83.70                                                                
## 
table(DATA_NEW$Provinsi)
## 
##                 Aceh                 Bali               Banten 
##                    1                    1                    1 
##             Bengkulu        DI Yogyakarta          DKI Jakarta 
##                    1                    1                    1 
##            Gorontalo                Jambi           Jawa Barat 
##                    1                    1                    1 
##          Jawa Tengah           Jawa Timur     Kalimantan Barat 
##                    1                    1                    1 
##   Kalimantan Selatan    Kalimantan Tengah     Kalimantan Timur 
##                    1                    1                    1 
##     Kalimantan Utara Kep. Bangka Belitung            Kep. Riau 
##                    1                    1                    1 
##              Lampung               Maluku         Maluku Utara 
##                    1                    1                    1 
##                  NTB                  NTT                Papua 
##                    1                    1                    1 
##       Papua  Selatan          Papua Barat     Papua Barat Daya 
##                    1                    1                    1 
##     Papua Pegunungan         Papua Tengah                 Riau 
##                    1                    1                    1 
##       Sulawesi Barat     Sulawesi Selatan      Sulawesi Tengah 
##                    1                    1                    1 
##    Sulawesi Tenggara       Sulawesi Utara       Sumatera Barat 
##                    1                    1                    1 
##     Sumatera Selatan       Sumatera Utara 
##                    1                    1
# Memuat library yang dibutuhkan
library(tidyverse)

# Mengubah kolom Prevalensi Stunting dan Prevalensi Ketidakcukupan Pangan
# menjadi tipe data factor.
library(dplyr)
library(cluster) 
DATA_NEW_CLEAN <- DATA_NEW
DATA_NEW_CLEAN <- DATA_NEW_CLEAN %>%
  # Mengubah kolom Provinsi (karakter) menjadi Factor
  mutate(Provinsi = as.factor(Provinsi))

DATA_NEW <- DATA_NEW %>%
  mutate(`Prevalensi_Stunting` = as.factor(`Prevalensi_Stunting`),
         `Prevalensi_Ketidakcukupan_Pangan` = as.factor(`Prevalensi_Ketidakcukupan_Pangan`))

DATA_NEW %>% 
  select(Imunisasi, PAUD, Sumber_Air_Minum_Layak, Persentase_ASI_Eksklusif) %>% 
  hist.data.frame()

# Cek kembali struktur data untuk memverifikasi perubahan
cat("--- Struktur Data Setelah Konversi ke Factor ---\n")
## --- Struktur Data Setelah Konversi ke Factor ---
str(DATA_NEW)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi                        : Factor w/ 38 levels "Aceh","Bali",..: 1 38 36 30 8 37 4 19 17 18 ...
##  $ Imunisasi                       : num [1:38] 25.9 41.1 39.8 45.6 53.7 ...
##  $ PAUD                            : num [1:38] 30.6 23.3 28.1 22.6 31.7 ...
##  $ Sumber_Air_Minum_Layak          : num [1:38] 90.8 92.9 86.5 92.2 82.2 ...
##  $ Persentase_ASI_Eksklusif        : num [1:38] 67.8 66.4 76.4 71.5 74.3 ...
##  $ Prevalensi_Stunting             : Factor w/ 4 levels "rendah","sangat tinggi",..: 3 3 3 3 1 1 1 1 3 1 ...
##  $ Prevalensi_Ketidakcukupan_Pangan: Factor w/ 2 levels "rendah","sedang": 1 1 1 1 1 1 1 1 1 1 ...
#IMPLEMENTASI PAM/K-MEDOIDS (ada 3 langkah)
#Langka 1 (Perhitungan jarak)
library(cluster)
DATA_NEW_CLEAN <- DATA_NEW
DATA_NEW_gower <- daisy(x = DATA_NEW_CLEAN,
                        metric = "gower")

cat("\nPerhitungan jarak Gower berhasil diselesaikan!\n")
## 
## Perhitungan jarak Gower berhasil diselesaikan!
print(DATA_NEW_gower[1:5])
## [1] 0.2017115 0.2186205 0.2199256 0.3889251 0.3921714
#Sebagai pembuktian apakah metode Gower Distance memang bisa menempatkan data-data kita berdasarkan kemiripannya, kita bisa mencetak pasangan yang paling mirip dan berbeda dalam data kita dengan menggunakan fungsi di bawah ini.
#===========a).CONTOH PASANGAN PALING IDENTIK
example <- as.matrix(DATA_NEW_gower)

DATA_NEW[which(example == min(example[example != min(example)]),
               arr.ind = TRUE)[1, ], ]
## # A tibble: 2 × 7
##   Provinsi         Imunisasi  PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
##   <fct>                <dbl> <dbl>                  <dbl>                  <dbl>
## 1 Sulawesi Tengga…      72.4  32.4                   95.3                   66.4
## 2 Sulawesi Utara        68.3  29.6                   94.5                   65.0
## # ℹ abbreviated name: ¹​Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## #   Prevalensi_Ketidakcukupan_Pangan <fct>
#===========b).CONTOH PASANGAN PALING TIDAK IDENTIK
DATA_NEW[which(example == max(example[example != max(example)]),
               arr.ind = TRUE)[1, ], ]
## # A tibble: 2 × 7
##   Provinsi         Imunisasi  PAUD Sumber_Air_Minum_Layak Persentase_ASI_Ekskl…¹
##   <fct>                <dbl> <dbl>                  <dbl>                  <dbl>
## 1 Papua Pegunungan      18.4   3.8                   30.6                   82.2
## 2 Maluku Utara          53.5  37.1                   88.9                   65.6
## # ℹ abbreviated name: ¹​Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## #   Prevalensi_Ketidakcukupan_Pangan <fct>
#LANGKAH 2 (Penentuan Nilai Cluster(Kelompok))
#===========a).Elbow Method
set.seed(123)

fviz_nbclust(x = as.matrix(DATA_NEW_gower),
             FUNcluster = pam,
             method = "wss") +
  labs(subtitle = "Elbow Method")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===========b).Silhouette Method
set.seed(123)

fviz_nbclust(x = as.matrix(DATA_NEW_gower),
             FUNcluster = pam,
             method = "silhouette") +
  labs(subtitle = "Silhouette Method")

#===========c).Gap Statistic Method
set.seed(123)

fviz_nbclust(x = as.matrix(DATA_NEW_gower),
             FUNcluster = pam,
             method = "gap_stat") +
  labs(subtitle = "Gap Statistic Method")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#3).LANGKAH 3 (CLUSTERING)
pam_fit <- pam(x = DATA_NEW_gower, 
               k = 2)
DATA_NEW[pam_fit$medoids, ]
## # A tibble: 2 × 7
##   Provinsi        Imunisasi  PAUD Sumber_Air_Minum_Layak Persentase_ASI_Eksklu…¹
##   <fct>               <dbl> <dbl>                  <dbl>                   <dbl>
## 1 Sulawesi Tengah      61.6  36.5                   87.9                    67.3
## 2 Lampung              74.4  32.4                   84.5                    76.4
## # ℹ abbreviated name: ¹​Persentase_ASI_Eksklusif
## # ℹ 2 more variables: Prevalensi_Stunting <fct>,
## #   Prevalensi_Ketidakcukupan_Pangan <fct>
#=========a).METODE STATISTIKA DESKRIPTIF
pam_results <- DATA_NEW %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))
#KELOMPOK 1
pam_results$the_summary[[1]]
##                Provinsi    Imunisasi          PAUD       Sumber_Air_Minum_Layak
##  Aceh              : 1   Min.   :25.88   Min.   : 9.98   Min.   :71.90         
##  Banten            : 1   1st Qu.:50.66   1st Qu.:22.95   1st Qu.:82.25         
##  Gorontalo         : 1   Median :57.73   Median :30.64   Median :88.92         
##  Kalimantan Barat  : 1   Mean   :57.13   Mean   :29.78   Mean   :87.59         
##  Kalimantan Selatan: 1   3rd Qu.:69.50   3rd Qu.:34.39   3rd Qu.:93.13         
##  Kalimantan Tengah : 1   Max.   :73.39   Max.   :50.31   Max.   :96.13         
##  (Other)           :17                                                         
##  Persentase_ASI_Eksklusif    Prevalensi_Stunting
##  Min.   :44.64            rendah       : 0      
##  1st Qu.:63.60            sangat tinggi: 0      
##  Median :66.42            sedang       :21      
##  Mean   :67.04            tinggi       : 2      
##  3rd Qu.:72.42                                  
##  Max.   :83.70                                  
##                                                 
##  Prevalensi_Ketidakcukupan_Pangan    cluster 
##  rendah:21                        Min.   :1  
##  sedang: 2                        1st Qu.:1  
##                                   Median :1  
##                                   Mean   :1  
##                                   3rd Qu.:1  
##                                   Max.   :1  
## 
#KELOMPOK 2
pam_results$the_summary[[2]]
##           Provinsi   Imunisasi          PAUD       Sumber_Air_Minum_Layak
##  Bali         :1   Min.   :18.41   Min.   : 3.80   Min.   :30.64         
##  Bengkulu     :1   1st Qu.:58.26   1st Qu.:29.12   1st Qu.:83.33         
##  DI Yogyakarta:1   Median :70.99   Median :32.42   Median :89.42         
##  DKI Jakarta  :1   Mean   :65.56   Mean   :35.26   Mean   :86.13         
##  Jambi        :1   3rd Qu.:74.83   3rd Qu.:41.17   3rd Qu.:96.17         
##  Jawa Barat   :1   Max.   :85.58   Max.   :65.67   Max.   :99.96         
##  (Other)      :9                                                         
##  Persentase_ASI_Eksklusif    Prevalensi_Stunting
##  Min.   :63.58            rendah       :12      
##  1st Qu.:73.83            sangat tinggi: 1      
##  Median :76.40            sedang       : 0      
##  Mean   :75.79            tinggi       : 2      
##  3rd Qu.:79.90                                  
##  Max.   :82.25                                  
##                                                 
##  Prevalensi_Ketidakcukupan_Pangan    cluster 
##  rendah:14                        Min.   :2  
##  sedang: 1                        1st Qu.:2  
##                                   Median :2  
##                                   Mean   :2  
##                                   3rd Qu.:2  
##                                   Max.   :2  
## 
#=========b).METODE VISUALISASI
library(Rtsne)
set.seed(123)
tsne_obj <- Rtsne(X = DATA_NEW_gower, 
                  is_distance = TRUE,
                  perplexity = 10) # <-- Solusi untuk error perplexity
K <- 3 
# 3. Buat dataframe untuk visualisasi (Asumsi tsne_obj dan pam_fit sudah ada)
tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(Provinsi = DATA_NEW$Provinsi,
         cluster = factor(pam_fit$clustering)) 
# 4. Plot Visualisasi
ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster), size = 3) + 
  geom_text(aes(label = Provinsi), check_overlap = TRUE, vjust = -1) + 
  labs(title = paste("Visualisasi Clustering 2D t-SNE (k =", K, ")"), # Sekarang K sudah ditemukan
       color = "Klaster") + 
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.title = element_blank())

#CLUSTERING DENGAN MENGGUNAKN PLOT 3 DIMENSI
#Selain menampilkan hasil pengelompokan dengan menggunakan plot secara 2 dimensi, kita juga dapat menampilkan hasil clustering dengan menggunakan plot 3 dimensi.
# PENTING: Pastikan library Rtsne, dplyr, dan cluster sudah dimuat
library(Rtsne)
library(dplyr)
library(cluster)

set.seed(123)
# 1. Perbaiki Rtsne 3D dengan menambahkan parameter perplexity
# Gunakan perplexity = 10 (nilai aman untuk 38 sampel)
tsne_obj_3d <- Rtsne(X = DATA_NEW_gower, 
                     is_distance = TRUE,
                     dims = 3, # Menentukan output 3 dimensi
                     perplexity = 10) # <-- Solusi untuk error perplexity
# 2. Buat dataframe untuk visualisasi (Sekarang objek tsne_obj_3d sudah berhasil dibuat)
tsne_data_3d <- tsne_obj_3d$Y %>%
  data.frame() %>%
  setNames(c("X", "Y", "Z")) %>%
  mutate(Provinsi = DATA_NEW$Provinsi, # Tambahkan label Provinsi
         cluster = factor(pam_fit$clustering),
         Observasi = row_number())

head(tsne_data_3d)
##           X           Y         Z         Provinsi cluster Observasi
## 1 -50.86054 -10.9773675   0.72540             Aceh       1         1
## 2 -55.29086  -9.8285991  17.16906   Sumatera Utara       1         2
## 3 -35.84983  -0.4365006  11.65410   Sumatera Barat       1         3
## 4 -54.31266   4.8162107  16.67430             Riau       1         4
## 5  43.61005   5.5001702 -82.36551            Jambi       2         5
## 6  38.87114  -6.2920433 -73.70671 Sumatera Selatan       2         6
#VISUALISASI DATA 3D
tsne_3d <- plot_ly(tsne_data_3d,
                   x = ~X,
                   y = ~Y,
                   z = ~Z,
                   color = ~cluster,
                   colors = c('#BF382A','#0C4B8E'),
                   size = 3,
                   text = ~paste("Product No:", Observasi),
                   hoverinfo = 'text') %>% 
  layout(title = "Visualisasi Clustering 3D",
         scene = list(xaxis = list(title = ''),
                      yaxis = list(title = ''),
                      zaxis = list(title = '')))

tsne_3d <- tsne_3d %>% 
  add_markers() 

tsne_3d