Load Package

library(tidyverse)
library(cluster)
library(factoextra)
library(fpc)
library(e1071)
library(fclust)
library(corrplot)
library(ggplot2)
library(NbClust)
library(clusterSim)
library(ggrepel)
library(dbscan)

Baca Data

data <- read.csv('D:/Semester 4/ANMUL/Kelompok 8/data jatim 2023.csv')
data 
##      Kabupaten.Kota Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1           Pacitan                    13.65             12244.97       72.86
## 2          Ponorogo                     9.53             15870.05       73.55
## 3        Trenggalek                    10.63             14212.06       74.64
## 4       Tulungagung                     6.53             30234.61       74.91
## 5            Blitar                     8.69             28239.86       74.34
## 6            Kediri                    10.72             32195.50       73.27
## 7            Malang                     9.45             75744.29       73.26
## 8          Lumajang                     8.93             24808.35       70.96
## 9            Jember                     9.51             59984.00       70.03
## 10       Banyuwangi                     7.34             60848.35       71.38
## 11        Bondowoso                    13.34             15075.62       67.60
## 12        Situbondo                    11.90             15018.99       69.94
## 13      Probolinggo                    17.19             25904.93       68.12
## 14         Pasuruan                     9.24            119252.55       70.81
## 15         Sidoarjo                     5.00            160950.78       74.69
## 16        Mojokerto                     9.80             66982.68       73.25
## 17          Jombang                     9.15             31602.77       73.22
## 18          Nganjuk                    10.89             20598.57       72.28
## 19           Madiun                    11.04             14895.81       72.28
## 20          Magetan                     9.80             14562.68       73.29
## 21            Ngawi                    14.40             14904.51       73.20
## 22       Bojonegoro                    12.18             63310.69       72.57
## 23            Tuban                    14.91             49984.23       72.36
## 24         Lamongan                    12.42             30709.18       73.22
## 25           Gresik                    10.96            113825.43       73.30
## 26        Bangkalan                    19.35             17164.20       70.79
## 27          Sampang                    21.76             14674.11       68.64
## 28        Pamekasan                    13.85             12628.69       68.31
## 29          Sumenep                    18.70             26244.79       72.47
## 30      Kota Kediri                     7.15             91631.35       74.67
## 31      Kota Blitar                     7.30              5455.81       74.66
## 32      Kota Malang                     4.26             60119.82       74.13
## 33 Kota Probolinggo                     6.48              9408.51       70.99
## 34    Kota Pasuruan                     6.60              6637.08       72.31
## 35   Kota Mojokerto                     5.77              5399.62       74.10
## 36      Kota Madiun                     4.74             11764.40       73.44
## 37    Kota Surabaya                     4.65            459030.72       74.75
## 38        Kota Batu                     3.31             12936.60       73.29
##    Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1                  7.88                                9681
## 2                  7.78                               10658
## 3                  7.90                               10465
## 4                  8.66                               11565
## 5                  7.83                               11499
## 6                  8.24                               11952
## 7                  7.75                               10791
## 8                  7.14                                9720
## 9                  6.52                               10277
## 10                 7.76                               12820
## 11                 6.36                               11255
## 12                 6.90                               10702
## 13                 6.29                               11756
## 14                 7.44                               11239
## 15                10.78                               15311
## 16                 9.11                               13467
## 17                 8.77                               11999
## 18                 8.24                               12821
## 19                 7.95                               12259
## 20                 8.67                               12495
## 21                 7.78                               11897
## 22                 7.45                               10776
## 23                 7.40                               11174
## 24                 8.34                               12019
## 25                10.01                               13870
## 26                 5.99                                9438
## 27                 5.07                                9363
## 28                 7.15                                9420
## 29                 5.94                                9807
## 30                10.69                               13276
## 31                10.78                               14548
## 32                10.94                               17222
## 33                 9.56                               12999
## 34                 9.78                               14250
## 35                11.05                               14422
## 36                11.82                               17115
## 37                10.70                               18977
## 38                 9.85                               13603
##    Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1                               1.83                                 80.19
## 2                               4.66                                 94.42
## 3                               4.52                                 84.87
## 4                               5.65                                 97.90
## 5                               4.91                                 96.41
## 6                               5.79                                 92.00
## 7                               5.70                                 98.18
## 8                               3.67                                 95.49
## 9                               4.01                                 97.51
## 10                              4.75                                 97.44
## 11                              4.15                                 94.43
## 12                              3.27                                 97.45
## 13                              3.24                                 97.75
## 14                              5.48                                 97.63
## 15                              8.05                                 97.19
## 16                              4.67                                 98.90
## 17                              4.66                                 98.01
## 18                              4.68                                 98.86
## 19                              5.14                                 96.39
## 20                              4.16                                 99.61
## 21                              2.41                                 97.51
## 22                              4.63                                 97.59
## 23                              4.40                                 95.22
## 24                              5.46                                 79.26
## 25                              6.82                                 92.54
## 26                              6.18                                 96.30
## 27                              2.72                                 94.29
## 28                              1.74                                 97.91
## 29                              1.71                                 98.10
## 30                              4.06                                 99.60
## 31                              5.24                                 98.79
## 32                              6.80                                 99.06
## 33                              4.53                                100.00
## 34                              5.64                                 99.38
## 35                              4.73                                 99.95
## 36                              5.85                                 99.10
## 37                              6.76                                 98.15
## 38                              4.52                                 99.16
##      IPM
## 1  70.19
## 2  72.50
## 3  71.73
## 4  74.61
## 5  72.49
## 6  73.96
## 7  72.16
## 8  67.87
## 9  68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18

EDA

glimpse(data)
## Rows: 38
## Columns: 9
## $ Kabupaten.Kota                        <chr> "Pacitan", "Ponorogo", "Trenggal…
## $ Penduduk.Miskin..persen.              <dbl> 13.65, 9.53, 10.63, 6.53, 8.69, …
## $ PDRB..miliar.rupiah.                  <dbl> 12244.97, 15870.05, 14212.06, 30…
## $ AHH..tahun.                           <dbl> 72.86, 73.55, 74.64, 74.91, 74.3…
## $ Lama.Sekolah..tahun.                  <dbl> 7.88, 7.78, 7.90, 8.66, 7.83, 8.…
## $ Pengeluaran.Perkapita..ribu.rupiah.   <int> 9681, 10658, 10465, 11565, 11499…
## $ Tingkat.Pengangguran.TPT..persen.     <dbl> 1.83, 4.66, 4.52, 5.65, 4.91, 5.…
## $ Akses.Terhadap.Sumber.Air.Minum.Layak <dbl> 80.19, 94.42, 84.87, 97.90, 96.4…
## $ IPM                                   <dbl> 70.19, 72.50, 71.73, 74.61, 72.4…
str(data)
## 'data.frame':    38 obs. of  9 variables:
##  $ Kabupaten.Kota                       : chr  "Pacitan" "Ponorogo" "Trenggalek" "Tulungagung" ...
##  $ Penduduk.Miskin..persen.             : num  13.65 9.53 10.63 6.53 8.69 ...
##  $ PDRB..miliar.rupiah.                 : num  12245 15870 14212 30235 28240 ...
##  $ AHH..tahun.                          : num  72.9 73.5 74.6 74.9 74.3 ...
##  $ Lama.Sekolah..tahun.                 : num  7.88 7.78 7.9 8.66 7.83 8.24 7.75 7.14 6.52 7.76 ...
##  $ Pengeluaran.Perkapita..ribu.rupiah.  : int  9681 10658 10465 11565 11499 11952 10791 9720 10277 12820 ...
##  $ Tingkat.Pengangguran.TPT..persen.    : num  1.83 4.66 4.52 5.65 4.91 5.79 5.7 3.67 4.01 4.75 ...
##  $ Akses.Terhadap.Sumber.Air.Minum.Layak: num  80.2 94.4 84.9 97.9 96.4 ...
##  $ IPM                                  : num  70.2 72.5 71.7 74.6 72.5 ...
summary(data)
##  Kabupaten.Kota     Penduduk.Miskin..persen. PDRB..miliar.rupiah.
##  Length:38          Min.   : 3.310           Min.   :  5400      
##  Class :character   1st Qu.: 7.188           1st Qu.: 14590      
##  Mode  :character   Median : 9.665           Median : 25357      
##                     Mean   :10.293           Mean   : 48554      
##                     3rd Qu.:12.360           3rd Qu.: 60086      
##                     Max.   :21.760           Max.   :459031      
##   AHH..tahun.    Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
##  Min.   :67.60   Min.   : 5.070       Min.   : 9363                      
##  1st Qu.:71.09   1st Qu.: 7.410       1st Qu.:10720                      
##  Median :73.21   Median : 7.925       Median :11924                      
##  Mean   :72.42   Mean   : 8.376       Mean   :12287                      
##  3rd Qu.:73.52   3rd Qu.: 9.725       3rd Qu.:13419                      
##  Max.   :74.91   Max.   :11.820       Max.   :18977                      
##  Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
##  Min.   :1.710                     Min.   : 79.26                       
##  1st Qu.:4.082                     1st Qu.: 95.69                       
##  Median :4.665                     Median : 97.61                       
##  Mean   :4.663                     Mean   : 96.12                       
##  3rd Qu.:5.600                     3rd Qu.: 98.84                       
##  Max.   :8.050                     Max.   :100.00                       
##       IPM       
##  Min.   :64.13  
##  1st Qu.:70.22  
##  Median :72.79  
##  Mean   :73.68  
##  3rd Qu.:76.76  
##  Max.   :83.45

Pre-Processing

Cek missing value

colSums(is.na(data))
##                        Kabupaten.Kota              Penduduk.Miskin..persen. 
##                                     0                                     0 
##                  PDRB..miliar.rupiah.                           AHH..tahun. 
##                                     0                                     0 
##                  Lama.Sekolah..tahun.   Pengeluaran.Perkapita..ribu.rupiah. 
##                                     0                                     0 
##     Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak 
##                                     0                                     0 
##                                   IPM 
##                                     0

Cek data duplikat

sum(duplicated(data))
## [1] 0

Mengambil data numerik

data_num <- data[, sapply(data, is.numeric)]
data_num
##    Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1                     13.65             12244.97       72.86
## 2                      9.53             15870.05       73.55
## 3                     10.63             14212.06       74.64
## 4                      6.53             30234.61       74.91
## 5                      8.69             28239.86       74.34
## 6                     10.72             32195.50       73.27
## 7                      9.45             75744.29       73.26
## 8                      8.93             24808.35       70.96
## 9                      9.51             59984.00       70.03
## 10                     7.34             60848.35       71.38
## 11                    13.34             15075.62       67.60
## 12                    11.90             15018.99       69.94
## 13                    17.19             25904.93       68.12
## 14                     9.24            119252.55       70.81
## 15                     5.00            160950.78       74.69
## 16                     9.80             66982.68       73.25
## 17                     9.15             31602.77       73.22
## 18                    10.89             20598.57       72.28
## 19                    11.04             14895.81       72.28
## 20                     9.80             14562.68       73.29
## 21                    14.40             14904.51       73.20
## 22                    12.18             63310.69       72.57
## 23                    14.91             49984.23       72.36
## 24                    12.42             30709.18       73.22
## 25                    10.96            113825.43       73.30
## 26                    19.35             17164.20       70.79
## 27                    21.76             14674.11       68.64
## 28                    13.85             12628.69       68.31
## 29                    18.70             26244.79       72.47
## 30                     7.15             91631.35       74.67
## 31                     7.30              5455.81       74.66
## 32                     4.26             60119.82       74.13
## 33                     6.48              9408.51       70.99
## 34                     6.60              6637.08       72.31
## 35                     5.77              5399.62       74.10
## 36                     4.74             11764.40       73.44
## 37                     4.65            459030.72       74.75
## 38                     3.31             12936.60       73.29
##    Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1                  7.88                                9681
## 2                  7.78                               10658
## 3                  7.90                               10465
## 4                  8.66                               11565
## 5                  7.83                               11499
## 6                  8.24                               11952
## 7                  7.75                               10791
## 8                  7.14                                9720
## 9                  6.52                               10277
## 10                 7.76                               12820
## 11                 6.36                               11255
## 12                 6.90                               10702
## 13                 6.29                               11756
## 14                 7.44                               11239
## 15                10.78                               15311
## 16                 9.11                               13467
## 17                 8.77                               11999
## 18                 8.24                               12821
## 19                 7.95                               12259
## 20                 8.67                               12495
## 21                 7.78                               11897
## 22                 7.45                               10776
## 23                 7.40                               11174
## 24                 8.34                               12019
## 25                10.01                               13870
## 26                 5.99                                9438
## 27                 5.07                                9363
## 28                 7.15                                9420
## 29                 5.94                                9807
## 30                10.69                               13276
## 31                10.78                               14548
## 32                10.94                               17222
## 33                 9.56                               12999
## 34                 9.78                               14250
## 35                11.05                               14422
## 36                11.82                               17115
## 37                10.70                               18977
## 38                 9.85                               13603
##    Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1                               1.83                                 80.19
## 2                               4.66                                 94.42
## 3                               4.52                                 84.87
## 4                               5.65                                 97.90
## 5                               4.91                                 96.41
## 6                               5.79                                 92.00
## 7                               5.70                                 98.18
## 8                               3.67                                 95.49
## 9                               4.01                                 97.51
## 10                              4.75                                 97.44
## 11                              4.15                                 94.43
## 12                              3.27                                 97.45
## 13                              3.24                                 97.75
## 14                              5.48                                 97.63
## 15                              8.05                                 97.19
## 16                              4.67                                 98.90
## 17                              4.66                                 98.01
## 18                              4.68                                 98.86
## 19                              5.14                                 96.39
## 20                              4.16                                 99.61
## 21                              2.41                                 97.51
## 22                              4.63                                 97.59
## 23                              4.40                                 95.22
## 24                              5.46                                 79.26
## 25                              6.82                                 92.54
## 26                              6.18                                 96.30
## 27                              2.72                                 94.29
## 28                              1.74                                 97.91
## 29                              1.71                                 98.10
## 30                              4.06                                 99.60
## 31                              5.24                                 98.79
## 32                              6.80                                 99.06
## 33                              4.53                                100.00
## 34                              5.64                                 99.38
## 35                              4.73                                 99.95
## 36                              5.85                                 99.10
## 37                              6.76                                 98.15
## 38                              4.52                                 99.16
##      IPM
## 1  70.19
## 2  72.50
## 3  71.73
## 4  74.61
## 5  72.49
## 6  73.96
## 7  72.16
## 8  67.87
## 9  68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18

Statistika deskriptif

summary(data_num)
##  Penduduk.Miskin..persen. PDRB..miliar.rupiah.  AHH..tahun.   
##  Min.   : 3.310           Min.   :  5400       Min.   :67.60  
##  1st Qu.: 7.188           1st Qu.: 14590       1st Qu.:71.09  
##  Median : 9.665           Median : 25357       Median :73.21  
##  Mean   :10.293           Mean   : 48554       Mean   :72.42  
##  3rd Qu.:12.360           3rd Qu.: 60086       3rd Qu.:73.52  
##  Max.   :21.760           Max.   :459031       Max.   :74.91  
##  Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
##  Min.   : 5.070       Min.   : 9363                      
##  1st Qu.: 7.410       1st Qu.:10720                      
##  Median : 7.925       Median :11924                      
##  Mean   : 8.376       Mean   :12287                      
##  3rd Qu.: 9.725       3rd Qu.:13419                      
##  Max.   :11.820       Max.   :18977                      
##  Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
##  Min.   :1.710                     Min.   : 79.26                       
##  1st Qu.:4.082                     1st Qu.: 95.69                       
##  Median :4.665                     Median : 97.61                       
##  Mean   :4.663                     Mean   : 96.12                       
##  3rd Qu.:5.600                     3rd Qu.: 98.84                       
##  Max.   :8.050                     Max.   :100.00                       
##       IPM       
##  Min.   :64.13  
##  1st Qu.:70.22  
##  Median :72.79  
##  Mean   :73.68  
##  3rd Qu.:76.76  
##  Max.   :83.45

Korelasi antar variabel dengan menggunakan biplot

# melakukan PCA pada data numerik
pca_result <- prcomp(data_num, scale. = TRUE)  

# visualisasi biplot
library(factoextra)
fviz_pca_biplot(pca_result,
                repel = TRUE,     # untuk menghindari label yang saling menimpa
                col.var = "red",  # warna untuk vektor variabel
                col.ind = "blue") # warna untuk titik individu (observasi)

Mengubah ke dataframe

data_num <- as.data.frame(data_num)
data_num
##    Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1                     13.65             12244.97       72.86
## 2                      9.53             15870.05       73.55
## 3                     10.63             14212.06       74.64
## 4                      6.53             30234.61       74.91
## 5                      8.69             28239.86       74.34
## 6                     10.72             32195.50       73.27
## 7                      9.45             75744.29       73.26
## 8                      8.93             24808.35       70.96
## 9                      9.51             59984.00       70.03
## 10                     7.34             60848.35       71.38
## 11                    13.34             15075.62       67.60
## 12                    11.90             15018.99       69.94
## 13                    17.19             25904.93       68.12
## 14                     9.24            119252.55       70.81
## 15                     5.00            160950.78       74.69
## 16                     9.80             66982.68       73.25
## 17                     9.15             31602.77       73.22
## 18                    10.89             20598.57       72.28
## 19                    11.04             14895.81       72.28
## 20                     9.80             14562.68       73.29
## 21                    14.40             14904.51       73.20
## 22                    12.18             63310.69       72.57
## 23                    14.91             49984.23       72.36
## 24                    12.42             30709.18       73.22
## 25                    10.96            113825.43       73.30
## 26                    19.35             17164.20       70.79
## 27                    21.76             14674.11       68.64
## 28                    13.85             12628.69       68.31
## 29                    18.70             26244.79       72.47
## 30                     7.15             91631.35       74.67
## 31                     7.30              5455.81       74.66
## 32                     4.26             60119.82       74.13
## 33                     6.48              9408.51       70.99
## 34                     6.60              6637.08       72.31
## 35                     5.77              5399.62       74.10
## 36                     4.74             11764.40       73.44
## 37                     4.65            459030.72       74.75
## 38                     3.31             12936.60       73.29
##    Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1                  7.88                                9681
## 2                  7.78                               10658
## 3                  7.90                               10465
## 4                  8.66                               11565
## 5                  7.83                               11499
## 6                  8.24                               11952
## 7                  7.75                               10791
## 8                  7.14                                9720
## 9                  6.52                               10277
## 10                 7.76                               12820
## 11                 6.36                               11255
## 12                 6.90                               10702
## 13                 6.29                               11756
## 14                 7.44                               11239
## 15                10.78                               15311
## 16                 9.11                               13467
## 17                 8.77                               11999
## 18                 8.24                               12821
## 19                 7.95                               12259
## 20                 8.67                               12495
## 21                 7.78                               11897
## 22                 7.45                               10776
## 23                 7.40                               11174
## 24                 8.34                               12019
## 25                10.01                               13870
## 26                 5.99                                9438
## 27                 5.07                                9363
## 28                 7.15                                9420
## 29                 5.94                                9807
## 30                10.69                               13276
## 31                10.78                               14548
## 32                10.94                               17222
## 33                 9.56                               12999
## 34                 9.78                               14250
## 35                11.05                               14422
## 36                11.82                               17115
## 37                10.70                               18977
## 38                 9.85                               13603
##    Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1                               1.83                                 80.19
## 2                               4.66                                 94.42
## 3                               4.52                                 84.87
## 4                               5.65                                 97.90
## 5                               4.91                                 96.41
## 6                               5.79                                 92.00
## 7                               5.70                                 98.18
## 8                               3.67                                 95.49
## 9                               4.01                                 97.51
## 10                              4.75                                 97.44
## 11                              4.15                                 94.43
## 12                              3.27                                 97.45
## 13                              3.24                                 97.75
## 14                              5.48                                 97.63
## 15                              8.05                                 97.19
## 16                              4.67                                 98.90
## 17                              4.66                                 98.01
## 18                              4.68                                 98.86
## 19                              5.14                                 96.39
## 20                              4.16                                 99.61
## 21                              2.41                                 97.51
## 22                              4.63                                 97.59
## 23                              4.40                                 95.22
## 24                              5.46                                 79.26
## 25                              6.82                                 92.54
## 26                              6.18                                 96.30
## 27                              2.72                                 94.29
## 28                              1.74                                 97.91
## 29                              1.71                                 98.10
## 30                              4.06                                 99.60
## 31                              5.24                                 98.79
## 32                              6.80                                 99.06
## 33                              4.53                                100.00
## 34                              5.64                                 99.38
## 35                              4.73                                 99.95
## 36                              5.85                                 99.10
## 37                              6.76                                 98.15
## 38                              4.52                                 99.16
##      IPM
## 1  70.19
## 2  72.50
## 3  71.73
## 4  74.61
## 5  72.49
## 6  73.96
## 7  72.16
## 8  67.87
## 9  68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18

Distribusi masing-masing variabel numerik

data_num %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(fill = "steelblue", color = "white", bins = 30) +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Fitur Numerik")

Handling outliers

# cek outlier menggunakan IQR
outlier_counts <- sapply(data_num, function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR), na.rm = TRUE)
})

# menampilkan total outlier
outlier_counts
##              Penduduk.Miskin..persen.                  PDRB..miliar.rupiah. 
##                                     1                                     2 
##                           AHH..tahun.                  Lama.Sekolah..tahun. 
##                                     0                                     0 
##   Pengeluaran.Perkapita..ribu.rupiah.     Tingkat.Pengangguran.TPT..persen. 
##                                     1                                     3 
## Akses.Terhadap.Sumber.Air.Minum.Layak                                   IPM 
##                                     3                                     0
# visualisasi SEBELUM handling outliers
data_num %>%
  scale() %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  ggplot(aes(x = Fitur, y = Nilai)) +
  geom_boxplot(fill = "skyblue") +
  theme_minimal() +
  coord_flip() +
  ggtitle("Boxplot Sebelum Handling Outliers")

# handling outliers
handle_outliers_iqr <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR_value
  upper <- Q3 + 1.5 * IQR_value
  x[x < lower] <- lower
  x[x > upper] <- upper
  return(x)
}

data_winsor <- data_num %>% mutate(across(everything(), handle_outliers_iqr))

# normalisasi atau standarisasi data
data_scaled <- scale(data_winsor) %>% as.data.frame()
colnames(data_scaled) <- colnames(data_num)

# visualisasi setelah handling outliers
data_scaled %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  ggplot(aes(x = Fitur, y = Nilai)) +
  geom_boxplot(fill = "lightcoral") +
  theme_minimal() +
  coord_flip() +
  ggtitle("Boxplot Setelah Handling Outliers")

Implementasi Algoritma

K-Means

Menentukan jumlah cluster

Silhouette method

library(cluster)

silhouette_scores <- data.frame(K = integer(), Silhouette = numeric())

for (k in 2:5) {
  km <- kmeans(data_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(data_scaled))
  avg_sil <- mean(ss[, 3])  
  silhouette_scores <- rbind(silhouette_scores, data.frame(K = k, Silhouette = avg_sil))
}

print(silhouette_scores)
##   K Silhouette
## 1 2  0.3046194
## 2 3  0.2570559
## 3 4  0.2573574
## 4 5  0.2645056

Calinski-Harabasz Index

library(NbClust)

# Menentukan jumlah cluster optimal dengan Calinski-Harabasz
set.seed(123)
ch_result <- NbClust(data_scaled, 
                     distance = "euclidean", 
                     min.nc = 2, max.nc = 5, 
                     method = "kmeans", 
                     index = "ch")

# Melihat jumlah cluster
ch_result
## $All.index
##       2       3       4       5 
## 23.9056 20.8596 18.0180 18.3573 
## 
## $Best.nc
## Number_clusters     Value_Index 
##          2.0000         23.9056 
## 
## $Best.partition
##  [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1

ICD rate

# Fungsi untuk hitung intra-cluster distance
intra_dist <- function(data, clusters) {
  sum <- 0
  for (k in unique(clusters)) {
    cluster_points <- data[clusters == k, ]
    center <- colMeans(cluster_points)
    sum <- sum + sum(rowSums((cluster_points - center)^2))
  }
  return(sum / nrow(data))
}

# Fungsi untuk hitung rata-rata jarak antar-centroid
inter_dist <- function(centers) {
  d <- dist(centers)
  return(mean(d))
}

# Loop untuk K = 2 sampai 5
icd_results <- data.frame(K = integer(), ICD_Rate = numeric())

for (k in 2:5) {
  set.seed(123) 
  km <- kmeans(data_scaled, centers = k, nstart = 25)
  
  intra <- intra_dist(data_scaled, km$cluster)
  inter <- inter_dist(km$centers)
  icd <- intra / inter
  
  icd_results <- rbind(icd_results, data.frame(K = k, ICD_Rate = icd))
}

# Tampilkan hasil
print(icd_results)
##   K ICD_Rate
## 1 2 2.296684
## 2 3 2.132510
## 3 4 2.049413
## 4 5 2.222454

Berdasarkan hasil ketiga metode tersebut, terlihat bahwa ch index dan silhoutee menunjukkan klaster yang paling optimal adalah 2, sedangkan ICD rate menunjukkan nilai yang paling kecil ada di klaster 4. Namun, tetap memilih menggunakan klaster 2 karena tidak ada perbedaan yang cukup besar.

Implementasi K-Means

kmeans_result <- kmeans(data_scaled, centers = 2, nstart = 25)

Visualisasi

fviz_cluster(kmeans_result, data = data_scaled, ellipse.type = "norm",
             ggtheme = theme_minimal()) +
  ggtitle("K-Means Clustering")

### Menggabungkan hasil cluster ke data asli

data$Cluster_KMeans <- as.factor(kmeans_result$cluster)

Menambahkan hasil cluster ke data asli

data_clustered <- data %>%
  mutate(Cluster = factor(kmeans_result$cluster))

write.csv(data_clustered, "Anmul Clustering K-Means.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"

Karakteristik Tiap Cluster

data_clustered %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  View()  # Melihat di tab 'Viewer'

Visualisasi dengan nama kabupaten

library(ggrepel)

data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(kmeans_result$cluster)
pca_df$Kabupaten <- data$Kabupaten  

ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
  geom_point(size = 3) +
  geom_text_repel(size = 3, max.overlaps = Inf) +
  theme_minimal() +
  labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan K-Means")

## DBSCAN

Menentukan nilai epsilon

library(dbscan)
kNNdistplot(data_scaled, k = 2)  

Mencari rekomendasi nilai minpts terbaiknya

for (m in c(4, 5, 6)) {
  result <- dbscan(data_scaled, eps = 1.87, minPts = m)
  cat("minPts =", m, "-> clusters:", length(unique(result$cluster)) - 1, 
      ", noise:", sum(result$cluster == 0), "\n")
}
## minPts = 4 -> clusters: 1 , noise: 8 
## minPts = 5 -> clusters: 2 , noise: 9 
## minPts = 6 -> clusters: 2 , noise: 10

Silhouette method

library(fpc)
library(cluster)

# Inisialisasi data frame hasil
silhouette_scores <- data.frame(Eps = numeric(), MinPts = integer(), Silhouette = numeric())

# Parameter tetap
eps <- 1.87
minPts <- 5

# Jalankan DBSCAN
db <- fpc::dbscan(data_scaled, eps = eps, MinPts = minPts)

# Hanya evaluasi jika ada lebih dari 1 cluster (selain noise)
if (length(unique(db$cluster)) > 1) {
  ss <- silhouette(db$cluster, dist(data_scaled))
  avg_sil <- mean(ss[, 3])
  silhouette_scores <- rbind(silhouette_scores, data.frame(Eps = eps, MinPts = minPts, Silhouette = avg_sil))
}

print(silhouette_scores)
##    Eps MinPts Silhouette
## 1 1.87      5  0.1369504

ICD rate

db <- fpc::dbscan(data_scaled, eps = 1.87, MinPts = 5)
cluster_labels <- db$cluster
valid_idx <- cluster_labels > 0

intra <- intra_dist(data_scaled[valid_idx, ], cluster_labels[valid_idx])
centroids <- aggregate(data_scaled[valid_idx, ], list(cluster_labels[valid_idx]), mean)[-1]
inter <- inter_dist(as.matrix(centroids))
icd <- intra / inter
icd
## [1] 1.683253

Implementasi DBSCAN

dbscan_result <- dbscan(data_scaled, eps = 1.87, minPts = 5)

Visualisasi

fviz_cluster(dbscan_result, data = data_scaled, ellipse.type = "norm",
             ggtheme = theme_minimal()) +
  ggtitle("DBSCAN Clustering")

Simpan di csv

data_dbscan <- data %>%
  mutate(Cluster_DBSCAN = factor(dbscan_result$cluster))

write.csv(data_dbscan, "Anmul_clusters_DBSCAN.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"

Gabungkan hasil ke data asli

data$Cluster_DBSCAN <- as.factor(dbscan_result$cluster)

data$Cluster_DBSCAN
##  [1] 0 1 1 1 1 1 1 2 1 1 2 2 2 1 0 1 1 1 1 1 0 1 1 1 0 0 0 2 0 1 1 0 1 1 1 1 0 1
## Levels: 0 1 2

Karakteristik Tiap Cluster

data_dbscan %>%
  group_by(Cluster_DBSCAN) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  View()

Visualisasi dengan nama kabupaten

library(ggrepel)

data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(dbscan_result$cluster)
pca_df$Kabupaten <- data$Kabupaten  

ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
  geom_point(size = 3) +
  geom_text_repel(size = 3, max.overlaps = Inf) +
  theme_minimal() +
  labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan DBSCAN")

## Fuzzy C-Means ### Menentukan cluster ### Silhouette Method

library(e1071) 
library(cluster)

silhouette_scores <- data.frame(K = integer(), Silhouette = numeric())

for (k in 2:5) {
  fcm <- cmeans(data_scaled, centers = k, m = 2, iter.max = 100, verbose = FALSE)
  hard_labels <- apply(fcm$membership, 1, which.max)
  
  ss <- silhouette(hard_labels, dist(data_scaled))
  avg_sil <- mean(ss[, 3])
  
  silhouette_scores <- rbind(silhouette_scores, data.frame(K = k, Silhouette = avg_sil))
}

print(silhouette_scores)
##   K Silhouette
## 1 2  0.2993860
## 2 3  0.2504029
## 3 4  0.2227702
## 4 5  0.2550641

Calinski-Harabasz index

library(clusterSim) 

ch_results <- data.frame(K = integer(), CH = numeric())

for (k in 2:5) {
  fcm <- cmeans(data_scaled, centers = k, m = 2)
  hard_labels <- apply(fcm$membership, 1, which.max)
  
  ch <- index.G1(data_scaled, hard_labels)
  ch_results <- rbind(ch_results, data.frame(K = k, CH = ch))
}

print(ch_results)
##   K       CH
## 1 2 23.54144
## 2 3 20.72743
## 3 4 18.33874
## 4 5 18.51890

ICD Rate

intra_dist <- function(data, clusters) {
  sum <- 0
  for (k in unique(clusters)) {
    cluster_points <- data[clusters == k, ]
    center <- colMeans(cluster_points)
    sum <- sum + sum(rowSums((cluster_points - center)^2))
  }
  return(sum / nrow(data))
}

inter_dist <- function(centers) {
  d <- dist(centers)
  return(mean(d))
}

icd_results <- data.frame(K = integer(), ICD_Rate = numeric())

for (k in 2:5) {
  fcm <- cmeans(data_scaled, centers = k, m = 2)
  hard_labels <- apply(fcm$membership, 1, which.max)
  
  intra <- intra_dist(data_scaled, hard_labels)
  inter <- inter_dist(fcm$centers)
  icd <- intra / inter
  
  icd_results <- rbind(icd_results, data.frame(K = k, ICD_Rate = icd))
}

print(icd_results)
##   K ICD_Rate
## 1 2 2.508908
## 2 3 2.348585
## 3 4 2.239839
## 4 5 2.286752

Implementasi Fuzzy C-Means

fcm_result <- cmeans(data_scaled, centers = 2, m = 2)

Lihat hasil cluster

fcm_result$cluster  # hard label
##  [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1
fcm_result$membership  # probabilitas ke tiap cluster
##                1         2
##  [1,] 0.23710131 0.7628987
##  [2,] 0.27312420 0.7268758
##  [3,] 0.33456478 0.6654352
##  [4,] 0.77098856 0.2290114
##  [5,] 0.47269365 0.5273063
##  [6,] 0.42203090 0.5779691
##  [7,] 0.49000277 0.5099972
##  [8,] 0.09777591 0.9022241
##  [9,] 0.14657440 0.8534256
## [10,] 0.49301829 0.5069817
## [11,] 0.16925290 0.8307471
## [12,] 0.09055720 0.9094428
## [13,] 0.18031940 0.8196806
## [14,] 0.40947853 0.5905215
## [15,] 0.75923560 0.2407644
## [16,] 0.84819257 0.1518074
## [17,] 0.69012425 0.3098758
## [18,] 0.51722269 0.4827773
## [19,] 0.31186263 0.6881374
## [20,] 0.66859481 0.3314052
## [21,] 0.23396097 0.7660390
## [22,] 0.18478846 0.8152115
## [23,] 0.07946657 0.9205334
## [24,] 0.39116453 0.6088355
## [25,] 0.64891410 0.3510859
## [26,] 0.20982259 0.7901774
## [27,] 0.19921891 0.8007811
## [28,] 0.19605449 0.8039455
## [29,] 0.19839521 0.8016048
## [30,] 0.82830030 0.1716997
## [31,] 0.88279423 0.1172058
## [32,] 0.83416122 0.1658388
## [33,] 0.71308262 0.2869174
## [34,] 0.87557260 0.1244274
## [35,] 0.86618898 0.1338110
## [36,] 0.82410852 0.1758915
## [37,] 0.76578808 0.2342119
## [38,] 0.86886759 0.1311324

Visualisasi keanggotaan fuzzy

fviz_cluster(list(data = data_scaled, cluster = fcm_result$cluster),
             ellipse.type = "norm", geom = "point") +
  ggtitle("Fuzzy C-Means Clustering")

### Simpan di csv

data_fcm <- data %>%
  mutate(Cluster_FCM = factor(fcm_result$cluster))

write.csv(data_fcm, "Anmul_FuzzyCMeans_Clusters.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"

Menggabungkan hasil cluster ke data asli

data$Cluster_FCM <- as.factor(fcm_result$cluster)

Karakteristik Tiap Cluster

data_fcm %>%
  group_by(Cluster_FCM) %>%
  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
  View()  

Visualisasi dengan nama kabupaten

library(ggrepel)

data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(fcm_result$cluster)
pca_df$Kabupaten <- data$Kabupaten  

ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
  geom_point(size = 3) +
  geom_text_repel(size = 3, max.overlaps = Inf) +
  theme_minimal() +
  labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan Fuzzy C-MEANS")