library(tidyverse)
library(cluster)
library(factoextra)
library(fpc)
library(e1071)
library(fclust)
library(corrplot)
library(ggplot2)
library(NbClust)
library(clusterSim)
library(ggrepel)
library(dbscan)
data <- read.csv('D:/Semester 4/ANMUL/Kelompok 8/data jatim 2023.csv')
data
## Kabupaten.Kota Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1 Pacitan 13.65 12244.97 72.86
## 2 Ponorogo 9.53 15870.05 73.55
## 3 Trenggalek 10.63 14212.06 74.64
## 4 Tulungagung 6.53 30234.61 74.91
## 5 Blitar 8.69 28239.86 74.34
## 6 Kediri 10.72 32195.50 73.27
## 7 Malang 9.45 75744.29 73.26
## 8 Lumajang 8.93 24808.35 70.96
## 9 Jember 9.51 59984.00 70.03
## 10 Banyuwangi 7.34 60848.35 71.38
## 11 Bondowoso 13.34 15075.62 67.60
## 12 Situbondo 11.90 15018.99 69.94
## 13 Probolinggo 17.19 25904.93 68.12
## 14 Pasuruan 9.24 119252.55 70.81
## 15 Sidoarjo 5.00 160950.78 74.69
## 16 Mojokerto 9.80 66982.68 73.25
## 17 Jombang 9.15 31602.77 73.22
## 18 Nganjuk 10.89 20598.57 72.28
## 19 Madiun 11.04 14895.81 72.28
## 20 Magetan 9.80 14562.68 73.29
## 21 Ngawi 14.40 14904.51 73.20
## 22 Bojonegoro 12.18 63310.69 72.57
## 23 Tuban 14.91 49984.23 72.36
## 24 Lamongan 12.42 30709.18 73.22
## 25 Gresik 10.96 113825.43 73.30
## 26 Bangkalan 19.35 17164.20 70.79
## 27 Sampang 21.76 14674.11 68.64
## 28 Pamekasan 13.85 12628.69 68.31
## 29 Sumenep 18.70 26244.79 72.47
## 30 Kota Kediri 7.15 91631.35 74.67
## 31 Kota Blitar 7.30 5455.81 74.66
## 32 Kota Malang 4.26 60119.82 74.13
## 33 Kota Probolinggo 6.48 9408.51 70.99
## 34 Kota Pasuruan 6.60 6637.08 72.31
## 35 Kota Mojokerto 5.77 5399.62 74.10
## 36 Kota Madiun 4.74 11764.40 73.44
## 37 Kota Surabaya 4.65 459030.72 74.75
## 38 Kota Batu 3.31 12936.60 73.29
## Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1 7.88 9681
## 2 7.78 10658
## 3 7.90 10465
## 4 8.66 11565
## 5 7.83 11499
## 6 8.24 11952
## 7 7.75 10791
## 8 7.14 9720
## 9 6.52 10277
## 10 7.76 12820
## 11 6.36 11255
## 12 6.90 10702
## 13 6.29 11756
## 14 7.44 11239
## 15 10.78 15311
## 16 9.11 13467
## 17 8.77 11999
## 18 8.24 12821
## 19 7.95 12259
## 20 8.67 12495
## 21 7.78 11897
## 22 7.45 10776
## 23 7.40 11174
## 24 8.34 12019
## 25 10.01 13870
## 26 5.99 9438
## 27 5.07 9363
## 28 7.15 9420
## 29 5.94 9807
## 30 10.69 13276
## 31 10.78 14548
## 32 10.94 17222
## 33 9.56 12999
## 34 9.78 14250
## 35 11.05 14422
## 36 11.82 17115
## 37 10.70 18977
## 38 9.85 13603
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1 1.83 80.19
## 2 4.66 94.42
## 3 4.52 84.87
## 4 5.65 97.90
## 5 4.91 96.41
## 6 5.79 92.00
## 7 5.70 98.18
## 8 3.67 95.49
## 9 4.01 97.51
## 10 4.75 97.44
## 11 4.15 94.43
## 12 3.27 97.45
## 13 3.24 97.75
## 14 5.48 97.63
## 15 8.05 97.19
## 16 4.67 98.90
## 17 4.66 98.01
## 18 4.68 98.86
## 19 5.14 96.39
## 20 4.16 99.61
## 21 2.41 97.51
## 22 4.63 97.59
## 23 4.40 95.22
## 24 5.46 79.26
## 25 6.82 92.54
## 26 6.18 96.30
## 27 2.72 94.29
## 28 1.74 97.91
## 29 1.71 98.10
## 30 4.06 99.60
## 31 5.24 98.79
## 32 6.80 99.06
## 33 4.53 100.00
## 34 5.64 99.38
## 35 4.73 99.95
## 36 5.85 99.10
## 37 6.76 98.15
## 38 4.52 99.16
## IPM
## 1 70.19
## 2 72.50
## 3 71.73
## 4 74.61
## 5 72.49
## 6 73.96
## 7 72.16
## 8 67.87
## 9 68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18
glimpse(data)
## Rows: 38
## Columns: 9
## $ Kabupaten.Kota <chr> "Pacitan", "Ponorogo", "Trenggal…
## $ Penduduk.Miskin..persen. <dbl> 13.65, 9.53, 10.63, 6.53, 8.69, …
## $ PDRB..miliar.rupiah. <dbl> 12244.97, 15870.05, 14212.06, 30…
## $ AHH..tahun. <dbl> 72.86, 73.55, 74.64, 74.91, 74.3…
## $ Lama.Sekolah..tahun. <dbl> 7.88, 7.78, 7.90, 8.66, 7.83, 8.…
## $ Pengeluaran.Perkapita..ribu.rupiah. <int> 9681, 10658, 10465, 11565, 11499…
## $ Tingkat.Pengangguran.TPT..persen. <dbl> 1.83, 4.66, 4.52, 5.65, 4.91, 5.…
## $ Akses.Terhadap.Sumber.Air.Minum.Layak <dbl> 80.19, 94.42, 84.87, 97.90, 96.4…
## $ IPM <dbl> 70.19, 72.50, 71.73, 74.61, 72.4…
str(data)
## 'data.frame': 38 obs. of 9 variables:
## $ Kabupaten.Kota : chr "Pacitan" "Ponorogo" "Trenggalek" "Tulungagung" ...
## $ Penduduk.Miskin..persen. : num 13.65 9.53 10.63 6.53 8.69 ...
## $ PDRB..miliar.rupiah. : num 12245 15870 14212 30235 28240 ...
## $ AHH..tahun. : num 72.9 73.5 74.6 74.9 74.3 ...
## $ Lama.Sekolah..tahun. : num 7.88 7.78 7.9 8.66 7.83 8.24 7.75 7.14 6.52 7.76 ...
## $ Pengeluaran.Perkapita..ribu.rupiah. : int 9681 10658 10465 11565 11499 11952 10791 9720 10277 12820 ...
## $ Tingkat.Pengangguran.TPT..persen. : num 1.83 4.66 4.52 5.65 4.91 5.79 5.7 3.67 4.01 4.75 ...
## $ Akses.Terhadap.Sumber.Air.Minum.Layak: num 80.2 94.4 84.9 97.9 96.4 ...
## $ IPM : num 70.2 72.5 71.7 74.6 72.5 ...
summary(data)
## Kabupaten.Kota Penduduk.Miskin..persen. PDRB..miliar.rupiah.
## Length:38 Min. : 3.310 Min. : 5400
## Class :character 1st Qu.: 7.188 1st Qu.: 14590
## Mode :character Median : 9.665 Median : 25357
## Mean :10.293 Mean : 48554
## 3rd Qu.:12.360 3rd Qu.: 60086
## Max. :21.760 Max. :459031
## AHH..tahun. Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## Min. :67.60 Min. : 5.070 Min. : 9363
## 1st Qu.:71.09 1st Qu.: 7.410 1st Qu.:10720
## Median :73.21 Median : 7.925 Median :11924
## Mean :72.42 Mean : 8.376 Mean :12287
## 3rd Qu.:73.52 3rd Qu.: 9.725 3rd Qu.:13419
## Max. :74.91 Max. :11.820 Max. :18977
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## Min. :1.710 Min. : 79.26
## 1st Qu.:4.082 1st Qu.: 95.69
## Median :4.665 Median : 97.61
## Mean :4.663 Mean : 96.12
## 3rd Qu.:5.600 3rd Qu.: 98.84
## Max. :8.050 Max. :100.00
## IPM
## Min. :64.13
## 1st Qu.:70.22
## Median :72.79
## Mean :73.68
## 3rd Qu.:76.76
## Max. :83.45
colSums(is.na(data))
## Kabupaten.Kota Penduduk.Miskin..persen.
## 0 0
## PDRB..miliar.rupiah. AHH..tahun.
## 0 0
## Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 0 0
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 0 0
## IPM
## 0
sum(duplicated(data))
## [1] 0
data_num <- data[, sapply(data, is.numeric)]
data_num
## Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1 13.65 12244.97 72.86
## 2 9.53 15870.05 73.55
## 3 10.63 14212.06 74.64
## 4 6.53 30234.61 74.91
## 5 8.69 28239.86 74.34
## 6 10.72 32195.50 73.27
## 7 9.45 75744.29 73.26
## 8 8.93 24808.35 70.96
## 9 9.51 59984.00 70.03
## 10 7.34 60848.35 71.38
## 11 13.34 15075.62 67.60
## 12 11.90 15018.99 69.94
## 13 17.19 25904.93 68.12
## 14 9.24 119252.55 70.81
## 15 5.00 160950.78 74.69
## 16 9.80 66982.68 73.25
## 17 9.15 31602.77 73.22
## 18 10.89 20598.57 72.28
## 19 11.04 14895.81 72.28
## 20 9.80 14562.68 73.29
## 21 14.40 14904.51 73.20
## 22 12.18 63310.69 72.57
## 23 14.91 49984.23 72.36
## 24 12.42 30709.18 73.22
## 25 10.96 113825.43 73.30
## 26 19.35 17164.20 70.79
## 27 21.76 14674.11 68.64
## 28 13.85 12628.69 68.31
## 29 18.70 26244.79 72.47
## 30 7.15 91631.35 74.67
## 31 7.30 5455.81 74.66
## 32 4.26 60119.82 74.13
## 33 6.48 9408.51 70.99
## 34 6.60 6637.08 72.31
## 35 5.77 5399.62 74.10
## 36 4.74 11764.40 73.44
## 37 4.65 459030.72 74.75
## 38 3.31 12936.60 73.29
## Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1 7.88 9681
## 2 7.78 10658
## 3 7.90 10465
## 4 8.66 11565
## 5 7.83 11499
## 6 8.24 11952
## 7 7.75 10791
## 8 7.14 9720
## 9 6.52 10277
## 10 7.76 12820
## 11 6.36 11255
## 12 6.90 10702
## 13 6.29 11756
## 14 7.44 11239
## 15 10.78 15311
## 16 9.11 13467
## 17 8.77 11999
## 18 8.24 12821
## 19 7.95 12259
## 20 8.67 12495
## 21 7.78 11897
## 22 7.45 10776
## 23 7.40 11174
## 24 8.34 12019
## 25 10.01 13870
## 26 5.99 9438
## 27 5.07 9363
## 28 7.15 9420
## 29 5.94 9807
## 30 10.69 13276
## 31 10.78 14548
## 32 10.94 17222
## 33 9.56 12999
## 34 9.78 14250
## 35 11.05 14422
## 36 11.82 17115
## 37 10.70 18977
## 38 9.85 13603
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1 1.83 80.19
## 2 4.66 94.42
## 3 4.52 84.87
## 4 5.65 97.90
## 5 4.91 96.41
## 6 5.79 92.00
## 7 5.70 98.18
## 8 3.67 95.49
## 9 4.01 97.51
## 10 4.75 97.44
## 11 4.15 94.43
## 12 3.27 97.45
## 13 3.24 97.75
## 14 5.48 97.63
## 15 8.05 97.19
## 16 4.67 98.90
## 17 4.66 98.01
## 18 4.68 98.86
## 19 5.14 96.39
## 20 4.16 99.61
## 21 2.41 97.51
## 22 4.63 97.59
## 23 4.40 95.22
## 24 5.46 79.26
## 25 6.82 92.54
## 26 6.18 96.30
## 27 2.72 94.29
## 28 1.74 97.91
## 29 1.71 98.10
## 30 4.06 99.60
## 31 5.24 98.79
## 32 6.80 99.06
## 33 4.53 100.00
## 34 5.64 99.38
## 35 4.73 99.95
## 36 5.85 99.10
## 37 6.76 98.15
## 38 4.52 99.16
## IPM
## 1 70.19
## 2 72.50
## 3 71.73
## 4 74.61
## 5 72.49
## 6 73.96
## 7 72.16
## 8 67.87
## 9 68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18
summary(data_num)
## Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## Min. : 3.310 Min. : 5400 Min. :67.60
## 1st Qu.: 7.188 1st Qu.: 14590 1st Qu.:71.09
## Median : 9.665 Median : 25357 Median :73.21
## Mean :10.293 Mean : 48554 Mean :72.42
## 3rd Qu.:12.360 3rd Qu.: 60086 3rd Qu.:73.52
## Max. :21.760 Max. :459031 Max. :74.91
## Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## Min. : 5.070 Min. : 9363
## 1st Qu.: 7.410 1st Qu.:10720
## Median : 7.925 Median :11924
## Mean : 8.376 Mean :12287
## 3rd Qu.: 9.725 3rd Qu.:13419
## Max. :11.820 Max. :18977
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## Min. :1.710 Min. : 79.26
## 1st Qu.:4.082 1st Qu.: 95.69
## Median :4.665 Median : 97.61
## Mean :4.663 Mean : 96.12
## 3rd Qu.:5.600 3rd Qu.: 98.84
## Max. :8.050 Max. :100.00
## IPM
## Min. :64.13
## 1st Qu.:70.22
## Median :72.79
## Mean :73.68
## 3rd Qu.:76.76
## Max. :83.45
# melakukan PCA pada data numerik
pca_result <- prcomp(data_num, scale. = TRUE)
# visualisasi biplot
library(factoextra)
fviz_pca_biplot(pca_result,
repel = TRUE, # untuk menghindari label yang saling menimpa
col.var = "red", # warna untuk vektor variabel
col.ind = "blue") # warna untuk titik individu (observasi)
data_num <- as.data.frame(data_num)
data_num
## Penduduk.Miskin..persen. PDRB..miliar.rupiah. AHH..tahun.
## 1 13.65 12244.97 72.86
## 2 9.53 15870.05 73.55
## 3 10.63 14212.06 74.64
## 4 6.53 30234.61 74.91
## 5 8.69 28239.86 74.34
## 6 10.72 32195.50 73.27
## 7 9.45 75744.29 73.26
## 8 8.93 24808.35 70.96
## 9 9.51 59984.00 70.03
## 10 7.34 60848.35 71.38
## 11 13.34 15075.62 67.60
## 12 11.90 15018.99 69.94
## 13 17.19 25904.93 68.12
## 14 9.24 119252.55 70.81
## 15 5.00 160950.78 74.69
## 16 9.80 66982.68 73.25
## 17 9.15 31602.77 73.22
## 18 10.89 20598.57 72.28
## 19 11.04 14895.81 72.28
## 20 9.80 14562.68 73.29
## 21 14.40 14904.51 73.20
## 22 12.18 63310.69 72.57
## 23 14.91 49984.23 72.36
## 24 12.42 30709.18 73.22
## 25 10.96 113825.43 73.30
## 26 19.35 17164.20 70.79
## 27 21.76 14674.11 68.64
## 28 13.85 12628.69 68.31
## 29 18.70 26244.79 72.47
## 30 7.15 91631.35 74.67
## 31 7.30 5455.81 74.66
## 32 4.26 60119.82 74.13
## 33 6.48 9408.51 70.99
## 34 6.60 6637.08 72.31
## 35 5.77 5399.62 74.10
## 36 4.74 11764.40 73.44
## 37 4.65 459030.72 74.75
## 38 3.31 12936.60 73.29
## Lama.Sekolah..tahun. Pengeluaran.Perkapita..ribu.rupiah.
## 1 7.88 9681
## 2 7.78 10658
## 3 7.90 10465
## 4 8.66 11565
## 5 7.83 11499
## 6 8.24 11952
## 7 7.75 10791
## 8 7.14 9720
## 9 6.52 10277
## 10 7.76 12820
## 11 6.36 11255
## 12 6.90 10702
## 13 6.29 11756
## 14 7.44 11239
## 15 10.78 15311
## 16 9.11 13467
## 17 8.77 11999
## 18 8.24 12821
## 19 7.95 12259
## 20 8.67 12495
## 21 7.78 11897
## 22 7.45 10776
## 23 7.40 11174
## 24 8.34 12019
## 25 10.01 13870
## 26 5.99 9438
## 27 5.07 9363
## 28 7.15 9420
## 29 5.94 9807
## 30 10.69 13276
## 31 10.78 14548
## 32 10.94 17222
## 33 9.56 12999
## 34 9.78 14250
## 35 11.05 14422
## 36 11.82 17115
## 37 10.70 18977
## 38 9.85 13603
## Tingkat.Pengangguran.TPT..persen. Akses.Terhadap.Sumber.Air.Minum.Layak
## 1 1.83 80.19
## 2 4.66 94.42
## 3 4.52 84.87
## 4 5.65 97.90
## 5 4.91 96.41
## 6 5.79 92.00
## 7 5.70 98.18
## 8 3.67 95.49
## 9 4.01 97.51
## 10 4.75 97.44
## 11 4.15 94.43
## 12 3.27 97.45
## 13 3.24 97.75
## 14 5.48 97.63
## 15 8.05 97.19
## 16 4.67 98.90
## 17 4.66 98.01
## 18 4.68 98.86
## 19 5.14 96.39
## 20 4.16 99.61
## 21 2.41 97.51
## 22 4.63 97.59
## 23 4.40 95.22
## 24 5.46 79.26
## 25 6.82 92.54
## 26 6.18 96.30
## 27 2.72 94.29
## 28 1.74 97.91
## 29 1.71 98.10
## 30 4.06 99.60
## 31 5.24 98.79
## 32 6.80 99.06
## 33 4.53 100.00
## 34 5.64 99.38
## 35 4.73 99.95
## 36 5.85 99.10
## 37 6.76 98.15
## 38 4.52 99.16
## IPM
## 1 70.19
## 2 72.50
## 3 71.73
## 4 74.61
## 5 72.49
## 6 73.96
## 7 72.16
## 8 67.87
## 9 68.64
## 10 72.61
## 11 67.99
## 12 69.16
## 13 67.79
## 14 70.29
## 15 81.55
## 16 75.53
## 17 74.60
## 18 73.71
## 19 72.97
## 20 75.41
## 21 72.47
## 22 70.85
## 23 70.34
## 24 74.53
## 25 77.98
## 26 65.75
## 27 64.13
## 28 67.96
## 29 68.61
## 30 80.44
## 31 80.63
## 32 83.39
## 33 75.43
## 34 77.17
## 35 80.07
## 36 82.71
## 37 83.45
## 38 78.18
data_num %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(fill = "steelblue", color = "white", bins = 30) +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Fitur Numerik")
# cek outlier menggunakan IQR
outlier_counts <- sapply(data_num, function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR), na.rm = TRUE)
})
# menampilkan total outlier
outlier_counts
## Penduduk.Miskin..persen. PDRB..miliar.rupiah.
## 1 2
## AHH..tahun. Lama.Sekolah..tahun.
## 0 0
## Pengeluaran.Perkapita..ribu.rupiah. Tingkat.Pengangguran.TPT..persen.
## 1 3
## Akses.Terhadap.Sumber.Air.Minum.Layak IPM
## 3 0
# visualisasi SEBELUM handling outliers
data_num %>%
scale() %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
ggplot(aes(x = Fitur, y = Nilai)) +
geom_boxplot(fill = "skyblue") +
theme_minimal() +
coord_flip() +
ggtitle("Boxplot Sebelum Handling Outliers")
# handling outliers
handle_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_value
upper <- Q3 + 1.5 * IQR_value
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
data_winsor <- data_num %>% mutate(across(everything(), handle_outliers_iqr))
# normalisasi atau standarisasi data
data_scaled <- scale(data_winsor) %>% as.data.frame()
colnames(data_scaled) <- colnames(data_num)
# visualisasi setelah handling outliers
data_scaled %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
ggplot(aes(x = Fitur, y = Nilai)) +
geom_boxplot(fill = "lightcoral") +
theme_minimal() +
coord_flip() +
ggtitle("Boxplot Setelah Handling Outliers")
library(cluster)
silhouette_scores <- data.frame(K = integer(), Silhouette = numeric())
for (k in 2:5) {
km <- kmeans(data_scaled, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(data_scaled))
avg_sil <- mean(ss[, 3])
silhouette_scores <- rbind(silhouette_scores, data.frame(K = k, Silhouette = avg_sil))
}
print(silhouette_scores)
## K Silhouette
## 1 2 0.3046194
## 2 3 0.2570559
## 3 4 0.2573574
## 4 5 0.2645056
library(NbClust)
# Menentukan jumlah cluster optimal dengan Calinski-Harabasz
set.seed(123)
ch_result <- NbClust(data_scaled,
distance = "euclidean",
min.nc = 2, max.nc = 5,
method = "kmeans",
index = "ch")
# Melihat jumlah cluster
ch_result
## $All.index
## 2 3 4 5
## 23.9056 20.8596 18.0180 18.3573
##
## $Best.nc
## Number_clusters Value_Index
## 2.0000 23.9056
##
## $Best.partition
## [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1
# Fungsi untuk hitung intra-cluster distance
intra_dist <- function(data, clusters) {
sum <- 0
for (k in unique(clusters)) {
cluster_points <- data[clusters == k, ]
center <- colMeans(cluster_points)
sum <- sum + sum(rowSums((cluster_points - center)^2))
}
return(sum / nrow(data))
}
# Fungsi untuk hitung rata-rata jarak antar-centroid
inter_dist <- function(centers) {
d <- dist(centers)
return(mean(d))
}
# Loop untuk K = 2 sampai 5
icd_results <- data.frame(K = integer(), ICD_Rate = numeric())
for (k in 2:5) {
set.seed(123)
km <- kmeans(data_scaled, centers = k, nstart = 25)
intra <- intra_dist(data_scaled, km$cluster)
inter <- inter_dist(km$centers)
icd <- intra / inter
icd_results <- rbind(icd_results, data.frame(K = k, ICD_Rate = icd))
}
# Tampilkan hasil
print(icd_results)
## K ICD_Rate
## 1 2 2.296684
## 2 3 2.132510
## 3 4 2.049413
## 4 5 2.222454
kmeans_result <- kmeans(data_scaled, centers = 2, nstart = 25)
fviz_cluster(kmeans_result, data = data_scaled, ellipse.type = "norm",
ggtheme = theme_minimal()) +
ggtitle("K-Means Clustering")
### Menggabungkan hasil cluster ke data asli
data$Cluster_KMeans <- as.factor(kmeans_result$cluster)
data_clustered <- data %>%
mutate(Cluster = factor(kmeans_result$cluster))
write.csv(data_clustered, "Anmul Clustering K-Means.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"
data_clustered %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
View() # Melihat di tab 'Viewer'
library(ggrepel)
data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(kmeans_result$cluster)
pca_df$Kabupaten <- data$Kabupaten
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
geom_point(size = 3) +
geom_text_repel(size = 3, max.overlaps = Inf) +
theme_minimal() +
labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan K-Means")
## DBSCAN
library(dbscan)
kNNdistplot(data_scaled, k = 2)
for (m in c(4, 5, 6)) {
result <- dbscan(data_scaled, eps = 1.87, minPts = m)
cat("minPts =", m, "-> clusters:", length(unique(result$cluster)) - 1,
", noise:", sum(result$cluster == 0), "\n")
}
## minPts = 4 -> clusters: 1 , noise: 8
## minPts = 5 -> clusters: 2 , noise: 9
## minPts = 6 -> clusters: 2 , noise: 10
library(fpc)
library(cluster)
# Inisialisasi data frame hasil
silhouette_scores <- data.frame(Eps = numeric(), MinPts = integer(), Silhouette = numeric())
# Parameter tetap
eps <- 1.87
minPts <- 5
# Jalankan DBSCAN
db <- fpc::dbscan(data_scaled, eps = eps, MinPts = minPts)
# Hanya evaluasi jika ada lebih dari 1 cluster (selain noise)
if (length(unique(db$cluster)) > 1) {
ss <- silhouette(db$cluster, dist(data_scaled))
avg_sil <- mean(ss[, 3])
silhouette_scores <- rbind(silhouette_scores, data.frame(Eps = eps, MinPts = minPts, Silhouette = avg_sil))
}
print(silhouette_scores)
## Eps MinPts Silhouette
## 1 1.87 5 0.1369504
db <- fpc::dbscan(data_scaled, eps = 1.87, MinPts = 5)
cluster_labels <- db$cluster
valid_idx <- cluster_labels > 0
intra <- intra_dist(data_scaled[valid_idx, ], cluster_labels[valid_idx])
centroids <- aggregate(data_scaled[valid_idx, ], list(cluster_labels[valid_idx]), mean)[-1]
inter <- inter_dist(as.matrix(centroids))
icd <- intra / inter
icd
## [1] 1.683253
dbscan_result <- dbscan(data_scaled, eps = 1.87, minPts = 5)
fviz_cluster(dbscan_result, data = data_scaled, ellipse.type = "norm",
ggtheme = theme_minimal()) +
ggtitle("DBSCAN Clustering")
data_dbscan <- data %>%
mutate(Cluster_DBSCAN = factor(dbscan_result$cluster))
write.csv(data_dbscan, "Anmul_clusters_DBSCAN.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"
data$Cluster_DBSCAN <- as.factor(dbscan_result$cluster)
data$Cluster_DBSCAN
## [1] 0 1 1 1 1 1 1 2 1 1 2 2 2 1 0 1 1 1 1 1 0 1 1 1 0 0 0 2 0 1 1 0 1 1 1 1 0 1
## Levels: 0 1 2
data_dbscan %>%
group_by(Cluster_DBSCAN) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
View()
library(ggrepel)
data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(dbscan_result$cluster)
pca_df$Kabupaten <- data$Kabupaten
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
geom_point(size = 3) +
geom_text_repel(size = 3, max.overlaps = Inf) +
theme_minimal() +
labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan DBSCAN")
## Fuzzy C-Means ### Menentukan cluster ### Silhouette
Method
library(e1071)
library(cluster)
silhouette_scores <- data.frame(K = integer(), Silhouette = numeric())
for (k in 2:5) {
fcm <- cmeans(data_scaled, centers = k, m = 2, iter.max = 100, verbose = FALSE)
hard_labels <- apply(fcm$membership, 1, which.max)
ss <- silhouette(hard_labels, dist(data_scaled))
avg_sil <- mean(ss[, 3])
silhouette_scores <- rbind(silhouette_scores, data.frame(K = k, Silhouette = avg_sil))
}
print(silhouette_scores)
## K Silhouette
## 1 2 0.2993860
## 2 3 0.2504029
## 3 4 0.2227702
## 4 5 0.2550641
library(clusterSim)
ch_results <- data.frame(K = integer(), CH = numeric())
for (k in 2:5) {
fcm <- cmeans(data_scaled, centers = k, m = 2)
hard_labels <- apply(fcm$membership, 1, which.max)
ch <- index.G1(data_scaled, hard_labels)
ch_results <- rbind(ch_results, data.frame(K = k, CH = ch))
}
print(ch_results)
## K CH
## 1 2 23.54144
## 2 3 20.72743
## 3 4 18.33874
## 4 5 18.51890
intra_dist <- function(data, clusters) {
sum <- 0
for (k in unique(clusters)) {
cluster_points <- data[clusters == k, ]
center <- colMeans(cluster_points)
sum <- sum + sum(rowSums((cluster_points - center)^2))
}
return(sum / nrow(data))
}
inter_dist <- function(centers) {
d <- dist(centers)
return(mean(d))
}
icd_results <- data.frame(K = integer(), ICD_Rate = numeric())
for (k in 2:5) {
fcm <- cmeans(data_scaled, centers = k, m = 2)
hard_labels <- apply(fcm$membership, 1, which.max)
intra <- intra_dist(data_scaled, hard_labels)
inter <- inter_dist(fcm$centers)
icd <- intra / inter
icd_results <- rbind(icd_results, data.frame(K = k, ICD_Rate = icd))
}
print(icd_results)
## K ICD_Rate
## 1 2 2.508908
## 2 3 2.348585
## 3 4 2.239839
## 4 5 2.286752
fcm_result <- cmeans(data_scaled, centers = 2, m = 2)
fcm_result$cluster # hard label
## [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1
fcm_result$membership # probabilitas ke tiap cluster
## 1 2
## [1,] 0.23710131 0.7628987
## [2,] 0.27312420 0.7268758
## [3,] 0.33456478 0.6654352
## [4,] 0.77098856 0.2290114
## [5,] 0.47269365 0.5273063
## [6,] 0.42203090 0.5779691
## [7,] 0.49000277 0.5099972
## [8,] 0.09777591 0.9022241
## [9,] 0.14657440 0.8534256
## [10,] 0.49301829 0.5069817
## [11,] 0.16925290 0.8307471
## [12,] 0.09055720 0.9094428
## [13,] 0.18031940 0.8196806
## [14,] 0.40947853 0.5905215
## [15,] 0.75923560 0.2407644
## [16,] 0.84819257 0.1518074
## [17,] 0.69012425 0.3098758
## [18,] 0.51722269 0.4827773
## [19,] 0.31186263 0.6881374
## [20,] 0.66859481 0.3314052
## [21,] 0.23396097 0.7660390
## [22,] 0.18478846 0.8152115
## [23,] 0.07946657 0.9205334
## [24,] 0.39116453 0.6088355
## [25,] 0.64891410 0.3510859
## [26,] 0.20982259 0.7901774
## [27,] 0.19921891 0.8007811
## [28,] 0.19605449 0.8039455
## [29,] 0.19839521 0.8016048
## [30,] 0.82830030 0.1716997
## [31,] 0.88279423 0.1172058
## [32,] 0.83416122 0.1658388
## [33,] 0.71308262 0.2869174
## [34,] 0.87557260 0.1244274
## [35,] 0.86618898 0.1338110
## [36,] 0.82410852 0.1758915
## [37,] 0.76578808 0.2342119
## [38,] 0.86886759 0.1311324
fviz_cluster(list(data = data_scaled, cluster = fcm_result$cluster),
ellipse.type = "norm", geom = "point") +
ggtitle("Fuzzy C-Means Clustering")
### Simpan di csv
data_fcm <- data %>%
mutate(Cluster_FCM = factor(fcm_result$cluster))
write.csv(data_fcm, "Anmul_FuzzyCMeans_Clusters.csv", row.names = FALSE)
getwd()
## [1] "D:/Semester 4/ANMUL/Kelompok 8"
data$Cluster_FCM <- as.factor(fcm_result$cluster)
data_fcm %>%
group_by(Cluster_FCM) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>%
View()
library(ggrepel)
data_pca <- prcomp(data_scaled, scale. = FALSE)
pca_df <- as.data.frame(data_pca$x[, 1:2])
pca_df$cluster <- factor(fcm_result$cluster)
pca_df$Kabupaten <- data$Kabupaten
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster, label = Kabupaten)) +
geom_point(size = 3) +
geom_text_repel(size = 3, max.overlaps = Inf) +
theme_minimal() +
labs(title = "Peta Cluster Wilayah Jawa Timur Berdasarkan Fuzzy C-MEANS")