library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readr)
## Warning: package 'readr' was built under R version 4.4.2
df <- read_delim("C:\\Ayunda\\Kuliah\\Sem 5\\TPG\\0_df_long.csv", delim = ";")
## Rows: 340620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (5): KabKot, Bulan, Produk, Kategori, NamaProv
## dbl (4): Tahun, Harga, KodeBPS, KodeProv
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 9
## KabKot Tahun Bulan Produk Harga Kategori KodeBPS KodeProv NamaProv
## <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Kab. Aceh Barat 2022 Januari Beras … 11429 Beras 11.0 11 Aceh
## 2 Kab. Aceh Barat 2022 Januari Beras … 9979 Beras 11.0 11 Aceh
## 3 Kab. Aceh Barat 2022 Januari Bawang… 31000 Bawang 11.0 11 Aceh
## 4 Kab. Aceh Barat 2022 Januari Bawang… 28636 Bawang 11.0 11 Aceh
## 5 Kab. Aceh Barat 2022 Januari Cabai … 19409 Cabai 11.0 11 Aceh
## 6 Kab. Aceh Barat 2022 Januari Cabai … 45706 Cabai 11.0 11 Aceh
skim(df)
| Name | df |
| Number of rows | 340620 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| KabKot | 0 | 1.00 | 9 | 31 | 0 | 506 | 0 |
| Bulan | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
| Produk | 0 | 1.00 | 9 | 22 | 0 | 15 | 0 |
| Kategori | 0 | 1.00 | 4 | 12 | 0 | 7 | 0 |
| NamaProv | 3450 | 0.99 | 4 | 25 | 0 | 33 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Tahun | 0 | 1.00 | 2023.44 | 1.09 | 2022.00 | 2022.00 | 2023.00 | 2024.00 | 2025.00 | ▇▇▁▇▆ |
| Harga | 61513 | 0.82 | 36737.49 | 33129.94 | 3000.00 | 15871.00 | 27000.00 | 40645.00 | 204423.00 | ▇▂▁▁▁ |
| KodeBPS | 0 | 1.00 | 43.81 | 25.43 | 11.01 | 17.09 | 35.21 | 71.01 | 92.71 | ▇▆▂▇▂ |
| KodeProv | 0 | 1.00 | 43.60 | 25.47 | 11.00 | 17.00 | 35.00 | 71.00 | 92.00 | ▇▆▂▇▂ |
terdapat missing value pada “NamaProv” dan “Harga”
Untuk menghilangkan missing value pada NamaProv dilakukan penyesuaian pada KodeProv kemudian data NA pada NamaProv diisi dengan data yang sesuai dengan KodeProv
#Memasukan kode provinsi berdasarkan BPS
prov_ref <- tibble::tribble(
~KodeProv, ~NamaProv,
11, "ACEH",
12, "SUMATERA UTARA",
13, "SUMATERA BARAT",
14, "RIAU",
15, "JAMBI",
16, "SUMATERA SELATAN",
17, "BENGKULU",
18, "LAMPUNG",
19, "KEP. BANGKA BELITUNG",
21, "KEP. RIAU",
31, "DKI JAKARTA",
32, "JAWA BARAT",
33, "JAWA TENGAH",
34, "DI YOGYAKARTA",
35, "JAWA TIMUR",
36, "BANTEN",
51, "BALI",
52, "NUSA TENGGARA BARAT",
53, "NUSA TENGGARA TIMUR",
61, "KALIMANTAN BARAT",
62, "KALIMANTAN TENGAH",
63, "KALIMANTAN SELATAN",
64, "KALIMANTAN TIMUR",
65, "KALIMANTAN UTARA",
71, "SULAWESI UTARA",
72, "SULAWESI TENGAH",
73, "SULAWESI SELATAN",
74, "SULAWESI TENGGARA",
75, "GORONTALO",
76, "SULAWESI BARAT",
81, "MALUKU",
82, "MALUKU UTARA",
91, "PAPUA",
92, "PAPUA BARAT"
)
df_fix <- df %>%
select(-NamaProv) %>% # hapus NamaProv lama dari dataset
left_join(prov_ref, by = "KodeProv")
setelah kode dan nama provinsi sudah disesuaikan, kolom NamaProv dihapus
#Melihat apakah masih ada missing value pada kolom NamaProv di df_fix
sum(is.na(df_fix$NamaProv))
## [1] 0
tidak ada missing value, maka analisis bisa dilanjutkan
Clustering membutuhkan satu nilai representatif per variabel, sehingga: - Data harga mentah yang merupakan data deret waktu diubah menjadi tabel ringkas per provinsi dan per produk. - Rata - rata data produk per provinsi dihitung menjadi harga rata-rata tahunan “mean(Harga)”
prov_prod <- df_fix %>%
group_by(NamaProv, Produk) %>%
summarise(Harga_rata = mean(Harga, na.rm = TRUE), .groups = "drop")
prov_wide <- prov_prod %>%
pivot_wider(
names_from = Produk,
values_from = Harga_rata
)
Data di-pivot jadi format wide: - Baris = provinsi - Kolom = produk - isi tabel = harga rata-rata produk setiap provinsi
skimr::skim(prov_wide)
| Name | prov_wide |
| Number of rows | 34 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| NamaProv | 0 | 1 | 4 | 20 | 0 | 34 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Bawang Merah | 0 | 1 | 38892.23 | 6966.80 | 30665.53 | 34394.83 | 36602.71 | 41218.80 | 61651.31 | ▇▃▂▁▁ |
| Bawang Putih (Bonggol) | 0 | 1 | 37259.07 | 5711.65 | 30428.64 | 33700.77 | 35364.85 | 39833.81 | 53914.19 | ▇▅▃▁▂ |
| Beras Medium | 0 | 1 | 12895.31 | 1151.32 | 11695.03 | 12012.44 | 12588.83 | 13408.19 | 17042.24 | ▇▃▂▁▁ |
| Beras Premium | 0 | 1 | 14732.22 | 1456.72 | 13282.52 | 13599.76 | 14171.18 | 15852.48 | 19808.42 | ▇▂▂▁▁ |
| Cabai Merah Besar | 0 | 1 | 53521.54 | 12896.37 | 32500.00 | 43711.07 | 49957.84 | 60576.52 | 85901.07 | ▅▇▅▃▁ |
| Cabai Merah Keriting | 0 | 1 | 50576.96 | 10207.16 | 34581.79 | 42842.33 | 47866.27 | 56660.92 | 73535.41 | ▆▇▅▃▃ |
| Cabai Rawit Merah | 0 | 1 | 60288.95 | 12720.79 | 39686.95 | 52093.13 | 55811.52 | 70262.55 | 90467.69 | ▆▇▅▅▂ |
| Daging Ayam Ras | 0 | 1 | 37739.70 | 5605.60 | 28071.29 | 34054.73 | 35896.35 | 42393.78 | 48624.06 | ▂▇▃▃▃ |
| Daging Sapi Murni | 0 | 1 | 137207.38 | 11889.05 | 114401.34 | 132610.34 | 135184.00 | 145758.79 | 158547.28 | ▂▂▇▂▅ |
| Gula Konsumsi | 0 | 1 | 16571.51 | 1088.92 | 15164.15 | 15949.76 | 16426.69 | 16937.73 | 21233.97 | ▇▇▂▁▁ |
| Jagung Tk. Peternak | 0 | 1 | 7974.49 | 2884.15 | 5527.08 | 6135.80 | 7192.13 | 8845.52 | 21000.00 | ▇▂▁▁▁ |
| Minyak Goreng Curah | 0 | 1 | 16210.78 | 1116.82 | 14850.51 | 15301.09 | 16091.73 | 16788.72 | 19630.53 | ▇▆▃▁▁ |
| Minyak Goreng Kemasan | 0 | 1 | 19495.90 | 1988.03 | 17365.40 | 18097.94 | 19084.96 | 20128.14 | 25746.63 | ▇▆▂▂▁ |
| Minyakita | 0 | 1 | 17788.12 | 869.36 | 16796.61 | 17192.80 | 17558.25 | 18251.84 | 20710.39 | ▇▃▃▁▁ |
| Telur Ayam Ras | 0 | 1 | 29826.50 | 3784.51 | 25510.96 | 27143.48 | 28865.61 | 30934.60 | 41734.88 | ▇▅▂▁▁ |
Mengecek : - Apakah ada missing value - Melihat apakah skala antar produk berbeda jauh - Ringkasan statisik variabel
prov_scaled <- prov_wide %>% as.data.frame()
rownames(prov_scaled) <- prov_scaled$NamaProv
prov_scaled$NamaProv <- NULL
prov_scaled <- scale(prov_scaled)
library(factoextra)
fviz_nbclust(prov_scaled, kmeans, method = "wss")
Cluster optimalnya ada pada k=4
K-Means dijalankan dengan: - 4 cluster - nstart = 25 , untuk menghindari solusi lokal
set.seed(123)
km4 <- kmeans(prov_scaled, centers = 4, nstart = 25)
Hasil cluster dipasangkan kembali dengan nama provinsi. Setiap provinisinya diberi label 1-4 sesuai K-Means
hasil_cluster <- data.frame(
Provinsi = rownames(prov_scaled),
Cluster = km4$cluster
)
hasil_cluster
## Provinsi Cluster
## ACEH ACEH 3
## BALI BALI 3
## BANTEN BANTEN 3
## BENGKULU BENGKULU 3
## DI YOGYAKARTA DI YOGYAKARTA 3
## DKI JAKARTA DKI JAKARTA 1
## GORONTALO GORONTALO 1
## JAMBI JAMBI 3
## JAWA BARAT JAWA BARAT 3
## JAWA TENGAH JAWA TENGAH 3
## JAWA TIMUR JAWA TIMUR 3
## KALIMANTAN BARAT KALIMANTAN BARAT 4
## KALIMANTAN SELATAN KALIMANTAN SELATAN 4
## KALIMANTAN TENGAH KALIMANTAN TENGAH 4
## KALIMANTAN TIMUR KALIMANTAN TIMUR 4
## KALIMANTAN UTARA KALIMANTAN UTARA 4
## KEP. BANGKA BELITUNG KEP. BANGKA BELITUNG 4
## KEP. RIAU KEP. RIAU 4
## LAMPUNG LAMPUNG 3
## MALUKU MALUKU 2
## MALUKU UTARA MALUKU UTARA 2
## NUSA TENGGARA BARAT NUSA TENGGARA BARAT 1
## NUSA TENGGARA TIMUR NUSA TENGGARA TIMUR 1
## PAPUA PAPUA 2
## PAPUA BARAT PAPUA BARAT 2
## RIAU RIAU 3
## SULAWESI BARAT SULAWESI BARAT 3
## SULAWESI SELATAN SULAWESI SELATAN 3
## SULAWESI TENGAH SULAWESI TENGAH 1
## SULAWESI TENGGARA SULAWESI TENGGARA 1
## SULAWESI UTARA SULAWESI UTARA 1
## SUMATERA BARAT SUMATERA BARAT 3
## SUMATERA SELATAN SUMATERA SELATAN 3
## SUMATERA UTARA SUMATERA UTARA 3
fviz_cluster(km4, data = prov_scaled,
geom = "point",
ellipse.type = "norm",
repel = TRUE,
labelsize = 6)
## Mengurutkan Clustering berdasarkan harga tertinggi ke terendah
prov_cluster <- prov_wide %>%
left_join(hasil_cluster, by = c("NamaProv" = "Provinsi"))
Hitung rata-rata keseluruhan harga produk per cluster. Agar mempermudah menentukan urutan cluster dari paling mahal ke paling murah
cluster_mean <- prov_cluster %>%
group_by(Cluster) %>%
summarise(MeanHarga = mean(c_across(where(is.numeric)), na.rm = TRUE))
cluster_mean
## # A tibble: 4 × 2
## Cluster MeanHarga
## <int> <dbl>
## 1 1 35934.
## 2 2 44534.
## 3 3 33493.
## 4 4 40476.
cluster_order <- cluster_mean %>%
arrange(desc(MeanHarga)) %>%
mutate(ClusterBaru = row_number())
cluster_order
## # A tibble: 4 × 3
## Cluster MeanHarga ClusterBaru
## <int> <dbl> <int>
## 1 2 44534. 1
## 2 4 40476. 2
## 3 1 35934. 3
## 4 3 33493. 4
Keterangan: Cluster 1 : Harga Mahal Cluster 2 : Harga Agak mahal Cluster 3 : Harga Normal Cluster 4 : Harga Murah
prov_final <- hasil_cluster %>%
left_join(cluster_order, by = "Cluster") %>%
select(Provinsi, ClusterLama = Cluster, ClusterBaru, MeanHarga)
prov_final
## Provinsi ClusterLama ClusterBaru MeanHarga
## 1 ACEH 3 4 33492.78
## 2 BALI 3 4 33492.78
## 3 BANTEN 3 4 33492.78
## 4 BENGKULU 3 4 33492.78
## 5 DI YOGYAKARTA 3 4 33492.78
## 6 DKI JAKARTA 1 3 35934.14
## 7 GORONTALO 1 3 35934.14
## 8 JAMBI 3 4 33492.78
## 9 JAWA BARAT 3 4 33492.78
## 10 JAWA TENGAH 3 4 33492.78
## 11 JAWA TIMUR 3 4 33492.78
## 12 KALIMANTAN BARAT 4 2 40475.69
## 13 KALIMANTAN SELATAN 4 2 40475.69
## 14 KALIMANTAN TENGAH 4 2 40475.69
## 15 KALIMANTAN TIMUR 4 2 40475.69
## 16 KALIMANTAN UTARA 4 2 40475.69
## 17 KEP. BANGKA BELITUNG 4 2 40475.69
## 18 KEP. RIAU 4 2 40475.69
## 19 LAMPUNG 3 4 33492.78
## 20 MALUKU 2 1 44534.03
## 21 MALUKU UTARA 2 1 44534.03
## 22 NUSA TENGGARA BARAT 1 3 35934.14
## 23 NUSA TENGGARA TIMUR 1 3 35934.14
## 24 PAPUA 2 1 44534.03
## 25 PAPUA BARAT 2 1 44534.03
## 26 RIAU 3 4 33492.78
## 27 SULAWESI BARAT 3 4 33492.78
## 28 SULAWESI SELATAN 3 4 33492.78
## 29 SULAWESI TENGAH 1 3 35934.14
## 30 SULAWESI TENGGARA 1 3 35934.14
## 31 SULAWESI UTARA 1 3 35934.14
## 32 SUMATERA BARAT 3 4 33492.78
## 33 SUMATERA SELATAN 3 4 33492.78
## 34 SUMATERA UTARA 3 4 33492.78
# Buat mapping cluster lama → cluster baru
map_cluster <- cluster_order %>%
select(Cluster, ClusterBaru)
# Tambahkan label cluster baru ke hasil_cluster
hasil_cluster2 <- hasil_cluster %>%
left_join(map_cluster, by = c("Cluster" = "Cluster")) %>%
arrange(match(Provinsi, rownames(prov_scaled)))
km4_new <- km4
km4_new$cluster <- hasil_cluster2$ClusterBaru
fviz_cluster(km4_new, data = prov_scaled,
geom = "point",
ellipse.type = "norm",
repel = TRUE,
labelsize = 6,
main = "Cluster Provinsi Berdasarkan Harga (Cluster Baru)")