library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo) # interpolasi missing
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast) # ARIMA
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dbscan) # DBSCAN + kNNdistplot
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(cluster) # evaluasi silhouette
library(ggplot2)
library(factoextra) # visualisasi clustering
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readxl)
library(dplyr)
library(zoo)
library(imputeTS)
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
##
## na.locf
library(dtwclust)
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: dtw
## Loaded dtw v1.23-1. See ?dtw for help, citation("dtw") for use in publication.
## dtwclust:
## Setting random number generator to L'Ecuyer-CMRG (see RNGkind()).
## To read the included vignettes type: browseVignettes("dtwclust").
## See news(package = "dtwclust") after package updates.
library(fpc) # silhouette
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
library(forecast)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(pheatmap)
berasmedium <- read_excel("C:/Users/Zahra Mahendra Putri/Documents/STATISTIKA UNTIRTA/MATA KULIAH/SEMESTER 5/Data Challenge/berasmedium.xlsx"
)
summary_prov <- berasmedium %>%
pivot_longer(-Date, names_to = "Provinsi", values_to = "Harga") %>%
group_by(Provinsi) %>%
summarise(
mean = mean(Harga, na.rm = TRUE),
median = median(Harga, na.rm = TRUE),
min = min(Harga, na.rm = TRUE),
max = max(Harga, na.rm = TRUE),
sd = sd(Harga, na.rm = TRUE),
missing = sum(is.na(Harga))
)
print(summary_prov)
## # A tibble: 34 × 7
## Provinsi mean median min max sd missing
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Aceh 12014. 11560 10570 13680 1039. 37
## 2 Bali 12323. 12135 10000 15430 1410. 36
## 3 Banten 11499. 11180 9390 15080 1347. 37
## 4 Bengkulu 11896. 11430 9880 14290 1397. 37
## 5 DI Yogyakarta 11601. 11550 9330 15040 1522. 36
## 6 DKI Jakarta 11812. 10880 9900 15000 1522. 37
## 7 Gorontalo 12139. 12330 10010 16390 1579. 35
## 8 Jambi 11408. 10890 9990 13290 1062. 38
## 9 Jawa Barat 11594. 11140 9950 15180 1341. 37
## 10 Jawa Tengah 11755. 11370 9320 15180 1432. 34
## # ℹ 24 more rows
berasmedium$Date <- as.Date(berasmedium$Date, format = "%d/%m/%Y")
berasmedium_interp <- berasmedium %>%
mutate(across(-Date, ~na_kalman(., model = "auto.arima")))
data_norm <- berasmedium_interp
data_norm[,-1] <- scale(berasmedium_interp[,-1])
ts_matrix <- t(as.matrix(data_norm[,-1]))
colnames(ts_matrix) <- data_norm$Date
rownames(ts_matrix) <- colnames(data_norm)[-1]
library(TSclust)
## Loading required package: pdc
# fungsi jarak DTW
dtw_dist_fn <- function(a, b) {
dtw::dtw(a, b, distance.only = TRUE)$distance
}
# hitung jarak pairwise antar provinsi
dist_dtw <- proxy::dist(ts_matrix, method = dtw_dist_fn)
# konversi ke matriks untuk visual
dist_matrix_full <- as.matrix(dist_dtw)
# heatmap jarak
pheatmap(dist_matrix_full,
show_rownames = TRUE, show_colnames = TRUE,
main = "Matriks Jarak DTW antar Provinsi")
# 8. Hierarchie clustering
# clustering hierarkis dengan complete linkage
hc <- hclust(dist_dtw, method = "complete")
# dendrogram
plot(hc, main = "Dendrogram (DTW + Complete Linkage)", cex = 0.7, xlab = "", sub = "")
# 9. Menentukan Nilai K
sil_scores <- c()
# coba dari k=2 sampai k=10
for (k in 2:10) {
clust <- cutree(hc, k = k)
sil <- silhouette(clust, dist_dtw)
sil_scores[k] <- mean(sil[, "sil_width"])
}
# tampilkan skor silhouette
sil_scores
## [1] NA 0.6570453 0.5813476 0.5321804 0.1597908 0.1808526 0.1314637
## [8] 0.1353379 0.1314386 0.1275519
# plot silhouette vs k
plot(2:10, sil_scores[2:10], type = "b",
xlab = "Jumlah Cluster (k)",
ylab = "Average Silhouette Width",
main = "Evaluasi k dengan Silhouette")
# 10. Hasil Clustering
# hasil cluster dengan k = 3
clusters_h <- cutree(hc, k = 3)
# simpan hasil ke dataframe
cluster_df <- data.frame(Provinsi = names(clusters_h),
Cluster = as.integer(clusters_h))
cluster_df
## Provinsi Cluster
## 1 Aceh 1
## 2 Bali 1
## 3 Banten 1
## 4 Bengkulu 1
## 5 DI Yogyakarta 1
## 6 DKI Jakarta 1
## 7 Gorontalo 1
## 8 Jambi 1
## 9 Jawa Barat 1
## 10 Jawa Tengah 1
## 11 Jawa Timur 1
## 12 Kalimantan Barat 1
## 13 Kalimantan Selatan 2
## 14 Kalimantan Tengah 1
## 15 Kalimantan Timur 1
## 16 Kalimantan Utara 1
## 17 Kepulauan Bangka Belitung 1
## 18 Kepulauan Riau 3
## 19 Lampung 1
## 20 Maluku Utara 1
## 21 Maluku 1
## 22 Nusa Tenggara Barat 1
## 23 Nusa Tenggara Timur 1
## 24 Papua Barat 1
## 25 Papua 1
## 26 Riau 3
## 27 Sulawesi Barat 1
## 28 Sulawesi Selatan 1
## 29 Sulawesi Tengah 1
## 30 Sulawesi Tenggara 1
## 31 Sulawesi Utara 1
## 32 Sumatera Barat 1
## 33 Sumatera Selatan 1
## 34 Sumatera Utara 1
library(reshape2)
library(dplyr)
# Data frame normalisasi + cluster
plot_df <- data.frame(Date = data_norm$Date, berasmedium_interp[,-1])
# Ubah ke long format
plot_long <- melt(plot_df, id.vars = "Date",
variable.name = "Provinsi", value.name = "Harga")
# Tambahkan info cluster
plot_long <- left_join(plot_long, cluster_df, by = "Provinsi")
library(ggplot2)
library(patchwork)
# --- Dendrogram ---
p1 <- fviz_dend(hc, k = 3, rect = TRUE, show_labels = TRUE) +
ggtitle("Dendrogram dengan k = 3")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# --- Time series per cluster ---
p2 <- ggplot(plot_long, aes(x = Date, y = Harga, group = Provinsi)) +
geom_line(alpha = 0.4) +
facet_wrap(~Cluster, scales = "free_y") +
theme_minimal() +
labs(title = "Deret Waktu per Cluster",
x = "Tanggal", y = "Harga (Z-score)")
# --- Gabung jadi 1 tampilan ---
p1 / p2 # susun atas-bawah
# 12. Kategori Harga
# hitung rata-rata harga tiap provinsi (sebelum normalisasi)
avg_price <- berasmedium_interp %>%
summarise(across(-Date, mean, na.rm = TRUE)) %>%
t() %>%
as.data.frame()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(-Date, mean, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
colnames(avg_price) <- "Rata2_Harga"
avg_price$Provinsi <- rownames(avg_price)
# gabungkan dengan cluster
cluster_price <- left_join(cluster_df, avg_price, by = "Provinsi")
# tentukan kuantil
qnt <- quantile(cluster_price$Rata2_Harga, probs = c(0.33, 0.66))
# fungsi kategori
kategori <- function(x) {
if (x <= qnt[1]) return("Rendah")
else if (x <= qnt[2]) return("Sedang")
else return("Tinggi")
}
cluster_price$Kategori <- sapply(cluster_price$Rata2_Harga, kategori)
# hasil tabel
cluster_price <- cluster_price %>%
arrange(Cluster, Kategori, desc(Rata2_Harga))
print(cluster_price)
## Provinsi Cluster Rata2_Harga Kategori
## 1 Jawa Tengah 1 11695.82 Rendah
## 2 Jawa Barat 1 11547.42 Rendah
## 3 DI Yogyakarta 1 11537.00 Rendah
## 4 Lampung 1 11530.30 Rendah
## 5 Sumatera Selatan 1 11518.56 Rendah
## 6 Sulawesi Barat 1 11490.68 Rendah
## 7 Banten 1 11445.05 Rendah
## 8 Jambi 1 11370.23 Rendah
## 9 Jawa Timur 1 11254.39 Rendah
## 10 Nusa Tenggara Barat 1 11186.14 Rendah
## 11 Sulawesi Selatan 1 11175.83 Rendah
## 12 Sulawesi Utara 1 12398.05 Sedang
## 13 Nusa Tenggara Timur 1 12269.98 Sedang
## 14 Bali 1 12257.75 Sedang
## 15 Kepulauan Bangka Belitung 1 12173.42 Sedang
## 16 Gorontalo 1 12075.47 Sedang
## 17 Aceh 1 11968.83 Sedang
## 18 Bengkulu 1 11839.07 Sedang
## 19 Sulawesi Tengah 1 11835.90 Sedang
## 20 DKI Jakarta 1 11769.97 Sedang
## 21 Sulawesi Tenggara 1 11704.13 Sedang
## 22 Papua Barat 1 14158.44 Tinggi
## 23 Papua 1 14135.08 Tinggi
## 24 Maluku Utara 1 13948.78 Tinggi
## 25 Kalimantan Utara 1 13900.12 Tinggi
## 26 Sumatera Barat 1 13627.51 Tinggi
## 27 Kalimantan Timur 1 13590.20 Tinggi
## 28 Maluku 1 13457.43 Tinggi
## 29 Kalimantan Barat 1 13287.22 Tinggi
## 30 Kalimantan Tengah 1 12981.05 Tinggi
## 31 Sumatera Utara 1 12505.08 Tinggi
## 32 Kalimantan Selatan 2 12409.31 Sedang
## 33 Kepulauan Riau 3 13164.46 Tinggi
## 34 Riau 3 12750.02 Tinggi
ggplot(cluster_price, aes(x = factor(Cluster), y = Rata2_Harga,
color = Kategori, label = Provinsi)) +
geom_point(size = 3) +
geom_text(vjust = -0.8, size = 3, check_overlap = TRUE) +
theme_minimal() +
labs(title = paste("Cluster dan Kategori Harga Beras (k =3)"),
x = "Cluster", y = "Harga Rata-rata")