library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo) # interpolasi missing
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast) # ARIMA
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dbscan) # DBSCAN + kNNdistplot
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(cluster) # evaluasi silhouette
library(ggplot2)
library(factoextra) # visualisasi clustering
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readxl)
library(dplyr)
library(zoo)
library(imputeTS)
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
##
## na.locf
library(dtwclust)
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: dtw
## Loaded dtw v1.23-1. See ?dtw for help, citation("dtw") for use in publication.
## dtwclust:
## Setting random number generator to L'Ecuyer-CMRG (see RNGkind()).
## To read the included vignettes type: browseVignettes("dtwclust").
## See news(package = "dtwclust") after package updates.
library(fpc) # silhouette
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
library(forecast)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(pheatmap)
berasmedium <- read_excel("C:/Users/Zahra Mahendra Putri/Documents/STATISTIKA UNTIRTA/MATA KULIAH/SEMESTER 5/Data Challenge/berasmedium.xlsx"
)
summary_prov <- berasmedium %>%
pivot_longer(-Date, names_to = "Provinsi", values_to = "Harga") %>%
group_by(Provinsi) %>%
summarise(
mean = mean(Harga, na.rm = TRUE),
median = median(Harga, na.rm = TRUE),
min = min(Harga, na.rm = TRUE),
max = max(Harga, na.rm = TRUE),
sd = sd(Harga, na.rm = TRUE),
missing = sum(is.na(Harga))
)
print(summary_prov)
## # A tibble: 34 × 7
## Provinsi mean median min max sd missing
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Aceh 12014. 11560 10570 13680 1039. 37
## 2 Bali 12323. 12135 10000 15430 1410. 36
## 3 Banten 11499. 11180 9390 15080 1347. 37
## 4 Bengkulu 11896. 11430 9880 14290 1397. 37
## 5 DI Yogyakarta 11601. 11550 9330 15040 1522. 36
## 6 DKI Jakarta 11812. 10880 9900 15000 1522. 37
## 7 Gorontalo 12139. 12330 10010 16390 1579. 35
## 8 Jambi 11408. 10890 9990 13290 1062. 38
## 9 Jawa Barat 11594. 11140 9950 15180 1341. 37
## 10 Jawa Tengah 11755. 11370 9320 15180 1432. 34
## # ℹ 24 more rows
berasmedium$Date <- as.Date(berasmedium$Date, format = "%d/%m/%Y")
berasmedium_interp <- berasmedium %>%
mutate(across(-Date, ~na_kalman(., model = "auto.arima")))
data_norm <- berasmedium_interp
data_norm[,-1] <- scale(berasmedium_interp[,-1])
ts_matrix <- t(as.matrix(data_norm[,-1]))
colnames(ts_matrix) <- data_norm$Date
rownames(ts_matrix) <- colnames(data_norm)[-1]
library(TSclust)
## Loading required package: pdc
dist_matrix <- diss(ts_matrix, METHOD = "DTW")
kNNdistplot(as.matrix(dist_matrix), k = 5)
abline(h = 200, col = "red", lty = 2)
# 9. Clustering DBSCAN
# --- DBSCAN pakai versi modern (rekomendasi) ---
db_res <- dbscan::dbscan(as.matrix(dist_matrix), eps = 400, minPts = 5)
table(db_res$cluster)
##
## 0 1
## 3 31
for (e in c(200, 300, 400, 500, 600, 700)) {
cl <- fpc::dbscan(dist_matrix, eps = e, MinPts = 5, method = "dist")
cat("eps =", e, "-> jumlah cluster =", length(unique(cl$cluster)), "\n")
}
## eps = 200 -> jumlah cluster = 2
## eps = 300 -> jumlah cluster = 2
## eps = 400 -> jumlah cluster = 1
## eps = 500 -> jumlah cluster = 1
## eps = 600 -> jumlah cluster = 1
## eps = 700 -> jumlah cluster = 1
library(cluster)
sil <- silhouette(db_res$cluster, dist_matrix)
mean_sil <- mean(sil[, 3], na.rm = TRUE)
print(paste("Mean silhouette width:", round(mean_sil, 3)))
## [1] "Mean silhouette width: 0.63"
plot(sil, border = NA, main = "Silhouette Plot (DTW + DBSCAN)")
# 11. Identifikasi provinsi ekstrem (cluster noise = 0)
extreme_prov <- rownames(ts_matrix)[db_res$cluster == 0]
print(extreme_prov)
## [1] "Kalimantan Selatan" "Kepulauan Riau" "Riau"
# --- Ambil hasil cluster DBSCAN ---
cluster_result <- data.frame(
Provinsi = rownames(ts_matrix),
Cluster = db_res$cluster
)
# --- Buat data kategori harga per provinsi (pakai quantile) ---
all_prices <- unlist(berasmedium_interp[,-1])
qnt <- quantile(all_prices, probs = c(0.33, 0.66))
kategori <- function(x) {
if (x <= qnt[1]) return("Rendah")
else if (x <= qnt[2]) return("Sedang")
else return("Tinggi")
}
data_kategori <- berasmedium_interp %>%
pivot_longer(-Date, names_to = "Provinsi", values_to = "Harga") %>%
mutate(Kategori = sapply(Harga, kategori))
# --- Gabungkan cluster + kategori harga ---
data_cluster_kategori <- data_kategori %>%
left_join(cluster_result, by = "Provinsi")
# --- Tabel ringkas per provinsi ---
summary_cluster <- data_cluster_kategori %>%
group_by(Provinsi, Cluster) %>%
summarise(
mean_harga = mean(Harga, na.rm = TRUE),
kategori_dominan = names(sort(table(Kategori), decreasing = TRUE))[1],
.groups = "drop"
)
print(summary_cluster)
## # A tibble: 34 × 4
## Provinsi Cluster mean_harga kategori_dominan
## <chr> <int> <dbl> <chr>
## 1 Aceh 1 11969. Rendah
## 2 Bali 1 12258. Tinggi
## 3 Banten 1 11445. Rendah
## 4 Bengkulu 1 11839. Rendah
## 5 DI Yogyakarta 1 11537. Rendah
## 6 DKI Jakarta 1 11770. Rendah
## 7 Gorontalo 1 12075. Rendah
## 8 Jambi 1 11370. Rendah
## 9 Jawa Barat 1 11547. Rendah
## 10 Jawa Tengah 1 11696. Rendah
## # ℹ 24 more rows
ggplot(data_cluster_kategori, aes(x = Date, y = Harga, color = Kategori)) +
geom_line(alpha = 0.7) +
facet_wrap(~Cluster, scales = "free_y") +
theme_minimal() +
labs(title = "Harga Beras per Cluster (DTW + DBSCAN)",
y = "Harga", x = "Tanggal")
# heatmap
library(pheatmap)
# --- matrix harga (tanpa Date) ---
heatmap_mat <- as.matrix(berasmedium_interp[,-1])
rownames(heatmap_mat) <- berasmedium_interp$Date
heatmap_mat <- t(heatmap_mat) # baris = Provinsi, kolom = waktu
# --- Anotasi cluster + kategori ---
prov_cluster <- data.frame(
Cluster = factor(db_res$cluster)
)
rownames(prov_cluster) <- rownames(heatmap_mat)
# Kategori harga dominan per provinsi
prov_kategori <- data_kategori %>%
group_by(Provinsi) %>%
summarise(kategori_dominan = names(sort(table(Kategori), decreasing = TRUE))[1])
# Gabung ke anotasi
prov_cluster$Kategori <- prov_kategori$kategori_dominan[match(rownames(prov_cluster), prov_kategori$Provinsi)]
# --- Heatmap ---
pheatmap(
heatmap_mat,
cluster_rows = TRUE, cluster_cols = FALSE,
annotation_row = prov_cluster,
scale = "row", # biar per provinsi kelihatan pola naik turunnya
main = "Heatmap Harga Beras per Provinsi (Cluster + Kategori Harga)"
)