Sistem Deteksi Dini Gejolak Harga Pangan Multidimensi

1. Setup

packages <- c(
  "tidyverse", "lubridate", "zoo", "caret",
  "glmnet", "pROC", "PRROC", "janitor", "readxl",
  "rpart", "rpart.plot"
)

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
    install.packages(pkg, dependencies = TRUE)
    library(pkg, character.only = TRUE)
  }
}

select  <- dplyr::select
filter  <- dplyr::filter
arrange <- dplyr::arrange

theme_set(theme_minimal() + theme(
  plot.title    = element_text(face = "bold", size = 13, hjust = 0.5),
  plot.subtitle = element_text(hjust = 0.5, size = 10, color = "gray40"),
  axis.text.x   = element_text(angle = 45, hjust = 1)
))

2. Import Data

file_path <- "~/SEMESTER 6/Satria Data_Data SEC5.xlsx"

if (!file.exists(file_path)) stop("File tidak ditemukan.")

data_raw <- read_excel(
  file_path,
  sheet = "dataanalisis"
) %>%
  clean_names()

cat("Dimensi awal:", dim(data_raw), "\n")

## Dimensi awal: 840 11

3. Deduplikasi

data_raw <- data_raw %>%
  group_by(tahun, bulan, komoditas) %>%
  summarise(
    harga_eceran = median(harga_eceran, na.rm = TRUE),
    inflasi_komoditas = median(inflasi_komoditas, na.rm = TRUE),
    volume_impor_kg = median(volume_impor_kg, na.rm = TRUE),
    nilai_impor_usd = median(nilai_impor_usd, na.rm = TRUE),
    usd_to_idr = median(usd_to_idr, na.rm = TRUE),
    index_enso = median(index_enso, na.rm = TRUE),
    fao_value = median(fao_value, na.rm = TRUE),
    .groups = "drop"
  )

cat("Dimensi setelah deduplikasi:", dim(data_raw), "\n")

## Dimensi setelah deduplikasi: 840 10

4. Data Makro Bulanan

data_makro <- data_raw %>%
  group_by(tahun, bulan) %>%
  summarise(
    usd_to_idr = first(usd_to_idr),
    index_enso = first(index_enso),
    fao_value = first(fao_value),
    .groups = "drop"
  )

cat("Dimensi data makro per bulan:", dim(data_makro), "\n")

## Dimensi data makro per bulan: 120 5

5. Pivot Data Komoditas

data_pivot <- data_raw %>%
  select(
    tahun, bulan, komoditas,
    harga_eceran,
    inflasi_komoditas,
    volume_impor_kg,
    nilai_impor_usd
  ) %>%
  pivot_wider(
    id_cols = c(tahun, bulan),
    names_from = komoditas,
    values_from = c(
      harga_eceran,
      inflasi_komoditas,
      volume_impor_kg,
      nilai_impor_usd
    ),
    names_sep = "_"
  )

cat("Dimensi data pivot per bulan:", dim(data_pivot), "\n")

## Dimensi data pivot per bulan: 120 30

6. Gabungkan Data

data_wide <- data_pivot %>%
  left_join(data_makro, by = c("tahun", "bulan")) %>%
  arrange(tahun, bulan) %>%
  mutate(date = make_date(tahun, bulan, 1))

cat("Dimensi data wide FINAL:", dim(data_wide), "\n")

## Dimensi data wide FINAL: 120 34

7. Imputasi Missing Value

data_wide <- data_wide %>%
  mutate(across(where(is.numeric),
                ~ zoo::na.locf(., na.rm = FALSE))) %>%
  mutate(across(where(is.numeric),
                ~ ifelse(is.na(.),
                         median(., na.rm = TRUE),
                         .)))

8. Fitur Kalender dan Iklim

ramadan_months <- tibble(
  tahun = c(2016,2016, 2017,2017,2017, 2018,2018, 2019,2019,
            2020,2020,2020, 2021,2021,2021, 2022,2022,
            2023,2023,2023, 2024,2024, 2025,2025),
  bulan = c(6,7, 5,6,7, 5,6, 5,6, 4,5,6, 4,5,6, 4,5, 3,4,5, 3,4, 3,4),
  ramadan_dummy = 1L
)
data_wide <- data_wide %>%
  left_join(ramadan_months, by = c("tahun", "bulan")) %>%
  mutate(ramadan_dummy = replace_na(ramadan_dummy, 0L),
         el_nino_dummy = ifelse(index_enso > 0.5, 1L, 0L),
         panen_padi_dummy = ifelse(bulan %in% c(3,4), 1L, 0L),
         panen_bawang_dummy = ifelse(bulan %in% c(6,7), 1L, 0L),
         bulan_sin = sin(2*pi*bulan/12),
         bulan_cos = cos(2*pi*bulan/12))

9. Lag Harga dan Volume

harga_cols <- grep("^harga_eceran_", names(data_wide), value = TRUE)
vol_cols   <- grep("^volume_impor_kg_", names(data_wide), value = TRUE)
for (col in c(harga_cols, vol_cols)) {
  data_wide[[paste0("lag1_", col)]] <- lag(data_wide[[col]], 1)
}

10. Harga Impor dalam Rupiah

nilai_cols <- grep("^nilai_impor_usd_", names(data_wide), value = TRUE)
vol_cols_nm <- grep("^volume_impor_kg_", names(data_wide), value = TRUE)
for (i in seq_along(nilai_cols)) {
  kom <- gsub("nilai_impor_usd_", "", nilai_cols[i])
  data_wide[[paste0("harga_impor_idr_", kom)]] <-
    (data_wide[[nilai_cols[i]]] * data_wide$usd_to_idr) / (data_wide[[vol_cols_nm[i]]] + 1e-9)
}

11. Definisi Target Gejolak

bobot_ihk <- c(beras=0.036, cabai=0.012, bawang_merah=0.008,
               daging_ayam=0.020, daging_sapi=0.012, gula=0.010, telur=0.018)
bobot_ihk <- bobot_ihk / sum(bobot_ihk)
inflasi_cols <- grep("^inflasi_komoditas_", names(data_wide), value = TRUE)
inflasi_mat <- as.matrix(data_wide[, inflasi_cols])
komoditas_order <- gsub("inflasi_komoditas_", "", inflasi_cols)
bobot_ordered <- bobot_ihk[komoditas_order]
bobot_ordered[is.na(bobot_ordered)] <- mean(bobot_ihk)
bobot_ordered <- bobot_ordered / sum(bobot_ordered)
data_wide$inflasi_tertimbang <- as.numeric(inflasi_mat %*% bobot_ordered)

for (col in harga_cols) {
  kom <- gsub("harga_eceran_", "", col)
  data_wide[[paste0("harga_change_", kom)]] <-
    (data_wide[[col]] - lag(data_wide[[col]])) / (lag(data_wide[[col]]) + 1e-9) * 100
}
spike_cols <- grep("^harga_change_", names(data_wide), value = TRUE)
data_wide$pct_spike <- rowMeans(data_wide[, spike_cols] > 5, na.rm = TRUE) * 100

harga_mat <- as.matrix(data_wide[, harga_cols])
data_wide$harga_tertimbang <- as.numeric(harga_mat %*% bobot_ordered)
threshold_ekstrem <- median(data_wide$harga_tertimbang, na.rm = TRUE) +
  1.5 * IQR(data_wide$harga_tertimbang, na.rm = TRUE)

data_wide$status_gejolak <- ifelse(
  data_wide$inflasi_tertimbang > 2 |
    data_wide$pct_spike > 30 |
    data_wide$harga_tertimbang > threshold_ekstrem, 1L, 0L)
data_wide$status_gejolak[1] <- 0L

proporsi_gejolak <- mean(data_wide$status_gejolak)*100
cat("\nProporsi gejolak:", round(proporsi_gejolak,1), "%\n")

## 
## Proporsi gejolak: 29.2 %

12. Persiapan Data Model

data_model <- data_wide %>%
  arrange(date) %>%
  mutate(target = lead(status_gejolak, 1)) %>%
  filter(!is.na(target)) %>%
  select(-date, -tahun, -bulan, -status_gejolak,
         -starts_with("harga_change_"),
         -inflasi_tertimbang, -harga_tertimbang, -pct_spike) %>%
  mutate(across(everything(), as.numeric))

# --- PERBAIKAN: Hapus fitur impor tidak relevan (telur, daging ayam) -------
fitur_tidak_relevan <- c(
  "volume_impor_kg_Telur", "nilai_impor_usd_Telur",
  "volume_impor_kg_Daging Ayam", "nilai_impor_usd_Daging Ayam"
)
data_model <- data_model[, !names(data_model) %in% fitur_tidak_relevan]

X <- data_model %>% select(-target)
y <- data_model$target
cat(sprintf("\nData model: %d obs x %d fitur\n", nrow(X), ncol(X)))

## 
## Data model: 119 obs x 54 fitur

print(table(y))

## y
##  0  1 
## 84 35

13. Fungsi Helper

hitung_prauc <- function(actual, prob) {
  if (length(unique(actual)) < 2) return(NA)
  pr <- PRROC::pr.curve(scores.class0 = prob[actual==1], scores.class1 = prob[actual==0], curve=FALSE)
  pr$auc.integral
}
hitung_f2 <- function(actual, pred) {
  tp <- sum(actual==1 & pred==1); fp <- sum(actual==0 & pred==1); fn <- sum(actual==1 & pred==0)
  denom <- 5*tp + 4*fn + fp; if(denom==0) return(0); 5*tp/denom
}
evaluasi_threshold <- function(actual, prob, thresholds = seq(0.1,0.7,0.05)) {
  map_dfr(thresholds, function(thr) {
    pred <- ifelse(prob >= thr, 1L, 0L)
    if(length(unique(pred))<2) return(tibble(threshold=thr, recall=NA, precision=NA, specificity=NA, f2=0, f1=NA))
    cm <- caret::confusionMatrix(as.factor(pred), as.factor(actual), positive="1")
    tibble(threshold=thr, recall=cm$byClass["Sensitivity"], precision=cm$byClass["Pos Pred Value"],
           specificity=cm$byClass["Specificity"], f2=hitung_f2(actual, pred), f1=cm$byClass["F1"])
  })
}

14. Baseline

cat("\n========== BASELINE ==========\n")

## 
## ========== BASELINE ==========

prauc_persist <- hitung_prauc(y[-1], as.numeric(y[-length(y)]))
f2_persist <- hitung_f2(y[-1], y[-length(y)])
cat(sprintf("Persistence | PR-AUC: %.4f | F2: %.4f\n", prauc_persist, f2_persist))

## Persistence | PR-AUC: 0.4506 | F2: 0.5172

rule_pred <- as.numeric(data_wide$el_nino_dummy[-nrow(data_wide)]==1 & data_wide$inflasi_tertimbang[-nrow(data_wide)]>1)
prauc_rule <- hitung_prauc(y, rule_pred)
f2_rule <- hitung_f2(y, rule_pred)
cat(sprintf("Rule-based  | PR-AUC: %.4f | F2: %.4f\n", prauc_rule, f2_rule))

## Rule-based  | PR-AUC: 0.2926 | F2: 0.0680

15. Elastic Net

cat("\n========== ELASTIC NET ==========\n")

## 
## ========== ELASTIC NET ==========

window_size <- 60
n <- nrow(X)
pred_en <- rep(NA_real_, n - window_size)
actual_en <- rep(NA_real_, n - window_size)
success <- 0

for (i in seq(window_size, n-1)) {
  idx <- i - window_size + 1
  X_train <- as.matrix(X[1:i, ])
  y_train <- y[1:i]
  X_test  <- as.matrix(X[i+1, , drop=FALSE])
  y_test  <- y[i+1]
  actual_en[idx] <- y_test
  
  na_rows <- which(apply(X_train, 1, anyNA))
  if (length(na_rows) > 0) {
    X_train <- X_train[-na_rows, , drop=FALSE]
    y_train <- y_train[-na_rows]
  }
  if (length(unique(y_train)) < 2) next
  if (nrow(X_train) < 10) next
  
  vars <- apply(X_train, 2, var, na.rm=TRUE)
  keep <- which(vars > 0 & !is.na(vars))
  if (length(keep) < 3) next
  X_train <- X_train[, keep, drop=FALSE]
  X_test  <- X_test[,  keep, drop=FALSE]
  if (anyNA(X_test)) next
  
  means <- colMeans(X_train)
  sds <- apply(X_train, 2, sd); sds[sds==0] <- 1
  X_tr_s <- scale(X_train, center=means, scale=sds)
  X_te_s <- matrix((X_test - means)/sds, nrow=1)
  
  set.seed(42)
  en_cv <- tryCatch(
    cv.glmnet(X_tr_s, y_train, family="binomial", alpha=0.5, nfolds=min(5, length(y_train)), type.measure="deviance"),
    error=function(e) NULL
  )
  if (!is.null(en_cv)) {
    prob <- predict(en_cv, newx=X_te_s, s="lambda.1se", type="response")[1,1]
    pred_en[idx] <- prob; success <- success+1
  } else {
    en_fit <- tryCatch(glmnet(X_tr_s, y_train, family="binomial", alpha=0.5), error=function(e) NULL)
    if (!is.null(en_fit)) {
      lam <- en_fit$lambda[ceiling(length(en_fit$lambda)/2)]
      prob <- predict(en_fit, newx=X_te_s, s=lam, type="response")[1,1]
      pred_en[idx] <- prob; success <- success+1
    }
  }
  if (idx %% 50 == 0) cat(sprintf("Iterasi %d/%d, sukses: %d\n", idx, n-window_size, success))
}

## Warning in lognet(x, is.sparse, y, weights, offset, alpha, nobs, nvars, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, y, weights, offset, alpha, nobs, nvars, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, y, weights, offset, alpha, nobs, nvars, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, y, weights, offset, alpha, nobs, nvars, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Iterasi 50/59, sukses: 50

valid <- !is.na(pred_en) & !is.na(actual_en)
pred_en <- pred_en[valid]; actual_en <- actual_en[valid]
cat(sprintf("\nPrediksi Elastic Net berhasil: %d dari %d iterasi (%.1f%%)\n", length(pred_en), n-window_size, 100*length(pred_en)/(n-window_size)))

## 
## Prediksi Elastic Net berhasil: 59 dari 59 iterasi (100.0%)

if (length(pred_en)==0) stop("Elastic Net gagal total.")
if (length(unique(actual_en))<2) stop("Hanya satu kelas pada actual.")

prauc_en <- hitung_prauc(actual_en, pred_en)
roc_en <- pROC::roc(actual_en, pred_en, levels=c(0,1), direction="<", quiet=TRUE)
auc_en <- as.numeric(pROC::auc(roc_en))
cat(sprintf("PR-AUC: %.4f | ROC-AUC: %.4f\n", prauc_en, auc_en))

## PR-AUC: 0.7286 | ROC-AUC: 0.7012

thr_eval_en <- evaluasi_threshold(actual_en, pred_en)
best_idx_en <- which.max(thr_eval_en$f2)
best_thr_en <- thr_eval_en$threshold[best_idx_en]
cat(sprintf("Threshold optimal (F2): %.2f | Recall: %.3f | Precision: %.3f | F2: %.4f\n",
            best_thr_en, thr_eval_en$recall[best_idx_en], thr_eval_en$precision[best_idx_en], thr_eval_en$f2[best_idx_en]))

## Threshold optimal (F2): 0.15 | Recall: 1.000 | Precision: 0.414 | F2: 0.7792

16. Cost-Benefit Analysis

# Asumsi: biaya false negative (gejolak tidak terdeteksi) = 5x biaya false positive
biaya_FN <- 5
biaya_FP <- 1

cost_df <- thr_eval_en %>%
  mutate(
    FN_rate = 1 - recall,
    FP_rate = 1 - specificity,
    n_actual_1 = sum(actual_en == 1),
    n_actual_0 = sum(actual_en == 0),
    total_biaya = (FN_rate * n_actual_1 * biaya_FN) + (FP_rate * n_actual_0 * biaya_FP)
  )
best_cost_idx <- which.min(cost_df$total_biaya)
best_cost_thr <- cost_df$threshold[best_cost_idx]

cat(sprintf("\nThreshold optimal berdasarkan cost (FN=5xFP): %.2f\n", best_cost_thr))

## 
## Threshold optimal berdasarkan cost (FN=5xFP): 0.15

cat(sprintf("  Recall: %.3f | Precision: %.3f | F2: %.4f\n",
            cost_df$recall[best_cost_idx], cost_df$precision[best_cost_idx], cost_df$f2[best_cost_idx]))

##   Recall: 1.000 | Precision: 0.414 | F2: 0.7792

p_cost <- ggplot(cost_df, aes(x = threshold, y = total_biaya)) +
  geom_line(color = "purple", linewidth = 1) +
  geom_point(size = 2) +
  geom_vline(xintercept = best_cost_thr, linetype = "dotted", color = "gray40") +
  labs(title = "Total Biaya Kesalahan vs Threshold",
       subtitle = sprintf("Minimum biaya pada threshold = %.2f", best_cost_thr),
       x = "Threshold", y = "Estimasi Total Biaya") +
  theme_minimal()
print(p_cost)

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

17. Decision Tree

cat("\n========== DECISION TREE (rpart) ==========\n")

## 
## ========== DECISION TREE (rpart) ==========

pred_dt <- rep(NA_real_, n - window_size)
actual_dt <- rep(NA_real_, n - window_size)
success_dt <- 0

for (i in seq(window_size, n-1)) {
  idx <- i - window_size + 1
  X_train <- as.data.frame(X[1:i, ])
  y_train <- y[1:i]
  X_test  <- as.data.frame(X[i+1, , drop=FALSE])
  y_test  <- y[i+1]
  actual_dt[idx] <- y_test
  
  na_rows <- which(apply(X_train, 1, anyNA))
  if (length(na_rows) > 0) {
    X_train <- X_train[-na_rows, , drop=FALSE]
    y_train <- y_train[-na_rows]
  }
  if (length(unique(y_train)) < 2) next
  if (nrow(X_train) < 10) next
  
  vars <- apply(X_train, 2, var, na.rm=TRUE)
  keep <- which(vars > 0 & !is.na(vars))
  if (length(keep) < 2) next
  X_train <- X_train[, keep, drop=FALSE]
  X_test  <- X_test[, keep, drop=FALSE]
  if (anyNA(X_test)) next
  
  set.seed(42)
  dt_model <- tryCatch(
    rpart(y_train ~ ., data = cbind(y_train, X_train), method = "class",
          control = rpart.control(maxdepth=3, minsplit=20, cp=0.01)),
    error = function(e) NULL
  )
  if (!is.null(dt_model)) {
    prob <- predict(dt_model, newdata = X_test, type = "prob")[, "1"]
    if (length(prob) == 0) prob <- 0.5
    pred_dt[idx] <- prob
    success_dt <- success_dt + 1
  }
  if (idx %% 50 == 0) cat(sprintf("Iterasi %d/%d, sukses: %d\n", idx, n-window_size, success_dt))
}

## Iterasi 50/59, sukses: 50

valid_dt <- !is.na(pred_dt) & !is.na(actual_dt)
pred_dt <- pred_dt[valid_dt]; actual_dt <- actual_dt[valid_dt]
cat(sprintf("\nPrediksi Decision Tree berhasil: %d dari %d iterasi (%.1f%%)\n", 
            length(pred_dt), n-window_size, 100*length(pred_dt)/(n-window_size)))

## 
## Prediksi Decision Tree berhasil: 59 dari 59 iterasi (100.0%)

if (length(pred_dt) > 0 && length(unique(actual_dt)) >= 2) {
  prauc_dt <- hitung_prauc(actual_dt, pred_dt)
  roc_dt <- pROC::roc(actual_dt, pred_dt, levels=c(0,1), direction="<", quiet=TRUE)
  auc_dt <- as.numeric(pROC::auc(roc_dt))
  cat(sprintf("PR-AUC: %.4f | ROC-AUC: %.4f\n", prauc_dt, auc_dt))
  thr_eval_dt <- evaluasi_threshold(actual_dt, pred_dt)
  best_idx_dt <- which.max(thr_eval_dt$f2)
  best_thr_dt <- thr_eval_dt$threshold[best_idx_dt]
  cat(sprintf("Threshold optimal (F2): %.2f | Recall: %.3f | Precision: %.3f | F2: %.4f\n",
              best_thr_dt, thr_eval_dt$recall[best_idx_dt], thr_eval_dt$precision[best_idx_dt], thr_eval_dt$f2[best_idx_dt]))
}

## PR-AUC: 0.5986 | ROC-AUC: 0.5970
## Threshold optimal (F2): 0.10 | Recall: 0.875 | Precision: 0.420 | F2: 0.7192

18. Visualisasi

# Plot status gejolak
p_status <- ggplot(data_wide, aes(x = date, y = status_gejolak)) +
  geom_line(color = "gray60", linewidth = 0.6) +
  geom_point(aes(color = factor(status_gejolak)), size = 2) +
  scale_color_manual(values = c("0" = "#22c55e", "1" = "#ef4444"),
                     labels = c("Aman", "Gejolak")) +
  labs(title = "Status Gejolak Harga Pangan Nasional (2016-2025)",
       x = "Tanggal", y = "Status", color = NULL) +
  theme(legend.position = "bottom")
print(p_status)

# PR Curve perbandingan
if (exists("prauc_en") && exists("prauc_dt")) {
  pr_en_curve <- PRROC::pr.curve(scores.class0 = pred_en[actual_en==1], scores.class1 = pred_en[actual_en==0], curve=TRUE)
  pr_dt_curve <- PRROC::pr.curve(scores.class0 = pred_dt[actual_dt==1], scores.class1 = pred_dt[actual_dt==0], curve=TRUE)
  pr_df_en <- as.data.frame(pr_en_curve$curve); names(pr_df_en) <- c("recall", "precision", "threshold")
  pr_df_dt <- as.data.frame(pr_dt_curve$curve); names(pr_df_dt) <- c("recall", "precision", "threshold")
  pr_df_en$model <- "Elastic Net"
  pr_df_dt$model <- "Decision Tree"
  pr_comb <- rbind(pr_df_en, pr_df_dt)
  p_pr_comb <- ggplot(pr_comb, aes(x = recall, y = precision, color = model)) +
    geom_line(linewidth = 1) +
    geom_hline(yintercept = mean(y), linetype = "dashed", color = "gray50") +
    scale_color_manual(values = c("Elastic Net" = "#3B82F6", "Decision Tree" = "#22C55E")) +
    labs(title = "Perbandingan Precision-Recall Curve",
         subtitle = sprintf("PR-AUC | EN: %.4f | DT: %.4f", prauc_en, prauc_dt),
         x = "Recall", y = "Precision", color = NULL) +
    theme(legend.position = "bottom")
  print(p_pr_comb)
}

# Trade-off Elastic Net
p_trade_en <- thr_eval_en %>%
  select(threshold, recall, precision, f2) %>%
  pivot_longer(-threshold, names_to = "metrik", values_to = "nilai") %>%
  ggplot(aes(x = threshold, y = nilai, color = metrik)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = best_thr_en, linetype = "dotted", color = "gray40") +
  scale_color_manual(values = c("recall" = "#3B82F6", "precision" = "#EF4444", "f2" = "#22C55E"),
                     labels = c("f2" = "F2-score", "precision" = "Precision", "recall" = "Recall")) +
  labs(title = "Trade-off Recall vs Precision (Elastic Net)",
       subtitle = sprintf("Threshold optimal (F2) = %.2f", best_thr_en),
       x = "Threshold Probabilitas", y = "Nilai Metrik", color = NULL) +
  theme(legend.position = "bottom")
print(p_trade_en)

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Trade-off Decision Tree
if (exists("thr_eval_dt")) {
  p_trade_dt <- thr_eval_dt %>%
    select(threshold, recall, precision, f2) %>%
    pivot_longer(-threshold, names_to = "metrik", values_to = "nilai") %>%
    ggplot(aes(x = threshold, y = nilai, color = metrik)) +
    geom_line(linewidth = 1) +
    geom_vline(xintercept = best_thr_dt, linetype = "dotted", color = "gray40") +
    scale_color_manual(values = c("recall" = "#3B82F6", "precision" = "#EF4444", "f2" = "#22C55E"),
                       labels = c("f2" = "F2-score", "precision" = "Precision", "recall" = "Recall")) +
    labs(title = "Trade-off Recall vs Precision (Decision Tree)",
         subtitle = sprintf("Threshold optimal (F2) = %.2f", best_thr_dt),
         x = "Threshold Probabilitas", y = "Nilai Metrik", color = NULL) +
    theme(legend.position = "bottom")
  print(p_trade_dt)
}

# ROC Curve perbandingan
if (exists("roc_dt")) {
  roc_compare <- ggroc(list("Elastic Net" = roc_en, "Decision Tree" = roc_dt), size = 1) +
    geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "gray50") +
    scale_color_manual(values = c("Elastic Net" = "#3B82F6", "Decision Tree" = "#22C55E")) +
    labs(title = "Kurva ROC: Elastic Net vs Decision Tree",
         subtitle = sprintf("ROC-AUC | EN: %.3f | DT: %.3f", auc_en, auc_dt),
         x = "1 - Specificity", y = "Sensitivity", color = NULL) +
    theme(legend.position = "bottom")
  print(roc_compare)
}

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the pROC package.
##   Please report the issue at <https://github.com/xrobin/pROC/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

19. Interpretasi Model

# a) Koefisien Elastic Net (model final)
X_full <- as.matrix(X); y_full <- y
na_rows_full <- which(apply(X_full, 1, anyNA))
if (length(na_rows_full) > 0) {
  X_full <- X_full[-na_rows_full, , drop=FALSE]; y_full <- y_full[-na_rows_full]
}
vars_full <- apply(X_full, 2, var, na.rm=TRUE)
keep_full <- which(vars_full > 0 & !is.na(vars_full))
X_full <- X_full[, keep_full, drop=FALSE]
means_f <- colMeans(X_full); sds_f <- apply(X_full, 2, sd); sds_f[sds_f==0] <- 1
X_scaled <- scale(X_full, center=means_f, scale=sds_f)
set.seed(42)
final_cv <- cv.glmnet(X_scaled, y_full, family="binomial", alpha=0.5, nfolds=5)
coef_full <- coef(final_cv, s = "lambda.1se")
coef_df <- data.frame(fitur = rownames(coef_full), koef = as.numeric(coef_full)) %>%
  filter(fitur != "(Intercept)", koef != 0) %>% arrange(desc(abs(koef)))
cat("\n========== KOEFISIEN ELASTIC NET (FINAL) ==========\n")

## 
## ========== KOEFISIEN ELASTIC NET (FINAL) ==========

cat(sprintf("Fitur aktif (koef ≠ 0): %d dari %d\n", nrow(coef_df), ncol(X)))

## Fitur aktif (koef ≠ 0): 8 dari 54

print(head(coef_df, 15))

##                               fitur         koef
## 1                harga_eceran_Beras 0.1788878658
## 2           lag1_harga_eceran_Beras 0.1556201540
## 3 lag1_volume_impor_kg_Bawang Merah 0.1005617838
## 4                         bulan_cos 0.0918205975
## 5                 harga_eceran_Gula 0.0822243240
## 6            lag1_harga_eceran_Gula 0.0389842422
## 7          harga_eceran_Daging Sapi 0.0180703149
## 8          harga_eceran_Daging Ayam 0.0005992081

# Plot koefisien harga eceran
coef_harga <- coef_df %>% filter(grepl("harga_eceran_", fitur)) %>%
  mutate(komoditas = gsub("harga_eceran_", "", fitur),
         arah = ifelse(koef > 0, "Positif", "Negatif"))
if (nrow(coef_harga) > 0) {
  p_coef <- ggplot(coef_harga, aes(x = reorder(komoditas, abs(koef)), y = koef, fill = arah)) +
    geom_bar(stat = "identity") + coord_flip() +
    scale_fill_manual(values = c("Positif" = "#EF4444", "Negatif" = "#3B82F6")) +
    labs(title = "Koefisien Elastic Net: Harga Eceran per Komoditas",
         x = "Komoditas", y = "Koefisien", fill = NULL) +
    theme(legend.position = "bottom")
  print(p_coef)
}

# b) Variable importance Decision Tree
X_train_dt <- X[complete.cases(X), ]
y_train_dt <- y[complete.cases(X)]
dt_final <- rpart(y_train_dt ~ ., data = cbind(y_train_dt, X_train_dt), method = "class",
                  control = rpart.control(maxdepth=3, minsplit=20, cp=0.01))
cat("\n========== VARIABLE IMPORTANCE DECISION TREE ==========\n")

## 
## ========== VARIABLE IMPORTANCE DECISION TREE ==========

print(dt_final$variable.importance)

##                harga_eceran_Beras          harga_eceran_Daging Ayam 
##                        10.8113622                        10.8113622 
##          harga_eceran_Daging Sapi                 harga_eceran_Gula 
##                        10.8113622                        10.8113622 
##                harga_eceran_Telur           lag1_harga_eceran_Beras 
##                        10.8113622                         9.7302260 
##                         bulan_cos lag1_volume_impor_kg_Bawang Merah 
##                         8.1814815                         3.6362140 
##              nilai_impor_usd_Gula      nilai_impor_usd_Bawang Merah 
##                         2.7676802                         1.3635802 
##      volume_impor_kg_Bawang Merah             volume_impor_kg_Cabai 
##                         1.3635802                         1.3635802 
##  lag1_volume_impor_kg_Daging Sapi             volume_impor_kg_Beras 
##                         1.1861487                         1.1861487 
##    inflasi_komoditas_Bawang Merah            inflasi_komoditas_Gula 
##                         0.9090535                         0.7907658 
##       nilai_impor_usd_Daging Sapi       volume_impor_kg_Daging Sapi 
##                         0.7907658                         0.7907658

rpart.plot(dt_final, main = "Decision Tree Final (maxdepth=3)", extra = 101, under = TRUE, fallen.leaves = TRUE)

20. Prediksi Bulan Depan

last_row <- as.matrix(X[nrow(X), keep_full, drop=FALSE])
last_scaled <- matrix((last_row - means_f) / sds_f, nrow=1)
prob_next <- predict(final_cv, newx=last_scaled, s="lambda.1se", type="response")[1,1]
bulan_depan <- tail(data_wide$date, 1) + months(1)

# Gunakan threshold berbasis cost untuk rekomendasi final
thresh_final <- best_cost_thr
thresh_siaga_final <- thresh_final * 0.6
status_final <- case_when(
  prob_next >= thresh_final ~ "WASPADA",
  prob_next >= thresh_siaga_final ~ "SIAGA",
  TRUE ~ "AMAN"
)
cat(sprintf("\nPrediksi %s: prob=%.3f -> %s (threshold cost=%.2f)\n", 
            format(bulan_depan, "%B %Y"), prob_next, status_final, thresh_final))

## 
## Prediksi Januari 2026: prob=0.663 -> WASPADA (threshold cost=0.15)

21. Ringkasan Eksekutif

cat("\n========== RINGKASAN EKSEKUTIF (DENGAN KOREKSI) ==========\n")

## 
## ========== RINGKASAN EKSEKUTIF (DENGAN KOREKSI) ==========

cat(sprintf("Total data bulan: %d | Observasi model: %d\n", nrow(data_wide), nrow(X)))

## Total data bulan: 120 | Observasi model: 119

cat(sprintf("Proporsi gejolak dalam data: %.1f%%\n", proporsi_gejolak))

## Proporsi gejolak dalam data: 29.2%

cat(sprintf("Elastic Net | PR-AUC: %.4f | ROC-AUC: %.4f\n", prauc_en, auc_en))

## Elastic Net | PR-AUC: 0.7286 | ROC-AUC: 0.7012

cat(sprintf("Threshold optimal F2: %.2f (Recall=%.3f, Precision=%.3f)\n", 
            best_thr_en, thr_eval_en$recall[best_idx_en], thr_eval_en$precision[best_idx_en]))

## Threshold optimal F2: 0.15 (Recall=1.000, Precision=0.414)

cat(sprintf("Threshold optimal cost (FN=5xFP): %.2f (Recall=%.3f, Precision=%.3f)\n",
            best_cost_thr, cost_df$recall[best_cost_idx], cost_df$precision[best_cost_idx]))

## Threshold optimal cost (FN=5xFP): 0.15 (Recall=1.000, Precision=0.414)

cat(sprintf("Prediksi %s: prob=%.3f -> %s\n", format(bulan_depan, "%B %Y"), prob_next, status_final))

## Prediksi Januari 2026: prob=0.663 -> WASPADA

cat("\nKeterbatasan yang diakui:\n")

## 
## Keterbatasan yang diakui:

cat("- Ukuran sampel kecil (119 bulan) → recall 100% rapuh, perlu validasi lebih lanjut\n")

## - Ukuran sampel kecil (119 bulan) → recall 100% rapuh, perlu validasi lebih lanjut

cat("- Multikolinieritas harga komoditas menyebabkan Decision Tree tidak diskriminatif\n")

## - Multikolinieritas harga komoditas menyebabkan Decision Tree tidak diskriminatif

cat("- Threshold 0,10 tidak realistis secara ekonomi; gunakan threshold berbasis cost (%.2f)\n", best_cost_thr)

## - Threshold 0,10 tidak realistis secara ekonomi; gunakan threshold berbasis cost (%.2f)
##  0.15

cat("- Fitur impor telur dan daging ayam dihapus karena tidak relevan secara substansi\n")

## - Fitur impor telur dan daging ayam dihapus karena tidak relevan secara substansi

cat("=========================================\n")

## =========================================

Satria Data SEC Metode Klasifikasi

SEC5

2026-06-02

Sistem Deteksi Dini Gejolak Harga Pangan Multidimensi

1. Setup

2. Import Data

3. Deduplikasi

4. Data Makro Bulanan

5. Pivot Data Komoditas

6. Gabungkan Data

7. Imputasi Missing Value

8. Fitur Kalender dan Iklim

9. Lag Harga dan Volume

10. Harga Impor dalam Rupiah

11. Definisi Target Gejolak

12. Persiapan Data Model

13. Fungsi Helper

14. Baseline

15. Elastic Net

16. Cost-Benefit Analysis

17. Decision Tree

18. Visualisasi

19. Interpretasi Model

20. Prediksi Bulan Depan

21. Ringkasan Eksekutif