#INSTALL & LOAD PACKAGE ──────────────────────────────────
for (pkg in c("forecast", "tseries", "TSA")) {
  if (!requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = "https://cloud.r-project.org")
}
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.2
library(TSA)
## Warning: package 'TSA' was built under R version 4.5.3
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
# ================================================================
# 1. SPESIFIKASI MODEL ARIMA(2,0,3)
# ================================================================

# Koefisien AR
phi1 <- -0.5
phi2 <- -0.7

# Koefisien MA TRUE
ma1_true <- 0.7
ma2_true <- 0.5
ma3_true <- 0.3
# Konvensi R: MA dibalik
ma1 <- -ma1_true
ma2 <- -ma2_true
ma3 <- -ma3_true

# Standar deviasi error
sigma <- sqrt(1.15)

cat("================================================================\n")
## ================================================================
cat("  1. SPESIFIKASI MODEL ARIMA(2,0,3)\n")
##   1. SPESIFIKASI MODEL ARIMA(2,0,3)
cat("================================================================\n")
## ================================================================
cat(sprintf("  phi1 (AR1) = %4.1f\n", phi1))
##   phi1 (AR1) = -0.5
cat(sprintf("  phi2 (AR2) = %4.1f\n", phi2))
##   phi2 (AR2) = -0.7
cat(sprintf("  ma1 true   = %4.1f --> input R = %.1f\n",
            ma1_true, ma1))
##   ma1 true   =  0.7 --> input R = -0.7
cat(sprintf("  ma2 true   = %4.1f --> input R = %.1f\n",
            ma2_true, ma2))
##   ma2 true   =  0.5 --> input R = -0.5
cat(sprintf("  ma3 true   = %4.1f --> input R = %.1f\n",
            ma3_true, ma3))
##   ma3 true   =  0.3 --> input R = -0.3
cat(sprintf("  sigma      = %.4f\n", sigma))
##   sigma      = 1.0724
cat("================================================================\n\n")
## ================================================================

#Model yang digunakan adalah ARIMA(p=2, d=0, q=3). Artinya model ini memiliki 2 komponen autoregressive (AR), tidak memerlukan differencing karena data sudah stasioner pada level, dan memiliki 3 komponen moving average (MA).

# ================================================================
# 2. GENERATE DATA DENGAN MODEL ARIMA(2,0,3)
# ================================================================
set.seed(123)

ar_vec <- c(phi1, phi2)
ma_vec <- c(ma1, ma2, ma3)

ts_raw <- arima.sim(
  n = 200,
  model = list(
    order = c(2,0,3),
    ar = ar_vec,
    ma = ma_vec
  ),
  sd = sigma
)

cat("================================================================\n")
## ================================================================
cat("  2. GENERATE DATA (n = 200)\n")
##   2. GENERATE DATA (n = 200)
cat("================================================================\n")
## ================================================================
cat(sprintf("  Jumlah observasi : %d\n", length(ts_raw)))
##   Jumlah observasi : 200
cat(sprintf("  Mean             : %.4f\n", mean(ts_raw)))
##   Mean             : 0.0109
cat(sprintf("  Std Dev          : %.4f\n", sd(ts_raw)))
##   Std Dev          : 2.1640
cat(sprintf("  Min              : %.4f\n", min(ts_raw)))
##   Min              : -6.4241
cat(sprintf("  Max              : %.4f\n", max(ts_raw)))
##   Max              : 6.6883
cat("================================================================\n\n")
## ================================================================
# Plot data mentah
par(mar = c(4, 4, 3, 1))

plot(ts_raw,
     type = "l",
     col = "steelblue",
     lwd = 1.5,
     main = "Data Simulasi ARIMA(2,0,3) — 200 Observasi",
     xlab = "Waktu",
     ylab = "Nilai")

abline(v = 50,
       col = "red",
       lty = 2,
       lwd = 2)

legend("topleft",
       legend = "Batas burn-in (obs ke-50)",
       col = "red",
       lty = 2,
       bty = "n")

#Data simulasi dibentuk menggunakan fungsi arima.sim() berdasarkan model ARIMA(2,0,3) yang telah ditentukan sebelumnya. Penetapan set.seed(123) dilakukan agar proses pembangkitan data menghasilkan output yang konsisten dan dapat diulang kembali dengan hasil yang sama.

# ================================================================
# 3. HAPUS 50 DATA PERTAMA (BURN-IN) & SPLIT DATA
# ================================================================

data_ts <- ts(as.numeric(ts_raw)[51:200])

n <- length(data_ts)

# Split 80% training / 20% testing
n_train <- floor(0.8 * n)
n_test  <- n - n_train

train_data <- ts(data_ts[1:n_train],
                 start = 1,
                 frequency = 1)

test_data <- ts(data_ts[(n_train + 1):n],
                start = n_train + 1,
                frequency = 1)

cat("================================================================\n")
## ================================================================
cat("  3. HAPUS BURN-IN & SPLIT DATA\n")
##   3. HAPUS BURN-IN & SPLIT DATA
cat("================================================================\n")
## ================================================================
cat(sprintf("  Total data generate : 200 obs\n"))
##   Total data generate : 200 obs
cat(sprintf("  Dihapus burn-in     :  50 obs\n"))
##   Dihapus burn-in     :  50 obs
cat(sprintf("  Sisa data bersih    : %d obs\n", n))
##   Sisa data bersih    : 150 obs
cat(sprintf("  Training (80%%)      : %d obs\n", n_train))
##   Training (80%)      : 120 obs
cat(sprintf("  Testing  (20%%)      : %d obs\n", n_test))
##   Testing  (20%)      : 30 obs
cat("================================================================\n\n")
## ================================================================
# Plot split
par(mar = c(4, 4, 3, 1))

plot(data_ts,
     type = "l",
     col = "gray70",
     lwd = 1,
     main = "Data Bersih: Training vs Testing",
     xlab = "Waktu",
     ylab = "Nilai")

lines(1:n_train,
      as.numeric(train_data),
      col = "steelblue",
      lwd = 2)

lines((n_train+1):n,
      as.numeric(test_data),
      col = "firebrick",
      lwd = 2)

abline(v = n_train,
       col = "black",
       lty = 2,
       lwd = 1.5)

legend("topleft",
       legend = c(sprintf("Training (n=%d)", n_train),
                  sprintf("Testing (n=%d)", n_test)),
       col = c("steelblue", "firebrick"),
       lwd = 2,
       bty = "n")

#Sebanyak 50 observasi awal dihilangkan karena dianggap sebagai fase awal simulasi yang masih terpengaruh oleh nilai awal proses. Penghapusan ini dilakukan agar data yang dianalisis lebih stabil dan lebih mencerminkan karakteristik sebenarnya dari model ARIMA yang digunakan.

# ================================================================
# 4. UJI ADF (Augmented Dickey-Fuller)
# ================================================================

cat("================================================================\n")
## ================================================================
cat("  4. UJI ADF (Augmented Dickey-Fuller)\n")
##   4. UJI ADF (Augmented Dickey-Fuller)
cat("================================================================\n")
## ================================================================
cat("  H0: Data tidak stasioner\n")
##   H0: Data tidak stasioner
cat("  H1: Data stasioner\n")
##   H1: Data stasioner
cat("  Tolak H0 jika p-value < 0.05\n")
##   Tolak H0 jika p-value < 0.05
cat("----------------------------------------------------------------\n")
## ----------------------------------------------------------------
# Uji pada data level (d = 0)

adf_level <- adf.test(train_data,
                      alternative = "stationary")
## Warning in adf.test(train_data, alternative = "stationary"): p-value smaller
## than printed p-value
cat("  --- Data Level (d = 0) ---\n")
##   --- Data Level (d = 0) ---
cat(sprintf("  Statistik ADF : %.4f\n",
            adf_level$statistic))
##   Statistik ADF : -7.8411
cat(sprintf("  p-value       : %.4f\n",
            adf_level$p.value))
##   p-value       : 0.0100
cat(sprintf("  Kesimpulan    : %s\n\n",
            ifelse(adf_level$p.value < 0.05,
                   "STASIONER pada level (tidak perlu differencing)",
                   "Tidak Stasioner")))
##   Kesimpulan    : STASIONER pada level (tidak perlu differencing)
cat("================================================================\n\n")
## ================================================================
# ================================================================
# Plot Data Level
# ================================================================

par(mfrow = c(1,1),
    mar = c(4, 4, 3, 1))

plot(train_data,
     type = "l",
     col = "steelblue",
     lwd = 1.5,
     main = "Data Level (Training)",
     xlab = "Waktu",
     ylab = "Nilai")

abline(h = mean(train_data),
       col = "red",
       lty = 2,
       lwd = 2)

par(mfrow = c(1, 1))

#Uji Augmented Dickey-Fuller (ADF) digunakan untuk mengevaluasi apakah deret waktu sudah bersifat stasioner atau masih mengandung unit root. Proses pengujian dilakukan sesuai orde differencing pada model. Karena model yang digunakan adalah ARIMA(2,0,3), data sudah stasioner pada level sehingga tidak diperlukan proses differencing tambahan.

# ================================================================
# 5. ACF, PACF, DAN EACF
# ================================================================

cat("================================================================\n")
## ================================================================
cat("  5. ACF, PACF, DAN EACF\n")
##   5. ACF, PACF, DAN EACF
cat("================================================================\n\n")
## ================================================================
par(mfrow = c(1,2),
    mar = c(4, 4, 3, 1))

acf(train_data,
    lag.max = 20,
    col = "steelblue",
    lwd = 2,
    main = "ACF")

pacf(train_data,
     lag.max = 20,
     col = "firebrick",
     lwd = 2,
     main = "PACF")

par(mfrow = c(1,1))
eacf(train_data)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x o x o o o o o o  o  o  o 
## 1 x x x x x o o o o o o  o  o  o 
## 2 x x o x o o o o o o o  o  o  o 
## 3 x x o o o o o o o o o  o  o  o 
## 4 x x o x o o o o o o o  o  o  o 
## 5 x x x o o o o x o o o  o  o  o 
## 6 x x o o o o o o o o o  o  o  o 
## 7 x x o o o o o o o o o  o  o  o

#ACF, PACF, dan EACF digunakan untuk membantu menentukan orde AR (p) dan MA (q) pada data yang sudah stasioner (d=0)(d=0)(d=0). #ACF (Autocorrelation Function) Menunjukkan korelasi data dengan lag sebelumnya. Pola cut off pada ACF mengindikasikan komponen MA(q). #PACF (Partial Autocorrelation Function) Menunjukkan korelasi parsial antar lag. Pola cut off pada PACF mengindikasikan komponen AR(p). #EACF (Extended ACF) Digunakan untuk menentukan kandidat model ARIMA melalui pola simbol “o” dan “x”. Kandidat model dipilih dari segitiga “o”.

# ================================================================
# 6. KANDIDAT MODEL
# ================================================================

cat("================================================================\n")
## ================================================================
cat("  6. KANDIDAT MODEL\n")
##   6. KANDIDAT MODEL
cat("================================================================\n\n")
## ================================================================
model1 <- Arima(train_data,
                order = c(2,0,3))

model2 <- Arima(train_data,
                order = c(2,0,2))

model3 <- Arima(train_data,
                order = c(2,0,1))

cat("Semua kandidat model berhasil di-fit.\n\n")
## Semua kandidat model berhasil di-fit.
# ================================================================
# 7. MODEL TERBAIK — AIC, BIC, AICc
# ================================================================

models <- list(model1, model2, model3)

model_names <- c("ARIMA(2,0,3)",
                 "ARIMA(2,0,2)",
                 "ARIMA(2,0,1)")

tabel <- data.frame(
  Model  = model_names,
  AIC    = sapply(models, function(m) round(m$aic,4)),
  BIC    = sapply(models, function(m) round(m$bic,4)),
  AICc   = sapply(models, function(m) round(m$aicc,4)),
  LogLik = sapply(models, function(m)
    round(as.numeric(logLik(m)),4))
)

tabel <- tabel[order(tabel$AIC),]

rownames(tabel) <- NULL

cat("================================================================\n")
## ================================================================
cat("  7. PERBANDINGAN KRITERIA INFORMASI\n")
##   7. PERBANDINGAN KRITERIA INFORMASI
cat("================================================================\n\n")
## ================================================================
print(tabel)
##          Model      AIC      BIC     AICc    LogLik
## 1 ARIMA(2,0,2) 423.3633 440.0883 424.1067 -205.6817
## 2 ARIMA(2,0,3) 425.0239 444.5364 426.0239 -205.5120
## 3 ARIMA(2,0,1) 428.6901 442.6276 429.2165 -209.3451
cat("\nModel terbaik : ", tabel$Model[1], "\n\n")
## 
## Model terbaik :  ARIMA(2,0,2)
# Ringkasan model terbaik
cat("================================================================\n")
## ================================================================
cat("  RINGKASAN MODEL TERBAIK\n")
##   RINGKASAN MODEL TERBAIK
cat("================================================================\n\n")
## ================================================================
summary(model1)
## Series: train_data 
## ARIMA(2,0,3) with non-zero mean 
## 
## Coefficients:
##           ar1      ar2      ma1      ma2     ma3    mean
##       -0.4110  -0.4593  -0.3433  -0.4974  0.1020  0.0088
## s.e.   0.1678   0.1486   0.1818   0.1419  0.1794  0.0178
## 
## sigma^2 = 1.855:  log likelihood = -205.51
## AIC=425.02   AICc=426.02   BIC=444.54
## 
## Training set error measures:
##                        ME     RMSE      MAE      MPE    MAPE      MASE
## Training set -0.008519115 1.327663 1.060605 101.8146 206.716 0.3902745
##                      ACF1
## Training set -0.003498754

#Kriteria AIC, BIC, dan AICc digunakan untuk menentukan model terbaik. Model dengan nilai paling kecil dianggap paling baik karena mampu memberikan kecocokan model yang optimal dengan kompleksitas yang lebih efisien.

# ================================================================
# 8. DIAGNOSTIK MODEL
# ================================================================

res <- residuals(model1)

cat("================================================================\n")
## ================================================================
cat("  8. DIAGNOSTIK MODEL\n")
##   8. DIAGNOSTIK MODEL
cat("================================================================\n\n")
## ================================================================
# Plot diagnostik
par(mfrow = c(2,2),
    mar = c(4, 4, 3, 1))

plot(res,
     type = "l",
     col = "steelblue",
     lwd = 1.2,
     main = "Residual vs Waktu",
     xlab = "Waktu",
     ylab = "Residual")

abline(h = 0,
       col = "red",
       lty = 2,
       lwd = 1.5)

hist(res,
     breaks = 15,
     col = "lightblue",
     border = "white",
     main = "Histogram Residual",
     xlab = "Residual",
     probability = TRUE)

curve(dnorm(x,
            mean(res),
            sd(res)),
      add = TRUE,
      col = "red",
      lwd = 2)

acf(res,
    lag.max = 20,
    col = "darkgreen",
    lwd = 2,
    main = "ACF Residual")

qqnorm(res,
       main = "QQ-Plot Residual",
       col = "purple",
       pch = 16,
       cex = 0.7)

qqline(res,
       col = "red",
       lwd = 2)

par(mfrow = c(1,1))
# Uji Ljung-Box

cat("================================================================\n")
## ================================================================
cat("  UJI LJUNG-BOX\n")
##   UJI LJUNG-BOX
cat("================================================================\n\n")
## ================================================================
for (lg in c(5,10,15,20)) {

  lb <- Box.test(res,
                 lag = lg,
                 type = "Ljung-Box",
                 fitdf = 5)

cat(sprintf("Lag %2d | Statistik = %8.4f | p-value = %.4f\n",
              lg,
              lb$statistic,
              lb$p.value))
}
## Lag  5 | Statistik =   0.0905 | p-value = 0.0000
## Lag 10 | Statistik =   1.8947 | p-value = 0.8635
## Lag 15 | Statistik =   9.8032 | p-value = 0.4579
## Lag 20 | Statistik =  15.7874 | p-value = 0.3963
# Uji Jarque-Bera

cat("\n================================================================\n")
## 
## ================================================================
cat("  UJI JARQUE-BERA\n")
##   UJI JARQUE-BERA
cat("================================================================\n\n")
## ================================================================
jb <- jarque.bera.test(res)

cat(sprintf("Statistik JB : %.4f\n",
            jb$statistic))
## Statistik JB : 8.2204
cat(sprintf("p-value      : %.4f\n",
            jb$p.value))
## p-value      : 0.0164
# ================================================================
# FORECAST VS DATA TESTING
# ================================================================

cat("\n================================================================\n")
## 
## ================================================================
cat("  FORECAST VS DATA TESTING\n")
##   FORECAST VS DATA TESTING
cat("================================================================\n\n")
## ================================================================
fc     <- forecast(model1, h = n_test)

pred   <- as.numeric(fc$mean)

aktual <- as.numeric(test_data)

mae  <- mean(abs(pred - aktual))

rmse <- sqrt(mean((pred - aktual)^2))

mape <- mean(abs((pred - aktual) / aktual)) * 100

par(mar = c(4, 4, 3, 1))

plot(fc,
     main = "Forecast vs Aktual — ARIMA(2,0,3)",
     xlab = "Waktu",
     ylab = "Nilai",
     fcol = "firebrick",
     flwd = 2,
     shadecols = c("#d6e9f8", "#aed4f1"))

lines((n_train + 1):(n_train + n_test),
      aktual,
      col = "darkgreen",
      lwd = 2)

legend("topleft",
       legend = c("Aktual Testing",
                  "Forecast",
                  "CI 80%",
                  "CI 95%"),
       col = c("darkgreen",
               "firebrick",
               "#aed4f1",
               "#d6e9f8"),
       lwd = c(2,2,6,6),
       bty = "n")

cat(sprintf("MAE  : %.4f\n", mae))
## MAE  : 1.4644
cat(sprintf("RMSE : %.4f\n", rmse))
## RMSE : 1.7667
cat(sprintf("MAPE : %.4f%%\n\n", mape))
## MAPE : 96.9753%
accuracy(fc, aktual)
##                        ME     RMSE      MAE       MPE      MAPE      MASE
## Training set -0.008519115 1.327663 1.060605 101.81458 206.71602 0.3902745
## Test set      0.007521851 1.766654 1.464393  92.93831  96.97527 0.5388576
##                      ACF1
## Training set -0.003498754
## Test set               NA

#Diagnostik model dilakukan untuk memastikan bahwa residual memenuhi asumsi white noise, yaitu tidak mengandung autokorelasi, berdistribusi normal, dan memiliki variansi yang stabil.