=== 1. LOAD LIBRARY ===

=== 2. BACA & RAPIKAN DATA ===

file_path <- "C:/Users/USER/Downloads/data denda arw.xlsx"

raw_data <- read_excel(file_path, skip = 1, col_names = FALSE)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
names(raw_data)[1:6] <- c("Bulan","2020","2021","2022","2023","2024")

data_long <- raw_data %>%
  pivot_longer(cols = c("2020","2021","2022","2023","2024"),
               names_to = "Tahun", values_to = "Denda") %>%
  filter(!is.na(Denda))

data_long <- data_long %>%
  mutate(
    Tahun = as.integer(Tahun),
    Denda = as.numeric(Denda),
    Bulan = as.character(Bulan)
  )

bulan_order <- c("Januari","Februari","Maret","April","Mei","Juni",
                 "Juli","Agustus","September","Oktober","November","Desember")

data_long <- data_long %>%
  filter(Bulan %in% bulan_order) %>%
  mutate(Bulan = factor(Bulan, levels = bulan_order)) %>%
  arrange(Tahun, Bulan) %>%
  mutate(Date = as.yearmon(paste(Tahun, Bulan), "%Y %B")) %>%
  mutate(Date = as.Date(Date))

ts_all <- ts(data_long$Denda, start = c(2020, 1), frequency = 12)
ts_training <- window(ts_all, end = c(2023,12))
ts_testing  <- window(ts_all, start = c(2024,1))

3. PLOT TRAINING VS TESTING

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#4. BOXPLOT

df_training <- data_long %>% filter(Tahun <= 2023)
df_training$MonthAbb <- factor(month.abb[as.integer(format(df_training$Date,"%m"))], levels=month.abb)

ggplot(df_training, aes(factor(Tahun), Denda)) +
  geom_boxplot(fill="lightblue") +
  labs(title="Boxplot Data per Tahun")

ggplot(df_training, aes(MonthAbb, Denda)) +
  geom_boxplot(fill="gold") +
  labs(title="Boxplot Data per Bulan")

#5. IDENTIFIKASI: ACF & PACF

par(mfrow=c(1,2))
acf(ts_training, main="ACF Training")
pacf(ts_training, main="PACF Training")

par(mfrow=c(1,1))

#6. UJI STASIONERITAS

cat("\n--- ADF TEST ---\n")
## 
## --- ADF TEST ---
print(adf.test(ts_training))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_training
## Dickey-Fuller = -2.7682, Lag order = 3, p-value = 0.2673
## alternative hypothesis: stationary
cat("\n--- KPSS TEST ---\n")
## 
## --- KPSS TEST ---
print(kpss.test(ts_training))
## Warning in kpss.test(ts_training): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_training
## KPSS Level = 0.18791, Truncation lag parameter = 3, p-value = 0.1

#7. Box Cox Transformation

y <- ts_training  

min_val <- min(y)
shift <- 0
if(min_val <= 0){
  shift <- abs(min_val) + 1
  y_shift <- y + shift
  cat("Data <= 0 → dilakukan shifting: ", shift, "\n")
} else {
  y_shift <- y
}

fit_dummy <- lm(y_shift ~ 1)

bc <- boxcox(fit_dummy, lambda = seq(-2,2,0.1),
             main="Box–Cox Transformation (95% CI)")
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'main' will be disregarded

lambda_opt <- bc$x[which.max(bc$y)]
ci <- bc$x[bc$y > max(bc$y) - qchisq(0.95,1)/2]

cat("Lambda optimal =", lambda_opt, "\n")
## Lambda optimal = 0.3434343
cat("CI 95% =", ci[1], "sampai", ci[length(ci)], "\n")
## CI 95% = 0.1010101 sampai 0.6262626
y_bc <- BoxCox(y_shift, lambda_opt)

par(mfrow=c(2,1))
plot(y, main="Data Asli", ylab="Denda")
plot(y_bc, main=paste("Setelah Box–Cox (λ =", round(lambda_opt,3), ")"), ylab="Denda BC")

par(mfrow=c(1,1))

#8. DOUBLE EXPONENTIAL SMOOTHING (HOLT) — MODEL UTAMA

model_holt <- holt(ts_training,
                   initial="optimal",
                   h=length(ts_testing),
                   exponential=FALSE)

summary(model_holt)
## 
## Forecast method: Holt's method
## 
## Model Information:
## Holt's method 
## 
## Call:
## holt(y = ts_training, h = length(ts_testing), initial = "optimal", 
##     exponential = FALSE)
## 
##   Smoothing parameters:
##     alpha = 0.3681 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 122128399.7153 
##     b = -3780854.6827 
## 
##   sigma:  49224990
## 
##      AIC     AICc      BIC 
## 1891.985 1893.413 1901.341 
## 
## Error measures:
##                   ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 5590486 47129340 35856275 -111.0776 154.1508 0.5740707 0.03568187
## 
## Forecasts:
##          Point Forecast      Lo 80    Hi 80      Lo 95     Hi 95
## Jan 2024       36038588  -27045775 99122951  -60440620 132517796
## Feb 2024       32284569  -34939549 99508686  -70525847 135094984
## Mar 2024       28530550  -42594833 99655933  -80246337 137307436
## Apr 2024       24776530  -50048943 99602003  -89659155 139212216
## May 2024       21022511  -57330383 99375405  -98807899 140852922
## Jun 2024       17268492  -64461514 98998498 -107726766 142263750
## Jul 2024       13514473  -71460260 98489205 -116443166 143472111
## Aug 2024        9760453  -78341249 97862156 -124979473 144500380
## Sep 2024        6006434  -85116605 97129474 -133354229 145367097
## Oct 2024        2252415  -91796510 96301340 -141583004 146087834
## Nov 2024       -1501604  -98389612 95386404 -149679025 146675817
## Dec 2024       -5255623 -104903331 94392084 -157653641 147142394
# Fitted
autoplot(ts_training, series="Actual") +
  autolayer(model_holt$fitted, series="Fitted Holt") +
  labs(title="Double Exponential Smoothing (Holt)",
       y="Nilai Denda")

# Forecast
autoplot(model_holt) +
  autolayer(ts_testing, series="Actual Testing", PI=FALSE) +
  labs(title="Forecast Holt vs Testing")
## Warning in ggplot2::geom_line(ggplot2::aes(x = .data[["timeVal"]], y =
## .data[["seriesVal"]], : Ignoring unknown parameters: `PI`

# Akurasi Holt
accuracy_holt <- accuracy(model_holt, ts_testing)
cat("\n=== AKURASI MODEL HOLT ===\n")
## 
## === AKURASI MODEL HOLT ===
print(accuracy_holt)
##                    ME     RMSE      MAE        MPE      MAPE      MASE
## Training set  5590486 47129340 35856275 -111.07762 154.15083 0.5740707
## Test set     59170573 62704951 59170573   80.32592  80.32592 0.9473403
##                    ACF1 Theil's U
## Training set 0.03568187        NA
## Test set     0.30017674  2.125483

#9. REGRESI TREN

t <- seq_along(ts_training)
lm_trend <- lm(as.numeric(ts_training) ~ t)
summary(lm_trend)
## 
## Call:
## lm(formula = as.numeric(ts_training) ~ t)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -72820639 -39235110 -12320695  41268270  90725432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83708653   14122442   5.927 3.72e-07 ***
## t            -807803     501766  -1.610    0.114    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48160000 on 46 degrees of freedom
## Multiple R-squared:  0.05334,    Adjusted R-squared:  0.03276 
## F-statistic: 2.592 on 1 and 46 DF,  p-value: 0.1143
plot(ts_training, main="Regresi Tren", ylab="Denda")

# garis fitted training
lines(t, lm_trend$fitted.values, col="red", lwd=2)

# prediksi future
t_future <- (length(ts_training)+1):(length(ts_training)+length(ts_testing))
pred_trend <- predict(lm_trend, data.frame(t=t_future))

# garis prediksi testing
lines(t_future, pred_trend, col="darkgreen", lwd=2, lty=2)

#10. ARIMA MODEL

auto_fit <- auto.arima(ts_training, stepwise=FALSE, approximation=FALSE)
summary(auto_fit)
## Series: ts_training 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1      mean
##       0.3561  64314015
## s.e.  0.1351  10179903
## 
## sigma^2 = 2.136e+15:  log likelihood = -914.31
## AIC=1834.61   AICc=1835.16   BIC=1840.23
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -491164.4 45249023 38082072 -153.5304 183.1281 0.6097065
##                     ACF1
## Training set -0.02521728
m1 <- Arima(ts_training, order=c(1,1,0))
m2 <- Arima(ts_training, order=c(0,1,1))
m3 <- Arima(ts_training, order=c(1,1,1))

aic_df <- data.frame(
  Model=c("ARIMA(1,1,0)","ARIMA(0,1,1)","ARIMA(1,1,1)"),
  AIC=c(AIC(m1),AIC(m2),AIC(m3))
)
print(aic_df)
##          Model      AIC
## 1 ARIMA(1,1,0) 1805.586
## 2 ARIMA(0,1,1) 1799.419
## 3 ARIMA(1,1,1) 1800.475
best_idx <- which.min(aic_df$AIC)
best_model <- list(m1,m2,m3)[[best_idx]]

cat("Model ARIMA terbaik manual:", aic_df$Model[best_idx], "\n")
## Model ARIMA terbaik manual: ARIMA(0,1,1)
checkresiduals(best_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 7.3811, df = 9, p-value = 0.5975
## 
## Model df: 1.   Total lags used: 10
fc_arima <- forecast(best_model, h=length(ts_testing))

autoplot(fc_arima) +
  autolayer(ts_testing, series="Actual") +
  labs(title="Forecast ARIMA")

acc_arima <- accuracy(fc_arima, ts_testing)
cat("\n=== AKURASI ARIMA ===\n")
## 
## === AKURASI ARIMA ===
print(acc_arima)
##                    ME     RMSE      MAE        MPE      MAPE      MASE
## Training set -4496131 46979884 36678750 -155.06616 184.33412 0.5872388
## Test set     28241042 39164048 29902878   29.92843  34.50019 0.4787549
##                    ACF1 Theil's U
## Training set 0.04257315        NA
## Test set     0.47109867  0.996765