# 1. Load package
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
## Warning: package 'tseries' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(zoo)
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# 1️⃣ Load Package
# ==========================================================
library(readxl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(forecast)
library(tseries)
library(caret)
library(zoo)
library(gridExtra)

# ==========================================================
# 2️⃣ Import Data
# ==========================================================
data_pm25_raw <- read_excel("C:/Users/user/Downloads/PM2.5 Harian 2025.xlsx")

# Lihat struktur data
str(data_pm25_raw)
## tibble [273 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Date : POSIXct[1:273], format: "2025-01-01" "2025-01-02" ...
##  $ PM2.5: num [1:273] 32 33 31 24 47 36 68 55 34 42 ...
head(data_pm25_raw)
## # A tibble: 6 × 2
##   Date                PM2.5
##   <dttm>              <dbl>
## 1 2025-01-01 00:00:00    32
## 2 2025-01-02 00:00:00    33
## 3 2025-01-03 00:00:00    31
## 4 2025-01-04 00:00:00    24
## 5 2025-01-05 00:00:00    47
## 6 2025-01-06 00:00:00    36
# ==========================================================
# 3️⃣ Pemeriksaan Awal (Data Quality Check)
# ==========================================================
# Pastikan kolom sesuai
colnames(data_pm25_raw) <- c("Date", "PM25")

# Cek missing value
sum(is.na(data_pm25_raw))
## [1] 0
# Cek duplikasi
sum(duplicated(data_pm25_raw))
## [1] 0
# Cek format tanggal
data_pm25_raw$Date <- as.Date(data_pm25_raw$Date)

# Cek nilai unik dan ringkasan
summary(data_pm25_raw)
##       Date                 PM25      
##  Min.   :2025-01-01   Min.   : 0.00  
##  1st Qu.:2025-03-10   1st Qu.:26.00  
##  Median :2025-05-17   Median :33.00  
##  Mean   :2025-05-17   Mean   :34.71  
##  3rd Qu.:2025-07-24   3rd Qu.:42.00  
##  Max.   :2025-09-30   Max.   :70.00
# Cek outlier (boxplot)
ggplot(data_pm25_raw, aes(y = PM25)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Boxplot PM2.5 Harian", y = "PM2.5")

# 4️⃣ Preprocessing Data
# ==========================================================
# Isi missing value (jika ada)
data_pm25_raw$PM25 <- zoo::na.locf(data_pm25_raw$PM25, fromLast = TRUE)

# Urutkan berdasarkan tanggal
data_pm25 <- data_pm25_raw %>% arrange(Date)

# Buat objek time series harian
ts_pm25 <- ts(data_pm25$PM25, frequency = 365, start = c(2025, 1))
# 5️⃣ Eksplorasi Data Analisis (EDA)
# ==========================================================
# Plot deret waktu
ggplot(data_pm25, aes(x = Date, y = PM25)) +
  geom_line(color = "blue") +
  labs(title = "Pergerakan Harian PM2.5 Tahun 2025", x = "Tanggal", y = "PM2.5")

# Rata-rata bulanan
data_pm25_monthly <- data_pm25 %>%
  mutate(Bulan = floor_date(Date, "month")) %>%
  group_by(Bulan) %>%
  summarise(Mean_PM25 = mean(PM25))

ggplot(data_pm25_monthly, aes(x = Bulan, y = Mean_PM25)) +
  geom_line(color = "darkgreen") +
  geom_point() +
  labs(title = "Rata-rata PM2.5 Bulanan", x = "Bulan", y = "PM2.5")

# 6️⃣ Data Splitting (Training & Testing)
# ==========================================================
n <- nrow(data_pm25)
train_size <- round(0.8 * n)
train <- data_pm25[1:train_size, ]
test  <- data_pm25[(train_size + 1):n, ]

ts_train <- ts(train$PM25, frequency = 365)
ts_test <- ts(test$PM25, frequency = 365)

# ==========================================================
# 7️⃣ Uji Kestasioneran (ADF Test)
# ==========================================================
adf_result <- adf.test(ts_train)
## Warning in adf.test(ts_train): p-value smaller than printed p-value
adf_result
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train
## Dickey-Fuller = -5.1549, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# ==========================================================
# 8️⃣ Differencing (Jika p-value > 0.05)
# ==========================================================
if (adf_result$p.value > 0.05) {
  ts_train_diff <- diff(ts_train)
} else {
  ts_train_diff <- ts_train
}

adf.test(ts_train_diff)
## Warning in adf.test(ts_train_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train_diff
## Dickey-Fuller = -5.1549, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# ==========================================================
# 9️⃣ Plot ACF & PACF
# ==========================================================
par(mfrow = c(1, 2))
acf(ts_train_diff, main = "Plot ACF PM2.5")
pacf(ts_train_diff, main = "Plot PACF PM2.5")

par(mfrow = c(1, 1))
# ==========================================================
# 🔟 Fit Model ARIMA
# ==========================================================
# Model 1: Auto ARIMA
model_auto <- auto.arima(ts_train)

# Model 2: Manual ARIMA (Differencing)
model_diff <- Arima(ts_train, order = c(1,1,1))

# Model 3: Seasonal ARIMA (Bulanan)
model_seasonal_alt <- auto.arima(ts_train, seasonal = TRUE)

# ==========================================================
# 1️⃣1️⃣ Uji Diagnostik Residual
# ==========================================================
checkresiduals(model_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 29.483, df = 43, p-value = 0.9421
## 
## Model df: 1.   Total lags used: 44
checkresiduals(model_diff)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 30.698, df = 42, p-value = 0.9016
## 
## Model df: 2.   Total lags used: 44
checkresiduals(model_seasonal_alt)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 29.483, df = 43, p-value = 0.9421
## 
## Model df: 1.   Total lags used: 44
# ==========================================================
# 1️⃣2️⃣ Forecasting
# ==========================================================
forecast_auto <- forecast(model_auto, h = length(test$PM25))
forecast_diff <- forecast(model_diff, h = length(test$PM25))
forecast_seasonal_alt <- forecast(model_seasonal_alt, h = length(test$PM25))

# ==========================================================
# 1️⃣3️⃣ Evaluasi Model (Manual Calculation)
# ==========================================================
calc_metrics <- function(actual, predicted) {
  rmse <- sqrt(mean((actual - predicted)^2))
  mae <- mean(abs(actual - predicted))
  mape <- mean(abs((actual - predicted) / actual)) * 100
  return(c(RMSE = rmse, MAE = mae, MAPE = mape))
}


# Ambil nilai prediksi dari masing-masing model
pred_auto <- as.numeric(forecast_auto$mean)
pred_diff <- as.numeric(forecast_diff$mean)
pred_seasonal <- as.numeric(forecast_seasonal_alt$mean)


# Hitung metrik evaluasi untuk masing-masing model
acc_auto <- calc_metrics(test$PM25, pred_auto)
acc_diff <- calc_metrics(test$PM25, pred_diff)
acc_seasonal_alt <- calc_metrics(test$PM25, pred_seasonal)


# Gabungkan hasil ke dalam tabel perbandingan
library(dplyr)
comparison <- data.frame(
  Model = c("Auto ARIMA", "ARIMA Differencing", "Seasonal ARIMA (Bulanan)"),
  RMSE = c(acc_auto["RMSE"], acc_diff["RMSE"], acc_seasonal_alt["RMSE"]),
  MAE = c(acc_auto["MAE"], acc_diff["MAE"], acc_seasonal_alt["MAE"]),
  MAPE = c(acc_auto["MAPE"], acc_diff["MAPE"], acc_seasonal_alt["MAPE"])
)


# Bulatkan nilai metrik menjadi 3 desimal
comparison <- comparison %>% mutate(across(where(is.numeric), ~ round(., 3)))
print(comparison)
##                      Model   RMSE   MAE   MAPE
## 1               Auto ARIMA 12.147 9.822 27.654
## 2       ARIMA Differencing 12.149 9.823 27.655
## 3 Seasonal ARIMA (Bulanan) 12.147 9.822 27.654
# Menampilkan model terbaik berdasarkan RMSE dan MAPE
best_by_rmse <- comparison[which.min(comparison$RMSE), ]
cat("\nModel terbaik berdasarkan RMSE:\n")
## 
## Model terbaik berdasarkan RMSE:
print(best_by_rmse)
##        Model   RMSE   MAE   MAPE
## 1 Auto ARIMA 12.147 9.822 27.654
best_by_mape <- comparison[which.min(comparison$MAPE), ]
cat("\nModel terbaik berdasarkan MAPE:\n")
## 
## Model terbaik berdasarkan MAPE:
print(best_by_mape)
##        Model   RMSE   MAE   MAPE
## 1 Auto ARIMA 12.147 9.822 27.654
# ==========================================================
# 1️⃣4️⃣ Visualisasi Hasil Forecast
# ==========================================================
library(ggplot2)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.3
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
# Konversi ulang data training dan testing menjadi time series dengan frekuensi harian
ts_train <- ts(train$PM25, frequency = 7)  # frekuensi mingguan untuk data harian
ts_test  <- ts(test$PM25, start = end(ts_train)[1] + 1/365, frequency = 7)

# ---------- Plot 1: Auto ARIMA ----------
autoplot(ts_train, series = "Training", color = "black") +
  autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
  autolayer(forecast_auto$mean, series = "Forecast (Auto ARIMA)", color = "blue") +
  labs(title = "Perbandingan Aktual vs Forecast (Auto ARIMA)",
       x = "Waktu", y = "Konsentrasi PM2.5") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# ---------- Plot 2: ARIMA Differencing ----------
autoplot(ts_train, series = "Training", color = "black") +
  autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
  autolayer(forecast_diff$mean, series = "Forecast (ARIMA Differencing)", color = "blue") +
  labs(title = "Perbandingan Aktual vs Forecast (ARIMA Differencing)",
       x = "Waktu", y = "Konsentrasi PM2.5") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# ---------- Plot 3: Seasonal ARIMA ----------
autoplot(ts_train, series = "Training", color = "black") +
  autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
  autolayer(forecast_seasonal_alt$mean, series = "Forecast (Seasonal ARIMA)", color = "blue") +
  labs(title = "Perbandingan Aktual vs Forecast (Seasonal ARIMA)",
       x = "Waktu", y = "Konsentrasi PM2.5") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# ==========================================================
# ✅ Selesai — Model terbaik dapat dilihat dari nilai RMSE/MAE/MAPE terkecil
# ==========================================================