# 1. Load package
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
## Warning: package 'tseries' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(zoo)
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# 1️⃣ Load Package
# ==========================================================
library(readxl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(forecast)
library(tseries)
library(caret)
library(zoo)
library(gridExtra)
# ==========================================================
# 2️⃣ Import Data
# ==========================================================
data_pm25_raw <- read_excel("C:/Users/user/Downloads/PM2.5 Harian 2025.xlsx")
# Lihat struktur data
str(data_pm25_raw)
## tibble [273 × 2] (S3: tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:273], format: "2025-01-01" "2025-01-02" ...
## $ PM2.5: num [1:273] 32 33 31 24 47 36 68 55 34 42 ...
head(data_pm25_raw)
## # A tibble: 6 × 2
## Date PM2.5
## <dttm> <dbl>
## 1 2025-01-01 00:00:00 32
## 2 2025-01-02 00:00:00 33
## 3 2025-01-03 00:00:00 31
## 4 2025-01-04 00:00:00 24
## 5 2025-01-05 00:00:00 47
## 6 2025-01-06 00:00:00 36
# ==========================================================
# 3️⃣ Pemeriksaan Awal (Data Quality Check)
# ==========================================================
# Pastikan kolom sesuai
colnames(data_pm25_raw) <- c("Date", "PM25")
# Cek missing value
sum(is.na(data_pm25_raw))
## [1] 0
# Cek duplikasi
sum(duplicated(data_pm25_raw))
## [1] 0
# Cek format tanggal
data_pm25_raw$Date <- as.Date(data_pm25_raw$Date)
# Cek nilai unik dan ringkasan
summary(data_pm25_raw)
## Date PM25
## Min. :2025-01-01 Min. : 0.00
## 1st Qu.:2025-03-10 1st Qu.:26.00
## Median :2025-05-17 Median :33.00
## Mean :2025-05-17 Mean :34.71
## 3rd Qu.:2025-07-24 3rd Qu.:42.00
## Max. :2025-09-30 Max. :70.00
# Cek outlier (boxplot)
ggplot(data_pm25_raw, aes(y = PM25)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Boxplot PM2.5 Harian", y = "PM2.5")

# 4️⃣ Preprocessing Data
# ==========================================================
# Isi missing value (jika ada)
data_pm25_raw$PM25 <- zoo::na.locf(data_pm25_raw$PM25, fromLast = TRUE)
# Urutkan berdasarkan tanggal
data_pm25 <- data_pm25_raw %>% arrange(Date)
# Buat objek time series harian
ts_pm25 <- ts(data_pm25$PM25, frequency = 365, start = c(2025, 1))
# 5️⃣ Eksplorasi Data Analisis (EDA)
# ==========================================================
# Plot deret waktu
ggplot(data_pm25, aes(x = Date, y = PM25)) +
geom_line(color = "blue") +
labs(title = "Pergerakan Harian PM2.5 Tahun 2025", x = "Tanggal", y = "PM2.5")

# Rata-rata bulanan
data_pm25_monthly <- data_pm25 %>%
mutate(Bulan = floor_date(Date, "month")) %>%
group_by(Bulan) %>%
summarise(Mean_PM25 = mean(PM25))
ggplot(data_pm25_monthly, aes(x = Bulan, y = Mean_PM25)) +
geom_line(color = "darkgreen") +
geom_point() +
labs(title = "Rata-rata PM2.5 Bulanan", x = "Bulan", y = "PM2.5")

# 6️⃣ Data Splitting (Training & Testing)
# ==========================================================
n <- nrow(data_pm25)
train_size <- round(0.8 * n)
train <- data_pm25[1:train_size, ]
test <- data_pm25[(train_size + 1):n, ]
ts_train <- ts(train$PM25, frequency = 365)
ts_test <- ts(test$PM25, frequency = 365)
# ==========================================================
# 7️⃣ Uji Kestasioneran (ADF Test)
# ==========================================================
adf_result <- adf.test(ts_train)
## Warning in adf.test(ts_train): p-value smaller than printed p-value
adf_result
##
## Augmented Dickey-Fuller Test
##
## data: ts_train
## Dickey-Fuller = -5.1549, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# ==========================================================
# 8️⃣ Differencing (Jika p-value > 0.05)
# ==========================================================
if (adf_result$p.value > 0.05) {
ts_train_diff <- diff(ts_train)
} else {
ts_train_diff <- ts_train
}
adf.test(ts_train_diff)
## Warning in adf.test(ts_train_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ts_train_diff
## Dickey-Fuller = -5.1549, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# ==========================================================
# 9️⃣ Plot ACF & PACF
# ==========================================================
par(mfrow = c(1, 2))
acf(ts_train_diff, main = "Plot ACF PM2.5")
pacf(ts_train_diff, main = "Plot PACF PM2.5")

par(mfrow = c(1, 1))
# ==========================================================
# 🔟 Fit Model ARIMA
# ==========================================================
# Model 1: Auto ARIMA
model_auto <- auto.arima(ts_train)
# Model 2: Manual ARIMA (Differencing)
model_diff <- Arima(ts_train, order = c(1,1,1))
# Model 3: Seasonal ARIMA (Bulanan)
model_seasonal_alt <- auto.arima(ts_train, seasonal = TRUE)
# ==========================================================
# 1️⃣1️⃣ Uji Diagnostik Residual
# ==========================================================
checkresiduals(model_auto)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 29.483, df = 43, p-value = 0.9421
##
## Model df: 1. Total lags used: 44
checkresiduals(model_diff)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)
## Q* = 30.698, df = 42, p-value = 0.9016
##
## Model df: 2. Total lags used: 44
checkresiduals(model_seasonal_alt)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 29.483, df = 43, p-value = 0.9421
##
## Model df: 1. Total lags used: 44
# ==========================================================
# 1️⃣2️⃣ Forecasting
# ==========================================================
forecast_auto <- forecast(model_auto, h = length(test$PM25))
forecast_diff <- forecast(model_diff, h = length(test$PM25))
forecast_seasonal_alt <- forecast(model_seasonal_alt, h = length(test$PM25))
# ==========================================================
# 1️⃣3️⃣ Evaluasi Model (Manual Calculation)
# ==========================================================
calc_metrics <- function(actual, predicted) {
rmse <- sqrt(mean((actual - predicted)^2))
mae <- mean(abs(actual - predicted))
mape <- mean(abs((actual - predicted) / actual)) * 100
return(c(RMSE = rmse, MAE = mae, MAPE = mape))
}
# Ambil nilai prediksi dari masing-masing model
pred_auto <- as.numeric(forecast_auto$mean)
pred_diff <- as.numeric(forecast_diff$mean)
pred_seasonal <- as.numeric(forecast_seasonal_alt$mean)
# Hitung metrik evaluasi untuk masing-masing model
acc_auto <- calc_metrics(test$PM25, pred_auto)
acc_diff <- calc_metrics(test$PM25, pred_diff)
acc_seasonal_alt <- calc_metrics(test$PM25, pred_seasonal)
# Gabungkan hasil ke dalam tabel perbandingan
library(dplyr)
comparison <- data.frame(
Model = c("Auto ARIMA", "ARIMA Differencing", "Seasonal ARIMA (Bulanan)"),
RMSE = c(acc_auto["RMSE"], acc_diff["RMSE"], acc_seasonal_alt["RMSE"]),
MAE = c(acc_auto["MAE"], acc_diff["MAE"], acc_seasonal_alt["MAE"]),
MAPE = c(acc_auto["MAPE"], acc_diff["MAPE"], acc_seasonal_alt["MAPE"])
)
# Bulatkan nilai metrik menjadi 3 desimal
comparison <- comparison %>% mutate(across(where(is.numeric), ~ round(., 3)))
print(comparison)
## Model RMSE MAE MAPE
## 1 Auto ARIMA 12.147 9.822 27.654
## 2 ARIMA Differencing 12.149 9.823 27.655
## 3 Seasonal ARIMA (Bulanan) 12.147 9.822 27.654
# Menampilkan model terbaik berdasarkan RMSE dan MAPE
best_by_rmse <- comparison[which.min(comparison$RMSE), ]
cat("\nModel terbaik berdasarkan RMSE:\n")
##
## Model terbaik berdasarkan RMSE:
print(best_by_rmse)
## Model RMSE MAE MAPE
## 1 Auto ARIMA 12.147 9.822 27.654
best_by_mape <- comparison[which.min(comparison$MAPE), ]
cat("\nModel terbaik berdasarkan MAPE:\n")
##
## Model terbaik berdasarkan MAPE:
print(best_by_mape)
## Model RMSE MAE MAPE
## 1 Auto ARIMA 12.147 9.822 27.654
# ==========================================================
# 1️⃣4️⃣ Visualisasi Hasil Forecast
# ==========================================================
library(ggplot2)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.3
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
# Konversi ulang data training dan testing menjadi time series dengan frekuensi harian
ts_train <- ts(train$PM25, frequency = 7) # frekuensi mingguan untuk data harian
ts_test <- ts(test$PM25, start = end(ts_train)[1] + 1/365, frequency = 7)
# ---------- Plot 1: Auto ARIMA ----------
autoplot(ts_train, series = "Training", color = "black") +
autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
autolayer(forecast_auto$mean, series = "Forecast (Auto ARIMA)", color = "blue") +
labs(title = "Perbandingan Aktual vs Forecast (Auto ARIMA)",
x = "Waktu", y = "Konsentrasi PM2.5") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))

# ---------- Plot 2: ARIMA Differencing ----------
autoplot(ts_train, series = "Training", color = "black") +
autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
autolayer(forecast_diff$mean, series = "Forecast (ARIMA Differencing)", color = "blue") +
labs(title = "Perbandingan Aktual vs Forecast (ARIMA Differencing)",
x = "Waktu", y = "Konsentrasi PM2.5") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))

# ---------- Plot 3: Seasonal ARIMA ----------
autoplot(ts_train, series = "Training", color = "black") +
autolayer(ts_test, series = "Testing (Aktual)", color = "red") +
autolayer(forecast_seasonal_alt$mean, series = "Forecast (Seasonal ARIMA)", color = "blue") +
labs(title = "Perbandingan Aktual vs Forecast (Seasonal ARIMA)",
x = "Waktu", y = "Konsentrasi PM2.5") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))

# ==========================================================
# ✅ Selesai — Model terbaik dapat dilihat dari nilai RMSE/MAE/MAPE terkecil
# ==========================================================