library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
# ⿥ Import dan pembersihan data
data <- read_excel("C:/Users/HP/Downloads/Data curah hujan 2020-2024 di Bandung (1).xlsx", 
                   sheet = "Sheet2")
colnames(data) <- c("Tanggal", "Curah_Hujan")
data <- na.omit(data)
data$Tanggal <- as.Date(data$Tanggal)

# Tambah kolom Tahun dan Bulan
data <- data %>%
  mutate(
    Tahun = format(Tanggal, "%Y"),
    Bulan = format(Tanggal, "%B")
  )
# ⿢ Pemisahan Data Training (2020–2023) & Testing (2024)
data_train <- data %>% filter(Tahun %in% c("2020", "2021", "2022", "2023"))
data_train
## # A tibble: 48 × 4
##    Tanggal    Curah_Hujan Tahun Bulan    
##    <date>           <dbl> <chr> <chr>    
##  1 2020-01-01        208. 2020  January  
##  2 2020-02-01        337  2020  February 
##  3 2020-03-01        291  2020  March    
##  4 2020-04-01        271  2020  April    
##  5 2020-05-01        292  2020  May      
##  6 2020-06-01         30  2020  June     
##  7 2020-07-01         64  2020  July     
##  8 2020-08-01         42  2020  August   
##  9 2020-09-01         88  2020  September
## 10 2020-10-01        327  2020  October  
## # ℹ 38 more rows
data_test  <- data %>% filter(Tahun == "2024")
data_test
## # A tibble: 12 × 4
##    Tanggal    Curah_Hujan Tahun Bulan    
##    <date>           <dbl> <chr> <chr>    
##  1 2024-01-01         335 2024  January  
##  2 2024-02-01         264 2024  February 
##  3 2024-03-01         272 2024  March    
##  4 2024-04-01         215 2024  April    
##  5 2024-05-01          71 2024  May      
##  6 2024-06-01         156 2024  June     
##  7 2024-07-01          30 2024  July     
##  8 2024-08-01          53 2024  August   
##  9 2024-09-01         136 2024  September
## 10 2024-10-01          81 2024  October  
## 11 2024-11-01         512 2024  November 
## 12 2024-12-01          90 2024  December
# ⿣ Statistik Deskriptif
cat("===== Statistik Deskriptif Keseluruhan (2020–2024) =====\n")
## ===== Statistik Deskriptif Keseluruhan (2020–2024) =====
print(summary(data$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    79.0   166.7   181.1   271.2   512.0
cat("\nStandar Deviasi:", sd(data$Curah_Hujan), "\n")
## 
## Standar Deviasi: 118.0585
cat("\n===== Statistik Deskriptif Training (2020–2023) =====\n")
## 
## ===== Statistik Deskriptif Training (2020–2023) =====
print(summary(data_train$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   84.25  179.75  180.24  272.25  454.30
cat("\nStandar Deviasi:", sd(data_train$Curah_Hujan), "\n")
## 
## Standar Deviasi: 113.0322
cat("\n===== Statistik Deskriptif Testing (2024) =====\n")
## 
## ===== Statistik Deskriptif Testing (2024) =====
print(summary(data_test$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    30.0    78.5   146.0   184.6   266.0   512.0
cat("\nStandar Deviasi:", sd(data_test$Curah_Hujan), "\n")
## 
## Standar Deviasi: 141.9555
# Statistik per tahun
cat("\n===== Statistik Curah Hujan per Tahun =====\n")
## 
## ===== Statistik Curah Hujan per Tahun =====
print(
  data %>%
    group_by(Tahun) %>%
    summarise(
      Rata_rata = mean(Curah_Hujan),
      SD = sd(Curah_Hujan),
      Minimum = min(Curah_Hujan),
      Maksimum = max(Curah_Hujan)
    )
)
## # A tibble: 5 × 5
##   Tahun Rata_rata    SD Minimum Maksimum
##   <chr>     <dbl> <dbl>   <dbl>    <dbl>
## 1 2020       202.  115.    30       337 
## 2 2021       181.  114.    33.2     454.
## 3 2022       193.  111.    29.9     367.
## 4 2023       146.  118.    18       365 
## 5 2024       185.  142.    30       512
# ⿤ Visualisasi Data

# a. Time Series Plot dengan Garis Tren
ggplot(data, aes(x = Tanggal, y = Curah_Hujan)) +
  geom_line(color = "black", linewidth = 0.5) +
  geom_point(color = "darkred", size = 1.5) +
  geom_smooth(method = "loess", se = FALSE, color = "orange", linewidth = 1) +
  ggtitle("Curah Hujan Bulanan di Kota Bandung (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# b. Boxplot Keseluruhan per Tahun
ggplot(data, aes(x = Tahun, y = Curah_Hujan, fill = Tahun)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan per Tahun (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

# c. Boxplot Seluruh Data 
ggplot(data, aes(y = Curah_Hujan)) +
  geom_boxplot(fill = "steelblue", alpha = 0.8) +
  ggtitle("Boxplot Seluruh Data Curah Hujan Bandung (2020–2024)") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()

# d. Boxplot Training dan Testing
data$Periode <- ifelse(data$Tahun == "2024", "Testing (2024)", "Training (2020–2023)")
ggplot(data, aes(x = Periode, y = Curah_Hujan, fill = Periode)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan Training dan Testing") +
  xlab("") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

# e. Boxplot per Bulan (urut sesuai kalender)
data$Bulan <- factor(
  data$Bulan,
  levels = c(
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
  ),
  ordered = TRUE
)

ggplot(data, aes(x = Bulan, y = Curah_Hujan, fill = Bulan)) +
  geom_boxplot(alpha = 0.8) +
  ggtitle("Boxplot Curah Hujan per Bulan (2020–2024)") +
  xlab("Bulan") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

# ⿼ PLOT ACF DAN PACF DATA TRAINING

# Konversi data training ke time series
train_ts <- ts(data_train$Curah_Hujan, start = c(2020, 1), frequency = 12)

# Plot ACF & PACF untuk data training
par(mfrow = c(1, 2))
acf(train_ts, main = "Plot ACF Curah Hujan")
pacf(train_ts, main = "Plot PACF Curah Hujan")

par(mfrow = c(1, 1))
# ⿌ UJI STASIONERITAS (ADF TEST)

cat("\n===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====\n")
## 
## ===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts
## Dickey-Fuller = -5.402, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
# Jika data tidak stasioner (p-value > 0.05), lakukan differencing
if (adf_result$p.value > 0.05) {
  cat("\nData tidak stasioner. Melakukan differencing 1x...\n")
  diff_train <- diff(train_ts)
  
# Plot data setelah differencing
  autoplot(diff_train) +
    ggtitle("Data Training Setelah Differencing (1x)") +
    xlab("Tahun") + ylab("Perubahan Curah Hujan (mm)") +
    theme_minimal()
  
  # Plot ACF & PACF setelah differencing
  par(mfrow = c(1, 2))
  acf(diff_train, main = "ACF Setelah Differencing (1x)")
  pacf(diff_train, main = "PACF Setelah Differencing (1x)")
  par(mfrow = c(1, 1))
} else {
  cat("\nData sudah stasioner, tidak perlu differencing.\n")
}
## 
## Data sudah stasioner, tidak perlu differencing.
# âż§ IDENTIFIKASI TIME SERIES

# Plot time series data training
autoplot(train_ts) +
  ggtitle("Plot Time Series Curah Hujan Bandung (Training 2020–2023)") +
  xlab("Tahun") + ylab("Curah Hujan (mm)") +
  theme_minimal()

# Dekomposisi data
dekomposisi <- decompose(train_ts, type = "additive")
autoplot(dekomposisi) +
  ggtitle("Dekomposisi Time Series Curah Hujan (Additive Model)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the forecast package.
##   Please report the issue at <https://github.com/robjhyndman/forecast/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Boxplot musiman
boxplot(Curah_Hujan ~ Bulan, data = data_train,
        main = "Pola Musiman Curah Hujan (2020–2023)",
        xlab = "Bulan", ylab = "Curah Hujan (mm)",
        col = "skyblue")

# ⿨ PEMODELAN EXPONENTIAL SMOOTHING TRIPLE (HOLT-WINTER)

# Model Holt-Winters (Additive)
hw_model <- HoltWinters(train_ts, seasonal = "additive")

# Lihat hasil parameter smoothing
hw_model
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = train_ts, seasonal = "additive")
## 
## Smoothing parameters:
##  alpha: 0
##  beta : 0
##  gamma: 0.2529663
## 
## Coefficients:
##            [,1]
## a    140.603676
## b     -1.281949
## s1   -57.971796
## s2   -31.353429
## s3    91.667702
## s4    67.886407
## s5    63.113306
## s6   -65.725114
## s7  -118.162011
## s8  -125.515738
## s9   -79.345660
## s10   89.162344
## s11  109.264516
## s12  122.965327
# Konversi data testing (2024) menjadi objek time series
test_ts <- ts(data_test$Curah_Hujan, start = c(2024, 1), frequency = 12)
# Plot hasil fitting
plot(hw_model, main = "Model Holt-Winters (Additive) Curah Hujan Bandung",
     ylab = "Curah Hujan (mm)", xlab = "Waktu")

# Peramalan 12 bulan ke depan (2024)
hw_forecast <- forecast(hw_model, h = 12)

# Plot hasil peramalan
plot(hw_forecast, main = "Peramalan Curah Hujan 2024 (Holt-Winters Additive)",
     ylab = "Curah Hujan (mm)", xlab = "Tahun")

# Akurasi model Holt-Winters
accuracy(hw_forecast, test_ts)
##                     ME     RMSE       MAE       MPE     MAPE      MASE
## Training set  7.245788  83.2868  59.86385 -20.53533 51.12713 0.7038666
## Test set     46.813507 142.8309 116.88674  -3.70363 83.21212 1.3743296
##                     ACF1 Theil's U
## Training set -0.09587046        NA
## Test set     -0.34781788 0.7602152
# ⿊PEMODELAN REGRESI TIME SERIES (MUSIMAN)

# Model regresi time series dengan trend dan musiman
reg_model <- tslm(train_ts ~ season)

# Ringkasan model
summary(reg_model)
## 
## Call:
## tslm(formula = train_ts ~ season)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -181.525  -53.744   -1.525   38.519  157.250 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   120.12      38.49   3.121  0.00354 **
## season2        59.63      54.43   1.095  0.28058   
## season3       135.47      54.43   2.489  0.01757 * 
## season4       145.00      54.43   2.664  0.01148 * 
## season5       116.60      54.43   2.142  0.03901 * 
## season6       -29.38      54.43  -0.540  0.59272   
## season7       -65.20      54.43  -1.198  0.23878   
## season8       -71.70      54.43  -1.317  0.19605   
## season9       -29.83      54.43  -0.548  0.58709   
## season10      123.40      54.43   2.267  0.02948 * 
## season11      181.75      54.43   3.339  0.00196 **
## season12      155.68      54.43   2.860  0.00700 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.97 on 36 degrees of freedom
## Multiple R-squared:  0.6448, Adjusted R-squared:  0.5363 
## F-statistic: 5.941 on 11 and 36 DF,  p-value: 2.058e-05
# Plot fitting model
plot(train_ts, main = "Fitting Regresi Time Series Curah Hujan (Training)")
lines(fitted(reg_model), col = "red")

# Peramalan 12 bulan ke depan
reg_forecast <- forecast(reg_model, h = 12)

# Plot hasil peramalan
plot(reg_forecast, main = "Peramalan Regresi Time Series Curah Hujan 2024",
     ylab = "Curah Hujan (mm)", xlab = "Tahun")

# Akurasi model regresi time series
accuracy(reg_forecast, test_ts)
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 4.290116e-15  66.66049  51.07812 -27.83304 48.99192 0.6005658
## Test set     4.339583e+00 127.66896 102.52292 -43.30994 81.17429 1.2054429
##                    ACF1 Theil's U
## Training set  0.1096094        NA
## Test set     -0.3384045 0.5523028
# 🔟 PEMODELAN SARIMA (SEASONAL ARIMA)

# Pemodelan SARIMA otomatis
sarima_model <- auto.arima(train_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(sarima_model)
## Series: train_ts 
## ARIMA(0,0,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.7243
## s.e.   0.5088
## 
## sigma^2 = 7737:  log likelihood = -215.73
## AIC=435.46   AICc=435.83   BIC=438.63
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -14.64359 75.11008 53.24365 -38.25788 55.80402 0.6260277 0.0444806
# Diagnostik residual SARIMA
checkresiduals(sarima_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0)(0,1,1)[12]
## Q* = 9.0125, df = 9, p-value = 0.4361
## 
## Model df: 1.   Total lags used: 10
# Peramalan 12 bulan ke depan
sarima_forecast <- forecast(sarima_model, h = 12)

# Plot hasil peramalan
plot(sarima_forecast, main = "Peramalan Curah Hujan 2024 (Model SARIMA)",
     ylab = "Curah Hujan (mm)", xlab = "Tahun")

# Akurasi model SARIMA
accuracy(sarima_forecast, test_ts)
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -14.643587  75.11008  53.24365 -38.25788 55.80402 0.6260277
## Test set       8.138647 130.30404 105.40688 -40.52971 81.39865 1.2393520
##                    ACF1 Theil's U
## Training set  0.0444806        NA
## Test set     -0.2989930 0.5564026
# 1⿥ PERBANDINGAN KINERJA ANTAR MODEL

# Ambil nilai RMSE, MAE, dan MAPE dari setiap model
akurasi_hw <- accuracy(hw_forecast, test_ts)[2, c("RMSE", "MAE", "MAPE")]
akurasi_reg <- accuracy(reg_forecast, test_ts)[2, c("RMSE", "MAE", "MAPE")]
akurasi_sarima <- accuracy(sarima_forecast, test_ts)[2, c("RMSE", "MAE", "MAPE")]

# Gabungkan hasil ke dalam satu tabel
akurasi_model <- rbind(
  Holt_Winters_Additive = akurasi_hw,
  Regresi_TimeSeries = akurasi_reg,
  SARIMA = akurasi_sarima
)

print("📊 Perbandingan Akurasi Model Peramalan Curah Hujan Tahun 2024:")
## [1] "📊 Perbandingan Akurasi Model Peramalan Curah Hujan Tahun 2024:"
print(akurasi_model)
##                           RMSE      MAE     MAPE
## Holt_Winters_Additive 142.8309 116.8867 83.21212
## Regresi_TimeSeries    127.6690 102.5229 81.17429
## SARIMA                130.3040 105.4069 81.39865
# Visualisasi perbandingan MAPE
barplot(akurasi_model[, "MAPE"],
        names.arg = rownames(akurasi_model),
        col = c("skyblue", "salmon", "lightgreen"),
        main = "Perbandingan MAPE Model Peramalan Curah Hujan 2024",
        ylab = "MAPE (%)")