library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# 1️⃣ Import dan pembersihan data
data <- read_excel("C:/Users/HP/Downloads/Data curah hujan 2020-2024 di Bandung (1).xlsx", 
                   sheet = "Sheet2")
colnames(data) <- c("Tanggal", "Curah_Hujan")
data <- na.omit(data)
data$Tanggal <- as.Date(data$Tanggal)

# Tambah kolom Tahun dan Bulan
data <- data %>%
  mutate(
    Tahun = format(Tanggal, "%Y"),
    Bulan = format(Tanggal, "%B")
  )
# 2️⃣ Pemisahan Data Training (2020–2023) & Testing (2024)
data_train <- data %>% filter(Tahun %in% c("2020", "2021", "2022", "2023"))
data_test  <- data %>% filter(Tahun == "2024")
# 3️⃣ Statistik Deskriptif
cat("===== Statistik Deskriptif Keseluruhan (2020–2024) =====\n")
## ===== Statistik Deskriptif Keseluruhan (2020–2024) =====
print(summary(data$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    79.0   166.7   181.1   271.2   512.0
cat("\nStandar Deviasi:", sd(data$Curah_Hujan), "\n")
## 
## Standar Deviasi: 118.0585
cat("\n===== Statistik Deskriptif Training (2020–2023) =====\n")
## 
## ===== Statistik Deskriptif Training (2020–2023) =====
print(summary(data_train$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   84.25  179.75  180.24  272.25  454.30
cat("\nStandar Deviasi:", sd(data_train$Curah_Hujan), "\n")
## 
## Standar Deviasi: 113.0322
cat("\n===== Statistik Deskriptif Testing (2024) =====\n")
## 
## ===== Statistik Deskriptif Testing (2024) =====
print(summary(data_test$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    30.0    78.5   146.0   184.6   266.0   512.0
cat("\nStandar Deviasi:", sd(data_test$Curah_Hujan), "\n")
## 
## Standar Deviasi: 141.9555
# Statistik per tahun
print(
  data %>%
    group_by(Tahun) %>%
    summarise(
      Rata_rata = mean(Curah_Hujan),
      SD = sd(Curah_Hujan),
      Minimum = min(Curah_Hujan),
      Maksimum = max(Curah_Hujan)
    )
)
## # A tibble: 5 × 5
##   Tahun Rata_rata    SD Minimum Maksimum
##   <chr>     <dbl> <dbl>   <dbl>    <dbl>
## 1 2020       202.  115.    30       337 
## 2 2021       181.  114.    33.2     454.
## 3 2022       193.  111.    29.9     367.
## 4 2023       146.  118.    18       365 
## 5 2024       185.  142.    30       512
# 4️⃣ Visualisasi Data

# a. Time Series Plot dengan Garis Tren
ggplot(data, aes(x = Tanggal, y = Curah_Hujan)) +
  geom_line(color = "black", linewidth = 0.5) +
  geom_point(color = "darkred", size = 1.5) +
  geom_smooth(method = "loess", se = FALSE, color = "orange", linewidth = 1) +
  ggtitle("Curah Hujan Bulanan di Kota Bandung (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# b. Boxplot Keseluruhan per Tahun
ggplot(data, aes(x = Tahun, y = Curah_Hujan, fill = Tahun)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan per Tahun (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

# c. Boxplot Seluruh Data 
ggplot(data, aes(y = Curah_Hujan)) +
  geom_boxplot(fill = "steelblue", alpha = 0.8) +
  ggtitle("Boxplot Seluruh Data Curah Hujan Bandung (2020–2024)") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()

# d. Boxplot Training dan Testing
data$Periode <- ifelse(data$Tahun == "2024", "Testing (2024)", "Training (2020–2023)")
ggplot(data, aes(x = Periode, y = Curah_Hujan, fill = Periode)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan Training dan Testing") +
  xlab("") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

# e. Boxplot per Bulan (urut sesuai kalender)
data$Bulan <- factor(
  data$Bulan,
  levels = c(
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
  ),
  ordered = TRUE
)

ggplot(data, aes(x = Bulan, y = Curah_Hujan, fill = Bulan)) +
  geom_boxplot(alpha = 0.8) +
  ggtitle("Boxplot Curah Hujan per Bulan (2020–2024)") +
  xlab("Bulan") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

# 5️⃣ PLOT ACF DAN PACF DATA TRAINING
# Konversi data training ke time series
train_ts <- ts(data_train$Curah_Hujan, start = c(2020, 1), frequency = 12)

test_ts <- ts(data_test$Curah_Hujan, start = c(2024, 1), frequency = 12)

# Plot ACF & PACF untuk data training
par(mfrow = c(1, 2))
acf(train_ts, main = "Plot ACF Curah Hujan")
pacf(train_ts, main = "Plot PACF Curah Hujan")

par(mfrow = c(1, 1))
# 6️⃣ UJI STASIONERITAS (ADF TEST)

cat("\n===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====\n")
## 
## ===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts
## Dickey-Fuller = -5.402, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
# Jika data tidak stasioner (p-value > 0.05), lakukan differencing
if (adf_result$p.value > 0.05) {
  cat("\nData tidak stasioner. Melakukan differencing 1x...\n")
  diff_train <- diff(train_ts)
  
  # Plot data setelah differencing
  autoplot(diff_train) +
    ggtitle("Data Training Setelah Differencing (1x)") +
    xlab("Tahun") + ylab("Perubahan Curah Hujan (mm)") +
    theme_minimal()
  
  # Plot ACF & PACF setelah differencing
  par(mfrow = c(1, 2))
  acf(diff_train, main = "ACF Setelah Differencing (1x)")
  pacf(diff_train, main = "PACF Setelah Differencing (1x)")
  par(mfrow = c(1, 1))
} else {
  cat("\nData sudah stasioner, tidak perlu differencing.\n")
}
## 
## Data sudah stasioner, tidak perlu differencing.
# 7️⃣ IDENTIFIKASI TIME SERIES

# Plot time series data training
autoplot(train_ts) +
  ggtitle("Plot Time Series Curah Hujan Bandung (Training 2020–2023)") +
  xlab("Tahun") + ylab("Curah Hujan (mm)") +
  theme_minimal()

# Dekomposisi data
dekomposisi <- decompose(train_ts, type = "additive")
autoplot(dekomposisi) +
  ggtitle("Dekomposisi Time Series Curah Hujan (Additive Model)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the forecast package.
##   Please report the issue at <https://github.com/robjhyndman/forecast/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Boxplot musiman
boxplot(Curah_Hujan ~ Bulan, data = data_train,
        main = "Pola Musiman Curah Hujan (2020–2023)",
        xlab = "Bulan", ylab = "Curah Hujan (mm)",
        col = "skyblue")

# 8️⃣ PEMODELAN EXPONENTIAL SMOOTHING TRIPLE (HOLT-WINTER)

# Model Holt-Winters (Additive)
hw_model <- HoltWinters(train_ts, seasonal = "additive")

# Lihat hasil parameter smoothing
hw_model
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = train_ts, seasonal = "additive")
## 
## Smoothing parameters:
##  alpha: 0
##  beta : 0
##  gamma: 0.2529663
## 
## Coefficients:
##            [,1]
## a    140.603676
## b     -1.281949
## s1   -57.971796
## s2   -31.353429
## s3    91.667702
## s4    67.886407
## s5    63.113306
## s6   -65.725114
## s7  -118.162011
## s8  -125.515738
## s9   -79.345660
## s10   89.162344
## s11  109.264516
## s12  122.965327
# Plot hasil fitting
plot(hw_model, main = "Model Holt-Winters (Additive) Curah Hujan Bandung",
     ylab = "Curah Hujan (mm)", xlab = "Waktu")

# Peramalan 12 bulan ke depan (2024)
hw_forecast <- forecast(hw_model, h = 12)
# Plot hasil peramalan
plot(hw_forecast, main = "Peramalan Curah Hujan 2024 (Holt-Winters Additive)",
     ylab = "Curah Hujan (mm)", xlab = "Tahun")

# Akurasi model Holt-Winters
accuracy(hw_forecast, test_ts)
##                     ME     RMSE       MAE       MPE     MAPE      MASE
## Training set  7.245788  83.2868  59.86385 -20.53533 51.12713 0.7038666
## Test set     46.813507 142.8309 116.88674  -3.70363 83.21212 1.3743296
##                     ACF1 Theil's U
## Training set -0.09587046        NA
## Test set     -0.34781788 0.7602152
# 9️⃣PEMODELAN REGRESI TIME SERIES (MUSIMAN)
# Model regresi time series dengan musiman
reg_model <- tslm(train_ts ~ season)

# Ringkasan model
summary(reg_model)
## 
## Call:
## tslm(formula = train_ts ~ season)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -181.525  -53.744   -1.525   38.519  157.250 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   120.12      38.49   3.121  0.00354 **
## season2        59.63      54.43   1.095  0.28058   
## season3       135.47      54.43   2.489  0.01757 * 
## season4       145.00      54.43   2.664  0.01148 * 
## season5       116.60      54.43   2.142  0.03901 * 
## season6       -29.38      54.43  -0.540  0.59272   
## season7       -65.20      54.43  -1.198  0.23878   
## season8       -71.70      54.43  -1.317  0.19605   
## season9       -29.83      54.43  -0.548  0.58709   
## season10      123.40      54.43   2.267  0.02948 * 
## season11      181.75      54.43   3.339  0.00196 **
## season12      155.68      54.43   2.860  0.00700 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.97 on 36 degrees of freedom
## Multiple R-squared:  0.6448, Adjusted R-squared:  0.5363 
## F-statistic: 5.941 on 11 and 36 DF,  p-value: 2.058e-05
# Plot fitting model
plot(train_ts, main = "Fitting Regresi Time Series Curah Hujan (Training)")
lines(fitted(reg_model), col = "red")

# Peramalan 12 bulan ke depan
reg_forecast <- forecast(reg_model, h = 12)
# Plot hasil peramalan
plot(reg_forecast, main = "Peramalan Regresi Time Series Curah Hujan 2024",
     ylab = "Curah Hujan (mm)", xlab = "Tahun")

# Akurasi model regresi time series
accuracy(reg_forecast, test_ts)
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 4.290116e-15  66.66049  51.07812 -27.83304 48.99192 0.6005658
## Test set     4.339583e+00 127.66896 102.52292 -43.30994 81.17429 1.2054429
##                    ACF1 Theil's U
## Training set  0.1096094        NA
## Test set     -0.3384045 0.5523028
# 🔟 PEMODELAN SARIMA                                                                     
# Model Linear
fit <- lm(train_ts ~ 1)
fit
## 
## Call:
## lm(formula = train_ts ~ 1)
## 
## Coefficients:
## (Intercept)  
##       180.2
# BOX COX
bc <- boxcox(fit,
             lambda = seq(-2, 2, by = 0.1),   # rentang lambda
             plotit = TRUE)                  # grafik

# Ambil lambda optimal dan CI 95%
lambda_opt <- bc$x[which.max(bc$y)]

ci_95 <- bc$x[bc$y > max(bc$y) - 0.5 * qchisq(0.95, df = 1)]

cat("Lambda optimal =", lambda_opt, "\n")
## Lambda optimal = 0.5050505
cat("CI 95% =", range(ci_95), "\n")
## CI 95% = 0.1818182 0.9090909
# BOX–COX TRANSFORMATION
# Cari lambda optimal (metode Guerrero)
lambda_opt <- BoxCox.lambda(train_ts, method = "guerrero")
lambda_opt
## [1] 1.133134
# Transformasi data
train_ts_bc <- BoxCox(train_ts, lambda_opt)
# Plot perbandingan
par(mfrow = c(2,1))
plot(train_ts, main = "Data Asli", ylab = "Curah Hujan")
plot(train_ts_bc, main = paste("Data Setelah Box–Cox (lambda =", round(lambda_opt,3), ")"))

par(mfrow = c(1,1))
# UJI STASIONERITAS NON-SEASONAL (ADF)
adf_before <- adf.test(train_ts)       # sebelum Box–Cox
## Warning in adf.test(train_ts): p-value smaller than printed p-value
adf_after  <- adf.test(train_ts_bc)    # setelah Box–Cox
## Warning in adf.test(train_ts_bc): p-value smaller than printed p-value
print(adf_before)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts
## Dickey-Fuller = -5.402, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
print(adf_after)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts_bc
## Dickey-Fuller = -5.3867, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
train_season_diff <- diff(train_ts_bc, lag = 12, differences = 1)
adf.test(train_season_diff)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_season_diff
## Dickey-Fuller = -2.6746, Lag order = 3, p-value = 0.3107
## alternative hypothesis: stationary
par(mfrow = c(1,2))
acf(train_ts_bc, main = "ACF Setelah Box-Cox")
pacf(train_ts_bc, main = "PACF Setelah Box-Cox")

par(mfrow = c(1,1))

# Karena 0.3107 > 0.05 maka tolak H0, sehingga tidak pakai D = 1
# MODEL SARIMA D = 0
final_model <- Arima(train_ts_bc,
                     order = c(1,0,1),
                     seasonal = c(0,0,0))  # no seasonal component

summary(final_model)
## Series: train_ts_bc 
## ARIMA(1,0,1) with non-zero mean 
## 
## Coefficients:
##           ar1     ma1      mean
##       -0.2572  0.7840  328.2016
## s.e.   0.3189  0.2517   39.7981
## 
## sigma^2 = 40656:  log likelihood = -321.56
## AIC=651.11   AICc=652.04   BIC=658.6
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.2080451 195.2314 168.1958 -103.8131 135.1982 0.9850177
##                    ACF1
## Training set 0.02989791
# Forecast 12 Bulan ke depan
forecast_final <- forecast(final_model, h = 12)
plot(forecast_final, main="Peramalan 2024 (SARIMA Final)")