library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
#Import dan pembersihan data
data <- read_excel("C:/Users/HP/Downloads/Data curah hujan 2020-2024 di Bandung (1).xlsx", sheet = "Sheet2")
colnames(data) <- c("Tanggal", "Curah_Hujan")
data <- na.omit(data)
data$Tanggal <- as.Date(data$Tanggal)

# Tambah kolom Tahun dan Bulan
data <- data %>%
  mutate(
    Tahun = format(Tanggal, "%Y"),
    Bulan = format(Tanggal, "%B")
  )
#Pemisahan Data Training (2020–2023) & Testing (2024)
data_train <- data %>% filter(Tahun %in% c("2020", "2021", "2022", "2023"))
data_train
data_test  <- data %>% filter(Tahun == "2024")
data_test
#Statistik Deskriptif
cat("===== Statistik Deskriptif Keseluruhan (2020–2024) =====\n")
## ===== Statistik Deskriptif Keseluruhan (2020–2024) =====
print(summary(data$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    79.0   166.7   181.1   271.2   512.0
cat("\nStandar Deviasi:", sd(data$Curah_Hujan), "\n")
## 
## Standar Deviasi: 118.0585
cat("\n===== Statistik Deskriptif Training (2020–2023) =====\n")
## 
## ===== Statistik Deskriptif Training (2020–2023) =====
print(summary(data_train$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   84.25  179.75  180.24  272.25  454.30
cat("\nStandar Deviasi:", sd(data_train$Curah_Hujan), "\n")
## 
## Standar Deviasi: 113.0322
cat("\n===== Statistik Deskriptif Testing (2024) =====\n")
## 
## ===== Statistik Deskriptif Testing (2024) =====
print(summary(data_test$Curah_Hujan))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    30.0    78.5   146.0   184.6   266.0   512.0
cat("\nStandar Deviasi:", sd(data_test$Curah_Hujan), "\n")
## 
## Standar Deviasi: 141.9555
# Statistik per tahun
cat("\n===== Statistik Curah Hujan per Tahun =====\n")
## 
## ===== Statistik Curah Hujan per Tahun =====
print(
  data %>%
    group_by(Tahun) %>%
    summarise(
      Rata_rata = mean(Curah_Hujan),
      SD = sd(Curah_Hujan),
      Minimum = min(Curah_Hujan),
      Maksimum = max(Curah_Hujan)
    )
)
## # A tibble: 5 × 5
##   Tahun Rata_rata    SD Minimum Maksimum
##   <chr>     <dbl> <dbl>   <dbl>    <dbl>
## 1 2020       202.  115.    30       337 
## 2 2021       181.  114.    33.2     454.
## 3 2022       193.  111.    29.9     367.
## 4 2023       146.  118.    18       365 
## 5 2024       185.  142.    30       512
#Visualisasi Data
#a.Time Series Plot dengan Garis Tren
ggplot(data, aes(x = Tanggal, y = Curah_Hujan)) +
  geom_line(color = "black", linewidth = 0.5) +
  geom_point(color = "darkred", size = 1.5) +
  geom_smooth(method = "loess", se = FALSE, color = "orange", linewidth = 1) +
  ggtitle("Curah Hujan Bulanan di Kota Bandung (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#b.Boxplot Keseluruhan per Tahun
ggplot(data, aes(x = Tahun, y = Curah_Hujan, fill = Tahun)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan per Tahun (2020–2024)") +
  xlab("Tahun") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

#c.Boxplot Seluruh Data 
ggplot(data, aes(y = Curah_Hujan)) +
  geom_boxplot(fill = "steelblue", alpha = 0.8) +
  ggtitle("Boxplot Seluruh Data Curah Hujan Bandung (2020–2024)") +
  ylab("Curah Hujan (mm)") +
  theme_minimal()

#d.Boxplot Training dan Testing
data$Periode <- ifelse(data$Tahun == "2024", "Testing (2024)", "Training (2020–2023)")
ggplot(data, aes(x = Periode, y = Curah_Hujan, fill = Periode)) +
  geom_boxplot(alpha = 0.7) +
  ggtitle("Boxplot Curah Hujan Training dan Testing") +
  xlab("") +
  ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(legend.position = "none")

#e.Boxplot per Bulan 
ggplot(data, aes(x = Bulan, y = Curah_Hujan, fill = Bulan)) +
  geom_boxplot(alpha = 0.8) +
  ggtitle("Boxplot Curah Hujan per Bulan (2020–2024)") +
  xlab("Bulan") + ylab("Curah Hujan (mm)") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

#PLOT ACF DAN PACF DATA TRAINING
#Konversi data training ke time series
train_ts <- ts(data_train$Curah_Hujan, start = c(2020, 1), frequency = 12)
train_ts
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
## 2020 207.6 337.0 291.0 271.0 292.0  30.0  64.0  42.0  88.0 327.0 207.0 262.0
## 2021 146.4 153.9 292.5 177.3 239.0  92.4  33.2  91.8  73.0 218.4 454.3 198.5
## 2022  59.5 117.1 238.9 336.2 146.9 150.6  98.5  29.9 182.2 366.7 307.2 277.7
## 2023  67.0 111.0 200.0 276.0 269.0  90.0  24.0  30.0  18.0  62.0 239.0 365.0
# Plot ACF & PACF untuk data training
par(mfrow = c(1, 2))
acf(train_ts, main = "Plot ACF Curah Hujan")
pacf(train_ts, main = "Plot PACF Curah Hujan")

par(mfrow = c(1, 1))
#UJI STASIONERITAS (ADF TEST)
cat("\n===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====\n")
## 
## ===== UJI STASIONERITAS (ADF TEST) DATA TRAINING =====
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts
## Dickey-Fuller = -5.402, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
#Jika data tidak stasioner (p-value > 0.05), lakukan differencing
if (adf_result$p.value > 0.05) {
  cat("\nData tidak stasioner. Melakukan differencing 1x...\n")
  diff_train <- diff(train_ts)
  
  #Plot data setelah differencing
  autoplot(diff_train) +
    ggtitle("Data Training Setelah Differencing (1x)") +
    xlab("Tahun") + ylab("Perubahan Curah Hujan (mm)") +
    theme_minimal()
  
  #Plot ACF & PACF setelah differencing
  par(mfrow = c(1, 2))
  acf(diff_train, main = "ACF Setelah Differencing (1x)")
  pacf(diff_train, main = "PACF Setelah Differencing (1x)")
  par(mfrow = c(1, 1))
} else {
  cat("\nData sudah stasioner, tidak perlu differencing.\n")
}
## 
## Data sudah stasioner, tidak perlu differencing.