# ===========================================
# 1️⃣ Load Library
# ===========================================
library(readxl)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(TTR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tseries)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# ===========================================
# 2️⃣ Baca Data
# ===========================================

data <- read_excel("C:/Users/User/Downloads/data arw ekspor.xlsx")

# Ubah nama kolom agar aman
colnames(data) <- c("Tahun", "Bulan", "Ekspor")

# Pastikan kolom Ekspor numeric
data$Ekspor <- gsub("\\.", "", data$Ekspor)
data$Ekspor <- gsub(",", ".", data$Ekspor)
data$Ekspor <- as.numeric(data$Ekspor)

# Buat kolom tanggal
data$Tanggal <- seq(as.Date("2020-01-01"), by = "month", length.out = nrow(data))
data$Tahun <- format(data$Tanggal, "%Y")
data$Bulan <- format(data$Tanggal, "%b")
data$Bulan <- factor(data$Bulan, levels = month.abb)
# ===========================================
# 3️⃣ Cek Struktur Data
# ===========================================
str(data)
## tibble [60 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Tahun  : chr [1:60] "2020" "2020" "2020" "2020" ...
##  $ Bulan  : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Ekspor : num [1:60] 1096 1058 1177 991 848 ...
##  $ Tanggal: Date[1:60], format: "2020-01-01" "2020-02-01" ...
summary(data)
##     Tahun               Bulan        Ekspor          Tanggal          
##  Length:60          Jan    : 5   Min.   : 793.4   Min.   :2020-01-01  
##  Class :character   Feb    : 5   1st Qu.:1374.9   1st Qu.:2021-03-24  
##  Mode  :character   Mar    : 5   Median :1908.2   Median :2022-06-16  
##                     Apr    : 5   Mean   :1898.3   Mean   :2022-06-16  
##                     May    : 5   3rd Qu.:2237.2   3rd Qu.:2023-09-08  
##                     Jun    : 5   Max.   :3404.5   Max.   :2024-12-01  
##                     (Other):30
colSums(is.na(data))
##   Tahun   Bulan  Ekspor Tanggal 
##       0       0       0       0
#Pemisahan Data Training (2020–2023) & Testing (2024)
data_train <- data %>% filter(Tahun %in% c("2020", "2021", "2022", "2023"))
data_train
## # A tibble: 48 × 4
##    Tahun Bulan Ekspor Tanggal   
##    <chr> <fct>  <dbl> <date>    
##  1 2020  Jan    1096. 2020-01-01
##  2 2020  Feb    1058  2020-02-01
##  3 2020  Mar    1177. 2020-03-01
##  4 2020  Apr     991. 2020-04-01
##  5 2020  May     848. 2020-05-01
##  6 2020  Jun     848. 2020-06-01
##  7 2020  Jul     876. 2020-07-01
##  8 2020  Aug     836. 2020-08-01
##  9 2020  Sep     793. 2020-09-01
## 10 2020  Oct     861. 2020-10-01
## # ℹ 38 more rows
# ===========================================
# 4️⃣ Pembagian Data Training & Testing
# ===========================================

# Buat objek time series (bulanan)
ts_ekspor <- ts(data$Ekspor, start = c(2020, 1), frequency = 12)

# Split data:
train <- window(ts_ekspor, end = c(2023, 12))  # 2020–2023
test  <- window(ts_ekspor, start = c(2024, 1)) # 2024

# Visualisasi pembagian data
autoplot(ts_ekspor) +
  autolayer(train, series = "Training", color = "blue") +
  autolayer(test, series = "Testing", color = "red") +
  ggtitle("Pembagian Data Training (2020–2023) dan Testing (2024)") +
  xlab("Tahun") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal()

# ===========================================
# 5️⃣ Statistika Deskriptif
# ===========================================

# Hitung ukuran pemusatan & penyebaran
stat_deks <- data %>%
  summarise(
    Mean   = mean(Ekspor, na.rm = TRUE),
    Median = median(Ekspor, na.rm = TRUE),
    SD     = sd(Ekspor, na.rm = TRUE),
    Min    = min(Ekspor, na.rm = TRUE),
    Max    = max(Ekspor, na.rm = TRUE)
  )

# Tampilkan hasil
stat_deks
## # A tibble: 1 × 5
##    Mean Median    SD   Min   Max
##   <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 1898.  1908.  699.  793. 3404.
# ===========================================
# 6️⃣ Visualisasi Time Series dengan Garis Tren
# ===========================================

library(scales)  # untuk fungsi date_breaks & date_labels

ggplot(data, aes(x = Tanggal, y = Ekspor)) +
  geom_line(color = "black", linewidth = 0.5) +            # garis utama
  geom_point(color = "darkred", size = 1.5) +              # titik data
  geom_smooth(method = "loess", se = FALSE,                # garis tren LOESS
              color = "orange", linewidth = 1) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") + # tampilkan tahun di sumbu X
  ggtitle("Nilai Ekspor Bulanan Indonesia (2020–2024)") +
  xlab("Tahun") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# ===========================================
# 7️⃣ Visualisasi Boxplot per Tahun & per Bulan
# ===========================================

# a. Boxplot per Tahun
ggplot(data, aes(x = Tahun, y = Ekspor, fill = Tahun)) +
  geom_boxplot() +
  ggtitle("Sebaran Nilai Ekspor per Tahun (2020–2024)") +
  xlab("Tahun") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal()

# b. Boxplot per Bulan
ggplot(data, aes(x = Bulan, y = Ekspor, fill = Bulan)) +
  geom_boxplot() +
  ggtitle("Sebaran Nilai Ekspor per Bulan (2020–2024)") +
  xlab("Bulan") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ===========================================
# 8️⃣ Deteksi Outlier per Tahun dengan Boxplot
# ===========================================

# a. Deteksi outlier per tahun dengan metode IQR
outlier_check <- data %>%
  group_by(Tahun) %>%
  mutate(
    Q1 = quantile(Ekspor, 0.25, na.rm = TRUE),
    Q3 = quantile(Ekspor, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower = Q1 - 1.5 * IQR,
    Upper = Q3 + 1.5 * IQR,
    Outlier = ifelse(Ekspor < Lower | Ekspor > Upper, TRUE, FALSE)
  )

# b. Simpan data yang termasuk outlier
outlier_values <- outlier_check %>% filter(Outlier == TRUE)
outlier_values
## # A tibble: 2 × 10
## # Groups:   Tahun [1]
##   Tahun Bulan Ekspor Tanggal       Q1    Q3   IQR Lower Upper Outlier
##   <chr> <fct>  <dbl> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>  
## 1 2022  Jan    1011. 2022-01-01 2651. 3141.  490. 1917. 3875. TRUE   
## 2 2022  Feb    1838. 2022-02-01 2651. 3141.  490. 1917. 3875. TRUE
# c. Visualisasi boxplot per tahun untuk mendeteksi outlier
ggplot(data, aes(x = Tahun, y = Ekspor, fill = Tahun)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 19, outlier.size = 2.5) +
  ggtitle("Deteksi Outlier Nilai Ekspor per Tahun (2020–2024)") +
  xlab("Tahun") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 0, vjust = 0.5),
    legend.position = "none"
  )

# d. (Opsional) Visualisasi data outlier dengan label
ggplot(outlier_values, aes(x = Tahun, y = Ekspor)) +
  geom_point(color = "red", size = 3) +
  geom_text(aes(label = round(Ekspor, 2)), vjust = -1, color = "black", size = 3) +
  ggtitle("Nilai Outlier per Tahun pada Data Ekspor") +
  ylab("Nilai Ekspor (Juta USD)") +
  theme_minimal()

# ===========================================
#9️⃣ Uji Stasioneritas (ADF Test)
# ===========================================

# Uji stasioneritas awal menggunakan Augmented Dickey-Fuller Test
library(tseries)

adf_result <- adf.test(ts_ekspor)
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_ekspor
## Dickey-Fuller = -1.3898, Lag order = 3, p-value = 0.8214
## alternative hypothesis: stationary
# Interpretasi awal:
# Jika p-value > 0.05 → data belum stasioner
# Jika p-value ≤ 0.05 → data sudah stasioner
# Jika p-value > 0.05, lakukan differencing ke-1
if (adf_result$p.value > 0.05) {
  diff1 <- diff(ts_ekspor, differences = 1)
  cat("\nData belum stasioner → dilakukan differencing ke-1\n")
  adf_result_diff1 <- adf.test(diff1)
  print(adf_result_diff1)
  
  # Jika setelah differencing ke-1 masih belum stasioner → differencing ke-2
  if (adf_result_diff1$p.value > 0.05) {
    diff2 <- diff(ts_ekspor, differences = 2)
    cat("\nMasih belum stasioner → dilakukan differencing ke-2\n")
    adf_result_diff2 <- adf.test(diff2)
    print(adf_result_diff2)
    
    data_final <- diff2
    cat("\nData akhir yang digunakan: differencing ke-2\n")
  } else {
    data_final <- diff1
    cat("\nData akhir yang digunakan: differencing ke-1\n")
  }
  
} else {
  data_final <- ts_ekspor
  cat("\nData sudah stasioner → tidak perlu differencing\n")
}
## 
## Data belum stasioner → dilakukan differencing ke-1
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1
## Dickey-Fuller = -3.4867, Lag order = 3, p-value = 0.05083
## alternative hypothesis: stationary
## 
## 
## Masih belum stasioner → dilakukan differencing ke-2
## Warning in adf.test(diff2): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2
## Dickey-Fuller = -5.6787, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## Data akhir yang digunakan: differencing ke-2
# Plot ACF dan PACF dari data_final
par(mfrow = c(1, 2))
acf(data_final, main = "Plot ACF Data Akhir (Stasioner)")
pacf(data_final, main = "Plot PACF Data Akhir (Stasioner)")

par(mfrow = c(1, 1))