library(tidyverse)
library(forecast)
library(MLmetrics)
library(tseries)
theme_set(theme_minimal())birth <- read.csv("data_input/nybirth.csv")
tail(birth)range(birth$date)#> [1] "1946-01-01" "1959-12-01"
birth_ts1 <- ts(data = birth$births,
start = c(1946,1),
frequency = 12)
birth_ts_q <- ts(data = birth$births,
start = 1946,
frequency = 3)birth_ts1 %>%
autoplot()class(birth_ts1)#> [1] "ts"
birth_ts1 %>%
decompose() %>%
autoplot()# birth_ts %>%
# decompose() %>%
# autoplot()birth_ts_q %>%
decompose(type = "additive") %>%
autoplot()#plot trend masih fluktuatif, pilih yang tahun (12)Splitting data duatahun terakhir sebagai data test, dan sisanya data train
birth_test <- tail(birth_ts1, 24) #hanya mengambil 24 bulan terakhir
birth_train <- head(birth_ts1, -24) #tidak mengambil 24 bulan terakhirbirth_train %>%
decompose() %>%
autoplot()birth_train %>%
decompose(type = "multiplicative") %>%
autoplot()Dengan mengggunakan metode smoothing auto
Dengan nilai smoothing mendefinisikan nilai alpha beta dan gamma (manual)
birth_hw <- HoltWinters(x = birth_train)
birth_hw2 <- HoltWinters(x = birth_train, alpha = 0.9, beta = 0.02 , gamma = 0.90)
#gamma = seasonal
#alpha = error (residual)
#beta = trendbirth_forecast_auto <- forecast(birth_hw, h = 24)
birth_forecast_manual <- forecast(birth_hw2, h = 24)birth_ts1 %>%
autoplot()+
autolayer(birth_hw$fitted[,1], series = "Auto", lwd = 1)+
autolayer(birth_hw2$fitted[,1], series = "Alpha 0.9; beta 0.02; gamma = 0.9", lwd = 1)birth_ts1 %>%
autoplot()+
autolayer(birth_forecast_auto) # autolayer(birth_forecast_manual, col = "red")birth_ts1 %>%
autoplot()+
autolayer(birth_forecast_manual, col = "red")MAPE(y_pred = birth_forecast_manual$mean, y_true = birth_test) * 100 #$ mean adalah poin forecastnya #> [1] 4.152653
MAPE(y_pred = birth_forecast_auto$mean, y_true = birth_test) * 100 #dikali 100(persen) supaya keluar nya bukan satu komaan#> [1] 4.674416
nilai MAPE antara model auto dan manual terpaut 0.5 persen, rata-rata prosentase error dari model manual lebih kecil (lebih baik)
birth_train_a <- birth_train
birth_test_a <- birth_test birth_train_a%>%
adf.test() #P-value 0.01, data sudah stasioner (?)#>
#> Augmented Dickey-Fuller Test
#>
#> data: .
#> Dickey-Fuller = -5.6971, Lag order = 5, p-value = 0.01
#> alternative hypothesis: stationary
birth_train_a %>%
autoplot()Data secara statistik sudah stasioner (p-value < 0.05),
namun setelah dilakukan pemeriksaan plot masih perlu dilakukan differencing
birth_train_a %>%
diff(lag = 12) %>%
diff(lag = 1 ) %>%
tsdisplay(lag.max = 36)Seasonal AR
*P : 1, 2
*D : 1
*Q : 1
Keseluruhan Data
*p : 2, 3
*d : 1
*q : 2, 3
Kombinasi model yang mungkin terbentuk adalah:
SARIMA: (p, d, q)(P, D, Q)[m]
*ARIMA(2,1,2)(1,1,1) [12]
*ARIMA(3,1,3)(2,1,1) [12]
birth_arima212_111 <- arima(x = birth_train_a, order = c(2,1,2), seasonal = c(1,1,1))
birth_arima313_211 <- arima(x = birth_train_a, order = c(3,1,3), seasonal = c(2,1,1))birth_auto_arima <- auto.arima(birth_train_a)
birth_auto_arima#> Series: birth_train_a
#> ARIMA(0,1,2)(0,1,2)[12]
#>
#> Coefficients:
#> ma1 ma2 sma1 sma2
#> -0.1054 -0.2263 -1.1221 0.3499
#> s.e. 0.0883 0.0822 0.1095 0.1127
#>
#> sigma^2 estimated as 0.4161: log likelihood=-135.08
#> AIC=280.15 AICc=280.63 BIC=294.53
birth_arima212_111$aic#> [1] 283.9502
birth_arima313_211$aic#> [1] 277.8386
birth_auto_arima$aic#> [1] 280.1547
Dari hasil penghitungan AIC dapat dilihat bahwa model SARIMA (3,1,3)(2,1,1)[12] memiliki lebih sedikit nilai AIC
accuracy(birth_arima212_111)#> ME RMSE MAE MPE MAPE MASE
#> Training set 0.1445184 0.604505 0.4600347 0.5880691 1.860877 0.3899662
#> ACF1
#> Training set -0.060737
accuracy(birth_arima313_211)#> ME RMSE MAE MPE MAPE MASE
#> Training set 0.123292 0.5692868 0.4284582 0.5079665 1.738566 0.3631992
#> ACF1
#> Training set -0.006509432
accuracy(birth_auto_arima)#> ME RMSE MAE MPE MAPE MASE
#> Training set 0.1329977 0.6057793 0.450476 0.5482743 1.825522 0.4734038
#> ACF1
#> Training set -0.01184833
Dari hasil penghitungan accuracy dapat dilihat bahwa model SARIMA (3,1,3)(2,1,1)[12] memiliki nilai MAPE yang lebih kecil
birth_a_forecast <- forecast(birth_arima313_211, h = 24)
birth_au_forecast <- forecast(birth_auto_arima, h = 24)birth_ts1 %>%
autoplot()+
autolayer(birth_a_forecast, lwd = 1, series = "ARIMA (3,1,3)(2,1,1)")MAPE(y_pred = birth_a_forecast$mean, birth_test_a)*100#> [1] 4.995938
MAPE(y_pred = birth_au_forecast$mean, birth_test_a)*100 #(?)#> [1] 4.403916
Ingin ditanyakan: mengapa model manual memiliki nilai MAPE yang lebih besar saat dievaluasi dengan data testnya? Sedangkan pada saat proses membuat model, model manual lebih unggul