New York Birth Rate Timeseries

Harnsen

12/28/2021

library(tidyverse) # package
library(forecast) # forecast ARIMA & HoltWinters
library(MLmetrics) # MAPE()
library(tseries) # ts() timeseries
library(TSstudio) # to visualize our timeseries
library(lubridate) # incase if we needed ymd_hms customize
library(padr) # timestamp more specific (lubridate)

Read Data

birth <- read.csv("data_input/nybirth.csv")
glimpse(birth$date)
##  chr [1:168] "1946-01-01" "1946-02-01" "1946-03-01" "1946-04-01" ...

Data Pre-processing

filtering date from 1950

birth_1950 <- birth %>% 
  mutate(year = year(date)) %>% 
  filter(year >= 1950)

Check missing value

birth %>% 
  is.na() %>% 
  colSums() 
##   date births 
##      0      0

there is no missing value in our data thus we can proceed the our next step

Converting Dataframe timeseries

birth_ts <- ts(data = birth$births,
               start = 1946,
               frequency = 12)
birth_ts_1950 <- ts(data = birth_1950$births,
                    start = 1950,
                    frequency = 12)

Data Manipulation

Splitting train test and train test for the prediction later, in this case we want to forecasting for 2 years on the future (24 monhts) as the frequency

# test
birth_test <- tail(birth_ts, 24)
# train
birth_train <- head(birth_ts, -24)

visualization EDA timeseries

now we can visualize our data from library(forecast), autoplot() to observe the frequency of the data. Additionally, we can also use library(TSstudio) to perform data analysis.

1946-1960

birth_ts %>% 
  autoplot()

1950-1960

birth_ts_1950 %>% 
  autoplot()

Exploratory Data Analysis

Seasonal

TSstudio::ts_seasonal(birth_ts, type = "all")

Heatmap

ts_heatmap(birth_ts)

Interactive Plot Birth rate

ts_plot(birth_ts,
        title = "New York Birth Rate",
        Ytitle = "Births ratio",
        Xtitle = "Years", 
        slider = TRUE)

Check Seasonal or Non-Seasonal

TSstudio::ts_cor(birth_ts)

HoltWinters Timeseries

birth_ts %>% autoplot()

Decomposition (additive & multiplicative)

Additive timeseries

birth_train %>% 
  decompose(type = "additive") %>% 
  autoplot()

Multiplicative timeseries

birth_train %>% 
  decompose(type = "multiplicative") %>% 
  autoplot()

Holt-Winters using Additive Seasonality

# a
birth_hw_auto <- HoltWinters(birth_train) # Auto

# b
birth_hw_manual <- HoltWinters(birth_train, alpha = 0.5, beta = 0.2, gamma = 0.6) # Manual
birth_forecast_a <- forecast(birth_hw_auto, h = 24)
birth_forecast_b <- forecast(birth_hw_manual, h = 24)
birth_ts %>% 
  autoplot() +
  autolayer(birth_hw_auto$fitted[,1], lwd = 1) +
  autolayer(birth_hw_manual$fitted[,1], lwd = 1)

birth_ts %>% 
  autoplot()+
  autolayer(birth_forecast_a$mean, series = "auto", lwd = 1)+
  autolayer(birth_forecast_b$mean, series = "manual", lwd = 1)

### MAPE RESULT

MAPE(y_pred = birth_forecast_a$mean,
     y_true = birth_test)*100
## [1] 4.674416
MAPE(y_pred = birth_forecast_b$mean,
     y_true = birth_test)*100
## [1] 11.39774

Model Evaluation Holtwinters

accuracy(birth_hw_auto$fitted[,1], birth_ts)
##                  ME      RMSE       MAE       MPE     MAPE      ACF1 Theil's U
## Test set 0.09549013 0.7536798 0.5737626 0.3378208 2.339772 0.1415391 0.4947687
accuracy(birth_hw_manual$fitted[,1], birth_ts)
##                  ME      RMSE       MAE        MPE    MAPE      ACF1 Theil's U
## Test set 0.02567983 0.8293213 0.6553268 0.03230736 2.66782 0.3416248 0.5377257

model SARIMA

Seasonal arima adalah metode arima dimana object time series yang ada memiliki pola seasonal[^10]. Tahapan dalam melakukan pemodelan menggunakan SARIMA sama seperti saat membuat pemodelan ARIMA.

birth_train %>% 
  diff(lag = 12) %>% 
  diff(lag = 1) %>% 
  adf.test()
## 
##  Augmented Dickey-Fuller Test
## 
## data:  .
## Dickey-Fuller = -6.5978, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

auto SARIMA

  • Series: birth_ts
  • ARIMA(2,1,2)(1,1,1)[12]
birth_auto <- auto.arima(birth_ts)
birth_auto
## Series: birth_ts 
## ARIMA(2,1,2)(1,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     sar1     sma1
##       0.6539  -0.4540  -0.7255  0.2532  -0.2427  -0.8451
## s.e.  0.3004   0.2429   0.3228  0.2879   0.0985   0.0995
## 
## sigma^2 estimated as 0.4076:  log likelihood=-157.45
## AIC=328.91   AICc=329.67   BIC=350.21

manual SARIMA

Kesimpulan: data nya sudah di diffrencing 1x diff() untuk menghasilkan stasioner (p-value < alpha). sehingga nilai d pada model ARIMA adalah 1

birth_train %>% 
  diff(lag = 12) %>%
  diff(lag = 1) %>% 
  tsdisplay(lag.max = 36) 

Untuk seasosal, AR murni
* P : 1,2 * D : 1 * Q : 1

untuk keseluruhan data, MA murni * p: 2,3 * d: 1 * q: 2,3

birth_a <- arima (x = birth_train, order = c(2,1,2), seasonal = c(1,1,1))

birth_b <- arima (x = birth_train, order = c(2,1,3), seasonal = c(1,1,1))

birth_c <- arima (x = birth_train, order = c(3,1,3), seasonal = c(1,1,1))

birth_d <- arima (x = birth_train, order = c(3,1,2), seasonal = c(2,1,1))

birth_e <- arima (x = birth_train, order = c(2,1,2), seasonal = c(2,1,1))

cek nilai AIC

birth_a$aic
## [1] 283.9502
birth_b$aic
## [1] 285.4812
birth_c$aic
## [1] 287.3461
birth_d$aic # jackpot
## [1] 277.5694
birth_e$aic 
## [1] 281.1986
cat(
accuracy(birth_auto), # 1.8232
accuracy(birth_a),# 1.860
accuracy(birth_b), # 1.842
accuracy(birth_c), # 1.847
accuracy(birth_d), # 1.722 # jackpot
accuracy(birth_e) # 1.781
)
## 0.1273882 0.6012674 0.4578678 0.5110816 1.823201 0.4841123 -0.04405169 0.1445184 0.604505 0.4600347 0.5880691 1.860877 0.3899662 -0.060737 0.1492736 0.6036553 0.4552473 0.6074197 1.842082 0.385908 -0.05678732 0.1475945 0.6037679 0.456615 0.5999198 1.847341 0.3870674 -0.05458337 0.09476621 0.5693816 0.4233738 0.3806519 1.722433 0.3588892 -0.02140837 0.09770979 0.5786945 0.4387213 0.4022654 1.781933 0.3718991 -0.0948942

forecasting

birth_forecast_arima <- forecast(birth_d, h = 24)

EDA

birth_ts %>% 
  autoplot() +
  autolayer(birth_forecast_arima$mean, lwd = 1, series = "ARIMA (3,1,2) (2,1,1)") 

evaluation

MAPE(y_pred = birth_forecast_arima$mean, birth_test)*100
## [1] 8.472663
birth_ts %>% 
  autoplot() +
  autolayer(birth_forecast_arima$mean, lwd = 1, series = "SARIMA (3,1,2) (2,1,1)") +
  autolayer(birth_forecast_a$mean, series = "addictive") +
  autolayer(birth_forecast_b$mean, series = "multiplicative")

data.frame(y_pred = birth_forecast_arima$mean, birth_test)
##      y_pred birth_test
## 1  27.36446     27.132
## 2  26.33198     24.924
## 3  29.42753     28.963
## 4  27.45003     26.589
## 5  28.70995     27.931
## 6  29.20205     28.009
## 7  31.15892     29.229
## 8  31.17317     28.759
## 9  30.47068     28.405
## 10 30.74757     27.945
## 11 28.90249     25.912
## 12 29.82579     26.619
## 13 29.32366     26.076
## 14 27.78252     25.286
## 15 30.94898     27.660
## 16 29.44201     25.951
## 17 30.80229     26.398
## 18 30.20065     25.565
## 19 31.82085     28.865
## 20 31.45662     30.000
## 21 31.13976     29.261
## 22 31.52479     29.012
## 23 29.27402     26.992
## 24 30.11243     27.897