Berikut adalah simulasi forecasting sederhana jumlah angka pasien positif corona di Indonesia. Untuk medapatkan hasil yang maksimal disini akan digunakan berbagai model yang berbeda dan kemudian membandingkan dan memilih model terbaik.

Load Library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.0     v forcats 0.4.0
## v readr   1.3.1
## -- Conflicts ------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr) # data wrangling
library(lubridate) # date manipulation
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(forecast) # time series library
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(TTR) # for Simple moving average function
library(MLmetrics) # calculate error
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
library(tseries) # adf.test
library(fpp) # usconsumtion
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(TSstudio) # mempercantik visualisasi timeseries
library(ggplot2)
library(tidyr)

Data pre-processing berupa data cleansing dan pelengkapan data

  1. Read Data
covid19<- read_csv("cases.csv")
## Parsed with column specification:
## cols(
##   date = col_character(),
##   new_tested = col_double(),
##   acc_tested = col_double(),
##   new_confirmed = col_double(),
##   acc_confirmed = col_double(),
##   acc_negative = col_double(),
##   being_checked = col_double(),
##   isolated = col_double(),
##   new_released = col_double(),
##   acc_released = col_double(),
##   new_deceased = col_double(),
##   acc_deceased = col_double(),
##   positive_rate = col_character(),
##   negative_rate = col_character(),
##   decease_rate = col_character(),
##   release_rate = col_character(),
##   dailypositive_rate = col_character()
## )
glimpse(covid19)
## Observations: 30
## Variables: 17
## $ date               <chr> "2-Mar-20", "3-Mar-20", "4-Mar-20", "5-Mar-20", ...
## $ new_tested         <dbl> NA, 2, 31, 16, 62, 4, 29, 60, 151, 99, 69, 143, ...
## $ acc_tested         <dbl> 339, 341, 372, 388, 450, 454, 483, 543, 694, 793...
## $ new_confirmed      <dbl> 2, 0, 0, 0, 2, 0, 2, 13, 8, 7, 0, 35, 27, 21, 17...
## $ acc_confirmed      <dbl> 2, 2, 2, 2, 4, 4, 6, 19, 27, 34, 34, 69, 96, 117...
## $ acc_negative       <dbl> 335, 337, 356, 371, 422, 422, 445, 487, 648, 744...
## $ being_checked      <dbl> 2, 2, 14, 15, 24, 28, 32, 37, 19, 15, 17, 19, 0,...
## $ isolated           <dbl> 2, 2, 2, 2, 4, 4, 6, 19, 27, 30, 27, 60, 83, 104...
## $ new_released       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 3, 0, 0, 1, ...
## $ acc_released       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 8, 8, 8, 9, ...
## $ new_deceased       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 0, 2, ...
## $ acc_deceased       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 4, 5, 5, 5, 7, ...
## $ positive_rate      <chr> "0.59%", "0.59%", "0.54%", "0.52%", "0.89%", "0....
## $ negative_rate      <chr> "98.82%", "98.83%", "95.70%", "95.62%", "93.78%"...
## $ decease_rate       <chr> "0.00%", "0.00%", "0.00%", "0.00%", "0.00%", "0....
## $ release_rate       <chr> "0.00%", "0.00%", "0.00%", "0.00%", "0.00%", "0....
## $ dailypositive_rate <chr> "0.00%", "0.00%", "0.00%", "0.00%", "3.23%", "0....
  1. Cleansing Data
# ubah data tanggal menjadi date
covid19_clean <- covid19 %>% 
  mutate(date=dmy(date))
# Cek struktur data
glimpse(covid19_clean)
## Observations: 30
## Variables: 17
## $ date               <date> 2020-03-02, 2020-03-03, 2020-03-04, 2020-03-05,...
## $ new_tested         <dbl> NA, 2, 31, 16, 62, 4, 29, 60, 151, 99, 69, 143, ...
## $ acc_tested         <dbl> 339, 341, 372, 388, 450, 454, 483, 543, 694, 793...
## $ new_confirmed      <dbl> 2, 0, 0, 0, 2, 0, 2, 13, 8, 7, 0, 35, 27, 21, 17...
## $ acc_confirmed      <dbl> 2, 2, 2, 2, 4, 4, 6, 19, 27, 34, 34, 69, 96, 117...
## $ acc_negative       <dbl> 335, 337, 356, 371, 422, 422, 445, 487, 648, 744...
## $ being_checked      <dbl> 2, 2, 14, 15, 24, 28, 32, 37, 19, 15, 17, 19, 0,...
## $ isolated           <dbl> 2, 2, 2, 2, 4, 4, 6, 19, 27, 30, 27, 60, 83, 104...
## $ new_released       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 3, 0, 0, 1, ...
## $ acc_released       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 8, 8, 8, 9, ...
## $ new_deceased       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 0, 2, ...
## $ acc_deceased       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 4, 5, 5, 5, 7, ...
## $ positive_rate      <chr> "0.59%", "0.59%", "0.54%", "0.52%", "0.89%", "0....
## $ negative_rate      <chr> "98.82%", "98.83%", "95.70%", "95.62%", "93.78%"...
## $ decease_rate       <chr> "0.00%", "0.00%", "0.00%", "0.00%", "0.00%", "0....
## $ release_rate       <chr> "0.00%", "0.00%", "0.00%", "0.00%", "0.00%", "0....
## $ dailypositive_rate <chr> "0.00%", "0.00%", "0.00%", "0.00%", "3.23%", "0....

simpan dalam object ke dalam bentuk time series

covid_ts <- ts(data = covid19$acc_confirmed, start = 1, frequency = 1)

Exploratory Data Analysis

# inspect pola data
covid_ts %>% autoplot()+
  labs(title = "Trend Kasus positif Covid19 di Indonesia")

Penjelasan pemilihan model time series dan asumsinya.

Pada bagian ini akan dipilih berbagai metode modelling untuk kemudian dibandingkan masing - masing mana yang memiliki error terkecil. Mengingat data yang digunakan tidak memiliki seasonal dan hanya memiliki trend maka model yang akan digunakan model yang hanya mempertimbangkan trend.

Model Simple Moving Avarage

covid_SMA <- SMA(x = covid_ts, n = 2)
covid_ts %>% autoplot()+
  autolayer(covid_SMA)

Model Exponential Smoothing

Oleh karena data yang digunakan sangat berfokus ke pada trend dan tidak pada seasonal maka di sini digunakan model = “ZZN” dan alpha = 0.9 untuk memberikan bobot lebih tinggi pada data terbaru atau terakhir

covid_ets <- ets(y = covid_ts, model = "ZZN",alpha = 0.9)
covid_ts %>% autoplot()+
  autolayer(covid_ets$fitted)

# Holt Winter Exponential Alasan pemilihan model ini adalah karena model holt winter exponential adalah model yang sangat memperhitungan trend dan tidak memperhitungkan seasonality. Dalam hal ini digunakan gamma = F

covid_holt <- HoltWinters(x = covid_ts,alpha = 0.9, gamma = F)
covid_ts %>% autoplot()+
  autolayer(covid_holt$fitted[,1])

Model ARIMA

Arima yang dipakai di sini adalah model auto arima. Model ARIMA juga sangat cocok untuk data yang hanya memiliki trend tanpa seasonality

covid_autoarima <- auto.arima(covid_ts)

Evaluasi model dan pemilihan model terbaik.

# Evaluasi model SMA

accuracy(f = covid_SMA, covid_ts)
##                ME    RMSE      MAE      MPE     MAPE      ACF1 Theil's U
## Test set 26.31034 35.6402 26.31034 9.195154 9.195154 0.8691336       0.5
# Evaluasi model Exponential Smoothing

accuracy(f = covid_ets$fitted, covid_ts)
##                ME     RMSE      MAE      MPE     MAPE       ACF1 Theil's U
## Test set 5.667802 17.41284 11.68498 5.235236 10.68887 -0.1711287 0.8642494
# Evaluasi model Holt-winter exponential

accuracy(f = covid_holt$fitted[,1], covid_ts)
##               ME   RMSE      MAE      MPE     MAPE       ACF1 Theil's U
## Test set 6.06919 18.024 12.51717 5.484259 11.32771 -0.1823818   0.86427
# Evaluasi model Auto ARIMA

accuracy(covid_autoarima)
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 5.495837 17.52418 11.86959 4.984128 10.65518 0.2255689 -0.1844848

Membandingkan model:

Dapat dilihat bahwa model dengan MAE terendah adalah “Model Exponential Smoothing” dan “Model ARIMA”. Untuk mendapatkan model terbaik maka akan dilakukan tuning di salah satu model yaitu model “ARIMA”.

# Inspect Auto ARIMA
covid_autoarima
## Series: covid_ts 
## ARIMA(1,2,0) 
## 
## Coefficients:
##           ar1
##       -0.3944
## s.e.   0.1706
## 
## sigma^2 estimated as 341.2:  log likelihood=-120.96
## AIC=245.92   AICc=246.4   BIC=248.59

Tuning secara manual

Berdasarkan model auto arima diketahui order p,d,q = 1,2,0. Berikutnya di sini akan dilakukan tuning terhadap nilai p dan q

#  mencari nilai p dan q
tsdisplay(diff(covid_ts))

p = 1 q = 1,2,3,4,5,6

Setelah melakukan beberapa kali tuning didapat model dengan order p,d,q = (1,2,6)

covid_arima <- Arima(y = covid_ts, order = c(1,2,6))

Membandingkan AIC ARIMA tuning secara manual dengan auto ARIMA

covid_arima$aic
## [1] 246.6975
covid_autoarima$aic
## [1] 245.922

Membandingkan error

accuracy(covid_autoarima)
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 5.495837 17.52418 11.86959 4.984128 10.65518 0.2255689 -0.1844848
accuracy(covid_arima)
##                    ME     RMSE     MAE      MPE     MAPE      MASE       ACF1
## Training set 4.469263 12.58229 8.65807 4.437282 8.602598 0.1645374 -0.1074958

Berdasarkan jabaran diatas didapatkan model dengan MAE terkecil yaitu model “covid_arima” yang merupakan model yang dituning sencara menaul. Model ini cukup baik dengan menghasilkan MAE 8.65807 dan secara AIC pun tidak jauh berbeda dengan auto arima (Model sebelum dituning)

Forcasting

# melakukan forcasting untuk 10 hari ke depan
forecast_covid<- forecast(covid_ets,h=10)

covid_ts %>% autoplot()+
  autolayer(forecast_covid$mean)

Berdasarkan hasil forrecasting dapat disimpulkan 12 hari kedepan tepatnya tanggal 10 april (10 hari dari data terakhir yaitu tanggal 31 maret) angka pasien positif corona bertambah menjadi 2715.752

forecast_covid$upper[10,2]
##      95% 
## 3238.948