R Markdown

library(knitr)
library(tidymodels)
library(workflowsets)
library(modeltime)
library(modeltime.ensemble)
library(timetk)
library(xgboost)
library(lubridate)
library(parsnip)
library(rsample)
library(dplyr)
library(workflows)
library(recipes)
library(tidyverse)
library(janitor)

# GluonTS Installation - Run 1st time
# install_gluonts(fresh_install = FALSE, include_pytorch = TRUE)
#data 
covid_df <-  read_csv("df.csv") 

covid_kenya <- covid_df %>% 
   select(-1) %>% 
  mutate(date = dmy(date)) %>% 
  rename(daily_infection = new_cases, cum_infection= total_cases, 
         daily_deaths = new_deaths, cum_deaths = total_deaths )
glimpse(covid_kenya)
## Rows: 1,387
## Columns: 5
## $ date            <date> 2020-03-15, 2020-03-16, 2020-03-17, 2020-03-18, 2020-…
## $ daily_infection <dbl> 1, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0,…
## $ cum_infection   <dbl> 1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 38, 38, 38, …
## $ cum_deaths      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, …
## $ daily_deaths    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, …
nrow(covid_df)
## [1] 1387
df <- covid_kenya %>% 
  pivot_longer(
    cols =c(daily_infection, cum_infection, daily_deaths, cum_deaths),
    values_to = "value",
    names_to = "case_type"
  )
# attach(covid_kenya)

TIME SERIES PLOT OF DAILY AND CUMULTATIVE CASES (INFECTION & DEATHS)

df %>% group_by(case_type) %>% 
  plot_time_series(date, value, .facet_ncol = 2,  
                   .line_color = "red",
                   .line_size = 0.5,
                   .smooth_size = 1,
                   .smooth_alpha = 0.5,
                   .legend_show = TRUE,
                   .title = "Time Series Case Trend Plot",
                   .x_lab = "Date",
                   .y_lab = "Cases")
df %>% group_by(case_type) %>% 
  tk_summary_diagnostics()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
## # A tibble: 4 × 13
## # Groups:   case_type [4]
##   case_type   n.obs start      end        units scale tzone diff.minimum diff.q1
##   <chr>       <int> <date>     <date>     <chr> <chr> <chr>        <dbl>   <dbl>
## 1 daily_infe…  1387 2020-03-15 2023-12-31 days  day   UTC          86400   86400
## 2 cum_infect…  1387 2020-03-15 2023-12-31 days  day   UTC          86400   86400
## 3 daily_deat…  1387 2020-03-15 2023-12-31 days  day   UTC          86400   86400
## 4 cum_deaths   1387 2020-03-15 2023-12-31 days  day   UTC          86400   86400
## # ℹ 4 more variables: diff.median <dbl>, diff.mean <dbl>, diff.q3 <dbl>,
## #   diff.maximum <dbl>

ADDING HOLIDAYS

# Add Kenya holidays (2020–2023)
holiday_md <- c("01-01", "02-14", "03-08", "05-01", "06-01", 
                "10-10", "10-20", "12-12", "12-25", "12-26")

years <- 2020:2023
holiday_dates <- as.Date(unlist(lapply(years, function(y) {
  paste0(y, "-", holiday_md)
})))

kenya_holidays <- tibble(
  holiday = "kenya_holiday",
  ds = holiday_dates
)

Train/test split time series of COVID-19 new daily cumulative cases

#Data spliting
set.seed(123)
cum_df <- covid_kenya %>% select(date, cum_infection)

splits <- initial_time_split(cum_df, prop = 0.8)
splits %>% 
  tk_time_series_cv_plan() %>% 
  plot_time_series_cv_plan(date, 
                 cum_infection, 
                .line_alpha = 0.5,
                 .title = "Train-test split time series of COVID-19 new daily cumulative cases")

Modeling

ARIMA(NON-SEASONAL)

# Model 1: ARIMA
model_arima <- arima_reg(seasonal_period = "NONE") %>%
  set_engine("auto_arima") %>%
  fit(cum_infection ~ date, training(splits))
## Using period = 1 (no seasonal period).
model_arima
## parsnip model object
## 
## Series: outcome 
## ARIMA(5,2,4) 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5      ma1     ma2      ma3
##       0.1070  -0.7778  -0.2722  -0.3204  -0.6527  -1.6623  1.7980  -1.3509
## s.e.  0.0268   0.0215   0.0327   0.0212   0.0256   0.0300  0.0434   0.0381
##          ma4
##       0.5896
## s.e.  0.0254
## 
## sigma^2 = 600087:  log likelihood = -8935.5
## AIC=17891   AICc=17891.2   BIC=17941.1

We’ll create a Feature Engineering Recipe that can be applied to the data to create features that machine learning models can key in on. This will be most useful for the __Elastic Net

SARIMA

# Model 2: Seasonal ARIMA
model_sarima <- arima_reg() %>%
  set_engine("auto_arima") %>%
  fit(cum_infection ~ date, training(splits))
## frequency = 7 observations per 1 week
model_sarima
## parsnip model object
## 
## Series: outcome 
## ARIMA(0,1,0)(1,1,0)[7] 
## 
## Coefficients:
##         sar1
##       0.3789
## s.e.  0.0277
## 
## sigma^2 = 272083:  log likelihood = -8451.18
## AIC=16906.35   AICc=16906.36   BIC=16916.36

Holt-Winters

# Model 3: Holt-Winters
model_hw <- exp_smoothing() %>%
  set_engine("ets") %>%
    fit(cum_infection ~ date, training(splits))
## frequency = 7 observations per 1 week
model_hw
## parsnip model object
## 
## ETS(A,A,A) 
## 
## Call:
## forecast::ets(y = outcome, model = model_ets, damped = damping_ets, 
##     alpha = alpha, beta = beta, gamma = gamma)
## 
##   Smoothing parameters:
##     alpha = 0.2046 
##     beta  = 0.0864 
##     gamma = 0.7954 
## 
##   Initial states:
##     l = -483.3038 
##     b = 78.0575 
##     s = -932.2004 -622.0629 -311.9074 -2.1555 310.8545 622.2743
##            935.1974
## 
##   sigma:  754.8896
## 
##      AIC     AICc      BIC 
## 22486.12 22486.40 22546.25

FaceBook Prophet

# Model 4: Prophet with holidays
model_prophet <- prophet_reg( #growth = "logistic",
                              season = "multiplicative",
                              changepoint_num = 10,
                              logistic_floor = "saturation"
                                # seasonality_yearly = TRUE
) %>%
  set_engine("prophet", holidays = kenya_holidays) %>%
  fit(cum_infection ~ date, training(splits))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
model_prophet
## parsnip model object
## 
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 10
## - changepoint.range: 0.8
## - yearly.seasonality: 'auto'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'multiplicative'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: c("kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", 
## "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday")
## - logistic_cap: NULL
## - logistic_floor: saturation
## - extra_regressors: 0
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 10
## - changepoint.range: 0.8
## - yearly.seasonality: 'auto'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'multiplicative'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: c(18262, 18306, 18329, 18383, 18414, 18545, 18555, 18608, 18621, 18622, 18628, 18672, 18694, 18748, 18779, 18910, 18920, 18973, 18986, 18987, 18993, 19037, 19059, 19113, 19144, 19275, 19285, 19338, 19351, 19352, 19358, 19402, 19424, 19478, 19509, 19640, 19650, 19703, 19716, 19717)
## - logistic_cap: NULL
## - logistic_floor: saturation
## - extra_regressors: 0

Machine Learning Models

Preprocessing Recipe

recipe_spec <- recipe(cum_infection~date, training(splits)) %>% 
  step_timeseries_signature(date) %>% 
  step_rm(contains('xts'), contains('am.pm'), contains("minute"), contains("second"), 
          contains("hour")) %>% 
  step_fourier(date, period = 365, K=5) %>% 
  step_dummy(all_nominal()) 

recipe_spec %>% prep() %>% juice()
## # A tibble: 1,109 × 47
##    date       cum_infection date_index.num date_year date_year.iso date_half
##    <date>             <dbl>          <dbl>     <int>         <int>     <int>
##  1 2020-03-15             1     1584230400      2020          2020         1
##  2 2020-03-16             1     1584316800      2020          2020         1
##  3 2020-03-17             1     1584403200      2020          2020         1
##  4 2020-03-18             1     1584489600      2020          2020         1
##  5 2020-03-19             1     1584576000      2020          2020         1
##  6 2020-03-20             1     1584662400      2020          2020         1
##  7 2020-03-21             1     1584748800      2020          2020         1
##  8 2020-03-22             7     1584835200      2020          2020         1
##  9 2020-03-23             7     1584921600      2020          2020         1
## 10 2020-03-24             7     1585008000      2020          2020         1
## # ℹ 1,099 more rows
## # ℹ 41 more variables: date_quarter <int>, date_month <int>, date_day <int>,
## #   date_wday <int>, date_mday <int>, date_qday <int>, date_yday <int>,
## #   date_mweek <int>, date_week <int>, date_week.iso <int>, date_week2 <int>,
## #   date_week3 <int>, date_week4 <int>, date_mday7 <int>, date_sin365_K1 <dbl>,
## #   date_cos365_K1 <dbl>, date_sin365_K2 <dbl>, date_cos365_K2 <dbl>,
## #   date_sin365_K3 <dbl>, date_cos365_K3 <dbl>, date_sin365_K4 <dbl>, …

Prophet Boost

Set the model up using a workflow

model_prophet_boost <- prophet_boost(#season = "logistic"
                                     ) %>%
  set_engine("prophet_xgboost") 

workflow_fit_prophet_boost <- workflow() %>%
  add_model(model_prophet_boost) %>%
  add_recipe(recipe_spec) %>%
  fit(training(splits))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

Model evaluation and selection

Organize the models with IDs and creates generic descriptions.

 # Combine models
models_tbl <- modeltime_table(
  model_arima,
  model_sarima,
  model_hw,
  model_prophet,
  workflow_fit_prophet_boost
)

Forecast (Testing Set)

Visualize the testing predictions (forecast).

# Calibrate and forecast
calibration_tbl <- models_tbl %>%
  modeltime_calibrate(new_data = testing(splits))

# Forecast visualization
forecast_tbl <- calibration_tbl %>%
  modeltime_forecast(
    new_data = testing(splits),
    actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE) + 
  scale_y_continuous(labels = scales::comma)

forecast_tbl
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Make an Ensemble

Then use ensemble_average() to turn that Modeltime Table into a Modeltime Ensemble. This is a fitted ensemble specification containing the ingredients to forecast future data and be refitted on data sets using the submodels.

# ensemble_fit <- models_tbl %>%
#     ensemble_average(type = "mean")
# 
# ensemble_fit

Accuracy (Testing Set)

# Evaluate accuracy on test set
accuracy_tbl <- calibration_tbl %>%
  modeltime_accuracy() %>% 
   table_modeltime_accuracy(.interactive = FALSE)

accuracy_tbl
Accuracy Table
.model_id .model_desc .type mae mape mase smape rmse rsq
1 ARIMA(5,2,4) Test 559.96 0.16 137.63 0.16 650.00 0.83
2 ARIMA(0,1,0)(1,1,0)[7] Test 567.08 0.16 139.38 0.17 658.16 0.83
3 ETS(A,A,A) Test 587.67 0.17 144.44 0.17 680.38 0.83
4 PROPHET Test 7658.34 2.23 1882.31 2.19 9599.89 0.74
5 PROPHET W/ XGBOOST ERRORS Test 7903.13 2.30 1942.47 2.26 9587.45 0.66

Refit and Forecast Forward (Global in world)

calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  # filter(.model_id != 4) %>%
  
  # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "Compare all models forecast Plot", .y_lab = "New daily cases")
## Using period = 1 (no seasonal period).
## frequency = 7 observations per 1 week
## frequency = 7 observations per 1 week
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Optimization terminated abnormally. Falling back to Newton optimizer.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.model = purrr::map2(...)`.
## Caused by warning in `.local()`:
## ! non-zero return code in optimizing
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

ARIMA model forecast

p1 <- calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id == 1) %>%
  
  # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "ARIMA model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Using period = 1 (no seasonal period).

SARIMA model forecast

  p2 <-  calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id == 2) %>%
  
  # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "SARIMA model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## frequency = 7 observations per 1 week

Holt-Winters Model

p3 <-  calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id == 3) %>%
  
  # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, 
                          .title = "Holt-Winters model forecast Plot", 
                          .y_lab = "New daily cases") + 
  scale_y_continuous(labels = scales::comma)
## frequency = 7 observations per 1 week

PROPHET

 calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id == 4) %>%
  
    # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "Facebook-PROPHET model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Optimization terminated abnormally. Falling back to Newton optimizer.
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.model = purrr::map2(...)`.
## Caused by warning in `.local()`:
## ! non-zero return code in optimizing
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

PROPHET + XGBOOST model forecast

p4 <- calibration_tbl %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id == 5) %>%
  
    # Refit and Forecast Forward
  modeltime_refit(cum_df) %>%
  modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "Facebook-PROPHET + XGBOOST model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
library(patchwork)
# library(gridExtra)

# grid.arrange(p1, p2, p3, p4 ncol=2, top = "Model Forecasts Comparison")
combined_plot <- p1 + p2 + p3 +p4
# To arrange vertically: p1 / p2
# You can also add a common title and adjust layout
combined_plot <- combined_plot + plot_annotation(title = "Model Forecasts Comparison For Cumulative Cases")
# Print the combined plot
print(combined_plot)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf