Load the following libraries.

library(tidyverse)
library(tidymodels)
library(modeltime)
library(timetk)   
library(lubridate)
library(readr)
library(dplyr)
library(tidyr)
library(glmnet)
library(randomForest)

Data on COVID-19 (coronavirus) by Our World in Data

COVID-19 dataset is a collection of the COVID-19 data maintained by Our World in Data. It is updated daily and includes data on confirmed cases, deaths, and testing, as well as other variables of potential interest.

We’ll start with a covid daily time series data set that includes new daily cases. We’ll simplify the data set to a univariate time series with columns, “date” and “value”.

data_daily  <- read.csv("~/Workdir/COVID-19-models-forecasting/data/owid-covid-data.csv")
data_daily <- data_daily %>% select(date, new_cases) %>% drop_na(new_cases)

time_series_covid19_confirmed_global <- aggregate(data_daily$new_cases, by=list(date=data_daily$date), FUN=sum)

covid19_confirmed_global_tbl <- time_series_covid19_confirmed_global %>%
  select(date, x) %>%
  rename(value=x) 

covid19_confirmed_global_tbl$date <- as.Date(covid19_confirmed_global_tbl$date, format = "%Y-%m-%d")
covid19_confirmed_global_tbl <- covid19_confirmed_global_tbl[covid19_confirmed_global_tbl[["date"]] >= "2020-01-17", ]

covid19_confirmed_global_tbl <- as.tibble(covid19_confirmed_global_tbl)
covid19_confirmed_global_tbl

Show COVID-19 new daily cases

covid19_confirmed_global_tbl %>%
  plot_time_series(date, value, .interactive = FALSE)

Train / Test

Split time series into training and testing sets. We use the last ****5 days** of data as the testing set.

Train/test split time serie

splits <- covid19_confirmed_global_tbl %>%
  time_series_split(assess = "5 days", cumulative = TRUE)

## Using date_var: date

splits %>%
  tk_time_series_cv_plan() %>%
  plot_time_series_cv_plan(date, value, .interactive = FALSE)

Modeling

Automatic Models

Several models to see this process :

Auto ARIMA

Auto Arima Model fitting process.

model_fit_arima <- arima_reg() %>%
  set_engine("auto_arima") %>%
  fit(value ~ date, training(splits))

## frequency = 7 observations per 1 week

model_fit_arima

## parsnip model object
## 
## Fit time:  342ms 
## Series: outcome 
## ARIMA(2,1,2)(0,0,2)[7] with drift 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    sma1    sma2      drift
##       1.0928  -0.5854  -1.6122  0.8593  0.1954  0.3654  2001.7886
## s.e.  0.0932   0.0879   0.0692  0.0662  0.1019  0.0941   858.2301
## 
## sigma^2 estimated as 2.12e+08:  log likelihood=-1759.24
## AIC=3534.47   AICc=3535.43   BIC=3559.08

Prophet

Prophet Model fitting process.

model_fit_prophet <- prophet_reg() %>%
  set_engine("prophet", yearly.seasonality = TRUE) %>%
  fit(value ~ date, training(splits))

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

model_fit_prophet

## parsnip model object
## 
## Fit time:  61ms 
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - seasonality.mode: 'additive'
## - extra_regressors: 0

Machine Learning Models

Preprocessing Recipe

recipe_spec <- recipe(value ~ date, training(splits)) %>%
  step_timeseries_signature(date) %>%
  step_rm(contains("am.pm"), contains("hour"), contains("minute"),
          contains("second"), contains("xts")) %>%
  step_fourier(date, period = 365, K = 5) %>%
  step_dummy(all_nominal())

recipe_spec %>% prep() %>% juice()

With a recipe in-hand, we can set up our machine learning pipelines.

Elastic Net

Making an Elastic NET model

model_spec_glmnet <- linear_reg(penalty = 0.01, mixture = 0.5) %>%
  set_engine("glmnet")

Fitted workflow

workflow_fit_glmnet <- workflow() %>%
  add_model(model_spec_glmnet) %>%
  add_recipe(recipe_spec %>% step_rm(date)) %>%
  fit(training(splits))

Random Forest

Fit a Random Forest

model_spec_rf <- rand_forest(trees = 500, min_n = 50) %>%
  set_engine("randomForest")

workflow_fit_rf <- workflow() %>%
  add_model(model_spec_rf) %>%
  add_recipe(recipe_spec %>% step_rm(date)) %>%
  fit(training(splits))

Prophet Boost

Set the model up using a workflow

model_spec_prophet_boost <- prophet_boost() %>%
  set_engine("prophet_xgboost", yearly.seasonality = TRUE) 

workflow_fit_prophet_boost <- workflow() %>%
  add_model(model_spec_prophet_boost) %>%
  add_recipe(recipe_spec) %>%
  fit(training(splits))

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:41:37] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

workflow_fit_prophet_boost

## ══ Workflow [trained] ══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: prophet_boost()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## ● step_timeseries_signature()
## ● step_rm()
## ● step_fourier()
## ● step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## PROPHET w/ XGBoost Errors
## ---
## Model 1: PROPHET
## - growth: 'linear'
## - n.changepoints: 25
## - seasonality.mode: 'additive'
## 
## ---
## Model 2: XGBoost Errors
## 
## xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
##     colsample_bytree = 1, min_child_weight = 1, subsample = 1), 
##     data = x, nrounds = 15, early_stopping_rounds = NULL, objective = "reg:squarederror", 
##     validation = 0)

Model evaluation and selection

Organize the models with IDs and creates generic descriptions.

model_table <- modeltime_table(
  model_fit_arima, 
  model_fit_prophet,
  workflow_fit_glmnet,
  workflow_fit_rf,
  workflow_fit_prophet_boost
) 

model_table

Calibration

Quantify error and estimate confidence intervals.

calibration_table <- model_table %>%
  modeltime_calibrate(testing(splits))

calibration_table

Forecast (Testing Set)

Visualize the testing predictions (forecast).

calibration_table %>%
  modeltime_forecast(actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE)

## 'new_data' is missing. Using '.calibration_data' to forecast.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Accuracy (Testing Set)

Calculate the testing accuracy to compare the models.

calibration_table %>%
  modeltime_accuracy() %>%
  table_modeltime_accuracy(.interactive = FALSE)

Accuracy Table

.model_id	.model_desc	.type	mae	mape	mase	smape	rmse	rsq
1	ARIMA(2,1,2)(0,0,2)[7] WITH DRIFT	Test	22526.77	6.26	1.22	6.51	25367.86	0.94
2	PROPHET	Test	21943.96	6.45	1.19	6.34	24090.94	0.02
3	GLMNET	Test	28088.56	7.77	1.53	8.18	32302.35	1.00
4	RANDOMFOREST	Test	82702.35	23.34	4.49	26.62	86107.41	0.36
5	PROPHET W/ XGBOOST ERRORS	Test	20410.54	6.14	1.11	5.92	24731.97	0.12

Analyze Results

From the accuracy measures and forecast results, we see that:

RANDOMFOREST model is not a good fit for this data.
The best model are GLMNET and Prophet + XGBoost Let’s exclude the RANDOMFOREST from our final model, then make future forecasts with the remaining models.

Refit and Forecast Forward (Global in world)

calibration_table %>%
  # Remove RANDOMFOREST model with low accuracy
  filter(.model_id != 4) %>%
  
  # Refit and Forecast Forward
  modeltime_refit(covid19_confirmed_global_tbl) %>%
  modeltime_forecast(h = "3 months", actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "Compare all models forecast Plot", .y_lab = "New daily cases")

## frequency = 7 observations per 1 week

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:41:41] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

ARIMA model forecast

calibration_table %>%
  filter(.model_id == 1) %>%
  # Refit and Forecast Forward
  modeltime_refit(covid19_confirmed_global_tbl) %>%
  modeltime_forecast(h = "3 months", actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "ARIMA model forecast Plot", .y_lab = "New daily cases")

## frequency = 7 observations per 1 week

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

PROPHET model forecast

calibration_table %>%
  filter(.model_id == 2) %>%
  # Refit and Forecast Forward
  modeltime_refit(covid19_confirmed_global_tbl) %>%
  modeltime_forecast(h = "3 months", actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "PROPHET model forecast Plot", .y_lab = "New daily cases")

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

GLMNET model forecast

calibration_table %>%
  filter(.model_id == 3) %>%
  # Refit and Forecast Forward
  modeltime_refit(covid19_confirmed_global_tbl) %>%
  modeltime_forecast(h = "3 months", actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "GLMNET model forecast Plot", .y_lab = "New daily cases")

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

PROPHET + XGBOOST model forecast

calibration_table %>%
  filter(.model_id == 5) %>%
  # Refit and Forecast Forward
  modeltime_refit(covid19_confirmed_global_tbl) %>%
  modeltime_forecast(h = "3 months", actual_data = covid19_confirmed_global_tbl) %>%
  plot_modeltime_forecast(.interactive = FALSE, .title = "PROPHET + XGBOOST model forecast Plot", .y_lab = "New daily cases")

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:41:46] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

Data on COVID-19 (coronavirus) only in FRANCE

Filter COVID-19 dataset in France.

Show FRANCE COVID-19 new daily cases

Train / Test

Train/test split time serie

## Using date_var: date

Modeling

Automatic Models

Auto ARIMA

Auto Arima Model fitting process.

model_fit_arima <- arima_reg() %>%
  set_engine("auto_arima") %>%
  fit(value ~ date, training(splits))

## frequency = 7 observations per 1 week

model_fit_arima

## parsnip model object
## 
## Fit time:  527ms 
## Series: outcome 
## ARIMA(0,1,1)(0,0,2)[7] 
## 
## Coefficients:
##           ma1    sma1    sma2
##       -0.6935  0.1816  0.2081
## s.e.   0.0590  0.0874  0.0886
## 
## sigma^2 estimated as 816656:  log likelihood=-986.23
## AIC=1980.47   AICc=1980.82   BIC=1991.62

Prophet

Prophet Model fitting process.

model_fit_prophet <- prophet_reg() %>%
  set_engine("prophet", yearly.seasonality = TRUE) %>%
  fit(value ~ date, training(splits))

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

model_fit_prophet

## parsnip model object
## 
## Fit time:  55ms 
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 25
## - seasonality.mode: 'additive'
## - extra_regressors: 0

Machine Learning Models

Preprocessing Recipe

Elastic Net

Making an Elastic NET model

model_spec_glmnet <- linear_reg(penalty = 0.01, mixture = 0.5) %>%
  set_engine("glmnet")

Fitted workflow

workflow_fit_glmnet <- workflow() %>%
  add_model(model_spec_glmnet) %>%
  add_recipe(recipe_spec %>% step_rm(date)) %>%
  fit(training(splits))

Random Forest

Fit a Random Forest

Prophet Boost

Set the model up using a workflow

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:41:51] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

## ══ Workflow [trained] ══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: prophet_boost()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## ● step_timeseries_signature()
## ● step_rm()
## ● step_fourier()
## ● step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## PROPHET w/ XGBoost Errors
## ---
## Model 1: PROPHET
## - growth: 'linear'
## - n.changepoints: 25
## - seasonality.mode: 'additive'
## 
## ---
## Model 2: XGBoost Errors
## 
## xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
##     colsample_bytree = 1, min_child_weight = 1, subsample = 1), 
##     data = x, nrounds = 15, early_stopping_rounds = NULL, objective = "reg:squarederror", 
##     validation = 0)

Model evaluation and selection

Organize the models with IDs and creates generic descriptions.

Calibration

Quantify error and estimate confidence intervals.

FRANCE Forecast (Testing Set)

Visualize the testing predictions (forecast).

## 'new_data' is missing. Using '.calibration_data' to forecast.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Accuracy (Testing Set)

Calculate the testing accuracy to compare the models.

Accuracy Table

.model_id	.model_desc	.type	mae	mape	mase	smape	rmse	rsq
1	ARIMA(0,1,1)(0,0,2)[7]	Test	618.20	Inf	0.55	167.35	724.97	0.34
2	PROPHET	Test	664.69	Inf	0.59	167.73	759.30	0.08
3	GLMNET	Test	730.82	Inf	0.65	187.91	925.11	0.06
4	RANDOMFOREST	Test	1235.75	Inf	1.10	129.72	1433.47	0.01
5	PROPHET W/ XGBOOST ERRORS	Test	409.69	Inf	0.36	127.77	461.89	0.61

Analyze Results

From the accuracy measures and forecast results, we see that:

RANDOMFOREST model is not a good fit for this data.
The best model is PROPHET W/ XGBOOST ERRORS Let’s exclude the RANDOMFOREST from our final model, then make future forecasts with the remaining models.

Refit and Forecast Forward (FRANCE ONLY)

## frequency = 7 observations per 1 week

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:41:56] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

ARIMA model forecast

## frequency = 7 observations per 1 week

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

PROPHET model forecast

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

GLMNET model forecast

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

PROPHET + XGBOOST model forecast

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

## [11:42:01] WARNING: amalgamation/../src/learner.cc:480: 
## Parameters: { validation } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

COVID-19 - Forecasting and compare machine learning models

Adrien BORDERON

01 July, 2020

Data on COVID-19 (coronavirus) by Our World in Data

Show COVID-19 new daily cases

Train / Test

Train/test split time serie

Modeling

Automatic Models

Auto ARIMA

Prophet

Machine Learning Models

Preprocessing Recipe

Elastic Net

Random Forest

Prophet Boost

Model evaluation and selection

Calibration

Forecast (Testing Set)

Accuracy (Testing Set)

Analyze Results

Refit and Forecast Forward (Global in world)

ARIMA model forecast

PROPHET model forecast

GLMNET model forecast

PROPHET + XGBOOST model forecast

Data on COVID-19 (coronavirus) only in FRANCE

Show FRANCE COVID-19 new daily cases

Train / Test

Train/test split time serie

Modeling

Automatic Models

Auto ARIMA

Prophet

Machine Learning Models

Preprocessing Recipe

Elastic Net

Random Forest

Prophet Boost

Model evaluation and selection

Calibration

FRANCE Forecast (Testing Set)

Accuracy (Testing Set)

Analyze Results

Refit and Forecast Forward (FRANCE ONLY)

ARIMA model forecast

PROPHET model forecast

GLMNET model forecast

PROPHET + XGBOOST model forecast