library(knitr)
library(tidymodels)
library(workflowsets)
library(modeltime)
library(modeltime.ensemble)
library(timetk)
library(xgboost)
library(lubridate)
library(parsnip)
library(rsample)
library(dplyr)
library(workflows)
library(recipes)
library(tidyverse)
library(janitor)
# GluonTS Installation - Run 1st time
# install_gluonts(fresh_install = FALSE, include_pytorch = TRUE)
#data
covid_df <- read_csv("df.csv")
covid_kenya <- covid_df %>%
select(-1) %>%
mutate(date = dmy(date)) %>%
rename(daily_infection = new_cases, cum_infection= total_cases,
daily_deaths = new_deaths, cum_deaths = total_deaths )
glimpse(covid_kenya)
## Rows: 1,387
## Columns: 5
## $ date <date> 2020-03-15, 2020-03-16, 2020-03-17, 2020-03-18, 2020-…
## $ daily_infection <dbl> 1, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0,…
## $ cum_infection <dbl> 1, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 38, 38, 38, …
## $ cum_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, …
## $ daily_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, …
nrow(covid_df)
## [1] 1387
df <- covid_kenya %>%
pivot_longer(
cols =c(daily_infection, cum_infection, daily_deaths, cum_deaths),
values_to = "value",
names_to = "case_type"
)
# attach(covid_kenya)
df %>% group_by(case_type) %>%
plot_time_series(date, value, .facet_ncol = 2,
.line_color = "red",
.line_size = 0.5,
.smooth_size = 1,
.smooth_alpha = 0.5,
.legend_show = TRUE,
.title = "Time Series Case Trend Plot",
.x_lab = "Date",
.y_lab = "Cases")
df %>% group_by(case_type) %>%
tk_summary_diagnostics()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
## # A tibble: 4 × 13
## # Groups: case_type [4]
## case_type n.obs start end units scale tzone diff.minimum diff.q1
## <chr> <int> <date> <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 daily_infe… 1387 2020-03-15 2023-12-31 days day UTC 86400 86400
## 2 cum_infect… 1387 2020-03-15 2023-12-31 days day UTC 86400 86400
## 3 daily_deat… 1387 2020-03-15 2023-12-31 days day UTC 86400 86400
## 4 cum_deaths 1387 2020-03-15 2023-12-31 days day UTC 86400 86400
## # ℹ 4 more variables: diff.median <dbl>, diff.mean <dbl>, diff.q3 <dbl>,
## # diff.maximum <dbl>
# Add Kenya holidays (2020–2023)
holiday_md <- c("01-01", "02-14", "03-08", "05-01", "06-01",
"10-10", "10-20", "12-12", "12-25", "12-26")
years <- 2020:2023
holiday_dates <- as.Date(unlist(lapply(years, function(y) {
paste0(y, "-", holiday_md)
})))
kenya_holidays <- tibble(
holiday = "kenya_holiday",
ds = holiday_dates
)
#Data spliting
set.seed(123)
cum_df <- covid_kenya %>% select(date, cum_infection)
splits <- initial_time_split(cum_df, prop = 0.8)
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(date,
cum_infection,
.line_alpha = 0.5,
.title = "Train-test split time series of COVID-19 new daily cumulative cases")
# Model 1: ARIMA
model_arima <- arima_reg(seasonal_period = "NONE") %>%
set_engine("auto_arima") %>%
fit(cum_infection ~ date, training(splits))
## Using period = 1 (no seasonal period).
model_arima
## parsnip model object
##
## Series: outcome
## ARIMA(5,2,4)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.1070 -0.7778 -0.2722 -0.3204 -0.6527 -1.6623 1.7980 -1.3509
## s.e. 0.0268 0.0215 0.0327 0.0212 0.0256 0.0300 0.0434 0.0381
## ma4
## 0.5896
## s.e. 0.0254
##
## sigma^2 = 600087: log likelihood = -8935.5
## AIC=17891 AICc=17891.2 BIC=17941.1
We’ll create a Feature Engineering Recipe that can be applied to the data to create features that machine learning models can key in on. This will be most useful for the __Elastic Net
# Model 2: Seasonal ARIMA
model_sarima <- arima_reg() %>%
set_engine("auto_arima") %>%
fit(cum_infection ~ date, training(splits))
## frequency = 7 observations per 1 week
model_sarima
## parsnip model object
##
## Series: outcome
## ARIMA(0,1,0)(1,1,0)[7]
##
## Coefficients:
## sar1
## 0.3789
## s.e. 0.0277
##
## sigma^2 = 272083: log likelihood = -8451.18
## AIC=16906.35 AICc=16906.36 BIC=16916.36
# Model 3: Holt-Winters
model_hw <- exp_smoothing() %>%
set_engine("ets") %>%
fit(cum_infection ~ date, training(splits))
## frequency = 7 observations per 1 week
model_hw
## parsnip model object
##
## ETS(A,A,A)
##
## Call:
## forecast::ets(y = outcome, model = model_ets, damped = damping_ets,
## alpha = alpha, beta = beta, gamma = gamma)
##
## Smoothing parameters:
## alpha = 0.2046
## beta = 0.0864
## gamma = 0.7954
##
## Initial states:
## l = -483.3038
## b = 78.0575
## s = -932.2004 -622.0629 -311.9074 -2.1555 310.8545 622.2743
## 935.1974
##
## sigma: 754.8896
##
## AIC AICc BIC
## 22486.12 22486.40 22546.25
# Model 4: Prophet with holidays
model_prophet <- prophet_reg( #growth = "logistic",
season = "multiplicative",
changepoint_num = 10,
logistic_floor = "saturation"
# seasonality_yearly = TRUE
) %>%
set_engine("prophet", holidays = kenya_holidays) %>%
fit(cum_infection ~ date, training(splits))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
model_prophet
## parsnip model object
##
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 10
## - changepoint.range: 0.8
## - yearly.seasonality: 'auto'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'multiplicative'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: c("kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday",
## "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday", "kenya_holiday")
## - logistic_cap: NULL
## - logistic_floor: saturation
## - extra_regressors: 0
## PROPHET Model
## - growth: 'linear'
## - n.changepoints: 10
## - changepoint.range: 0.8
## - yearly.seasonality: 'auto'
## - weekly.seasonality: 'auto'
## - daily.seasonality: 'auto'
## - seasonality.mode: 'multiplicative'
## - changepoint.prior.scale: 0.05
## - seasonality.prior.scale: 10
## - holidays.prior.scale: c(18262, 18306, 18329, 18383, 18414, 18545, 18555, 18608, 18621, 18622, 18628, 18672, 18694, 18748, 18779, 18910, 18920, 18973, 18986, 18987, 18993, 19037, 19059, 19113, 19144, 19275, 19285, 19338, 19351, 19352, 19358, 19402, 19424, 19478, 19509, 19640, 19650, 19703, 19716, 19717)
## - logistic_cap: NULL
## - logistic_floor: saturation
## - extra_regressors: 0
recipe_spec <- recipe(cum_infection~date, training(splits)) %>%
step_timeseries_signature(date) %>%
step_rm(contains('xts'), contains('am.pm'), contains("minute"), contains("second"),
contains("hour")) %>%
step_fourier(date, period = 365, K=5) %>%
step_dummy(all_nominal())
recipe_spec %>% prep() %>% juice()
## # A tibble: 1,109 × 47
## date cum_infection date_index.num date_year date_year.iso date_half
## <date> <dbl> <dbl> <int> <int> <int>
## 1 2020-03-15 1 1584230400 2020 2020 1
## 2 2020-03-16 1 1584316800 2020 2020 1
## 3 2020-03-17 1 1584403200 2020 2020 1
## 4 2020-03-18 1 1584489600 2020 2020 1
## 5 2020-03-19 1 1584576000 2020 2020 1
## 6 2020-03-20 1 1584662400 2020 2020 1
## 7 2020-03-21 1 1584748800 2020 2020 1
## 8 2020-03-22 7 1584835200 2020 2020 1
## 9 2020-03-23 7 1584921600 2020 2020 1
## 10 2020-03-24 7 1585008000 2020 2020 1
## # ℹ 1,099 more rows
## # ℹ 41 more variables: date_quarter <int>, date_month <int>, date_day <int>,
## # date_wday <int>, date_mday <int>, date_qday <int>, date_yday <int>,
## # date_mweek <int>, date_week <int>, date_week.iso <int>, date_week2 <int>,
## # date_week3 <int>, date_week4 <int>, date_mday7 <int>, date_sin365_K1 <dbl>,
## # date_cos365_K1 <dbl>, date_sin365_K2 <dbl>, date_cos365_K2 <dbl>,
## # date_sin365_K3 <dbl>, date_cos365_K3 <dbl>, date_sin365_K4 <dbl>, …
Set the model up using a workflow
model_prophet_boost <- prophet_boost(#season = "logistic"
) %>%
set_engine("prophet_xgboost")
workflow_fit_prophet_boost <- workflow() %>%
add_model(model_prophet_boost) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Organize the models with IDs and creates generic descriptions.
# Combine models
models_tbl <- modeltime_table(
model_arima,
model_sarima,
model_hw,
model_prophet,
workflow_fit_prophet_boost
)
Visualize the testing predictions (forecast).
# Calibrate and forecast
calibration_tbl <- models_tbl %>%
modeltime_calibrate(new_data = testing(splits))
# Forecast visualization
forecast_tbl <- calibration_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE) +
scale_y_continuous(labels = scales::comma)
forecast_tbl
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
Then use ensemble_average() to turn that Modeltime Table
into a Modeltime Ensemble. This is a
fitted ensemble specification containing the ingredients to
forecast future data and be refitted on data sets using the
submodels.
# ensemble_fit <- models_tbl %>%
# ensemble_average(type = "mean")
#
# ensemble_fit
# Evaluate accuracy on test set
accuracy_tbl <- calibration_tbl %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(.interactive = FALSE)
accuracy_tbl
| Accuracy Table | ||||||||
| .model_id | .model_desc | .type | mae | mape | mase | smape | rmse | rsq |
|---|---|---|---|---|---|---|---|---|
| 1 | ARIMA(5,2,4) | Test | 559.96 | 0.16 | 137.63 | 0.16 | 650.00 | 0.83 |
| 2 | ARIMA(0,1,0)(1,1,0)[7] | Test | 567.08 | 0.16 | 139.38 | 0.17 | 658.16 | 0.83 |
| 3 | ETS(A,A,A) | Test | 587.67 | 0.17 | 144.44 | 0.17 | 680.38 | 0.83 |
| 4 | PROPHET | Test | 7658.34 | 2.23 | 1882.31 | 2.19 | 9599.89 | 0.74 |
| 5 | PROPHET W/ XGBOOST ERRORS | Test | 7903.13 | 2.30 | 1942.47 | 2.26 | 9587.45 | 0.66 |
calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
# filter(.model_id != 4) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE, .title = "Compare all models forecast Plot", .y_lab = "New daily cases")
## Using period = 1 (no seasonal period).
## frequency = 7 observations per 1 week
## frequency = 7 observations per 1 week
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Optimization terminated abnormally. Falling back to Newton optimizer.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.model = purrr::map2(...)`.
## Caused by warning in `.local()`:
## ! non-zero return code in optimizing
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
p1 <- calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
filter(.model_id == 1) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE, .title = "ARIMA model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Using period = 1 (no seasonal period).
p2 <- calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
filter(.model_id == 2) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE, .title = "SARIMA model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## frequency = 7 observations per 1 week
p3 <- calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
filter(.model_id == 3) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE,
.title = "Holt-Winters model forecast Plot",
.y_lab = "New daily cases") +
scale_y_continuous(labels = scales::comma)
## frequency = 7 observations per 1 week
calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
filter(.model_id == 4) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE, .title = "Facebook-PROPHET model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Optimization terminated abnormally. Falling back to Newton optimizer.
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `.model = purrr::map2(...)`.
## Caused by warning in `.local()`:
## ! non-zero return code in optimizing
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
p4 <- calibration_tbl %>%
# Remove RANDOMFOREST model with low accuracy
filter(.model_id == 5) %>%
# Refit and Forecast Forward
modeltime_refit(cum_df) %>%
modeltime_forecast(h = "3 months", actual_data = cum_df) %>%
plot_modeltime_forecast(.interactive = FALSE, .title = "Facebook-PROPHET + XGBOOST model forecast Plot", .y_lab = "New daily cases") + scale_y_continuous(labels = scales::comma)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
library(patchwork)
# library(gridExtra)
# grid.arrange(p1, p2, p3, p4 ncol=2, top = "Model Forecasts Comparison")
combined_plot <- p1 + p2 + p3 +p4
# To arrange vertically: p1 / p2
# You can also add a common title and adjust layout
combined_plot <- combined_plot + plot_annotation(title = "Model Forecasts Comparison For Cumulative Cases")
# Print the combined plot
print(combined_plot)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf