Here are the instructions on how to perform classical time series analysis and machine learning modeling in one framework.
setwd("~/Documents/Time Series Forecasting Using ML Models in R")
library(modeltime)
library(xgboost)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.9
## ✔ dials 1.2.0 ✔ rsample 1.2.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.4 ✔ tidyr 1.3.0
## ✔ infer 1.0.5 ✔ tune 1.1.2
## ✔ modeldata 1.2.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.2 ✔ yardstick 1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::slice() masks xgboost::slice()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::slice() masks xgboost::slice()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(timetk)
# Load data
bike_transactions_daily<- read.csv("day.csv")
str(bike_transactions_daily)
## 'data.frame': 731 obs. of 16 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : chr "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 0 1 2 3 4 5 6 0 1 ...
## $ workingday: int 0 0 1 1 1 1 1 0 0 1 ...
## $ weathersit: int 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : int 331 131 120 108 82 88 148 68 54 41 ...
## $ registered: int 654 670 1229 1454 1518 1518 1362 891 768 1280 ...
## $ cnt : int 985 801 1349 1562 1600 1606 1510 959 822 1321 ...
bike_transactions_tbl <- bike_sharing_daily %>%
select(date = dteday, value = cnt)
str(bike_transactions_tbl)
## tibble [731 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : Date[1:731], format: "2011-01-01" "2011-01-02" ...
## $ value: num [1:731] 985 801 1349 1562 1600 ...
bike_transactions_tbl %>%
plot_time_series(date, value, .interactive = TRUE)
# Split Data 80/20
splits <- initial_time_split(bike_transactions_tbl, prop = 0.8)
set.seed(1234)
model_fit_arima_no_boost <- arima_reg() %>%
set_engine(engine = "auto_arima") %>%
fit(value ~ date, data = training(splits))
## frequency = 7 observations per 1 week
set.seed(1234)
model_fit_arima_boosted <- arima_boost(
min_n = 2,
learn_rate = 0.015
) %>%
set_engine(engine = "auto_arima_xgboost") %>%
fit(value ~ date + as.numeric(date) + factor(month(date, label = TRUE), ordered = FALSE),
data = training(splits))
## frequency = 7 observations per 1 week
set.seed(1234)
model_fit_ets <- exp_smoothing() %>%
set_engine(engine = "ets") %>%
fit(value ~ date, data = training(splits))
## frequency = 7 observations per 1 week
set.seed(1234)
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(value ~ date, data = training(splits))
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
set.seed(1234)
model_fit_lm <- linear_reg() %>%
set_engine("lm") %>%
fit(value ~ as.numeric(date) + factor(month(date, label = TRUE), ordered = FALSE),
data = training(splits))
set.seed(1234)
model_fit_RF <- rand_forest(mode="regression",trees = 500, min_n = 50) %>%
set_engine("randomForest") %>%
fit(value ~ date + as.numeric(date) + factor(month(date, label = TRUE), ordered = FALSE),
data = training(splits))
models_tbl <- modeltime_table(
model_fit_arima_no_boost,
model_fit_arima_boosted,
model_fit_ets,
model_fit_prophet,
model_fit_lm,
model_fit_RF)
# View table
models_tbl
## # Modeltime Table
## # A tibble: 6 × 3
## .model_id .model .model_desc
## <int> <list> <chr>
## 1 1 <fit[+]> ARIMA(0,1,2) WITH DRIFT
## 2 2 <fit[+]> ARIMA(1,1,1) WITH DRIFT W/ XGBOOST ERRORS
## 3 3 <fit[+]> ETS(M,A,N)
## 4 4 <fit[+]> PROPHET
## 5 5 <fit[+]> LM
## 6 6 <fit[+]> RANDOMFOREST
calibration_tbl <- models_tbl %>%
modeltime_calibrate(new_data = testing(splits)) #Using the testing() dataset.
calibration_tbl
## # Modeltime Table
## # A tibble: 6 × 5
## .model_id .model .model_desc .type .calibration_data
## <int> <list> <chr> <chr> <list>
## 1 1 <fit[+]> ARIMA(0,1,2) WITH DRIFT Test <tibble>
## 2 2 <fit[+]> ARIMA(1,1,1) WITH DRIFT W/ XGBOOST… Test <tibble>
## 3 3 <fit[+]> ETS(M,A,N) Test <tibble>
## 4 4 <fit[+]> PROPHET Test <tibble>
## 5 5 <fit[+]> LM Test <tibble>
## 6 6 <fit[+]> RANDOMFOREST Test <tibble>
calibration_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = bike_transactions_tbl
) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
.interactive = TRUE)
# Accuracy Metrics ----
calibration_tbl %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(.interactive = TRUE)
refit_tbl <- calibration_tbl %>%
modeltime_refit(data = bike_transactions_tbl)
## frequency = 7 observations per 1 week
## frequency = 7 observations per 1 week
## frequency = 7 observations per 1 week
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
refit_tbl %>%
modeltime_forecast(h = "1 year", actual_data = bike_transactions_tbl) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
.interactive = TRUE)
As a result of refitting, the models have all changed.
The LM model looks much better now because the linear trend line has now
been fit to new data that follows the longer term trend.
The PROPHET model has a trend that is more representative of actual
trend.
A.M.D.G.