Forecasting Using a Time Series

Libraries

Load these libraries

library(tidyquant)

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## Loading required package: quantmod

## Loading required package: TTR

## Version 0.4-0 included new data defaults. See ?getSymbols.

## Loading required package: tidyverse

## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0       v purrr   0.3.0  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.2       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x dplyr::first()           masks xts::first()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x dplyr::last()            masks xts::last()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()

library(timetk)
library(broom)
library(tidyverse)

Data

Using the Bike Sharing Dataset from the UCI Machine Learning Repository. Select the “day.csv” file which is aggregated to daily periodicity.

bikes <- read_csv("day.csv")

## Parsed with column specification:
## cols(
##   instant = col_double(),
##   dteday = col_date(format = ""),
##   season = col_double(),
##   yr = col_double(),
##   mnth = col_double(),
##   holiday = col_double(),
##   weekday = col_double(),
##   workingday = col_double(),
##   weathersit = col_double(),
##   temp = col_double(),
##   atemp = col_double(),
##   hum = col_double(),
##   windspeed = col_double(),
##   casual = col_double(),
##   registered = col_double(),
##   cnt = col_double()
## )

bikes <- bikes %>%
    select(dteday, cnt) %>%
    rename(date = dteday)

Splitting data test and data train

bikes %>%
    ggplot(aes(x = date, y = cnt)) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_green()[[4]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_green()[[1]], label = "Train Data") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_green()[[1]], label = "Test Data") +
    geom_point(alpha = 0.5, color = palette_green()[[1]]) +
    labs(title = "Bikes Sharing: Daily Scale", x = "") +
    theme_tq()

Split the data into train and test sets at “2012-07-01”

train <- bikes %>%
    filter(date < ymd("2012-07-01"))

test <- bikes %>%
    filter(date >= ymd("2012-07-01"))

Modeling

Start with the training set, which has the “date” and “cnt” columns.

The first step is to add the time series signature to the training set, which will be used this to learn the patterns.

The most efficient method is using tk_augment_timeseries_signature(), which adds the columns we need as additional columns.

train_augmented <- train %>%
    tk_augment_timeseries_signature()
train_augmented

## # A tibble: 547 x 30
##    date         cnt index.num  diff  year year.iso  half quarter month
##    <date>     <dbl>     <int> <int> <int>    <int> <int>   <int> <int>
##  1 2011-01-01   985    1.29e9    NA  2011     2010     1       1     1
##  2 2011-01-02   801    1.29e9 86400  2011     2010     1       1     1
##  3 2011-01-03  1349    1.29e9 86400  2011     2011     1       1     1
##  4 2011-01-04  1562    1.29e9 86400  2011     2011     1       1     1
##  5 2011-01-05  1600    1.29e9 86400  2011     2011     1       1     1
##  6 2011-01-06  1606    1.29e9 86400  2011     2011     1       1     1
##  7 2011-01-07  1510    1.29e9 86400  2011     2011     1       1     1
##  8 2011-01-08   959    1.29e9 86400  2011     2011     1       1     1
##  9 2011-01-09   822    1.29e9 86400  2011     2011     1       1     1
## 10 2011-01-10  1321    1.29e9 86400  2011     2011     1       1     1
## # ... with 537 more rows, and 21 more variables: month.xts <int>,
## #   month.lbl <ord>, day <int>, hour <int>, minute <int>, second <int>,
## #   hour12 <int>, am.pm <int>, wday <int>, wday.xts <int>, wday.lbl <ord>,
## #   mday <int>, qday <int>, yday <int>, mweek <int>, week <int>,
## #   week.iso <int>, week2 <int>, week3 <int>, week4 <int>, mday7 <int>

This is a number of fields that can be used for training, use these for modeling

Model using the augmented features

fit_lm <- lm(cnt ~ ., data = train_augmented)

Examine the model residuals to see if there is any significant pattern remaining using augment() from the broom package

Visualize the residuals of training set

fit_lm %>%
    augment() %>%
    ggplot(aes(x = date, y = .resid)) +
    geom_hline(yintercept = 0, color = "red") +
    geom_point(color = palette_green()[[1]], alpha = 0.5) +
    theme_tq() +
    labs(title = "Training Set: lm() Model Residuals", x = "") +
    scale_y_continuous(limits = c(-5000, 5000))

This is a quick idea of the overall error of the model on the training set. Note that what we really care about is the error on the test set, as this is a much better predictor of future model performance.

sqrt(mean(fit_lm$residuals^2))

## [1] 858.4697

Test Validation

With a suitable model (low residual error and random residuals) we can forecast using the “test” set for validation purposes.

test_augmented <- test %>%
    tk_augment_timeseries_signature()
test_augmented

## # A tibble: 184 x 30
##    date         cnt index.num  diff  year year.iso  half quarter month
##    <date>     <dbl>     <int> <int> <int>    <int> <int>   <int> <int>
##  1 2012-07-01  5531    1.34e9    NA  2012     2012     2       3     7
##  2 2012-07-02  6227    1.34e9 86400  2012     2012     2       3     7
##  3 2012-07-03  6660    1.34e9 86400  2012     2012     2       3     7
##  4 2012-07-04  7403    1.34e9 86400  2012     2012     2       3     7
##  5 2012-07-05  6241    1.34e9 86400  2012     2012     2       3     7
##  6 2012-07-06  6207    1.34e9 86400  2012     2012     2       3     7
##  7 2012-07-07  4840    1.34e9 86400  2012     2012     2       3     7
##  8 2012-07-08  4672    1.34e9 86400  2012     2012     2       3     7
##  9 2012-07-09  6569    1.34e9 86400  2012     2012     2       3     7
## 10 2012-07-10  6290    1.34e9 86400  2012     2012     2       3     7
## # ... with 174 more rows, and 21 more variables: month.xts <int>,
## #   month.lbl <ord>, day <int>, hour <int>, minute <int>, second <int>,
## #   hour12 <int>, am.pm <int>, wday <int>, wday.xts <int>, wday.lbl <ord>,
## #   mday <int>, qday <int>, yday <int>, mweek <int>, week <int>,
## #   week.iso <int>, week2 <int>, week3 <int>, week4 <int>, mday7 <int>

yhat_test <- predict(fit_lm, newdata = test_augmented)

## Warning in predict.lm(fit_lm, newdata = test_augmented): prediction from a
## rank-deficient fit may be misleading

Add the predictions (use add_column for numeric vectors) to the test set for comparison. Additionally, we can add the residuals using mutate(), which enables performing calculations between columns of a data frame.

pred_test <- test %>%
    add_column(yhat = yhat_test) %>%
    mutate(.resid = cnt - yhat)
pred_test

## # A tibble: 184 x 4
##    date         cnt  yhat .resid
##    <date>     <dbl> <dbl>  <dbl>
##  1 2012-07-01  5531 6827. -1296.
##  2 2012-07-02  6227 6840.  -613.
##  3 2012-07-03  6660 7044.  -384.
##  4 2012-07-04  7403 6905.   498.
##  5 2012-07-05  6241 7136.  -895.
##  6 2012-07-06  6207 7166.  -959.
##  7 2012-07-07  4840 7167. -2327.
##  8 2012-07-08  4672 6816. -2144.
##  9 2012-07-09  6569 6828.  -259.
## 10 2012-07-10  6290 7033.  -743.
## # ... with 174 more rows

Visualize with ggplot

ggplot(aes(x = date), data = bikes) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_green()[[4]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_green()[[1]], label = "Train Region") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_green()[[1]], label = "Test Region") + 
    geom_point(aes(x = date, y = cnt), data = train, alpha = 0.5, color = palette_green()[[1]]) +
    geom_point(aes(x = date, y = cnt), data = pred_test, alpha = 0.5, color = palette_green()[[1]]) +
    geom_point(aes(x = date, y = yhat), data = pred_test, alpha = 0.5, color = palette_green()[[2]]) +
    theme_tq()

Test Accuracy

error_tbl <- pred_test %>%
  mutate(pct_err = .resid/cnt * 100) %>%
  summarize(
    me = mean(.resid, na.rm = TRUE),
    rmse = mean(.resid^2, na.rm = TRUE)^0.5,
    mae = mean(abs(.resid), na.rm = TRUE),
    mape = mean(abs(pct_err), na.rm = TRUE),
    mpe = mean(pct_err, na.rm = TRUE)
  )

error_tbl

## # A tibble: 1 x 5
##      me  rmse   mae  mape   mpe
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -49.8 1448. 1061.  188. -174.

visualize the residuals of the test set

ggplot(aes(x = date, y = .resid), data = pred_test) +
    geom_hline(yintercept = 0, color = "red") +
    geom_point(color = palette_green()[[1]], alpha = 0.5) +
    geom_smooth() +
    theme_tq() +
    labs(title = "Test Set: lm() Model Residuals", x = "") +
    scale_y_continuous(limits = c(-5000, 5000))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

Forecasting

The first step is to create the date sequence. Use tk_get_timeseries_summary() to review the summary of the dates from the original dataset, “bikes”

# Extract bikes index
idx <- bikes %>%
    tk_index()

# Get time series summary from index
bikes_summary <- idx %>%
    tk_get_timeseries_summary()

The first six parameters are general summary information.

bikes_summary[1:6]

## # A tibble: 1 x 6
##   n.obs start      end        units scale tzone
##   <int> <date>     <date>     <chr> <chr> <chr>
## 1   731 2011-01-01 2012-12-31 days  day   UTC

The second six parameters are the periodicity information

bikes_summary[7:12]

## # A tibble: 1 x 6
##   diff.minimum diff.q1 diff.median diff.mean diff.q3 diff.maximum
##          <dbl>   <dbl>       <dbl>     <dbl>   <dbl>        <dbl>
## 1        86400   86400       86400     86400   86400        86400

the data is 100% regular because the median and mean differences are 86400 seconds or 1 day

idx_future <- idx %>%
    tk_make_future_timeseries(n_future = 180)

To make the prediction, use the future index to get the time series signature (tk_get_timeseries_signature()). Rename the column “index” to “date” so it matches the column names of the original data.

data_future <- idx_future %>%
    tk_get_timeseries_signature() %>%
    rename(date = index)

Make the prediction

pred_future <- predict(fit_lm, newdata = data_future)

## Warning in predict.lm(fit_lm, newdata = data_future): prediction from a
## rank-deficient fit may be misleading

Build the future data frame

bikes_future <- data_future %>%
    select(date) %>%
    add_column(cnt = pred_future)

Visualize

bikes %>%
    ggplot(aes(x = date, y = cnt)) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_green()[[4]], alpha = 0.01) +
    geom_rect(xmin = as.numeric(ymd("2013-01-01")),
              xmax = as.numeric(ymd("2013-07-01")),
              ymin = 0, ymax = 10000,
              fill = palette_green()[[3]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_green()[[1]], label = "Train Data") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_green()[[1]], label = "Test Data") +
    annotate("text", x = ymd("2013-4-01"), y = 1550,
             color = palette_green()[[1]], label = "Forecast Data") +
    geom_point(alpha = 0.5, color = palette_green()[[1]]) +
    geom_point(aes(x = date, y = cnt), data = bikes_future,
               alpha = 0.5, color = palette_green()[[2]]) +
    geom_smooth(aes(x = date, y = cnt), data = bikes_future,
                method = 'loess') + 
    labs(title = "Bikes Sharing: 6-Month Forecast", x = "") +
    theme_tq()

Error Forecasting

test_resid_sd <- sd(pred_test$.resid)

bikes_future <- bikes_future %>%
    mutate(
        lo.95 = cnt - 1.96 * test_resid_sd,
        lo.80 = cnt - 1.28 * test_resid_sd,
        hi.80 = cnt + 1.28 * test_resid_sd,
        hi.95 = cnt + 1.96 * test_resid_sd
        )

Visualize

bikes %>%
    ggplot(aes(x = date, y = cnt)) +
    geom_point(alpha = 0.5, color = palette_green()[[1]]) +
    geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = bikes_future, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = bikes_future,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = date, y = cnt), data = bikes_future,
               alpha = 0.5, color = palette_green()[[2]]) +
    geom_smooth(aes(x = date, y = cnt), data = bikes_future,
                method = 'loess', color = "white") + 
    labs(title = "Bikes Sharing: 6-Month Forecast with Prediction Intervals", x = "") +
    theme_tq()

lbb_mlforecasting

Ricky Ariwibowo

March 19, 2019