Load these libraries
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
## Loading required package: tidyverse
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.0
## v tibble 2.0.1 v dplyr 0.8.0.1
## v tidyr 0.8.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks xts::first()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks xts::last()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(timetk)
library(broom)
library(tidyverse)
Using the Bike Sharing Dataset from the UCI Machine Learning Repository. Select the “day.csv” file which is aggregated to daily periodicity.
bikes <- read_csv("day.csv")
## Parsed with column specification:
## cols(
## instant = col_double(),
## dteday = col_date(format = ""),
## season = col_double(),
## yr = col_double(),
## mnth = col_double(),
## holiday = col_double(),
## weekday = col_double(),
## workingday = col_double(),
## weathersit = col_double(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## casual = col_double(),
## registered = col_double(),
## cnt = col_double()
## )
bikes <- bikes %>%
select(dteday, cnt) %>%
rename(date = dteday)
bikes %>%
ggplot(aes(x = date, y = cnt)) +
geom_rect(xmin = as.numeric(ymd("2012-07-01")),
xmax = as.numeric(ymd("2013-01-01")),
ymin = 0, ymax = 10000,
fill = palette_green()[[4]], alpha = 0.01) +
annotate("text", x = ymd("2011-10-01"), y = 7800,
color = palette_green()[[1]], label = "Train Data") +
annotate("text", x = ymd("2012-10-01"), y = 1550,
color = palette_green()[[1]], label = "Test Data") +
geom_point(alpha = 0.5, color = palette_green()[[1]]) +
labs(title = "Bikes Sharing: Daily Scale", x = "") +
theme_tq()
Split the data into train and test sets at “2012-07-01”
train <- bikes %>%
filter(date < ymd("2012-07-01"))
test <- bikes %>%
filter(date >= ymd("2012-07-01"))
Start with the training set, which has the “date” and “cnt” columns.
The first step is to add the time series signature to the training set, which will be used this to learn the patterns.
The most efficient method is using tk_augment_timeseries_signature(), which adds the columns we need as additional columns.
train_augmented <- train %>%
tk_augment_timeseries_signature()
train_augmented
## # A tibble: 547 x 30
## date cnt index.num diff year year.iso half quarter month
## <date> <dbl> <int> <int> <int> <int> <int> <int> <int>
## 1 2011-01-01 985 1.29e9 NA 2011 2010 1 1 1
## 2 2011-01-02 801 1.29e9 86400 2011 2010 1 1 1
## 3 2011-01-03 1349 1.29e9 86400 2011 2011 1 1 1
## 4 2011-01-04 1562 1.29e9 86400 2011 2011 1 1 1
## 5 2011-01-05 1600 1.29e9 86400 2011 2011 1 1 1
## 6 2011-01-06 1606 1.29e9 86400 2011 2011 1 1 1
## 7 2011-01-07 1510 1.29e9 86400 2011 2011 1 1 1
## 8 2011-01-08 959 1.29e9 86400 2011 2011 1 1 1
## 9 2011-01-09 822 1.29e9 86400 2011 2011 1 1 1
## 10 2011-01-10 1321 1.29e9 86400 2011 2011 1 1 1
## # ... with 537 more rows, and 21 more variables: month.xts <int>,
## # month.lbl <ord>, day <int>, hour <int>, minute <int>, second <int>,
## # hour12 <int>, am.pm <int>, wday <int>, wday.xts <int>, wday.lbl <ord>,
## # mday <int>, qday <int>, yday <int>, mweek <int>, week <int>,
## # week.iso <int>, week2 <int>, week3 <int>, week4 <int>, mday7 <int>
This is a number of fields that can be used for training, use these for modeling
Model using the augmented features
fit_lm <- lm(cnt ~ ., data = train_augmented)
Examine the model residuals to see if there is any significant pattern remaining using augment() from the broom package
Visualize the residuals of training set
fit_lm %>%
augment() %>%
ggplot(aes(x = date, y = .resid)) +
geom_hline(yintercept = 0, color = "red") +
geom_point(color = palette_green()[[1]], alpha = 0.5) +
theme_tq() +
labs(title = "Training Set: lm() Model Residuals", x = "") +
scale_y_continuous(limits = c(-5000, 5000))
This is a quick idea of the overall error of the model on the training set. Note that what we really care about is the error on the test set, as this is a much better predictor of future model performance.
sqrt(mean(fit_lm$residuals^2))
## [1] 858.4697
With a suitable model (low residual error and random residuals) we can forecast using the “test” set for validation purposes.
test_augmented <- test %>%
tk_augment_timeseries_signature()
test_augmented
## # A tibble: 184 x 30
## date cnt index.num diff year year.iso half quarter month
## <date> <dbl> <int> <int> <int> <int> <int> <int> <int>
## 1 2012-07-01 5531 1.34e9 NA 2012 2012 2 3 7
## 2 2012-07-02 6227 1.34e9 86400 2012 2012 2 3 7
## 3 2012-07-03 6660 1.34e9 86400 2012 2012 2 3 7
## 4 2012-07-04 7403 1.34e9 86400 2012 2012 2 3 7
## 5 2012-07-05 6241 1.34e9 86400 2012 2012 2 3 7
## 6 2012-07-06 6207 1.34e9 86400 2012 2012 2 3 7
## 7 2012-07-07 4840 1.34e9 86400 2012 2012 2 3 7
## 8 2012-07-08 4672 1.34e9 86400 2012 2012 2 3 7
## 9 2012-07-09 6569 1.34e9 86400 2012 2012 2 3 7
## 10 2012-07-10 6290 1.34e9 86400 2012 2012 2 3 7
## # ... with 174 more rows, and 21 more variables: month.xts <int>,
## # month.lbl <ord>, day <int>, hour <int>, minute <int>, second <int>,
## # hour12 <int>, am.pm <int>, wday <int>, wday.xts <int>, wday.lbl <ord>,
## # mday <int>, qday <int>, yday <int>, mweek <int>, week <int>,
## # week.iso <int>, week2 <int>, week3 <int>, week4 <int>, mday7 <int>
yhat_test <- predict(fit_lm, newdata = test_augmented)
## Warning in predict.lm(fit_lm, newdata = test_augmented): prediction from a
## rank-deficient fit may be misleading
Add the predictions (use add_column for numeric vectors) to the test set for comparison. Additionally, we can add the residuals using mutate(), which enables performing calculations between columns of a data frame.
pred_test <- test %>%
add_column(yhat = yhat_test) %>%
mutate(.resid = cnt - yhat)
pred_test
## # A tibble: 184 x 4
## date cnt yhat .resid
## <date> <dbl> <dbl> <dbl>
## 1 2012-07-01 5531 6827. -1296.
## 2 2012-07-02 6227 6840. -613.
## 3 2012-07-03 6660 7044. -384.
## 4 2012-07-04 7403 6905. 498.
## 5 2012-07-05 6241 7136. -895.
## 6 2012-07-06 6207 7166. -959.
## 7 2012-07-07 4840 7167. -2327.
## 8 2012-07-08 4672 6816. -2144.
## 9 2012-07-09 6569 6828. -259.
## 10 2012-07-10 6290 7033. -743.
## # ... with 174 more rows
ggplot(aes(x = date), data = bikes) +
geom_rect(xmin = as.numeric(ymd("2012-07-01")),
xmax = as.numeric(ymd("2013-01-01")),
ymin = 0, ymax = 10000,
fill = palette_green()[[4]], alpha = 0.01) +
annotate("text", x = ymd("2011-10-01"), y = 7800,
color = palette_green()[[1]], label = "Train Region") +
annotate("text", x = ymd("2012-10-01"), y = 1550,
color = palette_green()[[1]], label = "Test Region") +
geom_point(aes(x = date, y = cnt), data = train, alpha = 0.5, color = palette_green()[[1]]) +
geom_point(aes(x = date, y = cnt), data = pred_test, alpha = 0.5, color = palette_green()[[1]]) +
geom_point(aes(x = date, y = yhat), data = pred_test, alpha = 0.5, color = palette_green()[[2]]) +
theme_tq()
error_tbl <- pred_test %>%
mutate(pct_err = .resid/cnt * 100) %>%
summarize(
me = mean(.resid, na.rm = TRUE),
rmse = mean(.resid^2, na.rm = TRUE)^0.5,
mae = mean(abs(.resid), na.rm = TRUE),
mape = mean(abs(pct_err), na.rm = TRUE),
mpe = mean(pct_err, na.rm = TRUE)
)
error_tbl
## # A tibble: 1 x 5
## me rmse mae mape mpe
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -49.8 1448. 1061. 188. -174.
visualize the residuals of the test set
ggplot(aes(x = date, y = .resid), data = pred_test) +
geom_hline(yintercept = 0, color = "red") +
geom_point(color = palette_green()[[1]], alpha = 0.5) +
geom_smooth() +
theme_tq() +
labs(title = "Test Set: lm() Model Residuals", x = "") +
scale_y_continuous(limits = c(-5000, 5000))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
The first step is to create the date sequence. Use tk_get_timeseries_summary() to review the summary of the dates from the original dataset, “bikes”
# Extract bikes index
idx <- bikes %>%
tk_index()
# Get time series summary from index
bikes_summary <- idx %>%
tk_get_timeseries_summary()
The first six parameters are general summary information.
bikes_summary[1:6]
## # A tibble: 1 x 6
## n.obs start end units scale tzone
## <int> <date> <date> <chr> <chr> <chr>
## 1 731 2011-01-01 2012-12-31 days day UTC
The second six parameters are the periodicity information
bikes_summary[7:12]
## # A tibble: 1 x 6
## diff.minimum diff.q1 diff.median diff.mean diff.q3 diff.maximum
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 86400 86400 86400 86400 86400 86400
the data is 100% regular because the median and mean differences are 86400 seconds or 1 day
idx_future <- idx %>%
tk_make_future_timeseries(n_future = 180)
To make the prediction, use the future index to get the time series signature (tk_get_timeseries_signature()). Rename the column “index” to “date” so it matches the column names of the original data.
data_future <- idx_future %>%
tk_get_timeseries_signature() %>%
rename(date = index)
Make the prediction
pred_future <- predict(fit_lm, newdata = data_future)
## Warning in predict.lm(fit_lm, newdata = data_future): prediction from a
## rank-deficient fit may be misleading
Build the future data frame
bikes_future <- data_future %>%
select(date) %>%
add_column(cnt = pred_future)
Visualize
bikes %>%
ggplot(aes(x = date, y = cnt)) +
geom_rect(xmin = as.numeric(ymd("2012-07-01")),
xmax = as.numeric(ymd("2013-01-01")),
ymin = 0, ymax = 10000,
fill = palette_green()[[4]], alpha = 0.01) +
geom_rect(xmin = as.numeric(ymd("2013-01-01")),
xmax = as.numeric(ymd("2013-07-01")),
ymin = 0, ymax = 10000,
fill = palette_green()[[3]], alpha = 0.01) +
annotate("text", x = ymd("2011-10-01"), y = 7800,
color = palette_green()[[1]], label = "Train Data") +
annotate("text", x = ymd("2012-10-01"), y = 1550,
color = palette_green()[[1]], label = "Test Data") +
annotate("text", x = ymd("2013-4-01"), y = 1550,
color = palette_green()[[1]], label = "Forecast Data") +
geom_point(alpha = 0.5, color = palette_green()[[1]]) +
geom_point(aes(x = date, y = cnt), data = bikes_future,
alpha = 0.5, color = palette_green()[[2]]) +
geom_smooth(aes(x = date, y = cnt), data = bikes_future,
method = 'loess') +
labs(title = "Bikes Sharing: 6-Month Forecast", x = "") +
theme_tq()
test_resid_sd <- sd(pred_test$.resid)
bikes_future <- bikes_future %>%
mutate(
lo.95 = cnt - 1.96 * test_resid_sd,
lo.80 = cnt - 1.28 * test_resid_sd,
hi.80 = cnt + 1.28 * test_resid_sd,
hi.95 = cnt + 1.96 * test_resid_sd
)
Visualize
bikes %>%
ggplot(aes(x = date, y = cnt)) +
geom_point(alpha = 0.5, color = palette_green()[[1]]) +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = bikes_future,
fill = "#D5DBFF", color = NA, size = 0) +
geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = bikes_future,
fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
geom_point(aes(x = date, y = cnt), data = bikes_future,
alpha = 0.5, color = palette_green()[[2]]) +
geom_smooth(aes(x = date, y = cnt), data = bikes_future,
method = 'loess', color = "white") +
labs(title = "Bikes Sharing: 6-Month Forecast with Prediction Intervals", x = "") +
theme_tq()