Including Plots

You can also embed plots, for example:

list=c("tidyverse","tidyquant","timekit","modelr","lubridate","timetk","plotly","stringr","tseries","DT","sweep","scales","readxl","broom","tibble","gam")


 list_packages <- list
new.packages <- list_packages[!(list_packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

R<-suppressWarnings(suppressMessages(sapply(list, library, character.only = TRUE)))
#options(na.action = na.warn)
modeldat=read_excel("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/actvsfcdatajuly311.xlsx",sheet = "Sheet1")%>%na.omit()%>%dplyr::rename(Date=Week)
modeldat%>%tail()%>%knitr::kable()
Date Actual
2017-08-14 32
2017-08-21 30
2017-08-28 27
2017-09-04 32
2017-09-11 32
2017-09-18 36
modeldat[61,]
## # A tibble: 1 x 2
##         Date Actual
##       <dttm>  <dbl>
## 1 2017-03-27     29
dim(modeldat)
## [1] 86  2
glimpse(modeldat)
## Observations: 86
## Variables: 2
## $ Date   <dttm> 2016-01-11, 2016-01-18, 2016-01-25, 2016-02-01, 2016-0...
## $ Actual <dbl> 26, 27, 28, 22, 27, 31, 26, 27, 21, 23, 25, 23, 23, 23,...
modeldat%>%mutate(Date=as.Date(Date))%>%summary()%>%knitr::kable()
Date Actual
Min. :2016-01-11 Min. :21.00
1st Qu.:2016-06-07 1st Qu.:27.00
Median :2016-11-03 Median :30.00
Mean :2016-11-13 Mean :29.57
3rd Qu.:2017-04-22 3rd Qu.:32.00
Max. :2017-09-18 Max. :40.00

Split the data into train and test sets at “2012-07-01”.

# Split into training(first 60) and test sets(last 21)

modeldat=modeldat%>%mutate(Date=as.Date(Date))

train <- modeldat %>%
    dplyr::filter(Date < ymd("2017-03-27"))

test <- modeldat %>%
   dplyr:: filter(Date >= ymd("2017-03-27"))

Modeling

Start with the training set, which has the “date” and “cnt” columns.

# Training set
train%>%head()%>%knitr::kable()
Date Actual
2016-01-11 26
2016-01-18 27
2016-01-25 28
2016-02-01 22
2016-02-08 27
2016-02-15 31
test%>%tail()%>%knitr::kable()
Date Actual
2017-08-14 32
2017-08-21 30
2017-08-28 27
2017-09-04 32
2017-09-11 32
2017-09-18 36

A visualization will help understand how we plan to tackle the problem of forecasting the data. We’ll split the data into two regions: a training region and a testing region.

modeldat%>%ggplot(aes(Date,Actual))+geom_line()+

geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
  labs(title = "Drivers  Forecast ", x = "Date") +
    theme_tq()

modeldat%>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-10-21")),
              ymin = 20, ymax = 50,
              fill = palette_light()[[3]], alpha = 0.01) +
  ggplot2::annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   ggplot2::annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Region") +
  geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
  labs(title = "Bikes Sharing Dataset: Daily Scale", x = "") +
    theme_tq()

The first step is to add the time series signature to the training set, which will be used this to learn the patterns. The most efficient method is using tk_augment_timeseries_signature(), which adds the columns we need as additional columns.

# Add time series signature
train_augmented <- train %>%
    tk_augment_timeseries_signature()


train_augmented%>%head()%>%knitr::kable( format = "markdown", caption = "Training Data")
Date Actual index.num diff year year.iso half quarter month month.xts month.lbl day hour minute second hour12 am.pm wday wday.xts wday.lbl mday qday yday mweek week week.iso week2 week3 week4 mday7
2016-01-11 26 1452470400 NA 2016 2016 1 1 1 0 January 11 0 0 0 0 1 2 1 Monday 11 11 11 3 2 2 0 2 2 2
2016-01-18 27 1453075200 604800 2016 2016 1 1 1 0 January 18 0 0 0 0 1 2 1 Monday 18 18 18 4 3 3 1 0 3 3
2016-01-25 28 1453680000 604800 2016 2016 1 1 1 0 January 25 0 0 0 0 1 2 1 Monday 25 25 25 5 4 4 0 1 0 4
2016-02-01 22 1454284800 604800 2016 2016 1 1 2 1 February 1 0 0 0 0 1 2 1 Monday 1 32 32 6 5 5 1 2 1 1
2016-02-08 27 1454889600 604800 2016 2016 1 1 2 1 February 8 0 0 0 0 1 2 1 Monday 8 39 39 2 6 6 0 0 2 2
2016-02-15 31 1455494400 604800 2016 2016 1 1 2 1 February 15 0 0 0 0 1 2 1 Monday 15 46 46 3 7 7 1 1 3 3
#%>%knitr::kable( format = "html")


dat_augmented <- modeldat %>%
    tk_augment_timeseries_signature()

Now that we have a number of fields that can be used for training, we can use these for modeling. In practice, you will want to go through the process of pre-processing the data, centering and scaling if necessary, making dummy variables, removing correlated variables that are present, examining interactions, etc. For brevity, we do not do this here.

#fit an additive model
#fit an additive model

#fit_lm1<-gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )


fit_lm1<-gam::gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )

#fit_lm1<-lm(Actual~ 0+Date+quarter+month+day, data =dat_augmented )

summary(fit_lm1)
## 
## Call: gam::gam(formula = Actual ~ s(Date) + s(quarter) + s(month) + 
##     s(day), data = dat_augmented)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4968 -2.0114 -0.0306  1.7403  6.3624 
## 
## (Dispersion Parameter for gaussian family taken to be 10.2734)
## 
##     Null Deviance: 1575.081 on 85 degrees of freedom
## Residual Deviance: 719.1369 on 69.9998 degrees of freedom
## AIC: 460.6963 
## 
## Number of Local Scoring Iterations: 3 
## 
## Anova for Parametric Effects
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## s(Date)     1  57.73   57.73  5.6194   0.02052 *  
## s(quarter)  1 344.69  344.69 33.5521 1.814e-07 ***
## s(month)    1   7.94    7.94  0.7730   0.38230    
## s(day)      1   1.85    1.85  0.1804   0.67232    
## Residuals  70 719.14   10.27                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df  Npar F    Pr(F)    
## (Intercept)                             
## s(Date)           3  2.2142  0.09407 .  
## s(quarter)        2  2.2966  0.10813    
## s(month)          3 12.5093 1.22e-06 ***
## s(day)            3  0.3194  0.81132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#remove first row it contains NA

smry_additvemod=as.list.data.frame(summary(fit_lm1))

smry_additvemod$anova%>%timetk::tk_tbl()%>%slice(-1)%>%knitr::kable()
index Npar Df Npar F Pr(F)
s(Date) 3 2.214190 0.0940729
s(quarter) 2 2.296616 0.1081313
s(month) 3 12.509343 0.0000012
s(day) 3 0.319370 0.8113225
# You can also choose to fill in missing values

smry_additvemod_anova=smry_additvemod$anova%>%timetk::tk_tbl()
smry_additvemod_anova%>%knitr::kable()
index Npar Df Npar F Pr(F)
(Intercept) NA NA NA
s(Date) 3 2.214190 0.0940729
s(quarter) 2 2.296616 0.1081313
s(month) 3 12.509343 0.0000012
s(day) 3 0.319370 0.8113225
# replace white spaces in column names with _

names(smry_additvemod_anova)<-gsub("\\s", "_", names(smry_additvemod_anova))

smry_additvemod_anova%>%knitr::kable()
index Npar_Df Npar_F Pr(F)
(Intercept) NA NA NA
s(Date) 3 2.214190 0.0940729
s(quarter) 2 2.296616 0.1081313
s(month) 3 12.509343 0.0000012
s(day) 3 0.319370 0.8113225
# You can also choose to fill in missing values
smry_additvemod_anova %>% na.omit()%>%knitr::kable()
index Npar_Df Npar_F Pr(F)
s(Date) 3 2.214190 0.0940729
s(quarter) 2 2.296616 0.1081313
s(month) 3 12.509343 0.0000012
s(day) 3 0.319370 0.8113225
smry_additvemod_panova=smry_additvemod$parametric.anova%>%timetk::tk_tbl()

# replace white spaces in column names with _

names(smry_additvemod_panova)<-gsub("\\s", "_", names(smry_additvemod_panova))

#%>% replace_na(list(F_value = "-"))

smry_additvemod_panova%>%replace_na(list(F_value = "-"))
## # A tibble: 5 x 6
##        index       Df     Sum_Sq    Mean_Sq           F_value     `Pr(>F)`
##        <chr>    <dbl>      <dbl>      <dbl>             <chr>        <dbl>
## 1    s(Date)  1.00000  57.730544  57.730544  5.61941467508312 2.052095e-02
## 2 s(quarter)  1.00000 344.694531 344.694531  33.5521092940803 1.813960e-07
## 3   s(month)  1.00000   7.941417   7.941417 0.773006974355052 3.822967e-01
## 4     s(day)  1.00000   1.853444   1.853444 0.180411748026327 6.723224e-01
## 5  Residuals 69.99983 719.136881  10.273409                 -           NA
tidy(fit_lm1)
##         term       df      sumsq     meansq  statistic      p.value
## 1    s(Date)  1.00000  57.730544  57.730544  5.6194147 2.052095e-02
## 2 s(quarter)  1.00000 344.694531 344.694531 33.5521093 1.813960e-07
## 3   s(month)  1.00000   7.941417   7.941417  0.7730070 3.822967e-01
## 4     s(day)  1.00000   1.853444   1.853444  0.1804117 6.723224e-01
## 5  Residuals 69.99983 719.136881  10.273409         NA           NA
tidy(fit_lm1)["p.value"]%>%knitr::kable()
p.value
0.0205210
0.0000002
0.3822967
0.6723224
NA
# add fitted,resid to the train_augmented data

aug=broom::augment_columns(fit_lm1,dat_augmented)

aug%>%head()%>%knitr::kable()
Date Actual index.num diff year year.iso half quarter month month.xts month.lbl day hour minute second hour12 am.pm wday wday.xts wday.lbl mday qday yday mweek week week.iso week2 week3 week4 mday7 .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
2016-01-11 26 1452470400 NA 2016 2016 1 1 1 0 January 11 0 0 0 0 1 2 1 Monday 11 11 11 3 2 2 0 2 2 2 25.77137 1.724674 0.2286334 0.0732964 2.998084 0.0000869 0.0740989
2016-01-18 27 1453075200 604800 2016 2016 1 1 1 0 January 18 0 0 0 0 1 2 1 Monday 18 18 18 4 3 3 1 0 3 3 25.81127 1.644834 1.1887285 0.0684920 2.995037 0.0021714 0.3842661
2016-01-25 28 1453680000 604800 2016 2016 1 1 1 0 January 25 0 0 0 0 1 2 1 Monday 25 25 25 5 4 4 0 1 0 4 25.40803 1.583149 2.5919720 0.0785889 2.982963 0.0121068 0.8424542
2016-02-01 22 1454284800 604800 2016 2016 1 1 2 1 February 1 0 0 0 0 1 2 1 Monday 1 32 32 6 5 5 1 2 1 1 25.04003 1.673748 -3.0400257 0.0821441 2.977138 0.0175427 -0.9899944
2016-02-08 27 1454889600 604800 2016 2016 1 1 2 1 February 8 0 0 0 0 1 2 1 Monday 8 39 39 2 6 6 0 0 2 2 25.27167 1.269673 1.7283311 0.0569804 2.991591 0.0037261 0.5552765
2016-02-15 31 1455494400 604800 2016 2016 1 1 2 1 February 15 0 0 0 0 1 2 1 Monday 15 46 46 3 7 7 1 1 3 3 25.48693 1.217385 5.5130731 0.0467180 2.930984 0.0304190 1.7616749

The coefficients are given below

fit_lm1$coefficients
##  (Intercept)      s(Date)   s(quarter)     s(month)       s(day) 
## -5.959531079  0.001801862  3.206040745 -0.392055110 -0.016794206
library(RColorBrewer)
colors = brewer.pal(8, "Dark2")

#To manual add a legend,include the color="name" in the aes() of each plot

cols <- c("LINE1"="#f04546","LINE2"="#3591d1","BAR"="#62c76b")

aug%>%ggplot(aes(x=Date,y=Actual))+geom_line(aes(x=Date,y=.fitted,color="Fitted"),show.legend=TRUE)+geom_line(aes(x=Date,y=Actual,color="Observed"),data=dat_augmented, show.legend=TRUE)+
 labs(title = "Predicted Values vs Observed values", x = "Time")+
 
  theme_bw() +
# define a custom  background theme
#theme(panel.background = element_rect(fill = 'grey75'))  +
  

scale_colour_manual(name='GAM\n Method',values=c("Fitted"="red","Observed"='black'))+

#Make title bold and add a little space at the baseline (face, vjust)
  
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
  
#change the position of the legend
  
theme(legend.position="top") + 

# Change the color of the  title of the legend (name)  
  
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))

We can examine the model residuals to see if there is any significant pattern remaining using augment() from the broom package.

library(ggfortify)


# ggplot2::autoplot(fit_lm1, which = 1:6, nrow=2, label.size = F,na.rm=TRUE)+ theme_tq()
unlist(tidy(fit_lm1)["p.value"])
##     p.value1     p.value2     p.value3     p.value4     p.value5 
## 2.052095e-02 1.813960e-07 3.822967e-01 6.723224e-01           NA
as.integer(unlist(tidy(fit_lm1)["p.value"]))
## [1]  0  0  0  0 NA

Visualize the residuals of training set

     aug %>%
     ggplot(aes(x =Date, y = .resid)) +
  geom_point(color = palette_light()[[1]], alpha = 0.5)+
  
geom_hline(yintercept = 0, color = "red") +
 
     theme_bw() +
     

     labs(title = "Training Set: Additive  Model Residuals (gam)", x = "") 

We can also get a quick idea of the overall error of the model on the training set. Note that what we really care about is the error on the test set, as this is a much better predictor of future model performance.

# RMSE
sqrt(mean(fit_lm1$residuals^2))
## [1] 2.891722

Test Validation

With a suitable model (low residual error and random residuals) we can forecast using the “test” set for validation purposes.

test%>%head()%>%knitr::kable()
Date Actual
2017-03-27 29
2017-04-03 30
2017-04-10 32
2017-04-17 36
2017-04-24 32
2017-05-01 27

We need to again augment the time series signature to the test set.

test_augmented <- test %>%
    tk_augment_timeseries_signature()
test_augmented%>%head()%>%knitr::kable()
Date Actual index.num diff year year.iso half quarter month month.xts month.lbl day hour minute second hour12 am.pm wday wday.xts wday.lbl mday qday yday mweek week week.iso week2 week3 week4 mday7
2017-03-27 29 1490572800 NA 2017 2017 1 1 3 2 March 27 0 0 0 0 1 2 1 Monday 27 86 86 5 13 13 1 1 1 4
2017-04-03 30 1491177600 604800 2017 2017 1 2 4 3 April 3 0 0 0 0 1 2 1 Monday 3 3 93 2 14 14 0 2 2 1
2017-04-10 32 1491782400 604800 2017 2017 1 2 4 3 April 10 0 0 0 0 1 2 1 Monday 10 10 100 3 15 15 1 0 3 2
2017-04-17 36 1492387200 604800 2017 2017 1 2 4 3 April 17 0 0 0 0 1 2 1 Monday 17 17 107 4 16 16 0 1 0 3
2017-04-24 32 1492992000 604800 2017 2017 1 2 4 3 April 24 0 0 0 0 1 2 1 Monday 24 24 114 5 17 17 1 2 1 4
2017-05-01 27 1493596800 604800 2017 2017 1 2 5 4 May 1 0 0 0 0 1 2 1 Monday 1 31 121 6 18 18 0 0 2 1

Next, use predict() to apply the model to the test set.

yhat_test <- predict(fit_lm1, newdata = test_augmented)

yhat_test%>%knitr::kable()
25.84304
29.18919
29.48293
29.63722
29.16704
30.37889
30.61470
30.82938
30.34374
29.97607
31.41751
31.65067
31.57561
31.26805
33.61198
33.89416
34.05252
33.60279
33.07173
32.20581
32.48775
32.18068
31.94176
30.18704
30.51578
30.64140

Add the predictions (use add_column for numeric vectors) to the test set for comparison. Additionally, we can add the residuals using mutate(), which enables performing calculations between columns of a data frame.

options(dplyr.tibble.print = 21)
pred_test <- test %>%
    add_column(yhat = yhat_test) %>%
    mutate(.resid = Actual - yhat)

pred_test%>%knitr::kable()
Date Actual yhat .resid
2017-03-27 29 25.84304 3.1569632
2017-04-03 30 29.18919 0.8108056
2017-04-10 32 29.48293 2.5170742
2017-04-17 36 29.63722 6.3627782
2017-04-24 32 29.16704 2.8329609
2017-05-01 27 30.37889 -3.3788875
2017-05-08 32 30.61470 1.3853030
2017-05-15 28 30.82938 -2.8293823
2017-05-22 28 30.34374 -2.3437352
2017-05-29 30 29.97607 0.0239255
2017-06-05 34 31.41751 2.5824942
2017-06-12 32 31.65067 0.3493289
2017-06-19 28 31.57561 -3.5756094
2017-06-26 34 31.26805 2.7319514
2017-07-03 32 33.61198 -1.6119768
2017-07-10 35 33.89416 1.1058382
2017-07-17 32 34.05252 -2.0525211
2017-07-24 30 33.60279 -3.6027883
2017-07-31 33 33.07173 -0.0717314
2017-08-07 33 32.20581 0.7941911
2017-08-14 32 32.48775 -0.4877525
2017-08-21 30 32.18068 -2.1806757
2017-08-28 27 31.94176 -4.9417578
2017-09-04 32 30.18704 1.8129565
2017-09-11 32 30.51578 1.4842172
2017-09-18 36 30.64140 5.3585999
#print(pred_test,n=21)
#sd(test_residuals)
# Calculating forecast error
test_residuals <- pred_test$.resid

timetk::tk_tbl(test_residuals)%>%head()%>%knitr::kable()
## Warning in tk_tbl.data.frame(as.data.frame(data), preserve_index,
## rename_index, : Warning: No index to preserve. Object otherwise converted
## to tibble successfully.
data
3.1569632
0.8108056
2.5170742
6.3627782
2.8329609
-3.3788875
pred_test%>% dplyr::rename(Predicted=yhat)%>%knitr::kable()
Date Actual Predicted .resid
2017-03-27 29 25.84304 3.1569632
2017-04-03 30 29.18919 0.8108056
2017-04-10 32 29.48293 2.5170742
2017-04-17 36 29.63722 6.3627782
2017-04-24 32 29.16704 2.8329609
2017-05-01 27 30.37889 -3.3788875
2017-05-08 32 30.61470 1.3853030
2017-05-15 28 30.82938 -2.8293823
2017-05-22 28 30.34374 -2.3437352
2017-05-29 30 29.97607 0.0239255
2017-06-05 34 31.41751 2.5824942
2017-06-12 32 31.65067 0.3493289
2017-06-19 28 31.57561 -3.5756094
2017-06-26 34 31.26805 2.7319514
2017-07-03 32 33.61198 -1.6119768
2017-07-10 35 33.89416 1.1058382
2017-07-17 32 34.05252 -2.0525211
2017-07-24 30 33.60279 -3.6027883
2017-07-31 33 33.07173 -0.0717314
2017-08-07 33 32.20581 0.7941911
2017-08-14 32 32.48775 -0.4877525
2017-08-21 30 32.18068 -2.1806757
2017-08-28 27 31.94176 -4.9417578
2017-09-04 32 30.18704 1.8129565
2017-09-11 32 30.51578 1.4842172
2017-09-18 36 30.64140 5.3585999

Visualize the results using ggplot().

ggplot(aes(x = Date), data = modeldat) +
   geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-10-21")),
              ymin = 5, ymax = 50,
              fill = palette_light()[[11]], alpha = 0.01) +
  annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Region") + 
    geom_point(aes(x = Date, y = Actual), data = train, alpha = 0.5, color = palette_light()[[1]]) +
      geom_point(aes(x = Date, y = Actual), data = pred_test, alpha = 0.5, color = palette_light()[[1]]) +
    geom_point(aes(x = Date, y = yhat), data = pred_test, alpha = 0.9, color = palette_light()[[6]]) +
  
    theme_bw()

Test Accuracy

The forecast accuracy can be evaluated on the test set using residual diagnostics and forecast accuracy measures.

# Calculating forecast error
test_residuals <- pred_test$.resid

pct_err <- test_residuals/pred_test$Actual * 100 # Percentage error

me   <- mean(test_residuals, na.rm=TRUE)
rmse <- mean(test_residuals^2, na.rm=TRUE)^0.5
mae  <- mean(abs(test_residuals), na.rm=TRUE)
mape <- mean(abs(pct_err), na.rm=TRUE)
mpe  <- mean(pct_err, na.rm=TRUE)
error_tbl <- tibble(me, rmse, mae, mape, mpe)
error_tbl
## # A tibble: 1 x 5
##          me     rmse      mae     mape      mpe
##       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1 0.2397142 2.805308 2.322546 7.477003 0.205155

Next we can visualize the residuals of the test set. The residuals of the model aren’t perfect, but we can work with it. The residuals show that the model predicts low in April to May and high in June to July.

ggplot(aes(x = Date, y = .resid), data = pred_test) +
    geom_hline(yintercept = 0, color = "red") +
    geom_point(color = palette_light()[[1]], alpha = 0.5) +
    geom_smooth(method = 'gam' ) +
    theme_tq() +
    labs(title = "Test Set: gam() Model Residuals", x = "") 

At this point you might go back to the model and try tweaking features using interactions or polynomial terms, adding other features that may be known in the future (e.g. temperature of day can be forecasted relatively accurately within 7 days), or try a completely different modeling technique with the hope of better predictions on the test set. Once you feel that your model is optimized, move on the final step of forecasting.

Forecasting

Let’s use our model to predict What are the expected future values for the next six months. The first step is to create the date sequence. Let’s use tk_get_timeseries_summary() to review the summary of the dates from the original dataset, “bikes”.

# Extract bikes index
idx <- modeldat %>%
    tk_index()

# Get time series summary from index
modeldat_summary <- idx %>%
    tk_get_timeseries_summary()
modeldat_summary
## # A tibble: 1 x 12
##   n.obs      start        end units scale tzone diff.minimum diff.q1
##   <int>     <date>     <date> <chr> <chr> <chr>        <dbl>   <dbl>
## 1    86 2016-01-11 2017-09-18  days  week   UTC       604800  604800
## # ... with 4 more variables: diff.median <dbl>, diff.mean <dbl>,
## #   diff.q3 <dbl>, diff.maximum <dbl>

The first six parameters are general summary information.

modeldat_summary[1:6]
## # A tibble: 1 x 6
##   n.obs      start        end units scale tzone
##   <int>     <date>     <date> <chr> <chr> <chr>
## 1    86 2016-01-11 2017-09-18  days  week   UTC

The second six parameters are the periodicity information.

modeldat_summary[7:12]
## # A tibble: 1 x 6
##   diff.minimum diff.q1 diff.median diff.mean diff.q3 diff.maximum
##          <dbl>   <dbl>       <dbl>     <dbl>   <dbl>        <dbl>
## 1       604800  604800      604800  626145.9  604800      1814400

From the summary, we know that the data is 100% regular because the median and mean differences are 86400 seconds or 1 day. We don’t need to do any special inspections when we use tk_make_future_timeseries(). If the data was irregular, meaning weekends or holidays were excluded, you’d want to account for this. Otherwise your forecast would be inaccurate.

idx_future <- idx %>%
    tk_make_future_timeseries(n_future = 12)


idx_future%>%knitr::kable() 
## Warning in kable_markdown(x = structure(c("2017-09-25", "2017-10-02",
## "2017-10-09", : The table should have a header (column names)
2017-09-25
2017-10-02
2017-10-09
2017-10-16
2017-10-23
2017-10-30
2017-11-06
2017-11-13
2017-11-20
2017-11-27
2017-12-04
2017-12-11

To make the prediction, we need to use the future index to get the time series signature (tk_get_timeseries_signature()). Make sure to rename the column “index” to “date” so it matches the column names of the original data.

data_future <- idx_future %>%
    tk_get_timeseries_signature() 

data_future=data_future%>%dplyr::rename(Date=index)%>%timetk::tk_tbl()
## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.
data_future%>%knitr::kable()
Date index.num diff year year.iso half quarter month month.xts month.lbl day hour minute second hour12 am.pm wday wday.xts wday.lbl mday qday yday mweek week week.iso week2 week3 week4 mday7
2017-09-25 1506297600 NA 2017 2017 2 3 9 8 September 25 0 0 0 0 1 2 1 Monday 25 87 268 5 39 39 1 0 3 4
2017-10-02 1506902400 604800 2017 2017 2 4 10 9 October 2 0 0 0 0 1 2 1 Monday 2 2 275 1 40 40 0 1 0 1
2017-10-09 1507507200 604800 2017 2017 2 4 10 9 October 9 0 0 0 0 1 2 1 Monday 9 9 282 2 41 41 1 2 1 2
2017-10-16 1508112000 604800 2017 2017 2 4 10 9 October 16 0 0 0 0 1 2 1 Monday 16 16 289 3 42 42 0 0 2 3
2017-10-23 1508716800 604800 2017 2017 2 4 10 9 October 23 0 0 0 0 1 2 1 Monday 23 23 296 4 43 43 1 1 3 4
2017-10-30 1509321600 604800 2017 2017 2 4 10 9 October 30 0 0 0 0 1 2 1 Monday 30 30 303 5 44 44 0 2 0 5
2017-11-06 1509926400 604800 2017 2017 2 4 11 10 November 6 0 0 0 0 1 2 1 Monday 6 37 310 2 45 45 1 0 1 1
2017-11-13 1510531200 604800 2017 2017 2 4 11 10 November 13 0 0 0 0 1 2 1 Monday 13 44 317 3 46 46 0 1 2 2
2017-11-20 1511136000 604800 2017 2017 2 4 11 10 November 20 0 0 0 0 1 2 1 Monday 20 51 324 4 47 47 1 2 3 3
2017-11-27 1511740800 604800 2017 2017 2 4 11 10 November 27 0 0 0 0 1 2 1 Monday 27 58 331 5 48 48 0 0 0 4
2017-12-04 1512345600 604800 2017 2017 2 4 12 11 December 4 0 0 0 0 1 2 1 Monday 4 65 338 2 49 49 1 1 1 1
2017-12-11 1512950400 604800 2017 2017 2 4 12 11 December 11 0 0 0 0 1 2 1 Monday 11 72 345 3 50 50 0 2 2 2

Make the prediction.

pred_future <- predict(fit_lm1, newdata = data_future)
pred_future
##        1        2        3        4        5        6        7        8 
## 30.32402 34.12196 34.46041 34.73055 34.29086 33.91452 32.77320 33.08290 
##        9       10       11       12 
## 32.94137 32.73489 31.06453 31.39505

Build the future data frame.

#tidyverse_conflicts()
man_future<- data_future %>%
   dplyr:: select(Date) %>%
   tibble::add_column(Actual = pred_future)%>%timetk::tk_tbl()
## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.
man_future%>%knitr::kable()
Date Actual
2017-09-25 30.32402
2017-10-02 34.12196
2017-10-09 34.46041
2017-10-16 34.73055
2017-10-23 34.29086
2017-10-30 33.91452
2017-11-06 32.77320
2017-11-13 33.08290
2017-11-20 32.94137
2017-11-27 32.73489
2017-12-04 31.06453
2017-12-11 31.39505
man_future%>%dplyr::rename(Forecast=Actual)%>%knitr::kable()
Date Forecast
2017-09-25 30.32402
2017-10-02 34.12196
2017-10-09 34.46041
2017-10-16 34.73055
2017-10-23 34.29086
2017-10-30 33.91452
2017-11-06 32.77320
2017-11-13 33.08290
2017-11-20 32.94137
2017-11-27 32.73489
2017-12-04 31.06453
2017-12-11 31.39505

Visualize the forecast.

modeldat%>%ggplot(aes(x = Date,y=Actual)) + 
   geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-08-14")),
              ymin = 20, ymax = 50,
              fill = palette_light()[[3]], alpha = 0.01) +
   geom_rect(xmin = as.numeric(ymd("2017-08-14")),
              xmax = as.numeric(ymd("2017-12-31")),
              ymin = 20, ymax = 50,
              fill = "#64A8D3", alpha = 0.01)+
  annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Reg") + 
   annotate("text", x = ymd("2017-10-01"), y = 45,
             color = palette_light()[[1]], label = "Forecast ")+
  geom_point(alpha = 0.5, color = palette_light()[[1]])+
  geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]])+
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess')

Forecast Error

A forecast is never perfect. We need prediction intervals to account for the variance from the model predictions to the actual data. There’s a number of methods to achieve this. We’ll follow the prediction interval methodology from Forecasting: Principles and Practice.

# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)

man_future <- man_future %>%
    mutate(
        lo.95 = Actual - 1.96 * test_resid_sd,
        lo.80 = Actual - 1.28 * test_resid_sd,
        hi.80 = Actual + 1.28 * test_resid_sd,
        hi.95 = Actual + 1.96 * test_resid_sd
        )
man_future%>%head()%>%knitr::kable()
Date Actual lo.95 lo.80 hi.80 hi.95
2017-09-25 30.32402 24.73723 26.67550 33.97253 35.91080
2017-10-02 34.12196 28.53518 30.47345 37.77048 39.70875
2017-10-09 34.46041 28.87362 30.81189 38.10892 40.04719
2017-10-16 34.73055 29.14376 31.08204 38.37906 40.31733
2017-10-23 34.29086 28.70407 30.64234 37.93937 39.87764
2017-10-30 33.91452 28.32774 30.26601 37.56304 39.50131

Now, plotting the forecast with the prediction intervals.

modeldat %>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess', color = "white") + 
    labs(title = " 3-Month Forecast of Drivers needed  with Prediction Intervals", x = "",subtitle="80% and 95% confidence bands") +
    theme_tq()

# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)

man_future <- man_future %>%
    mutate(
        lo.95 = Actual - 1.96 * test_resid_sd,
        lo.99 = Actual - 2.58 * test_resid_sd,
        hi.99 = Actual + 2.58 * test_resid_sd,
        hi.95 = Actual + 1.96 * test_resid_sd
        )
modeldat %>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess', color = "white") + 
    labs(title = " 3-Month Forecast of Drivers needed  with Prediction Intervals", x = "",subtitle="90% and 95% confidence bands") +
    theme_tq()

sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] splines   stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggfortify_0.4.1               RColorBrewer_1.1-2           
##  [3] bindrcpp_0.2                  gam_1.14-4                   
##  [5] foreach_1.4.3                 broom_0.4.2                  
##  [7] readxl_1.0.0.9000             scales_0.5.0.9000            
##  [9] sweep_0.2.0                   DT_0.2                       
## [11] tseries_0.10-42               stringr_1.2.0                
## [13] plotly_4.7.1                  timetk_0.1.0                 
## [15] modelr_0.1.1                  timekit_0.3.0                
## [17] tidyquant_0.5.2               quantmod_0.4-10              
## [19] TTR_0.23-2                    PerformanceAnalytics_1.4.3541
## [21] xts_0.10-0                    zoo_1.8-0                    
## [23] lubridate_1.6.0               dplyr_0.7.2                  
## [25] purrr_0.2.3                   readr_1.1.1                  
## [27] tidyr_0.7.1                   tibble_1.3.4                 
## [29] ggplot2_2.2.1.9000            tidyverse_1.1.1              
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.3.1        jsonlite_1.5      viridisLite_0.2.0
##  [4] assertthat_0.2.0  highr_0.6         cellranger_1.1.0 
##  [7] yaml_2.1.14       backports_1.1.0   lattice_0.20-35  
## [10] glue_1.1.1        quadprog_1.5-5    digest_0.6.12    
## [13] rvest_0.3.2       colorspace_1.3-2  Matrix_1.2-10    
## [16] htmltools_0.3.6   plyr_1.8.4        psych_1.7.5      
## [19] pkgconfig_2.0.1   haven_1.1.0       mgcv_1.8-17      
## [22] lazyeval_0.2.0    mnormt_1.5-5      magrittr_1.5     
## [25] evaluate_0.10.1   nlme_3.1-131      forcats_0.2.0    
## [28] xml2_1.1.1        foreign_0.8-69    tools_3.4.1      
## [31] data.table_1.10.4 hms_0.3           munsell_0.4.3    
## [34] compiler_3.4.1    rlang_0.1.2       grid_3.4.1       
## [37] iterators_1.0.8   htmlwidgets_0.9   labeling_0.3     
## [40] rmarkdown_1.6     gtable_0.2.0      codetools_0.2-15 
## [43] DBI_0.7           curl_2.8.1        reshape2_1.4.2   
## [46] R6_2.2.2          gridExtra_2.2.1   knitr_1.17       
## [49] bindr_0.1         rprojroot_1.2     Quandl_2.8.0     
## [52] stringi_1.1.5     parallel_3.4.1    Rcpp_0.12.12