Additive Model for Forecasting

Including Plots

You can also embed plots, for example:

list=c("tidyverse","tidyquant","timekit","modelr","lubridate","timetk","plotly","stringr","tseries","DT","sweep","scales","readxl","broom","tibble","gam")


 list_packages <- list
new.packages <- list_packages[!(list_packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

R<-suppressWarnings(suppressMessages(sapply(list, library, character.only = TRUE)))
#options(na.action = na.warn)

modeldat=read_excel("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/actvsfcdatajuly311.xlsx",sheet = "Sheet1")%>%na.omit()%>%dplyr::rename(Date=Week)
modeldat%>%tail()%>%knitr::kable()

Date	Actual
2017-08-14	32
2017-08-21	30
2017-08-28	27
2017-09-04	32
2017-09-11	32
2017-09-18	36

modeldat[61,]

## # A tibble: 1 x 2
##         Date Actual
##       <dttm>  <dbl>
## 1 2017-03-27     29

dim(modeldat)

## [1] 86  2

glimpse(modeldat)

## Observations: 86
## Variables: 2
## $ Date   <dttm> 2016-01-11, 2016-01-18, 2016-01-25, 2016-02-01, 2016-0...
## $ Actual <dbl> 26, 27, 28, 22, 27, 31, 26, 27, 21, 23, 25, 23, 23, 23,...

modeldat%>%mutate(Date=as.Date(Date))%>%summary()%>%knitr::kable()

	Date	Actual
	Min. :2016-01-11	Min. :21.00
	1st Qu.:2016-06-07	1st Qu.:27.00
	Median :2016-11-03	Median :30.00
	Mean :2016-11-13	Mean :29.57
	3rd Qu.:2017-04-22	3rd Qu.:32.00
	Max. :2017-09-18	Max. :40.00

Split the data into train and test sets at “2012-07-01”.

# Split into training(first 60) and test sets(last 21)

modeldat=modeldat%>%mutate(Date=as.Date(Date))

train <- modeldat %>%
    dplyr::filter(Date < ymd("2017-03-27"))

test <- modeldat %>%
   dplyr:: filter(Date >= ymd("2017-03-27"))

Modeling

Start with the training set, which has the “date” and “cnt” columns.

# Training set
train%>%head()%>%knitr::kable()

Date	Actual
2016-01-11	26
2016-01-18	27
2016-01-25	28
2016-02-01	22
2016-02-08	27
2016-02-15	31

test%>%tail()%>%knitr::kable()

Date	Actual
2017-08-14	32
2017-08-21	30
2017-08-28	27
2017-09-04	32
2017-09-11	32
2017-09-18	36

A visualization will help understand how we plan to tackle the problem of forecasting the data. We’ll split the data into two regions: a training region and a testing region.

modeldat%>%ggplot(aes(Date,Actual))+geom_line()+

geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
  labs(title = "Drivers  Forecast ", x = "Date") +
    theme_tq()

modeldat%>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-10-21")),
              ymin = 20, ymax = 50,
              fill = palette_light()[[3]], alpha = 0.01) +
  ggplot2::annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   ggplot2::annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Region") +
  geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
  labs(title = "Bikes Sharing Dataset: Daily Scale", x = "") +
    theme_tq()

The first step is to add the time series signature to the training set, which will be used this to learn the patterns. The most efficient method is using tk_augment_timeseries_signature(), which adds the columns we need as additional columns.

# Add time series signature
train_augmented <- train %>%
    tk_augment_timeseries_signature()


train_augmented%>%head()%>%knitr::kable( format = "markdown", caption = "Training Data")

Date	Actual	index.num	diff	year	year.iso	half	quarter	month	month.xts	month.lbl	day	am.pm	wday	wday.xts	wday.lbl	mday	qday	yday	mweek	week	week.iso	week2	week3	week4	mday7
2016-01-11	26	1452470400	NA	2016	2016	1	1	1	0	January	11	1	2	1	Monday	11	11	11	3	2	2	0	2	2	2
2016-01-18	27	1453075200	604800	2016	2016	1	1	1	0	January	18	1	2	1	Monday	18	18	18	4	3	3	1	0	3	3
2016-01-25	28	1453680000	604800	2016	2016	1	1	1	0	January	25	1	2	1	Monday	25	25	25	5	4	4	0	1	0	4
2016-02-01	22	1454284800	604800	2016	2016	1	1	2	1	February	1	1	2	1	Monday	1	32	32	6	5	5	1	2	1	1
2016-02-08	27	1454889600	604800	2016	2016	1	1	2	1	February	8	1	2	1	Monday	8	39	39	2	6	6	0	0	2	2
2016-02-15	31	1455494400	604800	2016	2016	1	1	2	1	February	15	1	2	1	Monday	15	46	46	3	7	7	1	1	3	3

#%>%knitr::kable( format = "html")


dat_augmented <- modeldat %>%
    tk_augment_timeseries_signature()

Now that we have a number of fields that can be used for training, we can use these for modeling. In practice, you will want to go through the process of pre-processing the data, centering and scaling if necessary, making dummy variables, removing correlated variables that are present, examining interactions, etc. For brevity, we do not do this here.

#fit an additive model
#fit an additive model

#fit_lm1<-gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )


fit_lm1<-gam::gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )

#fit_lm1<-lm(Actual~ 0+Date+quarter+month+day, data =dat_augmented )

summary(fit_lm1)

## 
## Call: gam::gam(formula = Actual ~ s(Date) + s(quarter) + s(month) + 
##     s(day), data = dat_augmented)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4968 -2.0114 -0.0306  1.7403  6.3624 
## 
## (Dispersion Parameter for gaussian family taken to be 10.2734)
## 
##     Null Deviance: 1575.081 on 85 degrees of freedom
## Residual Deviance: 719.1369 on 69.9998 degrees of freedom
## AIC: 460.6963 
## 
## Number of Local Scoring Iterations: 3 
## 
## Anova for Parametric Effects
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## s(Date)     1  57.73   57.73  5.6194   0.02052 *  
## s(quarter)  1 344.69  344.69 33.5521 1.814e-07 ***
## s(month)    1   7.94    7.94  0.7730   0.38230    
## s(day)      1   1.85    1.85  0.1804   0.67232    
## Residuals  70 719.14   10.27                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df  Npar F    Pr(F)    
## (Intercept)                             
## s(Date)           3  2.2142  0.09407 .  
## s(quarter)        2  2.2966  0.10813    
## s(month)          3 12.5093 1.22e-06 ***
## s(day)            3  0.3194  0.81132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#remove first row it contains NA

smry_additvemod=as.list.data.frame(summary(fit_lm1))

smry_additvemod$anova%>%timetk::tk_tbl()%>%slice(-1)%>%knitr::kable()

index	Npar Df	Npar F	Pr(F)
s(Date)	3	2.214190	0.0940729
s(quarter)	2	2.296616	0.1081313
s(month)	3	12.509343	0.0000012
s(day)	3	0.319370	0.8113225

# You can also choose to fill in missing values

smry_additvemod_anova=smry_additvemod$anova%>%timetk::tk_tbl()
smry_additvemod_anova%>%knitr::kable()

index	Npar Df	Npar F	Pr(F)
(Intercept)	NA	NA	NA
s(Date)	3	2.214190	0.0940729
s(quarter)	2	2.296616	0.1081313
s(month)	3	12.509343	0.0000012
s(day)	3	0.319370	0.8113225

# replace white spaces in column names with _

names(smry_additvemod_anova)<-gsub("\\s", "_", names(smry_additvemod_anova))

smry_additvemod_anova%>%knitr::kable()

index	Npar_Df	Npar_F	Pr(F)
(Intercept)	NA	NA	NA
s(Date)	3	2.214190	0.0940729
s(quarter)	2	2.296616	0.1081313
s(month)	3	12.509343	0.0000012
s(day)	3	0.319370	0.8113225

# You can also choose to fill in missing values
smry_additvemod_anova %>% na.omit()%>%knitr::kable()

index	Npar_Df	Npar_F	Pr(F)
s(Date)	3	2.214190	0.0940729
s(quarter)	2	2.296616	0.1081313
s(month)	3	12.509343	0.0000012
s(day)	3	0.319370	0.8113225

smry_additvemod_panova=smry_additvemod$parametric.anova%>%timetk::tk_tbl()

# replace white spaces in column names with _

names(smry_additvemod_panova)<-gsub("\\s", "_", names(smry_additvemod_panova))

#%>% replace_na(list(F_value = "-"))

smry_additvemod_panova%>%replace_na(list(F_value = "-"))

## # A tibble: 5 x 6
##        index       Df     Sum_Sq    Mean_Sq           F_value     `Pr(>F)`
##        <chr>    <dbl>      <dbl>      <dbl>             <chr>        <dbl>
## 1    s(Date)  1.00000  57.730544  57.730544  5.61941467508312 2.052095e-02
## 2 s(quarter)  1.00000 344.694531 344.694531  33.5521092940803 1.813960e-07
## 3   s(month)  1.00000   7.941417   7.941417 0.773006974355052 3.822967e-01
## 4     s(day)  1.00000   1.853444   1.853444 0.180411748026327 6.723224e-01
## 5  Residuals 69.99983 719.136881  10.273409                 -           NA

tidy(fit_lm1)

##         term       df      sumsq     meansq  statistic      p.value
## 1    s(Date)  1.00000  57.730544  57.730544  5.6194147 2.052095e-02
## 2 s(quarter)  1.00000 344.694531 344.694531 33.5521093 1.813960e-07
## 3   s(month)  1.00000   7.941417   7.941417  0.7730070 3.822967e-01
## 4     s(day)  1.00000   1.853444   1.853444  0.1804117 6.723224e-01
## 5  Residuals 69.99983 719.136881  10.273409         NA           NA

tidy(fit_lm1)["p.value"]%>%knitr::kable()

p.value
0.0205210
0.0000002
0.3822967
0.6723224
NA

# add fitted,resid to the train_augmented data

aug=broom::augment_columns(fit_lm1,dat_augmented)

aug%>%head()%>%knitr::kable()

Date	Actual	index.num	diff	year	year.iso	half	quarter	month	month.xts	month.lbl	day	am.pm	wday	wday.xts	wday.lbl	mday	qday	yday	mweek	week	week.iso	week2	week3	week4	mday7	.fitted	.se.fit	.resid	.hat	.sigma	.cooksd	.std.resid
2016-01-11	26	1452470400	NA	2016	2016	1	1	1	0	January	11	1	2	1	Monday	11	11	11	3	2	2	0	2	2	2	25.77137	1.724674	0.2286334	0.0732964	2.998084	0.0000869	0.0740989
2016-01-18	27	1453075200	604800	2016	2016	1	1	1	0	January	18	1	2	1	Monday	18	18	18	4	3	3	1	0	3	3	25.81127	1.644834	1.1887285	0.0684920	2.995037	0.0021714	0.3842661
2016-01-25	28	1453680000	604800	2016	2016	1	1	1	0	January	25	1	2	1	Monday	25	25	25	5	4	4	0	1	0	4	25.40803	1.583149	2.5919720	0.0785889	2.982963	0.0121068	0.8424542
2016-02-01	22	1454284800	604800	2016	2016	1	1	2	1	February	1	1	2	1	Monday	1	32	32	6	5	5	1	2	1	1	25.04003	1.673748	-3.0400257	0.0821441	2.977138	0.0175427	-0.9899944
2016-02-08	27	1454889600	604800	2016	2016	1	1	2	1	February	8	1	2	1	Monday	8	39	39	2	6	6	0	0	2	2	25.27167	1.269673	1.7283311	0.0569804	2.991591	0.0037261	0.5552765
2016-02-15	31	1455494400	604800	2016	2016	1	1	2	1	February	15	1	2	1	Monday	15	46	46	3	7	7	1	1	3	3	25.48693	1.217385	5.5130731	0.0467180	2.930984	0.0304190	1.7616749

The coefficients are given below

fit_lm1$coefficients

##  (Intercept)      s(Date)   s(quarter)     s(month)       s(day) 
## -5.959531079  0.001801862  3.206040745 -0.392055110 -0.016794206

library(RColorBrewer)
colors = brewer.pal(8, "Dark2")

#To manual add a legend,include the color="name" in the aes() of each plot

cols <- c("LINE1"="#f04546","LINE2"="#3591d1","BAR"="#62c76b")

aug%>%ggplot(aes(x=Date,y=Actual))+geom_line(aes(x=Date,y=.fitted,color="Fitted"),show.legend=TRUE)+geom_line(aes(x=Date,y=Actual,color="Observed"),data=dat_augmented, show.legend=TRUE)+
 labs(title = "Predicted Values vs Observed values", x = "Time")+
 
  theme_bw() +
# define a custom  background theme
#theme(panel.background = element_rect(fill = 'grey75'))  +
  

scale_colour_manual(name='GAM\n Method',values=c("Fitted"="red","Observed"='black'))+

#Make title bold and add a little space at the baseline (face, vjust)
  
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
  
#change the position of the legend
  
theme(legend.position="top") + 

# Change the color of the  title of the legend (name)  
  
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))

We can examine the model residuals to see if there is any significant pattern remaining using augment() from the broom package.

library(ggfortify)


# ggplot2::autoplot(fit_lm1, which = 1:6, nrow=2, label.size = F,na.rm=TRUE)+ theme_tq()
unlist(tidy(fit_lm1)["p.value"])

##     p.value1     p.value2     p.value3     p.value4     p.value5 
## 2.052095e-02 1.813960e-07 3.822967e-01 6.723224e-01           NA

as.integer(unlist(tidy(fit_lm1)["p.value"]))

## [1]  0  0  0  0 NA

Visualize the residuals of training set

     aug %>%
     ggplot(aes(x =Date, y = .resid)) +
  geom_point(color = palette_light()[[1]], alpha = 0.5)+
  
geom_hline(yintercept = 0, color = "red") +
 
     theme_bw() +
     

     labs(title = "Training Set: Additive  Model Residuals (gam)", x = "")

We can also get a quick idea of the overall error of the model on the training set. Note that what we really care about is the error on the test set, as this is a much better predictor of future model performance.

# RMSE
sqrt(mean(fit_lm1$residuals^2))

## [1] 2.891722

Test Validation

With a suitable model (low residual error and random residuals) we can forecast using the “test” set for validation purposes.

test%>%head()%>%knitr::kable()

Date	Actual
2017-03-27	29
2017-04-03	30
2017-04-10	32
2017-04-17	36
2017-04-24	32
2017-05-01	27

We need to again augment the time series signature to the test set.

test_augmented <- test %>%
    tk_augment_timeseries_signature()
test_augmented%>%head()%>%knitr::kable()

Date	Actual	index.num	diff	year	year.iso	half	quarter	month	month.xts	month.lbl	day	am.pm	wday	wday.xts	wday.lbl	mday	qday	yday	mweek	week	week.iso	week2	week3	week4	mday7
2017-03-27	29	1490572800	NA	2017	2017	1	1	3	2	March	27	1	2	1	Monday	27	86	86	5	13	13	1	1	1	4
2017-04-03	30	1491177600	604800	2017	2017	1	2	4	3	April	3	1	2	1	Monday	3	3	93	2	14	14	0	2	2	1
2017-04-10	32	1491782400	604800	2017	2017	1	2	4	3	April	10	1	2	1	Monday	10	10	100	3	15	15	1	0	3	2
2017-04-17	36	1492387200	604800	2017	2017	1	2	4	3	April	17	1	2	1	Monday	17	17	107	4	16	16	0	1	0	3
2017-04-24	32	1492992000	604800	2017	2017	1	2	4	3	April	24	1	2	1	Monday	24	24	114	5	17	17	1	2	1	4
2017-05-01	27	1493596800	604800	2017	2017	1	2	5	4	May	1	1	2	1	Monday	1	31	121	6	18	18	0	0	2	1

Next, use predict() to apply the model to the test set.

yhat_test <- predict(fit_lm1, newdata = test_augmented)

yhat_test%>%knitr::kable()

25.84304
29.18919
29.48293
29.63722
29.16704
30.37889
30.61470
30.82938
30.34374
29.97607
31.41751
31.65067
31.57561
31.26805
33.61198
33.89416
34.05252
33.60279
33.07173
32.20581
32.48775
32.18068
31.94176
30.18704
30.51578
30.64140

Add the predictions (use add_column for numeric vectors) to the test set for comparison. Additionally, we can add the residuals using mutate(), which enables performing calculations between columns of a data frame.

options(dplyr.tibble.print = 21)
pred_test <- test %>%
    add_column(yhat = yhat_test) %>%
    mutate(.resid = Actual - yhat)

pred_test%>%knitr::kable()

Date	Actual	yhat	.resid
2017-03-27	29	25.84304	3.1569632
2017-04-03	30	29.18919	0.8108056
2017-04-10	32	29.48293	2.5170742
2017-04-17	36	29.63722	6.3627782
2017-04-24	32	29.16704	2.8329609
2017-05-01	27	30.37889	-3.3788875
2017-05-08	32	30.61470	1.3853030
2017-05-15	28	30.82938	-2.8293823
2017-05-22	28	30.34374	-2.3437352
2017-05-29	30	29.97607	0.0239255
2017-06-05	34	31.41751	2.5824942
2017-06-12	32	31.65067	0.3493289
2017-06-19	28	31.57561	-3.5756094
2017-06-26	34	31.26805	2.7319514
2017-07-03	32	33.61198	-1.6119768
2017-07-10	35	33.89416	1.1058382
2017-07-17	32	34.05252	-2.0525211
2017-07-24	30	33.60279	-3.6027883
2017-07-31	33	33.07173	-0.0717314
2017-08-07	33	32.20581	0.7941911
2017-08-14	32	32.48775	-0.4877525
2017-08-21	30	32.18068	-2.1806757
2017-08-28	27	31.94176	-4.9417578
2017-09-04	32	30.18704	1.8129565
2017-09-11	32	30.51578	1.4842172
2017-09-18	36	30.64140	5.3585999

#print(pred_test,n=21)

#sd(test_residuals)
# Calculating forecast error
test_residuals <- pred_test$.resid

timetk::tk_tbl(test_residuals)%>%head()%>%knitr::kable()

## Warning in tk_tbl.data.frame(as.data.frame(data), preserve_index,
## rename_index, : Warning: No index to preserve. Object otherwise converted
## to tibble successfully.

data
3.1569632
0.8108056
2.5170742
6.3627782
2.8329609
-3.3788875

pred_test%>% dplyr::rename(Predicted=yhat)%>%knitr::kable()

Date	Actual	Predicted	.resid
2017-03-27	29	25.84304	3.1569632
2017-04-03	30	29.18919	0.8108056
2017-04-10	32	29.48293	2.5170742
2017-04-17	36	29.63722	6.3627782
2017-04-24	32	29.16704	2.8329609
2017-05-01	27	30.37889	-3.3788875
2017-05-08	32	30.61470	1.3853030
2017-05-15	28	30.82938	-2.8293823
2017-05-22	28	30.34374	-2.3437352
2017-05-29	30	29.97607	0.0239255
2017-06-05	34	31.41751	2.5824942
2017-06-12	32	31.65067	0.3493289
2017-06-19	28	31.57561	-3.5756094
2017-06-26	34	31.26805	2.7319514
2017-07-03	32	33.61198	-1.6119768
2017-07-10	35	33.89416	1.1058382
2017-07-17	32	34.05252	-2.0525211
2017-07-24	30	33.60279	-3.6027883
2017-07-31	33	33.07173	-0.0717314
2017-08-07	33	32.20581	0.7941911
2017-08-14	32	32.48775	-0.4877525
2017-08-21	30	32.18068	-2.1806757
2017-08-28	27	31.94176	-4.9417578
2017-09-04	32	30.18704	1.8129565
2017-09-11	32	30.51578	1.4842172
2017-09-18	36	30.64140	5.3585999

Visualize the results using ggplot().

ggplot(aes(x = Date), data = modeldat) +
   geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-10-21")),
              ymin = 5, ymax = 50,
              fill = palette_light()[[11]], alpha = 0.01) +
  annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Region") + 
    geom_point(aes(x = Date, y = Actual), data = train, alpha = 0.5, color = palette_light()[[1]]) +
      geom_point(aes(x = Date, y = Actual), data = pred_test, alpha = 0.5, color = palette_light()[[1]]) +
    geom_point(aes(x = Date, y = yhat), data = pred_test, alpha = 0.9, color = palette_light()[[6]]) +
  
    theme_bw()

Test Accuracy

The forecast accuracy can be evaluated on the test set using residual diagnostics and forecast accuracy measures.

# Calculating forecast error
test_residuals <- pred_test$.resid

pct_err <- test_residuals/pred_test$Actual * 100 # Percentage error

me   <- mean(test_residuals, na.rm=TRUE)
rmse <- mean(test_residuals^2, na.rm=TRUE)^0.5
mae  <- mean(abs(test_residuals), na.rm=TRUE)
mape <- mean(abs(pct_err), na.rm=TRUE)
mpe  <- mean(pct_err, na.rm=TRUE)

error_tbl <- tibble(me, rmse, mae, mape, mpe)
error_tbl

## # A tibble: 1 x 5
##          me     rmse      mae     mape      mpe
##       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1 0.2397142 2.805308 2.322546 7.477003 0.205155

Next we can visualize the residuals of the test set. The residuals of the model aren’t perfect, but we can work with it. The residuals show that the model predicts low in April to May and high in June to July.

ggplot(aes(x = Date, y = .resid), data = pred_test) +
    geom_hline(yintercept = 0, color = "red") +
    geom_point(color = palette_light()[[1]], alpha = 0.5) +
    geom_smooth(method = 'gam' ) +
    theme_tq() +
    labs(title = "Test Set: gam() Model Residuals", x = "")

At this point you might go back to the model and try tweaking features using interactions or polynomial terms, adding other features that may be known in the future (e.g. temperature of day can be forecasted relatively accurately within 7 days), or try a completely different modeling technique with the hope of better predictions on the test set. Once you feel that your model is optimized, move on the final step of forecasting.

Forecasting

Let’s use our model to predict What are the expected future values for the next six months. The first step is to create the date sequence. Let’s use tk_get_timeseries_summary() to review the summary of the dates from the original dataset, “bikes”.

# Extract bikes index
idx <- modeldat %>%
    tk_index()

# Get time series summary from index
modeldat_summary <- idx %>%
    tk_get_timeseries_summary()
modeldat_summary

## # A tibble: 1 x 12
##   n.obs      start        end units scale tzone diff.minimum diff.q1
##   <int>     <date>     <date> <chr> <chr> <chr>        <dbl>   <dbl>
## 1    86 2016-01-11 2017-09-18  days  week   UTC       604800  604800
## # ... with 4 more variables: diff.median <dbl>, diff.mean <dbl>,
## #   diff.q3 <dbl>, diff.maximum <dbl>

The first six parameters are general summary information.

modeldat_summary[1:6]

## # A tibble: 1 x 6
##   n.obs      start        end units scale tzone
##   <int>     <date>     <date> <chr> <chr> <chr>
## 1    86 2016-01-11 2017-09-18  days  week   UTC

The second six parameters are the periodicity information.

modeldat_summary[7:12]

## # A tibble: 1 x 6
##   diff.minimum diff.q1 diff.median diff.mean diff.q3 diff.maximum
##          <dbl>   <dbl>       <dbl>     <dbl>   <dbl>        <dbl>
## 1       604800  604800      604800  626145.9  604800      1814400

From the summary, we know that the data is 100% regular because the median and mean differences are 86400 seconds or 1 day. We don’t need to do any special inspections when we use tk_make_future_timeseries(). If the data was irregular, meaning weekends or holidays were excluded, you’d want to account for this. Otherwise your forecast would be inaccurate.

idx_future <- idx %>%
    tk_make_future_timeseries(n_future = 12)


idx_future%>%knitr::kable()

## Warning in kable_markdown(x = structure(c("2017-09-25", "2017-10-02",
## "2017-10-09", : The table should have a header (column names)

2017-09-25
2017-10-02
2017-10-09
2017-10-16
2017-10-23
2017-10-30
2017-11-06
2017-11-13
2017-11-20
2017-11-27
2017-12-04
2017-12-11

To make the prediction, we need to use the future index to get the time series signature (tk_get_timeseries_signature()). Make sure to rename the column “index” to “date” so it matches the column names of the original data.

data_future <- idx_future %>%
    tk_get_timeseries_signature() 

data_future=data_future%>%dplyr::rename(Date=index)%>%timetk::tk_tbl()

## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.

data_future%>%knitr::kable()

Date	index.num	diff	year	year.iso	half	quarter	month	month.xts	month.lbl	day	am.pm	wday	wday.xts	wday.lbl	mday	qday	yday	mweek	week	week.iso	week2	week3	week4	mday7
2017-09-25	1506297600	NA	2017	2017	2	3	9	8	September	25	1	2	1	Monday	25	87	268	5	39	39	1	0	3	4
2017-10-02	1506902400	604800	2017	2017	2	4	10	9	October	2	1	2	1	Monday	2	2	275	1	40	40	0	1	0	1
2017-10-09	1507507200	604800	2017	2017	2	4	10	9	October	9	1	2	1	Monday	9	9	282	2	41	41	1	2	1	2
2017-10-16	1508112000	604800	2017	2017	2	4	10	9	October	16	1	2	1	Monday	16	16	289	3	42	42	0	0	2	3
2017-10-23	1508716800	604800	2017	2017	2	4	10	9	October	23	1	2	1	Monday	23	23	296	4	43	43	1	1	3	4
2017-10-30	1509321600	604800	2017	2017	2	4	10	9	October	30	1	2	1	Monday	30	30	303	5	44	44	0	2	0	5
2017-11-06	1509926400	604800	2017	2017	2	4	11	10	November	6	1	2	1	Monday	6	37	310	2	45	45	1	0	1	1
2017-11-13	1510531200	604800	2017	2017	2	4	11	10	November	13	1	2	1	Monday	13	44	317	3	46	46	0	1	2	2
2017-11-20	1511136000	604800	2017	2017	2	4	11	10	November	20	1	2	1	Monday	20	51	324	4	47	47	1	2	3	3
2017-11-27	1511740800	604800	2017	2017	2	4	11	10	November	27	1	2	1	Monday	27	58	331	5	48	48	0	0	0	4
2017-12-04	1512345600	604800	2017	2017	2	4	12	11	December	4	1	2	1	Monday	4	65	338	2	49	49	1	1	1	1
2017-12-11	1512950400	604800	2017	2017	2	4	12	11	December	11	1	2	1	Monday	11	72	345	3	50	50	0	2	2	2

Make the prediction.

pred_future <- predict(fit_lm1, newdata = data_future)
pred_future

##        1        2        3        4        5        6        7        8 
## 30.32402 34.12196 34.46041 34.73055 34.29086 33.91452 32.77320 33.08290 
##        9       10       11       12 
## 32.94137 32.73489 31.06453 31.39505

Build the future data frame.

#tidyverse_conflicts()

man_future<- data_future %>%
   dplyr:: select(Date) %>%
   tibble::add_column(Actual = pred_future)%>%timetk::tk_tbl()

## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.

man_future%>%knitr::kable()

Date	Actual
2017-09-25	30.32402
2017-10-02	34.12196
2017-10-09	34.46041
2017-10-16	34.73055
2017-10-23	34.29086
2017-10-30	33.91452
2017-11-06	32.77320
2017-11-13	33.08290
2017-11-20	32.94137
2017-11-27	32.73489
2017-12-04	31.06453
2017-12-11	31.39505

man_future%>%dplyr::rename(Forecast=Actual)%>%knitr::kable()

Date	Forecast
2017-09-25	30.32402
2017-10-02	34.12196
2017-10-09	34.46041
2017-10-16	34.73055
2017-10-23	34.29086
2017-10-30	33.91452
2017-11-06	32.77320
2017-11-13	33.08290
2017-11-20	32.94137
2017-11-27	32.73489
2017-12-04	31.06453
2017-12-11	31.39505

Visualize the forecast.

modeldat%>%ggplot(aes(x = Date,y=Actual)) + 
   geom_rect(xmin = as.numeric(ymd("2017-03-27")),
              xmax = as.numeric(ymd("2017-08-14")),
              ymin = 20, ymax = 50,
              fill = palette_light()[[3]], alpha = 0.01) +
   geom_rect(xmin = as.numeric(ymd("2017-08-14")),
              xmax = as.numeric(ymd("2017-12-31")),
              ymin = 20, ymax = 50,
              fill = "#64A8D3", alpha = 0.01)+
  annotate("text", x = ymd("2016-03-17"), y = 45,
             color = palette_light()[[1]], label = "Train Region") +
   annotate("text", x = ymd("2017-06-01"), y = 45,
             color = palette_light()[[1]], label = "Test Reg") + 
   annotate("text", x = ymd("2017-10-01"), y = 45,
             color = palette_light()[[1]], label = "Forecast ")+
  geom_point(alpha = 0.5, color = palette_light()[[1]])+
  geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]])+
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess')

Forecast Error

A forecast is never perfect. We need prediction intervals to account for the variance from the model predictions to the actual data. There’s a number of methods to achieve this. We’ll follow the prediction interval methodology from Forecasting: Principles and Practice.

# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)

man_future <- man_future %>%
    mutate(
        lo.95 = Actual - 1.96 * test_resid_sd,
        lo.80 = Actual - 1.28 * test_resid_sd,
        hi.80 = Actual + 1.28 * test_resid_sd,
        hi.95 = Actual + 1.96 * test_resid_sd
        )
man_future%>%head()%>%knitr::kable()

Date	Actual	lo.95	lo.80	hi.80	hi.95
2017-09-25	30.32402	24.73723	26.67550	33.97253	35.91080
2017-10-02	34.12196	28.53518	30.47345	37.77048	39.70875
2017-10-09	34.46041	28.87362	30.81189	38.10892	40.04719
2017-10-16	34.73055	29.14376	31.08204	38.37906	40.31733
2017-10-23	34.29086	28.70407	30.64234	37.93937	39.87764
2017-10-30	33.91452	28.32774	30.26601	37.56304	39.50131

Now, plotting the forecast with the prediction intervals.

modeldat %>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess', color = "white") + 
    labs(title = " 3-Month Forecast of Drivers needed  with Prediction Intervals", x = "",subtitle="80% and 95% confidence bands") +
    theme_tq()

# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)

man_future <- man_future %>%
    mutate(
        lo.95 = Actual - 1.96 * test_resid_sd,
        lo.99 = Actual - 2.58 * test_resid_sd,
        hi.99 = Actual + 2.58 * test_resid_sd,
        hi.95 = Actual + 1.96 * test_resid_sd
        )

modeldat %>%
    ggplot(aes(x = Date, y = Actual)) +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = Date, y = Actual), data = man_future,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = Date, y = Actual), data = man_future,
                method = 'loess', color = "white") + 
    labs(title = " 3-Month Forecast of Drivers needed  with Prediction Intervals", x = "",subtitle="90% and 95% confidence bands") +
    theme_tq()

sessionInfo()

## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] splines   stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggfortify_0.4.1               RColorBrewer_1.1-2           
##  [3] bindrcpp_0.2                  gam_1.14-4                   
##  [5] foreach_1.4.3                 broom_0.4.2                  
##  [7] readxl_1.0.0.9000             scales_0.5.0.9000            
##  [9] sweep_0.2.0                   DT_0.2                       
## [11] tseries_0.10-42               stringr_1.2.0                
## [13] plotly_4.7.1                  timetk_0.1.0                 
## [15] modelr_0.1.1                  timekit_0.3.0                
## [17] tidyquant_0.5.2               quantmod_0.4-10              
## [19] TTR_0.23-2                    PerformanceAnalytics_1.4.3541
## [21] xts_0.10-0                    zoo_1.8-0                    
## [23] lubridate_1.6.0               dplyr_0.7.2                  
## [25] purrr_0.2.3                   readr_1.1.1                  
## [27] tidyr_0.7.1                   tibble_1.3.4                 
## [29] ggplot2_2.2.1.9000            tidyverse_1.1.1              
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.3.1        jsonlite_1.5      viridisLite_0.2.0
##  [4] assertthat_0.2.0  highr_0.6         cellranger_1.1.0 
##  [7] yaml_2.1.14       backports_1.1.0   lattice_0.20-35  
## [10] glue_1.1.1        quadprog_1.5-5    digest_0.6.12    
## [13] rvest_0.3.2       colorspace_1.3-2  Matrix_1.2-10    
## [16] htmltools_0.3.6   plyr_1.8.4        psych_1.7.5      
## [19] pkgconfig_2.0.1   haven_1.1.0       mgcv_1.8-17      
## [22] lazyeval_0.2.0    mnormt_1.5-5      magrittr_1.5     
## [25] evaluate_0.10.1   nlme_3.1-131      forcats_0.2.0    
## [28] xml2_1.1.1        foreign_0.8-69    tools_3.4.1      
## [31] data.table_1.10.4 hms_0.3           munsell_0.4.3    
## [34] compiler_3.4.1    rlang_0.1.2       grid_3.4.1       
## [37] iterators_1.0.8   htmlwidgets_0.9   labeling_0.3     
## [40] rmarkdown_1.6     gtable_0.2.0      codetools_0.2-15 
## [43] DBI_0.7           curl_2.8.1        reshape2_1.4.2   
## [46] R6_2.2.2          gridExtra_2.2.1   knitr_1.17       
## [49] bindr_0.1         rprojroot_1.2     Quandl_2.8.0     
## [52] stringi_1.1.5     parallel_3.4.1    Rcpp_0.12.12