You can also embed plots, for example:
list=c("tidyverse","tidyquant","timekit","modelr","lubridate","timetk","plotly","stringr","tseries","DT","sweep","scales","readxl","broom","tibble","gam")
list_packages <- list
new.packages <- list_packages[!(list_packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
R<-suppressWarnings(suppressMessages(sapply(list, library, character.only = TRUE)))
#options(na.action = na.warn)
modeldat=read_excel("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/Time series/Manpower/actvsfcdatajuly311.xlsx",sheet = "Sheet1")%>%na.omit()%>%dplyr::rename(Date=Week)
modeldat%>%tail()%>%knitr::kable()
| Date | Actual |
|---|---|
| 2017-08-14 | 32 |
| 2017-08-21 | 30 |
| 2017-08-28 | 27 |
| 2017-09-04 | 32 |
| 2017-09-11 | 32 |
| 2017-09-18 | 36 |
modeldat[61,]
## # A tibble: 1 x 2
## Date Actual
## <dttm> <dbl>
## 1 2017-03-27 29
dim(modeldat)
## [1] 86 2
glimpse(modeldat)
## Observations: 86
## Variables: 2
## $ Date <dttm> 2016-01-11, 2016-01-18, 2016-01-25, 2016-02-01, 2016-0...
## $ Actual <dbl> 26, 27, 28, 22, 27, 31, 26, 27, 21, 23, 25, 23, 23, 23,...
modeldat%>%mutate(Date=as.Date(Date))%>%summary()%>%knitr::kable()
| Date | Actual | |
|---|---|---|
| Min. :2016-01-11 | Min. :21.00 | |
| 1st Qu.:2016-06-07 | 1st Qu.:27.00 | |
| Median :2016-11-03 | Median :30.00 | |
| Mean :2016-11-13 | Mean :29.57 | |
| 3rd Qu.:2017-04-22 | 3rd Qu.:32.00 | |
| Max. :2017-09-18 | Max. :40.00 |
Split the data into train and test sets at “2012-07-01”.
# Split into training(first 60) and test sets(last 21)
modeldat=modeldat%>%mutate(Date=as.Date(Date))
train <- modeldat %>%
dplyr::filter(Date < ymd("2017-03-27"))
test <- modeldat %>%
dplyr:: filter(Date >= ymd("2017-03-27"))
Start with the training set, which has the “date” and “cnt” columns.
# Training set
train%>%head()%>%knitr::kable()
| Date | Actual |
|---|---|
| 2016-01-11 | 26 |
| 2016-01-18 | 27 |
| 2016-01-25 | 28 |
| 2016-02-01 | 22 |
| 2016-02-08 | 27 |
| 2016-02-15 | 31 |
test%>%tail()%>%knitr::kable()
| Date | Actual |
|---|---|
| 2017-08-14 | 32 |
| 2017-08-21 | 30 |
| 2017-08-28 | 27 |
| 2017-09-04 | 32 |
| 2017-09-11 | 32 |
| 2017-09-18 | 36 |
A visualization will help understand how we plan to tackle the problem of forecasting the data. We’ll split the data into two regions: a training region and a testing region.
modeldat%>%ggplot(aes(Date,Actual))+geom_line()+
geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
labs(title = "Drivers Forecast ", x = "Date") +
theme_tq()
modeldat%>%
ggplot(aes(x = Date, y = Actual)) +
geom_rect(xmin = as.numeric(ymd("2017-03-27")),
xmax = as.numeric(ymd("2017-10-21")),
ymin = 20, ymax = 50,
fill = palette_light()[[3]], alpha = 0.01) +
ggplot2::annotate("text", x = ymd("2016-03-17"), y = 45,
color = palette_light()[[1]], label = "Train Region") +
ggplot2::annotate("text", x = ymd("2017-06-01"), y = 45,
color = palette_light()[[1]], label = "Test Region") +
geom_point(alpha = 0.5, color = palette_light()[[1]], shape=20,size=2) +
labs(title = "Bikes Sharing Dataset: Daily Scale", x = "") +
theme_tq()
The first step is to add the time series signature to the training set, which will be used this to learn the patterns. The most efficient method is using tk_augment_timeseries_signature(), which adds the columns we need as additional columns.
# Add time series signature
train_augmented <- train %>%
tk_augment_timeseries_signature()
train_augmented%>%head()%>%knitr::kable( format = "markdown", caption = "Training Data")
| Date | Actual | index.num | diff | year | year.iso | half | quarter | month | month.xts | month.lbl | day | hour | minute | second | hour12 | am.pm | wday | wday.xts | wday.lbl | mday | qday | yday | mweek | week | week.iso | week2 | week3 | week4 | mday7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2016-01-11 | 26 | 1452470400 | NA | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 11 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 11 | 11 | 11 | 3 | 2 | 2 | 0 | 2 | 2 | 2 |
| 2016-01-18 | 27 | 1453075200 | 604800 | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 18 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 18 | 18 | 18 | 4 | 3 | 3 | 1 | 0 | 3 | 3 |
| 2016-01-25 | 28 | 1453680000 | 604800 | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 25 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 25 | 25 | 25 | 5 | 4 | 4 | 0 | 1 | 0 | 4 |
| 2016-02-01 | 22 | 1454284800 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 1 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 1 | 32 | 32 | 6 | 5 | 5 | 1 | 2 | 1 | 1 |
| 2016-02-08 | 27 | 1454889600 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 8 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 8 | 39 | 39 | 2 | 6 | 6 | 0 | 0 | 2 | 2 |
| 2016-02-15 | 31 | 1455494400 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 15 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 15 | 46 | 46 | 3 | 7 | 7 | 1 | 1 | 3 | 3 |
#%>%knitr::kable( format = "html")
dat_augmented <- modeldat %>%
tk_augment_timeseries_signature()
Now that we have a number of fields that can be used for training, we can use these for modeling. In practice, you will want to go through the process of pre-processing the data, centering and scaling if necessary, making dummy variables, removing correlated variables that are present, examining interactions, etc. For brevity, we do not do this here.
#fit an additive model
#fit an additive model
#fit_lm1<-gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )
fit_lm1<-gam::gam(Actual~ s(Date)+s(quarter)+s(month)+s(day), data =dat_augmented )
#fit_lm1<-lm(Actual~ 0+Date+quarter+month+day, data =dat_augmented )
summary(fit_lm1)
##
## Call: gam::gam(formula = Actual ~ s(Date) + s(quarter) + s(month) +
## s(day), data = dat_augmented)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.4968 -2.0114 -0.0306 1.7403 6.3624
##
## (Dispersion Parameter for gaussian family taken to be 10.2734)
##
## Null Deviance: 1575.081 on 85 degrees of freedom
## Residual Deviance: 719.1369 on 69.9998 degrees of freedom
## AIC: 460.6963
##
## Number of Local Scoring Iterations: 3
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Date) 1 57.73 57.73 5.6194 0.02052 *
## s(quarter) 1 344.69 344.69 33.5521 1.814e-07 ***
## s(month) 1 7.94 7.94 0.7730 0.38230
## s(day) 1 1.85 1.85 0.1804 0.67232
## Residuals 70 719.14 10.27
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Date) 3 2.2142 0.09407 .
## s(quarter) 2 2.2966 0.10813
## s(month) 3 12.5093 1.22e-06 ***
## s(day) 3 0.3194 0.81132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#remove first row it contains NA
smry_additvemod=as.list.data.frame(summary(fit_lm1))
smry_additvemod$anova%>%timetk::tk_tbl()%>%slice(-1)%>%knitr::kable()
| index | Npar Df | Npar F | Pr(F) |
|---|---|---|---|
| s(Date) | 3 | 2.214190 | 0.0940729 |
| s(quarter) | 2 | 2.296616 | 0.1081313 |
| s(month) | 3 | 12.509343 | 0.0000012 |
| s(day) | 3 | 0.319370 | 0.8113225 |
# You can also choose to fill in missing values
smry_additvemod_anova=smry_additvemod$anova%>%timetk::tk_tbl()
smry_additvemod_anova%>%knitr::kable()
| index | Npar Df | Npar F | Pr(F) |
|---|---|---|---|
| (Intercept) | NA | NA | NA |
| s(Date) | 3 | 2.214190 | 0.0940729 |
| s(quarter) | 2 | 2.296616 | 0.1081313 |
| s(month) | 3 | 12.509343 | 0.0000012 |
| s(day) | 3 | 0.319370 | 0.8113225 |
# replace white spaces in column names with _
names(smry_additvemod_anova)<-gsub("\\s", "_", names(smry_additvemod_anova))
smry_additvemod_anova%>%knitr::kable()
| index | Npar_Df | Npar_F | Pr(F) |
|---|---|---|---|
| (Intercept) | NA | NA | NA |
| s(Date) | 3 | 2.214190 | 0.0940729 |
| s(quarter) | 2 | 2.296616 | 0.1081313 |
| s(month) | 3 | 12.509343 | 0.0000012 |
| s(day) | 3 | 0.319370 | 0.8113225 |
# You can also choose to fill in missing values
smry_additvemod_anova %>% na.omit()%>%knitr::kable()
| index | Npar_Df | Npar_F | Pr(F) |
|---|---|---|---|
| s(Date) | 3 | 2.214190 | 0.0940729 |
| s(quarter) | 2 | 2.296616 | 0.1081313 |
| s(month) | 3 | 12.509343 | 0.0000012 |
| s(day) | 3 | 0.319370 | 0.8113225 |
smry_additvemod_panova=smry_additvemod$parametric.anova%>%timetk::tk_tbl()
# replace white spaces in column names with _
names(smry_additvemod_panova)<-gsub("\\s", "_", names(smry_additvemod_panova))
#%>% replace_na(list(F_value = "-"))
smry_additvemod_panova%>%replace_na(list(F_value = "-"))
## # A tibble: 5 x 6
## index Df Sum_Sq Mean_Sq F_value `Pr(>F)`
## <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 s(Date) 1.00000 57.730544 57.730544 5.61941467508312 2.052095e-02
## 2 s(quarter) 1.00000 344.694531 344.694531 33.5521092940803 1.813960e-07
## 3 s(month) 1.00000 7.941417 7.941417 0.773006974355052 3.822967e-01
## 4 s(day) 1.00000 1.853444 1.853444 0.180411748026327 6.723224e-01
## 5 Residuals 69.99983 719.136881 10.273409 - NA
tidy(fit_lm1)
## term df sumsq meansq statistic p.value
## 1 s(Date) 1.00000 57.730544 57.730544 5.6194147 2.052095e-02
## 2 s(quarter) 1.00000 344.694531 344.694531 33.5521093 1.813960e-07
## 3 s(month) 1.00000 7.941417 7.941417 0.7730070 3.822967e-01
## 4 s(day) 1.00000 1.853444 1.853444 0.1804117 6.723224e-01
## 5 Residuals 69.99983 719.136881 10.273409 NA NA
tidy(fit_lm1)["p.value"]%>%knitr::kable()
| p.value |
|---|
| 0.0205210 |
| 0.0000002 |
| 0.3822967 |
| 0.6723224 |
| NA |
# add fitted,resid to the train_augmented data
aug=broom::augment_columns(fit_lm1,dat_augmented)
aug%>%head()%>%knitr::kable()
| Date | Actual | index.num | diff | year | year.iso | half | quarter | month | month.xts | month.lbl | day | hour | minute | second | hour12 | am.pm | wday | wday.xts | wday.lbl | mday | qday | yday | mweek | week | week.iso | week2 | week3 | week4 | mday7 | .fitted | .se.fit | .resid | .hat | .sigma | .cooksd | .std.resid |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2016-01-11 | 26 | 1452470400 | NA | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 11 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 11 | 11 | 11 | 3 | 2 | 2 | 0 | 2 | 2 | 2 | 25.77137 | 1.724674 | 0.2286334 | 0.0732964 | 2.998084 | 0.0000869 | 0.0740989 |
| 2016-01-18 | 27 | 1453075200 | 604800 | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 18 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 18 | 18 | 18 | 4 | 3 | 3 | 1 | 0 | 3 | 3 | 25.81127 | 1.644834 | 1.1887285 | 0.0684920 | 2.995037 | 0.0021714 | 0.3842661 |
| 2016-01-25 | 28 | 1453680000 | 604800 | 2016 | 2016 | 1 | 1 | 1 | 0 | January | 25 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 25 | 25 | 25 | 5 | 4 | 4 | 0 | 1 | 0 | 4 | 25.40803 | 1.583149 | 2.5919720 | 0.0785889 | 2.982963 | 0.0121068 | 0.8424542 |
| 2016-02-01 | 22 | 1454284800 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 1 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 1 | 32 | 32 | 6 | 5 | 5 | 1 | 2 | 1 | 1 | 25.04003 | 1.673748 | -3.0400257 | 0.0821441 | 2.977138 | 0.0175427 | -0.9899944 |
| 2016-02-08 | 27 | 1454889600 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 8 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 8 | 39 | 39 | 2 | 6 | 6 | 0 | 0 | 2 | 2 | 25.27167 | 1.269673 | 1.7283311 | 0.0569804 | 2.991591 | 0.0037261 | 0.5552765 |
| 2016-02-15 | 31 | 1455494400 | 604800 | 2016 | 2016 | 1 | 1 | 2 | 1 | February | 15 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 15 | 46 | 46 | 3 | 7 | 7 | 1 | 1 | 3 | 3 | 25.48693 | 1.217385 | 5.5130731 | 0.0467180 | 2.930984 | 0.0304190 | 1.7616749 |
The coefficients are given below
fit_lm1$coefficients
## (Intercept) s(Date) s(quarter) s(month) s(day)
## -5.959531079 0.001801862 3.206040745 -0.392055110 -0.016794206
library(RColorBrewer)
colors = brewer.pal(8, "Dark2")
#To manual add a legend,include the color="name" in the aes() of each plot
cols <- c("LINE1"="#f04546","LINE2"="#3591d1","BAR"="#62c76b")
aug%>%ggplot(aes(x=Date,y=Actual))+geom_line(aes(x=Date,y=.fitted,color="Fitted"),show.legend=TRUE)+geom_line(aes(x=Date,y=Actual,color="Observed"),data=dat_augmented, show.legend=TRUE)+
labs(title = "Predicted Values vs Observed values", x = "Time")+
theme_bw() +
# define a custom background theme
#theme(panel.background = element_rect(fill = 'grey75')) +
scale_colour_manual(name='GAM\n Method',values=c("Fitted"="red","Observed"='black'))+
#Make title bold and add a little space at the baseline (face, vjust)
theme(plot.title = element_text(size=14, face="bold", vjust=2))+
#change the position of the legend
theme(legend.position="top") +
# Change the color of the title of the legend (name)
theme(legend.title = element_text(colour="chocolate", size=14, face="bold"))
We can examine the model residuals to see if there is any significant pattern remaining using augment() from the broom package.
library(ggfortify)
# ggplot2::autoplot(fit_lm1, which = 1:6, nrow=2, label.size = F,na.rm=TRUE)+ theme_tq()
unlist(tidy(fit_lm1)["p.value"])
## p.value1 p.value2 p.value3 p.value4 p.value5
## 2.052095e-02 1.813960e-07 3.822967e-01 6.723224e-01 NA
as.integer(unlist(tidy(fit_lm1)["p.value"]))
## [1] 0 0 0 0 NA
Visualize the residuals of training set
aug %>%
ggplot(aes(x =Date, y = .resid)) +
geom_point(color = palette_light()[[1]], alpha = 0.5)+
geom_hline(yintercept = 0, color = "red") +
theme_bw() +
labs(title = "Training Set: Additive Model Residuals (gam)", x = "")
We can also get a quick idea of the overall error of the model on the training set. Note that what we really care about is the error on the test set, as this is a much better predictor of future model performance.
# RMSE
sqrt(mean(fit_lm1$residuals^2))
## [1] 2.891722
With a suitable model (low residual error and random residuals) we can forecast using the “test” set for validation purposes.
test%>%head()%>%knitr::kable()
| Date | Actual |
|---|---|
| 2017-03-27 | 29 |
| 2017-04-03 | 30 |
| 2017-04-10 | 32 |
| 2017-04-17 | 36 |
| 2017-04-24 | 32 |
| 2017-05-01 | 27 |
We need to again augment the time series signature to the test set.
test_augmented <- test %>%
tk_augment_timeseries_signature()
test_augmented%>%head()%>%knitr::kable()
| Date | Actual | index.num | diff | year | year.iso | half | quarter | month | month.xts | month.lbl | day | hour | minute | second | hour12 | am.pm | wday | wday.xts | wday.lbl | mday | qday | yday | mweek | week | week.iso | week2 | week3 | week4 | mday7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2017-03-27 | 29 | 1490572800 | NA | 2017 | 2017 | 1 | 1 | 3 | 2 | March | 27 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 27 | 86 | 86 | 5 | 13 | 13 | 1 | 1 | 1 | 4 |
| 2017-04-03 | 30 | 1491177600 | 604800 | 2017 | 2017 | 1 | 2 | 4 | 3 | April | 3 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 3 | 3 | 93 | 2 | 14 | 14 | 0 | 2 | 2 | 1 |
| 2017-04-10 | 32 | 1491782400 | 604800 | 2017 | 2017 | 1 | 2 | 4 | 3 | April | 10 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 10 | 10 | 100 | 3 | 15 | 15 | 1 | 0 | 3 | 2 |
| 2017-04-17 | 36 | 1492387200 | 604800 | 2017 | 2017 | 1 | 2 | 4 | 3 | April | 17 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 17 | 17 | 107 | 4 | 16 | 16 | 0 | 1 | 0 | 3 |
| 2017-04-24 | 32 | 1492992000 | 604800 | 2017 | 2017 | 1 | 2 | 4 | 3 | April | 24 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 24 | 24 | 114 | 5 | 17 | 17 | 1 | 2 | 1 | 4 |
| 2017-05-01 | 27 | 1493596800 | 604800 | 2017 | 2017 | 1 | 2 | 5 | 4 | May | 1 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 1 | 31 | 121 | 6 | 18 | 18 | 0 | 0 | 2 | 1 |
Next, use predict() to apply the model to the test set.
yhat_test <- predict(fit_lm1, newdata = test_augmented)
yhat_test%>%knitr::kable()
| 25.84304 |
| 29.18919 |
| 29.48293 |
| 29.63722 |
| 29.16704 |
| 30.37889 |
| 30.61470 |
| 30.82938 |
| 30.34374 |
| 29.97607 |
| 31.41751 |
| 31.65067 |
| 31.57561 |
| 31.26805 |
| 33.61198 |
| 33.89416 |
| 34.05252 |
| 33.60279 |
| 33.07173 |
| 32.20581 |
| 32.48775 |
| 32.18068 |
| 31.94176 |
| 30.18704 |
| 30.51578 |
| 30.64140 |
Add the predictions (use add_column for numeric vectors) to the test set for comparison. Additionally, we can add the residuals using mutate(), which enables performing calculations between columns of a data frame.
options(dplyr.tibble.print = 21)
pred_test <- test %>%
add_column(yhat = yhat_test) %>%
mutate(.resid = Actual - yhat)
pred_test%>%knitr::kable()
| Date | Actual | yhat | .resid |
|---|---|---|---|
| 2017-03-27 | 29 | 25.84304 | 3.1569632 |
| 2017-04-03 | 30 | 29.18919 | 0.8108056 |
| 2017-04-10 | 32 | 29.48293 | 2.5170742 |
| 2017-04-17 | 36 | 29.63722 | 6.3627782 |
| 2017-04-24 | 32 | 29.16704 | 2.8329609 |
| 2017-05-01 | 27 | 30.37889 | -3.3788875 |
| 2017-05-08 | 32 | 30.61470 | 1.3853030 |
| 2017-05-15 | 28 | 30.82938 | -2.8293823 |
| 2017-05-22 | 28 | 30.34374 | -2.3437352 |
| 2017-05-29 | 30 | 29.97607 | 0.0239255 |
| 2017-06-05 | 34 | 31.41751 | 2.5824942 |
| 2017-06-12 | 32 | 31.65067 | 0.3493289 |
| 2017-06-19 | 28 | 31.57561 | -3.5756094 |
| 2017-06-26 | 34 | 31.26805 | 2.7319514 |
| 2017-07-03 | 32 | 33.61198 | -1.6119768 |
| 2017-07-10 | 35 | 33.89416 | 1.1058382 |
| 2017-07-17 | 32 | 34.05252 | -2.0525211 |
| 2017-07-24 | 30 | 33.60279 | -3.6027883 |
| 2017-07-31 | 33 | 33.07173 | -0.0717314 |
| 2017-08-07 | 33 | 32.20581 | 0.7941911 |
| 2017-08-14 | 32 | 32.48775 | -0.4877525 |
| 2017-08-21 | 30 | 32.18068 | -2.1806757 |
| 2017-08-28 | 27 | 31.94176 | -4.9417578 |
| 2017-09-04 | 32 | 30.18704 | 1.8129565 |
| 2017-09-11 | 32 | 30.51578 | 1.4842172 |
| 2017-09-18 | 36 | 30.64140 | 5.3585999 |
#print(pred_test,n=21)
#sd(test_residuals)
# Calculating forecast error
test_residuals <- pred_test$.resid
timetk::tk_tbl(test_residuals)%>%head()%>%knitr::kable()
## Warning in tk_tbl.data.frame(as.data.frame(data), preserve_index,
## rename_index, : Warning: No index to preserve. Object otherwise converted
## to tibble successfully.
| data |
|---|
| 3.1569632 |
| 0.8108056 |
| 2.5170742 |
| 6.3627782 |
| 2.8329609 |
| -3.3788875 |
pred_test%>% dplyr::rename(Predicted=yhat)%>%knitr::kable()
| Date | Actual | Predicted | .resid |
|---|---|---|---|
| 2017-03-27 | 29 | 25.84304 | 3.1569632 |
| 2017-04-03 | 30 | 29.18919 | 0.8108056 |
| 2017-04-10 | 32 | 29.48293 | 2.5170742 |
| 2017-04-17 | 36 | 29.63722 | 6.3627782 |
| 2017-04-24 | 32 | 29.16704 | 2.8329609 |
| 2017-05-01 | 27 | 30.37889 | -3.3788875 |
| 2017-05-08 | 32 | 30.61470 | 1.3853030 |
| 2017-05-15 | 28 | 30.82938 | -2.8293823 |
| 2017-05-22 | 28 | 30.34374 | -2.3437352 |
| 2017-05-29 | 30 | 29.97607 | 0.0239255 |
| 2017-06-05 | 34 | 31.41751 | 2.5824942 |
| 2017-06-12 | 32 | 31.65067 | 0.3493289 |
| 2017-06-19 | 28 | 31.57561 | -3.5756094 |
| 2017-06-26 | 34 | 31.26805 | 2.7319514 |
| 2017-07-03 | 32 | 33.61198 | -1.6119768 |
| 2017-07-10 | 35 | 33.89416 | 1.1058382 |
| 2017-07-17 | 32 | 34.05252 | -2.0525211 |
| 2017-07-24 | 30 | 33.60279 | -3.6027883 |
| 2017-07-31 | 33 | 33.07173 | -0.0717314 |
| 2017-08-07 | 33 | 32.20581 | 0.7941911 |
| 2017-08-14 | 32 | 32.48775 | -0.4877525 |
| 2017-08-21 | 30 | 32.18068 | -2.1806757 |
| 2017-08-28 | 27 | 31.94176 | -4.9417578 |
| 2017-09-04 | 32 | 30.18704 | 1.8129565 |
| 2017-09-11 | 32 | 30.51578 | 1.4842172 |
| 2017-09-18 | 36 | 30.64140 | 5.3585999 |
Visualize the results using ggplot().
ggplot(aes(x = Date), data = modeldat) +
geom_rect(xmin = as.numeric(ymd("2017-03-27")),
xmax = as.numeric(ymd("2017-10-21")),
ymin = 5, ymax = 50,
fill = palette_light()[[11]], alpha = 0.01) +
annotate("text", x = ymd("2016-03-17"), y = 45,
color = palette_light()[[1]], label = "Train Region") +
annotate("text", x = ymd("2017-06-01"), y = 45,
color = palette_light()[[1]], label = "Test Region") +
geom_point(aes(x = Date, y = Actual), data = train, alpha = 0.5, color = palette_light()[[1]]) +
geom_point(aes(x = Date, y = Actual), data = pred_test, alpha = 0.5, color = palette_light()[[1]]) +
geom_point(aes(x = Date, y = yhat), data = pred_test, alpha = 0.9, color = palette_light()[[6]]) +
theme_bw()
The forecast accuracy can be evaluated on the test set using residual diagnostics and forecast accuracy measures.
# Calculating forecast error
test_residuals <- pred_test$.resid
pct_err <- test_residuals/pred_test$Actual * 100 # Percentage error
me <- mean(test_residuals, na.rm=TRUE)
rmse <- mean(test_residuals^2, na.rm=TRUE)^0.5
mae <- mean(abs(test_residuals), na.rm=TRUE)
mape <- mean(abs(pct_err), na.rm=TRUE)
mpe <- mean(pct_err, na.rm=TRUE)
error_tbl <- tibble(me, rmse, mae, mape, mpe)
error_tbl
## # A tibble: 1 x 5
## me rmse mae mape mpe
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.2397142 2.805308 2.322546 7.477003 0.205155
Next we can visualize the residuals of the test set. The residuals of the model aren’t perfect, but we can work with it. The residuals show that the model predicts low in April to May and high in June to July.
ggplot(aes(x = Date, y = .resid), data = pred_test) +
geom_hline(yintercept = 0, color = "red") +
geom_point(color = palette_light()[[1]], alpha = 0.5) +
geom_smooth(method = 'gam' ) +
theme_tq() +
labs(title = "Test Set: gam() Model Residuals", x = "")
At this point you might go back to the model and try tweaking features using interactions or polynomial terms, adding other features that may be known in the future (e.g. temperature of day can be forecasted relatively accurately within 7 days), or try a completely different modeling technique with the hope of better predictions on the test set. Once you feel that your model is optimized, move on the final step of forecasting.
Let’s use our model to predict What are the expected future values for the next six months. The first step is to create the date sequence. Let’s use tk_get_timeseries_summary() to review the summary of the dates from the original dataset, “bikes”.
# Extract bikes index
idx <- modeldat %>%
tk_index()
# Get time series summary from index
modeldat_summary <- idx %>%
tk_get_timeseries_summary()
modeldat_summary
## # A tibble: 1 x 12
## n.obs start end units scale tzone diff.minimum diff.q1
## <int> <date> <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 86 2016-01-11 2017-09-18 days week UTC 604800 604800
## # ... with 4 more variables: diff.median <dbl>, diff.mean <dbl>,
## # diff.q3 <dbl>, diff.maximum <dbl>
The first six parameters are general summary information.
modeldat_summary[1:6]
## # A tibble: 1 x 6
## n.obs start end units scale tzone
## <int> <date> <date> <chr> <chr> <chr>
## 1 86 2016-01-11 2017-09-18 days week UTC
The second six parameters are the periodicity information.
modeldat_summary[7:12]
## # A tibble: 1 x 6
## diff.minimum diff.q1 diff.median diff.mean diff.q3 diff.maximum
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 604800 604800 604800 626145.9 604800 1814400
From the summary, we know that the data is 100% regular because the median and mean differences are 86400 seconds or 1 day. We don’t need to do any special inspections when we use tk_make_future_timeseries(). If the data was irregular, meaning weekends or holidays were excluded, you’d want to account for this. Otherwise your forecast would be inaccurate.
idx_future <- idx %>%
tk_make_future_timeseries(n_future = 12)
idx_future%>%knitr::kable()
## Warning in kable_markdown(x = structure(c("2017-09-25", "2017-10-02",
## "2017-10-09", : The table should have a header (column names)
| 2017-09-25 |
| 2017-10-02 |
| 2017-10-09 |
| 2017-10-16 |
| 2017-10-23 |
| 2017-10-30 |
| 2017-11-06 |
| 2017-11-13 |
| 2017-11-20 |
| 2017-11-27 |
| 2017-12-04 |
| 2017-12-11 |
To make the prediction, we need to use the future index to get the time series signature (tk_get_timeseries_signature()). Make sure to rename the column “index” to “date” so it matches the column names of the original data.
data_future <- idx_future %>%
tk_get_timeseries_signature()
data_future=data_future%>%dplyr::rename(Date=index)%>%timetk::tk_tbl()
## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.
data_future%>%knitr::kable()
| Date | index.num | diff | year | year.iso | half | quarter | month | month.xts | month.lbl | day | hour | minute | second | hour12 | am.pm | wday | wday.xts | wday.lbl | mday | qday | yday | mweek | week | week.iso | week2 | week3 | week4 | mday7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2017-09-25 | 1506297600 | NA | 2017 | 2017 | 2 | 3 | 9 | 8 | September | 25 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 25 | 87 | 268 | 5 | 39 | 39 | 1 | 0 | 3 | 4 |
| 2017-10-02 | 1506902400 | 604800 | 2017 | 2017 | 2 | 4 | 10 | 9 | October | 2 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 2 | 2 | 275 | 1 | 40 | 40 | 0 | 1 | 0 | 1 |
| 2017-10-09 | 1507507200 | 604800 | 2017 | 2017 | 2 | 4 | 10 | 9 | October | 9 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 9 | 9 | 282 | 2 | 41 | 41 | 1 | 2 | 1 | 2 |
| 2017-10-16 | 1508112000 | 604800 | 2017 | 2017 | 2 | 4 | 10 | 9 | October | 16 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 16 | 16 | 289 | 3 | 42 | 42 | 0 | 0 | 2 | 3 |
| 2017-10-23 | 1508716800 | 604800 | 2017 | 2017 | 2 | 4 | 10 | 9 | October | 23 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 23 | 23 | 296 | 4 | 43 | 43 | 1 | 1 | 3 | 4 |
| 2017-10-30 | 1509321600 | 604800 | 2017 | 2017 | 2 | 4 | 10 | 9 | October | 30 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 30 | 30 | 303 | 5 | 44 | 44 | 0 | 2 | 0 | 5 |
| 2017-11-06 | 1509926400 | 604800 | 2017 | 2017 | 2 | 4 | 11 | 10 | November | 6 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 6 | 37 | 310 | 2 | 45 | 45 | 1 | 0 | 1 | 1 |
| 2017-11-13 | 1510531200 | 604800 | 2017 | 2017 | 2 | 4 | 11 | 10 | November | 13 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 13 | 44 | 317 | 3 | 46 | 46 | 0 | 1 | 2 | 2 |
| 2017-11-20 | 1511136000 | 604800 | 2017 | 2017 | 2 | 4 | 11 | 10 | November | 20 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 20 | 51 | 324 | 4 | 47 | 47 | 1 | 2 | 3 | 3 |
| 2017-11-27 | 1511740800 | 604800 | 2017 | 2017 | 2 | 4 | 11 | 10 | November | 27 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 27 | 58 | 331 | 5 | 48 | 48 | 0 | 0 | 0 | 4 |
| 2017-12-04 | 1512345600 | 604800 | 2017 | 2017 | 2 | 4 | 12 | 11 | December | 4 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 4 | 65 | 338 | 2 | 49 | 49 | 1 | 1 | 1 | 1 |
| 2017-12-11 | 1512950400 | 604800 | 2017 | 2017 | 2 | 4 | 12 | 11 | December | 11 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | Monday | 11 | 72 | 345 | 3 | 50 | 50 | 0 | 2 | 2 | 2 |
Make the prediction.
pred_future <- predict(fit_lm1, newdata = data_future)
pred_future
## 1 2 3 4 5 6 7 8
## 30.32402 34.12196 34.46041 34.73055 34.29086 33.91452 32.77320 33.08290
## 9 10 11 12
## 32.94137 32.73489 31.06453 31.39505
Build the future data frame.
#tidyverse_conflicts()
man_future<- data_future %>%
dplyr:: select(Date) %>%
tibble::add_column(Actual = pred_future)%>%timetk::tk_tbl()
## Warning in tk_tbl.data.frame(.): Warning: No index to preserve. Object
## otherwise converted to tibble successfully.
man_future%>%knitr::kable()
| Date | Actual |
|---|---|
| 2017-09-25 | 30.32402 |
| 2017-10-02 | 34.12196 |
| 2017-10-09 | 34.46041 |
| 2017-10-16 | 34.73055 |
| 2017-10-23 | 34.29086 |
| 2017-10-30 | 33.91452 |
| 2017-11-06 | 32.77320 |
| 2017-11-13 | 33.08290 |
| 2017-11-20 | 32.94137 |
| 2017-11-27 | 32.73489 |
| 2017-12-04 | 31.06453 |
| 2017-12-11 | 31.39505 |
man_future%>%dplyr::rename(Forecast=Actual)%>%knitr::kable()
| Date | Forecast |
|---|---|
| 2017-09-25 | 30.32402 |
| 2017-10-02 | 34.12196 |
| 2017-10-09 | 34.46041 |
| 2017-10-16 | 34.73055 |
| 2017-10-23 | 34.29086 |
| 2017-10-30 | 33.91452 |
| 2017-11-06 | 32.77320 |
| 2017-11-13 | 33.08290 |
| 2017-11-20 | 32.94137 |
| 2017-11-27 | 32.73489 |
| 2017-12-04 | 31.06453 |
| 2017-12-11 | 31.39505 |
Visualize the forecast.
modeldat%>%ggplot(aes(x = Date,y=Actual)) +
geom_rect(xmin = as.numeric(ymd("2017-03-27")),
xmax = as.numeric(ymd("2017-08-14")),
ymin = 20, ymax = 50,
fill = palette_light()[[3]], alpha = 0.01) +
geom_rect(xmin = as.numeric(ymd("2017-08-14")),
xmax = as.numeric(ymd("2017-12-31")),
ymin = 20, ymax = 50,
fill = "#64A8D3", alpha = 0.01)+
annotate("text", x = ymd("2016-03-17"), y = 45,
color = palette_light()[[1]], label = "Train Region") +
annotate("text", x = ymd("2017-06-01"), y = 45,
color = palette_light()[[1]], label = "Test Reg") +
annotate("text", x = ymd("2017-10-01"), y = 45,
color = palette_light()[[1]], label = "Forecast ")+
geom_point(alpha = 0.5, color = palette_light()[[1]])+
geom_point(aes(x = Date, y = Actual), data = man_future,
alpha = 0.5, color = palette_light()[[2]])+
geom_smooth(aes(x = Date, y = Actual), data = man_future,
method = 'loess')
A forecast is never perfect. We need prediction intervals to account for the variance from the model predictions to the actual data. There’s a number of methods to achieve this. We’ll follow the prediction interval methodology from Forecasting: Principles and Practice.
# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)
man_future <- man_future %>%
mutate(
lo.95 = Actual - 1.96 * test_resid_sd,
lo.80 = Actual - 1.28 * test_resid_sd,
hi.80 = Actual + 1.28 * test_resid_sd,
hi.95 = Actual + 1.96 * test_resid_sd
)
man_future%>%head()%>%knitr::kable()
| Date | Actual | lo.95 | lo.80 | hi.80 | hi.95 |
|---|---|---|---|---|---|
| 2017-09-25 | 30.32402 | 24.73723 | 26.67550 | 33.97253 | 35.91080 |
| 2017-10-02 | 34.12196 | 28.53518 | 30.47345 | 37.77048 | 39.70875 |
| 2017-10-09 | 34.46041 | 28.87362 | 30.81189 | 38.10892 | 40.04719 |
| 2017-10-16 | 34.73055 | 29.14376 | 31.08204 | 38.37906 | 40.31733 |
| 2017-10-23 | 34.29086 | 28.70407 | 30.64234 | 37.93937 | 39.87764 |
| 2017-10-30 | 33.91452 | 28.32774 | 30.26601 | 37.56304 | 39.50131 |
Now, plotting the forecast with the prediction intervals.
modeldat %>%
ggplot(aes(x = Date, y = Actual)) +
geom_point(alpha = 0.5, color = palette_light()[[1]]) +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future,
fill = "#D5DBFF", color = NA, size = 0) +
geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
geom_point(aes(x = Date, y = Actual), data = man_future,
alpha = 0.5, color = palette_light()[[2]]) +
geom_smooth(aes(x = Date, y = Actual), data = man_future,
method = 'loess', color = "white") +
labs(title = " 3-Month Forecast of Drivers needed with Prediction Intervals", x = "",subtitle="80% and 95% confidence bands") +
theme_tq()
# Calculate standard deviation of residuals
test_resid_sd <- sd(test_residuals)
man_future <- man_future %>%
mutate(
lo.95 = Actual - 1.96 * test_resid_sd,
lo.99 = Actual - 2.58 * test_resid_sd,
hi.99 = Actual + 2.58 * test_resid_sd,
hi.95 = Actual + 1.96 * test_resid_sd
)
modeldat %>%
ggplot(aes(x = Date, y = Actual)) +
geom_point(alpha = 0.5, color = palette_light()[[1]]) +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), data = man_future,
fill = "#D5DBFF", color = NA, size = 0) +
geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key), data = man_future,
fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
geom_point(aes(x = Date, y = Actual), data = man_future,
alpha = 0.5, color = palette_light()[[2]]) +
geom_smooth(aes(x = Date, y = Actual), data = man_future,
method = 'loess', color = "white") +
labs(title = " 3-Month Forecast of Drivers needed with Prediction Intervals", x = "",subtitle="90% and 95% confidence bands") +
theme_tq()
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] splines stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggfortify_0.4.1 RColorBrewer_1.1-2
## [3] bindrcpp_0.2 gam_1.14-4
## [5] foreach_1.4.3 broom_0.4.2
## [7] readxl_1.0.0.9000 scales_0.5.0.9000
## [9] sweep_0.2.0 DT_0.2
## [11] tseries_0.10-42 stringr_1.2.0
## [13] plotly_4.7.1 timetk_0.1.0
## [15] modelr_0.1.1 timekit_0.3.0
## [17] tidyquant_0.5.2 quantmod_0.4-10
## [19] TTR_0.23-2 PerformanceAnalytics_1.4.3541
## [21] xts_0.10-0 zoo_1.8-0
## [23] lubridate_1.6.0 dplyr_0.7.2
## [25] purrr_0.2.3 readr_1.1.1
## [27] tidyr_0.7.1 tibble_1.3.4
## [29] ggplot2_2.2.1.9000 tidyverse_1.1.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.3.1 jsonlite_1.5 viridisLite_0.2.0
## [4] assertthat_0.2.0 highr_0.6 cellranger_1.1.0
## [7] yaml_2.1.14 backports_1.1.0 lattice_0.20-35
## [10] glue_1.1.1 quadprog_1.5-5 digest_0.6.12
## [13] rvest_0.3.2 colorspace_1.3-2 Matrix_1.2-10
## [16] htmltools_0.3.6 plyr_1.8.4 psych_1.7.5
## [19] pkgconfig_2.0.1 haven_1.1.0 mgcv_1.8-17
## [22] lazyeval_0.2.0 mnormt_1.5-5 magrittr_1.5
## [25] evaluate_0.10.1 nlme_3.1-131 forcats_0.2.0
## [28] xml2_1.1.1 foreign_0.8-69 tools_3.4.1
## [31] data.table_1.10.4 hms_0.3 munsell_0.4.3
## [34] compiler_3.4.1 rlang_0.1.2 grid_3.4.1
## [37] iterators_1.0.8 htmlwidgets_0.9 labeling_0.3
## [40] rmarkdown_1.6 gtable_0.2.0 codetools_0.2-15
## [43] DBI_0.7 curl_2.8.1 reshape2_1.4.2
## [46] R6_2.2.2 gridExtra_2.2.1 knitr_1.17
## [49] bindr_0.1 rprojroot_1.2 Quandl_2.8.0
## [52] stringi_1.1.5 parallel_3.4.1 Rcpp_0.12.12