Full Article can be accessed in medium with the same title

#Piping and utility
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Work with time data type
library(lubridate) 
## Warning: package 'lubridate' was built under R version 4.0.5
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(xts)
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
 #Plotting Time Series
library(timetk)
library(ggplot2)
library(gghighlight)
#Moving Average
library(zoo)
#Forecasting
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
## Warning: package 'tseries' was built under R version 4.0.5
#Test Coef
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
#XGBOOST
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.5
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
traffic <- read.csv("~/tugas/Disjas/traffic prediction/traffic.csv")
head(traffic)
##              DateTime Junction Vehicles          ID
## 1 2015-11-01 00:00:00        1       15 20151101001
## 2 2015-11-01 01:00:00        1       13 20151101011
## 3 2015-11-01 02:00:00        1       10 20151101021
## 4 2015-11-01 03:00:00        1        7 20151101031
## 5 2015-11-01 04:00:00        1        9 20151101041
## 6 2015-11-01 05:00:00        1        6 20151101051

Preprocessing

traffic <- traffic[,-4]
traffic$DateTime <- as_datetime(traffic$DateTime)
traffic$Junction <- as.factor(traffic$Junction)

Exploratory Data Analysis

traffic %>%
  group_by(Junction) %>%
  plot_time_series(DateTime, Vehicles,      
                   .color_var = year(DateTime),
                   # Facet formatting
                   .facet_ncol = 2, 
                   .facet_scales = "fixed",
                   .interactive = F,
                   .smooth = F)

ggplot(traffic, aes(x = Vehicles)) + 
  geom_boxplot()+
  facet_grid(. ~ Junction)+
  theme_minimal()+ coord_flip()

ggplot(traffic, aes(x = Vehicles)) + 
  geom_histogram()+
  facet_grid(. ~ Junction)+
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Make Smooth

smoothing <- function(x) {
   dt <- smooth_vec(x, span = 0.75, degree = 2)
   dt <- data.frame(dt)
   colnames(dt)<-"Smooth"
   return(dt)
}
Junc1 <- traffic %>% filter(Junction == 1) %>% select(DateTime,Vehicles)
Junc2 <- traffic %>% filter(Junction == 2) %>% select(DateTime,Vehicles)
Junc3 <- traffic %>% filter(Junction == 3) %>% select(DateTime,Vehicles)
Junc4 <- traffic %>% filter(Junction == 4) %>% select(DateTime,Vehicles)

Junc1S <- smoothing(Junc1$Vehicles)
Junc2S <- smoothing(Junc2$Vehicles)
Junc3S <- smoothing(Junc3$Vehicles)
Junc4S <- smoothing(Junc4$Vehicles)

traffic.S <- rbind(Junc1S,Junc2S,Junc3S,Junc4S)
traffic.S <- cbind(traffic,traffic.S)
ggplot(traffic.S, aes(x = DateTime, y = Smooth)) + 
  geom_line(aes(color = Junction), size = 1) +
  scale_color_manual(values = c("#03045e", "gray","gray","gray")) +
  theme_minimal()

Junc1.mod <- Junc1 %>%
    dplyr::mutate(.,hours =  hour(DateTime),
                  days =  day(DateTime),
                  months = month(DateTime),
                  years =  year(DateTime))
#avg line
ggplot(data=Junc1.mod, aes(x = hours, y = Vehicles)) + 
  geom_bar(stat = "identity",show.legend = FALSE) +
  theme_minimal() + 
  gghighlight(hours == '19', use_direct_label = F) +
  geom_hline(yintercept = nrow(dt)/24,color="cyan3")

Data Split

time_index <- seq(from = as.POSIXct("2015-11-01 00:00:00"), 
                  to = as.POSIXct("2017-06-30 23:00:00"), by = "hour")
Junc1.ts <- ts(Junc1$Vehicles, frequency = 24*30*3 ,start = time_index)

trIndex <- ceiling(0.8 * nrow(Junc1))
train <- subset(Junc1.ts, end = trIndex)
test <- subset(Junc1.ts, start = trIndex+1)

Time Series Model

train.dec<-decompose(train)
plot(train.dec)

RMSE<- function(fit,obs){
  err <- sqrt(sum((obs-fit)^2/length(obs)))
  return(err)
}
adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train
## Dickey-Fuller = -7.8824, Lag order = 22, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train, null="Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  train
## KPSS Trend = 0.18255, Truncation lag parameter = 13, p-value = 0.02254
ndiffs(train)
## [1] 1
auto.arima(train,trace = T,seasonal = F)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2)             with drift         : Inf
##  ARIMA(0,1,0)             with drift         : 73797.75
##  ARIMA(1,1,0)             with drift         : 73402.92
##  ARIMA(0,1,1)             with drift         : 73472.85
##  ARIMA(0,1,0)                                : 73795.75
##  ARIMA(2,1,0)             with drift         : 73280.01
##  ARIMA(3,1,0)             with drift         : 73238.81
##  ARIMA(4,1,0)             with drift         : 73224.99
##  ARIMA(5,1,0)             with drift         : 73116.63
##  ARIMA(5,1,1)             with drift         : 71805.52
##  ARIMA(4,1,1)             with drift         : 73216.27
##  ARIMA(5,1,2)             with drift         : 71783.98
##  ARIMA(4,1,2)             with drift         : Inf
##  ARIMA(5,1,3)             with drift         : 71668.94
##  ARIMA(4,1,3)             with drift         : 72918.87
##  ARIMA(5,1,4)             with drift         : Inf
##  ARIMA(4,1,4)             with drift         : Inf
##  ARIMA(5,1,3)                                : Inf
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(5,1,3)             with drift         : Inf
##  ARIMA(5,1,2)             with drift         : 71776.85
## 
##  Best model: ARIMA(5,1,2)             with drift
## Series: train 
## ARIMA(5,1,2) with drift 
## 
## Coefficients:
##          ar1      ar2     ar3     ar4      ar5      ma1     ma2   drift
##       1.1886  -0.2284  0.0000  0.0183  -0.1555  -1.1438  0.1876  0.0045
## s.e.  0.0386   0.0417  0.0145  0.0144   0.0120   0.0386  0.0375  0.0120
## 
## sigma^2 = 27.39:  log likelihood = -35879.42
## AIC=71776.83   AICc=71776.85   BIC=71843.12
(arima512 <- arima(train, order = c(5,1,2)))
## 
## Call:
## arima(x = train, order = c(5, 1, 2))
## 
## Coefficients:
##          ar1      ar2     ar3     ar4      ar5      ma1     ma2
##       1.1887  -0.2287  0.0001  0.0183  -0.1555  -1.1440  0.1878
## s.e.  0.0386   0.0417  0.0145  0.0144   0.0120   0.0386  0.0375
## 
## sigma^2 estimated as 27.37:  log likelihood = -35879.49,  aic = 71774.97
coeftest(arima512)
## 
## z test of coefficients:
## 
##        Estimate  Std. Error  z value  Pr(>|z|)    
## ar1  1.1887e+00  3.8609e-02  30.7895 < 2.2e-16 ***
## ar2 -2.2870e-01  4.1712e-02  -5.4829 4.183e-08 ***
## ar3  8.6268e-05  1.4537e-02   0.0059    0.9953    
## ar4  1.8262e-02  1.4396e-02   1.2686    0.2046    
## ar5 -1.5553e-01  1.2009e-02 -12.9506 < 2.2e-16 ***
## ma1 -1.1440e+00  3.8581e-02 -29.6519 < 2.2e-16 ***
## ma2  1.8781e-01  3.7492e-02   5.0092 5.465e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(arima212 <- arima(train, order = c(2,1,2)))
## 
## Call:
## arima(x = train, order = c(2, 1, 2))
## 
## Coefficients:
##          ar1     ar2      ma1     ma2
##       1.7286  -0.996  -1.7030  0.9872
## s.e.  0.0010   0.001   0.0025  0.0014
## 
## sigma^2 estimated as 27.87:  log likelihood = -35985.68,  aic = 71981.35
coeftest(arima212)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1  1.7285513  0.0010321 1674.81 < 2.2e-16 ***
## ar2 -0.9960255  0.0010449 -953.22 < 2.2e-16 ***
## ma1 -1.7030208  0.0024522 -694.50 < 2.2e-16 ***
## ma2  0.9871818  0.0014095  700.40 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Box.test(arima512$residuals,type = "Ljung")
## 
##  Box-Ljung test
## 
## data:  arima512$residuals
## X-squared = 0.060102, df = 1, p-value = 0.8063
Box.test(arima212$residuals,type = "Ljung") #arima212 might be worse than arima512
## 
##  Box-Ljung test
## 
## data:  arima212$residuals
## X-squared = 20.725, df = 1, p-value = 5.301e-06

XGBOOST Model

train.xgb <- Junc1.mod[1:trIndex,]
test.xgb <- Junc1.mod[-(1:trIndex),]
train_Dmatrix <- train.xgb %>%
                 dplyr::select(hours,days,months,years) %>%
                 as.matrix() %>% 
                 xgb.DMatrix()

test_Dmatrix <- test.xgb %>%
                dplyr::select(hours,days,months,years) %>%
                as.matrix() %>% 
                xgb.DMatrix()

target <- train.xgb$Vehicles
xgb_grid <-expand.grid(
   list(
    nrounds = 100,
    max_depth = 10, # maximum depth of a tree
    colsample_bytree = seq(0.5), # subsample ratio of columns when construction each tree
    eta = 0.1, # learning rate
    gamma = 0, # minimum loss reduction
    min_child_weight = 1,  # minimum sum of instance weight (hessian) needed ina child
    subsample = 1 # subsample ratio of the training instances
))

xgb_model <- train(
   train_Dmatrix, target,
   tuneGrid = xgb_grid,
   method = "xgbTree",
   nthread = 1
)
## Warning in train.default(train_Dmatrix, target, tuneGrid = xgb_grid, method =
## "xgbTree", : The training data could not be converted to a data frame for saving
actual <- test
forecas.arima212 <- forecast(arima212, h=length(test)) 
autoplot(forecas.arima212) + autolayer(actual)  + autolayer(forecas.arima212$mean)

forecast.arima212.rmse.train <- sqrt(sum(arima212$residuals^2/length(train)))
forecast.arima212.rmse.test <- RMSE(fit = forecas.arima212$mean, obs = test)
forecas.arima512 <- forecast(arima512, h=length(test)) 
autoplot(forecas.arima512) + autolayer(actual) + autolayer(forecas.arima512$mean)

(forecast.arima512.rmse.train <- sqrt(sum(arima512$residuals^2/length(train))))
## [1] 5.231368
(forecast.arima512.rmse.test <- RMSE(fit = forecas.arima512$mean, obs = test))
## [1] 23.60246
xgb_pred <- xgb_model %>% predict(test_Dmatrix)
xgb_pred.rmse.test <- RMSE(xgb_pred,test.xgb$Vehicles)
pred.xgb.dt <- data.frame(DateTime=test.xgb$DateTime,Vehicles=xgb_pred,label="pred")
train.xgb.dt <- data.frame(DateTime=train.xgb$DateTime,Vehicles=train.xgb$Vehicles,label="train")
test.xgb.dt <- data.frame(DateTime=test.xgb$DateTime,Vehicles=test.xgb$Vehicles,label="actual")

plot.xgb <- rbind(train.xgb.dt,test.xgb.dt,pred.xgb.dt)
ggplot(plot.xgb, aes(x = DateTime, y = Vehicles)) + 
  geom_line(aes(color = label), size = 1) +
  scale_color_manual(values = c("tomato", "turquoise", "gray")) +
  theme_minimal()+
  ylim(-100, 200)

RMSE.DT <- data.frame(Method=c("Arima 212","Arima 512","XGBOOST"),
                      RMSE.TRAIN=c(forecast.arima212.rmse.train,forecast.arima512.rmse.train,xgb_model$results$RMSE),RMSE.TEST=c(forecast.arima212.rmse.test,forecast.arima512.rmse.test,xgb_pred.rmse.test))
RMSE.DT
##      Method RMSE.TRAIN RMSE.TEST
## 1 Arima 212   5.278592  24.97503
## 2 Arima 512   5.231368  23.60246
## 3   XGBOOST   5.226656  20.20291
train.full_Dmatrix <- Junc1.mod %>%
                dplyr::select(hours,days,months,years) %>%
                as.matrix() %>% 
                xgb.DMatrix()

xgb_model_pred <- train(
   train.full_Dmatrix, Junc1.mod$Vehicles,
   tuneGrid = xgb_grid,
   method = "xgbTree",
   nthread = 1
)
## Warning in train.default(train.full_Dmatrix, Junc1.mod$Vehicles, tuneGrid =
## xgb_grid, : The training data could not be converted to a data frame for saving
n.forecast <- 24*30*4
DateTime.na <- seq(ymd_hms("2017-07-01 00:00:00 "), by = "hour", length.out = n.forecast)
Vehicles.na <- rep(NA, n.forecast)
DT.na <- data.frame(DateTime=DateTime.na,Vehicles=Vehicles.na)

DT.na.mod <- DT.na %>%
    dplyr::mutate(.,hours =  hour(DateTime),
                  days =  day(DateTime),
                  months = month(DateTime),
                  years =  year(DateTime))

pred.n_Dmatrix <- DT.na.mod %>%
                dplyr::select(hours,days,months,years) %>%
                as.matrix() %>% 
                xgb.DMatrix()

xgb.full_pred <- xgb_model_pred %>% predict(pred.n_Dmatrix)
train.full <- data.frame(DateTime=Junc1.mod$DateTime,Vehicles=Junc1.mod$Vehicles,label="train")
pred.n <- data.frame(DateTime=DT.na.mod$DateTime,Vehicles=xgb.full_pred,label="pred")

plot.n <- rbind(train.full,pred.n)

ggplot(plot.n, aes(x = DateTime, y = Vehicles)) + 
  geom_line(aes(color = label), size = 1) +
  scale_color_manual(values = c("turquoise", "gray")) +
  theme_minimal()+
  ylim(-100, 200)