Full Article can be accessed in medium with the same title
#Piping and utility
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Work with time data type
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(xts)
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
#Plotting Time Series
library(timetk)
library(ggplot2)
library(gghighlight)
#Moving Average
library(zoo)
#Forecasting
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
## Warning: package 'tseries' was built under R version 4.0.5
#Test Coef
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
#XGBOOST
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.5
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
traffic <- read.csv("~/tugas/Disjas/traffic prediction/traffic.csv")
head(traffic)
## DateTime Junction Vehicles ID
## 1 2015-11-01 00:00:00 1 15 20151101001
## 2 2015-11-01 01:00:00 1 13 20151101011
## 3 2015-11-01 02:00:00 1 10 20151101021
## 4 2015-11-01 03:00:00 1 7 20151101031
## 5 2015-11-01 04:00:00 1 9 20151101041
## 6 2015-11-01 05:00:00 1 6 20151101051
Preprocessing
traffic <- traffic[,-4]
traffic$DateTime <- as_datetime(traffic$DateTime)
traffic$Junction <- as.factor(traffic$Junction)
Exploratory Data Analysis
traffic %>%
group_by(Junction) %>%
plot_time_series(DateTime, Vehicles,
.color_var = year(DateTime),
# Facet formatting
.facet_ncol = 2,
.facet_scales = "fixed",
.interactive = F,
.smooth = F)
ggplot(traffic, aes(x = Vehicles)) +
geom_boxplot()+
facet_grid(. ~ Junction)+
theme_minimal()+ coord_flip()
ggplot(traffic, aes(x = Vehicles)) +
geom_histogram()+
facet_grid(. ~ Junction)+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Make Smooth
smoothing <- function(x) {
dt <- smooth_vec(x, span = 0.75, degree = 2)
dt <- data.frame(dt)
colnames(dt)<-"Smooth"
return(dt)
}
Junc1 <- traffic %>% filter(Junction == 1) %>% select(DateTime,Vehicles)
Junc2 <- traffic %>% filter(Junction == 2) %>% select(DateTime,Vehicles)
Junc3 <- traffic %>% filter(Junction == 3) %>% select(DateTime,Vehicles)
Junc4 <- traffic %>% filter(Junction == 4) %>% select(DateTime,Vehicles)
Junc1S <- smoothing(Junc1$Vehicles)
Junc2S <- smoothing(Junc2$Vehicles)
Junc3S <- smoothing(Junc3$Vehicles)
Junc4S <- smoothing(Junc4$Vehicles)
traffic.S <- rbind(Junc1S,Junc2S,Junc3S,Junc4S)
traffic.S <- cbind(traffic,traffic.S)
ggplot(traffic.S, aes(x = DateTime, y = Smooth)) +
geom_line(aes(color = Junction), size = 1) +
scale_color_manual(values = c("#03045e", "gray","gray","gray")) +
theme_minimal()
Junc1.mod <- Junc1 %>%
dplyr::mutate(.,hours = hour(DateTime),
days = day(DateTime),
months = month(DateTime),
years = year(DateTime))
#avg line
ggplot(data=Junc1.mod, aes(x = hours, y = Vehicles)) +
geom_bar(stat = "identity",show.legend = FALSE) +
theme_minimal() +
gghighlight(hours == '19', use_direct_label = F) +
geom_hline(yintercept = nrow(dt)/24,color="cyan3")
Data Split
time_index <- seq(from = as.POSIXct("2015-11-01 00:00:00"),
to = as.POSIXct("2017-06-30 23:00:00"), by = "hour")
Junc1.ts <- ts(Junc1$Vehicles, frequency = 24*30*3 ,start = time_index)
trIndex <- ceiling(0.8 * nrow(Junc1))
train <- subset(Junc1.ts, end = trIndex)
test <- subset(Junc1.ts, start = trIndex+1)
Time Series Model
train.dec<-decompose(train)
plot(train.dec)
RMSE<- function(fit,obs){
err <- sqrt(sum((obs-fit)^2/length(obs)))
return(err)
}
adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train
## Dickey-Fuller = -7.8824, Lag order = 22, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train, null="Trend")
##
## KPSS Test for Trend Stationarity
##
## data: train
## KPSS Trend = 0.18255, Truncation lag parameter = 13, p-value = 0.02254
ndiffs(train)
## [1] 1
auto.arima(train,trace = T,seasonal = F)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2) with drift : Inf
## ARIMA(0,1,0) with drift : 73797.75
## ARIMA(1,1,0) with drift : 73402.92
## ARIMA(0,1,1) with drift : 73472.85
## ARIMA(0,1,0) : 73795.75
## ARIMA(2,1,0) with drift : 73280.01
## ARIMA(3,1,0) with drift : 73238.81
## ARIMA(4,1,0) with drift : 73224.99
## ARIMA(5,1,0) with drift : 73116.63
## ARIMA(5,1,1) with drift : 71805.52
## ARIMA(4,1,1) with drift : 73216.27
## ARIMA(5,1,2) with drift : 71783.98
## ARIMA(4,1,2) with drift : Inf
## ARIMA(5,1,3) with drift : 71668.94
## ARIMA(4,1,3) with drift : 72918.87
## ARIMA(5,1,4) with drift : Inf
## ARIMA(4,1,4) with drift : Inf
## ARIMA(5,1,3) : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(5,1,3) with drift : Inf
## ARIMA(5,1,2) with drift : 71776.85
##
## Best model: ARIMA(5,1,2) with drift
## Series: train
## ARIMA(5,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 drift
## 1.1886 -0.2284 0.0000 0.0183 -0.1555 -1.1438 0.1876 0.0045
## s.e. 0.0386 0.0417 0.0145 0.0144 0.0120 0.0386 0.0375 0.0120
##
## sigma^2 = 27.39: log likelihood = -35879.42
## AIC=71776.83 AICc=71776.85 BIC=71843.12
(arima512 <- arima(train, order = c(5,1,2)))
##
## Call:
## arima(x = train, order = c(5, 1, 2))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2
## 1.1887 -0.2287 0.0001 0.0183 -0.1555 -1.1440 0.1878
## s.e. 0.0386 0.0417 0.0145 0.0144 0.0120 0.0386 0.0375
##
## sigma^2 estimated as 27.37: log likelihood = -35879.49, aic = 71774.97
coeftest(arima512)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.1887e+00 3.8609e-02 30.7895 < 2.2e-16 ***
## ar2 -2.2870e-01 4.1712e-02 -5.4829 4.183e-08 ***
## ar3 8.6268e-05 1.4537e-02 0.0059 0.9953
## ar4 1.8262e-02 1.4396e-02 1.2686 0.2046
## ar5 -1.5553e-01 1.2009e-02 -12.9506 < 2.2e-16 ***
## ma1 -1.1440e+00 3.8581e-02 -29.6519 < 2.2e-16 ***
## ma2 1.8781e-01 3.7492e-02 5.0092 5.465e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(arima212 <- arima(train, order = c(2,1,2)))
##
## Call:
## arima(x = train, order = c(2, 1, 2))
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.7286 -0.996 -1.7030 0.9872
## s.e. 0.0010 0.001 0.0025 0.0014
##
## sigma^2 estimated as 27.87: log likelihood = -35985.68, aic = 71981.35
coeftest(arima212)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.7285513 0.0010321 1674.81 < 2.2e-16 ***
## ar2 -0.9960255 0.0010449 -953.22 < 2.2e-16 ***
## ma1 -1.7030208 0.0024522 -694.50 < 2.2e-16 ***
## ma2 0.9871818 0.0014095 700.40 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Box.test(arima512$residuals,type = "Ljung")
##
## Box-Ljung test
##
## data: arima512$residuals
## X-squared = 0.060102, df = 1, p-value = 0.8063
Box.test(arima212$residuals,type = "Ljung") #arima212 might be worse than arima512
##
## Box-Ljung test
##
## data: arima212$residuals
## X-squared = 20.725, df = 1, p-value = 5.301e-06
XGBOOST Model
train.xgb <- Junc1.mod[1:trIndex,]
test.xgb <- Junc1.mod[-(1:trIndex),]
train_Dmatrix <- train.xgb %>%
dplyr::select(hours,days,months,years) %>%
as.matrix() %>%
xgb.DMatrix()
test_Dmatrix <- test.xgb %>%
dplyr::select(hours,days,months,years) %>%
as.matrix() %>%
xgb.DMatrix()
target <- train.xgb$Vehicles
xgb_grid <-expand.grid(
list(
nrounds = 100,
max_depth = 10, # maximum depth of a tree
colsample_bytree = seq(0.5), # subsample ratio of columns when construction each tree
eta = 0.1, # learning rate
gamma = 0, # minimum loss reduction
min_child_weight = 1, # minimum sum of instance weight (hessian) needed ina child
subsample = 1 # subsample ratio of the training instances
))
xgb_model <- train(
train_Dmatrix, target,
tuneGrid = xgb_grid,
method = "xgbTree",
nthread = 1
)
## Warning in train.default(train_Dmatrix, target, tuneGrid = xgb_grid, method =
## "xgbTree", : The training data could not be converted to a data frame for saving
actual <- test
forecas.arima212 <- forecast(arima212, h=length(test))
autoplot(forecas.arima212) + autolayer(actual) + autolayer(forecas.arima212$mean)
forecast.arima212.rmse.train <- sqrt(sum(arima212$residuals^2/length(train)))
forecast.arima212.rmse.test <- RMSE(fit = forecas.arima212$mean, obs = test)
forecas.arima512 <- forecast(arima512, h=length(test))
autoplot(forecas.arima512) + autolayer(actual) + autolayer(forecas.arima512$mean)
(forecast.arima512.rmse.train <- sqrt(sum(arima512$residuals^2/length(train))))
## [1] 5.231368
(forecast.arima512.rmse.test <- RMSE(fit = forecas.arima512$mean, obs = test))
## [1] 23.60246
xgb_pred <- xgb_model %>% predict(test_Dmatrix)
xgb_pred.rmse.test <- RMSE(xgb_pred,test.xgb$Vehicles)
pred.xgb.dt <- data.frame(DateTime=test.xgb$DateTime,Vehicles=xgb_pred,label="pred")
train.xgb.dt <- data.frame(DateTime=train.xgb$DateTime,Vehicles=train.xgb$Vehicles,label="train")
test.xgb.dt <- data.frame(DateTime=test.xgb$DateTime,Vehicles=test.xgb$Vehicles,label="actual")
plot.xgb <- rbind(train.xgb.dt,test.xgb.dt,pred.xgb.dt)
ggplot(plot.xgb, aes(x = DateTime, y = Vehicles)) +
geom_line(aes(color = label), size = 1) +
scale_color_manual(values = c("tomato", "turquoise", "gray")) +
theme_minimal()+
ylim(-100, 200)
RMSE.DT <- data.frame(Method=c("Arima 212","Arima 512","XGBOOST"),
RMSE.TRAIN=c(forecast.arima212.rmse.train,forecast.arima512.rmse.train,xgb_model$results$RMSE),RMSE.TEST=c(forecast.arima212.rmse.test,forecast.arima512.rmse.test,xgb_pred.rmse.test))
RMSE.DT
## Method RMSE.TRAIN RMSE.TEST
## 1 Arima 212 5.278592 24.97503
## 2 Arima 512 5.231368 23.60246
## 3 XGBOOST 5.226656 20.20291
train.full_Dmatrix <- Junc1.mod %>%
dplyr::select(hours,days,months,years) %>%
as.matrix() %>%
xgb.DMatrix()
xgb_model_pred <- train(
train.full_Dmatrix, Junc1.mod$Vehicles,
tuneGrid = xgb_grid,
method = "xgbTree",
nthread = 1
)
## Warning in train.default(train.full_Dmatrix, Junc1.mod$Vehicles, tuneGrid =
## xgb_grid, : The training data could not be converted to a data frame for saving
n.forecast <- 24*30*4
DateTime.na <- seq(ymd_hms("2017-07-01 00:00:00 "), by = "hour", length.out = n.forecast)
Vehicles.na <- rep(NA, n.forecast)
DT.na <- data.frame(DateTime=DateTime.na,Vehicles=Vehicles.na)
DT.na.mod <- DT.na %>%
dplyr::mutate(.,hours = hour(DateTime),
days = day(DateTime),
months = month(DateTime),
years = year(DateTime))
pred.n_Dmatrix <- DT.na.mod %>%
dplyr::select(hours,days,months,years) %>%
as.matrix() %>%
xgb.DMatrix()
xgb.full_pred <- xgb_model_pred %>% predict(pred.n_Dmatrix)
train.full <- data.frame(DateTime=Junc1.mod$DateTime,Vehicles=Junc1.mod$Vehicles,label="train")
pred.n <- data.frame(DateTime=DT.na.mod$DateTime,Vehicles=xgb.full_pred,label="pred")
plot.n <- rbind(train.full,pred.n)
ggplot(plot.n, aes(x = DateTime, y = Vehicles)) +
geom_line(aes(color = label), size = 1) +
scale_color_manual(values = c("turquoise", "gray")) +
theme_minimal()+
ylim(-100, 200)