Load packages and data

setwd("C:/Users/DellPC/Desktop/R_source_code/acb")


suppressMessages(library(tidyverse))
suppressMessages(library(fpp2))
suppressMessages(library(lubridate))
suppressMessages(library(plotly))
suppressMessages(library(ggsci))
suppressMessages(library(RColorBrewer))
suppressMessages(library(patchwork))
suppressMessages(library(psych))

df <- read.csv("acb_first_task.csv")%>% 
               mutate(date = as.Date(paste(as.character(YYYY_MM),'01'), format = "%Y%m%d")) %>%                   mutate(month = month(date), year = year(date)) %>% 
               mutate(year = as.factor(year), month = as.factor(month)) %>%
               arrange(year, month)

Descriptive data

describe(df$VALUES)
##    vars  n     mean       sd  median  trimmed      mad      min      max
## X1    1 78 49517.73 12428.84 46399.6 48588.66 14978.64 35151.18 74318.62
##       range skew kurtosis      se
## X1 39167.44 0.49    -1.16 1407.29

Histogram plot

gg0 <- df %>% ggplot(aes(x= VALUES, y=..density..)) +
               geom_histogram(fill = 'steelblue',color = 'white') +
               geom_density(color = 'red') +
               theme_light()

ggplotly(gg0)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Explore data via visualization

Check auto-correlation

ts <- ts(df$VALUES, frequency =12, start =c(2014,01))

ggAcf(ts) + theme_light() 

ggPacf(ts) + theme_light()

From 2 plots above, we can conclude that there is no seasonal pattern in this time series

Seasonal plot

ggseasonplot(ts, polar=TRUE) +
ylab("Values") +
ggtitle("Polar seasonal plot") + theme_light()

Mean through each month in each year

gg1<- df %>% ggplot(aes(x= month, y = VALUES)) + 
               geom_bar(stat = 'identity', aes(fill = factor(month)))   + 
               scale_fill_manual(values = c(brewer.pal(12,'Paired'))) +
               theme_light() +coord_flip()+ 
               facet_wrap(~year) + theme_light()

ggplotly(gg1)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Mean through each month

Simple plot

ggsubseriesplot(ts)+theme_light() + theme_light()

Another plot

foo1<-  df %>% mutate(month = as.factor(month))%>%
               group_by(month,year) %>%
               summarise(avg = mean(VALUES)) 
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
gg1<- foo1%>% ggplot(aes(x= month, y = avg, label = avg)) + 
               geom_bar(stat = 'identity', aes(fill = factor(month)))   + 
               scale_fill_manual(values = c(brewer.pal(12,'Paired'))) + 
               theme_light()  + 
               coord_flip()  + 
               geom_label()  + theme_light()

ggplotly(gg1)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

Mean through each year

foo<- df  %>% group_by( year) %>% summarise(avg = sum(VALUES)) 
## `summarise()` ungrouping output (override with `.groups` argument)
gg<- foo%>% ggplot(aes(x= fct_reorder(year, desc(avg)), y = avg, label = avg)) +geom_bar(stat = 'identity', aes(fill = factor(year)))   +scale_fill_aaas() +theme_light() +geom_label()

ggplotly(gg)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

Time series

train <- df %>% filter(year != 2020)
test <- df %>% filter(year == 2020)


ts <- ts(train$VALUES, frequency = 12, start= c(2014,01))

autoplot(ts) + theme_light()

ETS- Exponential Smoothing

fit_ets <- ets(ts)

summary(fit_ets)
## ETS(M,A,N) 
## 
## Call:
##  ets(y = ts) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1125 
## 
##   Initial states:
##     l = 35444.6308 
##     b = 110.8762 
## 
##   sigma:  0.0121
## 
##      AIC     AICc      BIC 
## 1225.411 1226.320 1236.794 
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE      MAPE       MASE      ACF1
## Training set 82.52843 549.0264 451.8519 0.172323 0.9648735 0.07548958 0.2569702
#--> Multiple error, additive trend and no season

Check residual for Exponential smoothing model

checkresiduals(fit_ets) + theme_light()

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 21.626, df = 10, p-value = 0.01713
## 
## Model df: 4.   Total lags used: 14
## NULL

MSE for ETS model

forecast_point<- forecast(fit_ets, h=6) %>% as.data.frame() %>% transmute(point = `Point Forecast`) 

mean((test$VALUES-forecast_point$point)^2)
## [1] 145150.5

Visualize final result

fc <- forecast(fit_ets, h=9) %>% 
               as.data.frame() %>% 
               mutate(point_forecast = `Point Forecast`)  %>% 
               head(6) %>% 
               mutate(date = test$date,
                       L80 = `Lo 80`,
                       H80 = `Hi 80`,
                       L95 = `Lo 95`,
                       H95  = `Hi 95` )


test <- test %>% mutate(point_forecast = VALUES)

test 
##   YYYY_MM   VALUES       date month year point_forecast
## 1  202001 71141.29 2020-01-01     1 2020       71141.29
## 2  202002 71075.09 2020-02-01     2 2020       71075.09
## 3  202003 72171.24 2020-03-01     3 2020       72171.24
## 4  202004 72947.07 2020-04-01     4 2020       72947.07
## 5  202005 73268.10 2020-05-01     5 2020       73268.10
## 6  202006 74318.62 2020-06-01     6 2020       74318.62
fitted_data_frame <- fitted(fit_ets) %>% as.data.frame() %>% mutate(VALUES = x, date = train$date)


gg1<- ggplot(data = fc, aes (x = date))+
               geom_ribbon(aes(ymin = L80, ymax = H80), alpha = 0.2, fill = 'blue') +
               geom_ribbon(aes(ymin = L95, ymax = H95), alpha = 0.2, fill = 'red') +
               geom_line(data = train, aes(x = date, y = VALUES),
                         color = 'red', lwd =1)  +
               geom_line(data = test,
                         aes(x= date, y = point_forecast), 
                         color = 'blue', lwd=1) + 
               geom_line(aes( y = point_forecast), 
                         color = 'green', lwd = 1)+
               theme_light() +
               geom_line(data = fitted_data_frame, aes(x= date, y=VALUES),
                         color ='darkorchid', lwd =1 ) + labs(title = 'ETS')
               
               
ggplotly(gg1)

ARIMA

Test stationary

Compare the p-value with 1-pct critical value

library(urca)

diff(diff(ts)) %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0566 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

We need 2-difference times to make data stationary

Fit and choose parameter for ARIMA model

fit_arima<- auto.arima(ts, seasonal = FALSE)

summary(fit_arima)
## Series: ts 
## ARIMA(1,2,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.3427  -0.9296
## s.e.  0.1267   0.0428
## 
## sigma^2 estimated as 288695:  log likelihood=-539.04
## AIC=1084.09   AICc=1084.45   BIC=1090.83
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE      MAPE       MASE
## Training set 77.35829 522.1652 426.6649 0.1519535 0.9245955 0.07128166
##                      ACF1
## Training set -0.006406899

Forecast for the next 6 -months

forecast(fit_arima, h =6) %>% autoplot()  + theme_light()

Check residual for ARIMA model

checkresiduals(fit_arima) + theme_light()

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,2,1)
## Q* = 16.736, df = 12, p-value = 0.1598
## 
## Model df: 2.   Total lags used: 14
## NULL

MSE for ARIMA model

forecast_point<- forecast(fit_arima, h=6) %>% as.data.frame() %>% transmute(point = `Point Forecast`) 

mean((test$VALUES-forecast_point$point)^2)
## [1] 574491

Forecast for the next 9 months (including July, Auguest and October)

forecast(fit_arima, h=9) %>% as.data.frame() %>% transmute(point = `Point Forecast`)  %>% tail(3)
##      point
## 7 75896.90
## 8 76667.34
## 9 77437.66

Visualize final result

fc <- forecast(fit_arima, h=9) %>% 
               as.data.frame() %>% 
               mutate(point_forecast = `Point Forecast`)  %>% 
               head(6) %>% 
               mutate(date = test$date,
                       L80 = `Lo 80`,
                       H80 = `Hi 80`,
                       L95 = `Lo 95`,
                       H95  = `Hi 95` )


test <- test %>% mutate(point_forecast = VALUES)

test 
##   YYYY_MM   VALUES       date month year point_forecast
## 1  202001 71141.29 2020-01-01     1 2020       71141.29
## 2  202002 71075.09 2020-02-01     2 2020       71075.09
## 3  202003 72171.24 2020-03-01     3 2020       72171.24
## 4  202004 72947.07 2020-04-01     4 2020       72947.07
## 5  202005 73268.10 2020-05-01     5 2020       73268.10
## 6  202006 74318.62 2020-06-01     6 2020       74318.62
fitted_data_frame <- fitted(fit_arima) %>% as.data.frame() %>% mutate(VALUES = x, date = train$date)


gg<- ggplot(data = fc, aes (x = date))+
               geom_ribbon(aes(ymin = L80, ymax = H80), alpha = 0.2, fill = 'blue') +
               geom_ribbon(aes(ymin = L95, ymax = H95), alpha = 0.2, fill = 'red') +
               geom_line(data = train, aes(x = date, y = VALUES),
                         color = 'red', lwd =1)  +
               geom_line(data = test,
                         aes(x= date, y = point_forecast), 
                         color = 'blue', lwd=1) + 
               geom_line(aes( y = point_forecast), 
                         color = 'green', lwd = 1)+
               geom_line(data = fitted_data_frame, aes(x= date, y=VALUES),
                         color ='darkorchid', lwd =1 ) +
               theme_light()+ labs(title = 'Arima')
               
ggplotly(gg)

Update (comparing model and trainning set)

layout <- "AABB"
gg1 + gg + plot_layout(design = layout)