setwd("C:/Users/DellPC/Desktop/R_source_code/acb")
suppressMessages(library(tidyverse))
suppressMessages(library(fpp2))
suppressMessages(library(lubridate))
suppressMessages(library(plotly))
suppressMessages(library(ggsci))
suppressMessages(library(RColorBrewer))
suppressMessages(library(patchwork))
suppressMessages(library(psych))
df <- read.csv("acb_first_task.csv")%>%
mutate(date = as.Date(paste(as.character(YYYY_MM),'01'), format = "%Y%m%d")) %>% mutate(month = month(date), year = year(date)) %>%
mutate(year = as.factor(year), month = as.factor(month)) %>%
arrange(year, month)
describe(df$VALUES)
## vars n mean sd median trimmed mad min max
## X1 1 78 49517.73 12428.84 46399.6 48588.66 14978.64 35151.18 74318.62
## range skew kurtosis se
## X1 39167.44 0.49 -1.16 1407.29
gg0 <- df %>% ggplot(aes(x= VALUES, y=..density..)) +
geom_histogram(fill = 'steelblue',color = 'white') +
geom_density(color = 'red') +
theme_light()
ggplotly(gg0)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ts <- ts(df$VALUES, frequency =12, start =c(2014,01))
ggAcf(ts) + theme_light()
ggPacf(ts) + theme_light()
From 2 plots above, we can conclude that there is no seasonal pattern in this time series
ggseasonplot(ts, polar=TRUE) +
ylab("Values") +
ggtitle("Polar seasonal plot") + theme_light()
gg1<- df %>% ggplot(aes(x= month, y = VALUES)) +
geom_bar(stat = 'identity', aes(fill = factor(month))) +
scale_fill_manual(values = c(brewer.pal(12,'Paired'))) +
theme_light() +coord_flip()+
facet_wrap(~year) + theme_light()
ggplotly(gg1)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ggsubseriesplot(ts)+theme_light() + theme_light()
foo1<- df %>% mutate(month = as.factor(month))%>%
group_by(month,year) %>%
summarise(avg = mean(VALUES))
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
gg1<- foo1%>% ggplot(aes(x= month, y = avg, label = avg)) +
geom_bar(stat = 'identity', aes(fill = factor(month))) +
scale_fill_manual(values = c(brewer.pal(12,'Paired'))) +
theme_light() +
coord_flip() +
geom_label() + theme_light()
ggplotly(gg1)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
foo<- df %>% group_by( year) %>% summarise(avg = sum(VALUES))
## `summarise()` ungrouping output (override with `.groups` argument)
gg<- foo%>% ggplot(aes(x= fct_reorder(year, desc(avg)), y = avg, label = avg)) +geom_bar(stat = 'identity', aes(fill = factor(year))) +scale_fill_aaas() +theme_light() +geom_label()
ggplotly(gg)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
train <- df %>% filter(year != 2020)
test <- df %>% filter(year == 2020)
ts <- ts(train$VALUES, frequency = 12, start= c(2014,01))
autoplot(ts) + theme_light()
fit_ets <- ets(ts)
summary(fit_ets)
## ETS(M,A,N)
##
## Call:
## ets(y = ts)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 0.1125
##
## Initial states:
## l = 35444.6308
## b = 110.8762
##
## sigma: 0.0121
##
## AIC AICc BIC
## 1225.411 1226.320 1236.794
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 82.52843 549.0264 451.8519 0.172323 0.9648735 0.07548958 0.2569702
#--> Multiple error, additive trend and no season
checkresiduals(fit_ets) + theme_light()
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 21.626, df = 10, p-value = 0.01713
##
## Model df: 4. Total lags used: 14
## NULL
forecast_point<- forecast(fit_ets, h=6) %>% as.data.frame() %>% transmute(point = `Point Forecast`)
mean((test$VALUES-forecast_point$point)^2)
## [1] 145150.5
fc <- forecast(fit_ets, h=9) %>%
as.data.frame() %>%
mutate(point_forecast = `Point Forecast`) %>%
head(6) %>%
mutate(date = test$date,
L80 = `Lo 80`,
H80 = `Hi 80`,
L95 = `Lo 95`,
H95 = `Hi 95` )
test <- test %>% mutate(point_forecast = VALUES)
test
## YYYY_MM VALUES date month year point_forecast
## 1 202001 71141.29 2020-01-01 1 2020 71141.29
## 2 202002 71075.09 2020-02-01 2 2020 71075.09
## 3 202003 72171.24 2020-03-01 3 2020 72171.24
## 4 202004 72947.07 2020-04-01 4 2020 72947.07
## 5 202005 73268.10 2020-05-01 5 2020 73268.10
## 6 202006 74318.62 2020-06-01 6 2020 74318.62
fitted_data_frame <- fitted(fit_ets) %>% as.data.frame() %>% mutate(VALUES = x, date = train$date)
gg1<- ggplot(data = fc, aes (x = date))+
geom_ribbon(aes(ymin = L80, ymax = H80), alpha = 0.2, fill = 'blue') +
geom_ribbon(aes(ymin = L95, ymax = H95), alpha = 0.2, fill = 'red') +
geom_line(data = train, aes(x = date, y = VALUES),
color = 'red', lwd =1) +
geom_line(data = test,
aes(x= date, y = point_forecast),
color = 'blue', lwd=1) +
geom_line(aes( y = point_forecast),
color = 'green', lwd = 1)+
theme_light() +
geom_line(data = fitted_data_frame, aes(x= date, y=VALUES),
color ='darkorchid', lwd =1 ) + labs(title = 'ETS')
ggplotly(gg1)
Compare the p-value with 1-pct critical value
library(urca)
diff(diff(ts)) %>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0566
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
We need 2-difference times to make data stationary
fit_arima<- auto.arima(ts, seasonal = FALSE)
summary(fit_arima)
## Series: ts
## ARIMA(1,2,1)
##
## Coefficients:
## ar1 ma1
## 0.3427 -0.9296
## s.e. 0.1267 0.0428
##
## sigma^2 estimated as 288695: log likelihood=-539.04
## AIC=1084.09 AICc=1084.45 BIC=1090.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 77.35829 522.1652 426.6649 0.1519535 0.9245955 0.07128166
## ACF1
## Training set -0.006406899
forecast(fit_arima, h =6) %>% autoplot() + theme_light()
checkresiduals(fit_arima) + theme_light()
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,2,1)
## Q* = 16.736, df = 12, p-value = 0.1598
##
## Model df: 2. Total lags used: 14
## NULL
forecast_point<- forecast(fit_arima, h=6) %>% as.data.frame() %>% transmute(point = `Point Forecast`)
mean((test$VALUES-forecast_point$point)^2)
## [1] 574491
forecast(fit_arima, h=9) %>% as.data.frame() %>% transmute(point = `Point Forecast`) %>% tail(3)
## point
## 7 75896.90
## 8 76667.34
## 9 77437.66
fc <- forecast(fit_arima, h=9) %>%
as.data.frame() %>%
mutate(point_forecast = `Point Forecast`) %>%
head(6) %>%
mutate(date = test$date,
L80 = `Lo 80`,
H80 = `Hi 80`,
L95 = `Lo 95`,
H95 = `Hi 95` )
test <- test %>% mutate(point_forecast = VALUES)
test
## YYYY_MM VALUES date month year point_forecast
## 1 202001 71141.29 2020-01-01 1 2020 71141.29
## 2 202002 71075.09 2020-02-01 2 2020 71075.09
## 3 202003 72171.24 2020-03-01 3 2020 72171.24
## 4 202004 72947.07 2020-04-01 4 2020 72947.07
## 5 202005 73268.10 2020-05-01 5 2020 73268.10
## 6 202006 74318.62 2020-06-01 6 2020 74318.62
fitted_data_frame <- fitted(fit_arima) %>% as.data.frame() %>% mutate(VALUES = x, date = train$date)
gg<- ggplot(data = fc, aes (x = date))+
geom_ribbon(aes(ymin = L80, ymax = H80), alpha = 0.2, fill = 'blue') +
geom_ribbon(aes(ymin = L95, ymax = H95), alpha = 0.2, fill = 'red') +
geom_line(data = train, aes(x = date, y = VALUES),
color = 'red', lwd =1) +
geom_line(data = test,
aes(x= date, y = point_forecast),
color = 'blue', lwd=1) +
geom_line(aes( y = point_forecast),
color = 'green', lwd = 1)+
geom_line(data = fitted_data_frame, aes(x= date, y=VALUES),
color ='darkorchid', lwd =1 ) +
theme_light()+ labs(title = 'Arima')
ggplotly(gg)
layout <- "AABB"
gg1 + gg + plot_layout(design = layout)