New package “FORECAST.TA” for forecasting apply Example For forecasting population in Chelyabinsk

That’s Algorithm Developed By

Makarovskikh Tatyana Anatolyevna “Макаровских Татьяна Анатольевна”

Abotaleb mostafa “Аботалеб Мостафа”

Department of Electrical Engineering and Computer Science

South ural state university, Chelyabinsk, Russian federation

# Imports
library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## -- Attaching packages --------------------------------------------------------------------------- fpp2 2.4 --

## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'forecast' was built under R version 4.0.3

##

library(forecast)
library(ggplot2)
library("readxl")

## Warning: package 'readxl' was built under R version 4.0.3

library(moments)

## Warning: package 'moments' was built under R version 4.0.3

library(forecast)
require(forecast)  
require(tseries)

## Loading required package: tseries

## Warning: package 'tseries' was built under R version 4.0.3

require(markovchain)

## Loading required package: markovchain

## Warning: package 'markovchain' was built under R version 4.0.3

## Package:  markovchain
## Version:  0.8.5-3
## Date:     2020-12-03
## BugReport: https://github.com/spedygiorgio/markovchain/issues

require(data.table)

## Loading required package: data.table

Full_original_data<-read_excel("F:/Phd/ALL Russia Analysis/population in cheleabinsk.xlsx",sheet = "Sheet1")
y_lab<- "population in Chelyabinsk"   # input name of data
Actual_date_interval <- c("1950/12/31","2020/12/31")
Forecast_date_interval <- c("2021/12/31","2027/12/31")
validation_data_days <-7
frequency<-"years"

# Data Preparation & calculate some of statistics measures
original_data<-Full_original_data$Population

summary(original_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  572873  836315 1080595  981687 1117110 1227795

sd(original_data)  # calculate standard deviation

## [1] 190660.9

skewness(original_data)  # calculate Cofficient of skewness

## [1] -0.8021526

kurtosis(original_data)   # calculate Cofficient of kurtosis

## [1] 2.271894

rows <- NROW(original_data)
training_data<-original_data[1:(rows-validation_data_days)]
testing_data<-original_data[(rows-validation_data_days+1):rows]
AD<-fulldate<-seq(as.Date(Actual_date_interval[1]),as.Date(Actual_date_interval[2]), frequency)  #input range for actual date
FD<-seq(as.Date(Forecast_date_interval[1]),as.Date(Forecast_date_interval[2]), frequency)  #input range forecasting date
N_forecasting_days<-nrow(data.frame(FD)) 
validation_dates<-tail(AD,validation_data_days)
validation_data_by_name<-weekdays(validation_dates)
forecasting_data_by_name<-weekdays(FD)

##bats model
# Data Modeling
data_series<-ts(training_data)
autoplot(data_series ,xlab=paste ("Time in  ", frequency, sep=" "), ylab = y_lab, main=paste ("Actual Data :", y_lab, sep=" "))

model_bats<-bats(data_series)
accuracy(model_bats)  # accuracy on training data

##                    ME     RMSE  MAE        MPE      MAPE       MASE        ACF1
## Training set 169.4551 1643.034 1057 0.02326991 0.1130982 0.09539695 -0.01629944

# Print Model Parameters
model_bats

## BATS(1, {0,0}, 0.97, -)
## 
## Call: bats(y = data_series)
## 
## Parameters
##   Alpha: 1.078879
##   Beta: 1.564508
##   Damping Parameter: 0.97007
## 
## Seed States:
##           [,1]
## [1,] 562192.05
## [2,]  12593.13
## 
## Sigma: 1643.034
## AIC: 1223.919

plot(model_bats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4)

# Testing Data Evaluation
forecasting_bats <- predict(model_bats, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_bats$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  7 years by using bats Model for  ==>  population in Chelyabinsk"

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_bats<-paste(round(MAPE_Per_Day,3),"%")
MAPE_bats_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  7  days in bats Model for  ==>  population in Chelyabinsk"

paste(MAPE_Mean_All,"%")

## [1] "0.13 % MAPE  7 years population in Chelyabinsk %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  7  days in bats Model for  ==>  population in Chelyabinsk"

data.frame(date_bats=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_bats=validation_forecast,MAPE_bats_Model)

##    date_bats validation_data_by_name actual_data forecasting_bats
## 1 2014-12-31                   среда     1170685          1170834
## 2 2015-12-31                 четверг     1181855          1181644
## 3 2016-12-31                 суббота     1193148          1192130
## 4 2017-12-31             воскресенье     1204517          1202302
## 5 2018-12-31             понедельник     1215994          1212169
## 6 2019-12-31                 вторник     1222108          1221742
## 7 2020-12-31                 четверг     1227795          1231028
##   MAPE_bats_Model
## 1         0.013 %
## 2         0.018 %
## 3         0.085 %
## 4         0.184 %
## 5         0.315 %
## 6          0.03 %
## 7         0.263 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_bats=tail(forecasting_bats$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_bats
## 1 2021-12-31         пятница             1240035
## 2 2022-12-31         суббота             1248774
## 3 2023-12-31     воскресенье             1257251
## 4 2024-12-31         вторник             1265474
## 5 2025-12-31           среда             1273451
## 6 2026-12-31         четверг             1281189
## 7 2027-12-31         пятница             1288695

plot(forecasting_bats)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph1<-autoplot(forecasting_bats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph1

## Error of forecasting
Error_bats<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_bats<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_bats<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_bats<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_bats<-sqrt(sum((Error_bats^2))/validation_data_days)   #  Root mean square forecast error
MAD_bats<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_bats<-c(Error_bats)
REOF_Abats<-c(paste(round(REOF_A_bats,3),"%"))
REOF_Fbats<-c(paste(round(REOF_F_bats,3),"%"))
data.frame(correlation_bats,RMSE_bats,MAPE_Mean_All,MAD_bats) # analysis of Error  by using Bats Model shows result of correlation ,MSE ,MPER

##   correlation_bats RMSE_bats                                  MAPE_Mean_All
## 1        0.9949532  2111.977 0.13 % MAPE  7 years population in Chelyabinsk
##   MAD_bats
## 1  607.783

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_bats,REOF_Abats,REOF_Fbats)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##   validation_dates Validation_day_name AEOF_bats REOF_Abats REOF_Fbats
## 1       2014-12-31               среда  149.1266    0.013 %    0.013 %
## 2       2015-12-31             четверг  211.3941    0.018 %    0.018 %
## 3       2016-12-31             суббота 1018.4379    0.085 %    0.085 %
## 4       2017-12-31         воскресенье 2215.3219    0.184 %    0.184 %
## 5       2018-12-31         понедельник 3824.6530    0.315 %    0.316 %
## 6       2019-12-31             вторник  366.3193     0.03 %     0.03 %
## 7       2020-12-31             четверг 3232.5186    0.263 %    0.263 %

## TBATS Model

# Data Modeling
data_series<-ts(training_data)
model_TBATS<-forecast:::fitSpecificTBATS(data_series,use.box.cox=FALSE, use.beta=TRUE,  seasonal.periods=c(6),use.damping=FALSE,k.vector=c(2))
accuracy(model_TBATS)  # accuracy on training data

##                     ME     RMSE      MAE         MPE      MAPE      MASE
## Training set -8.676742 1824.837 1294.112 0.002005064 0.1410602 0.1167969
##                     ACF1
## Training set 0.009651064

# Print Model Parameters
model_TBATS

## TBATS(1, {0,0}, 1, {<6,2>})
## 
## Call: NULL
## 
## Parameters
##   Alpha: 1.044364
##   Beta: 1.394573
##   Damping Parameter: 1
##   Gamma-1 Values: 0.03650915
##   Gamma-2 Values: -0.0127078
## 
## Seed States:
##             [,1]
## [1,] 562497.7030
## [2,]  12578.3278
## [3,]   -852.2614
## [4,]    475.7496
## [5,]    991.4793
## [6,]   -451.6407
## 
## Sigma: 1824.837
## AIC: 1247.352

plot(model_TBATS,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)

# Testing Data Evaluation
forecasting_tbats <- predict(model_TBATS, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_tbats$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using TBATS Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  7 years by using TBATS Model for  ==>  population in Chelyabinsk"

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_TBATS<-paste(round(MAPE_Per_Day,3),"%")
MAPE_TBATS_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in TBATS Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  7  days in TBATS Model for  ==>  population in Chelyabinsk"

paste(MAPE_Mean_All,"%")

## [1] "0.311 % MAPE  7 years population in Chelyabinsk %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in TBATS Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  7  days in TBATS Model for  ==>  population in Chelyabinsk"

data.frame(date_TBATS=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_TBATS=validation_forecast,MAPE_TBATS_Model)

##   date_TBATS validation_data_by_name actual_data forecasting_TBATS
## 1 2014-12-31                   среда     1170685           1170377
## 2 2015-12-31                 четверг     1181855           1181124
## 3 2016-12-31                 суббота     1193148           1192406
## 4 2017-12-31             воскресенье     1204517           1204960
## 5 2018-12-31             понедельник     1215994           1218493
## 6 2019-12-31                 вторник     1222108           1230604
## 7 2020-12-31                 четверг     1227795           1241201
##   MAPE_TBATS_Model
## 1          0.026 %
## 2          0.062 %
## 3          0.062 %
## 4          0.037 %
## 5          0.206 %
## 6          0.695 %
## 7          1.092 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_TBATS=tail(forecasting_tbats$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_TBATS
## 1 2021-12-31         пятница              1251947
## 2 2022-12-31         суббота              1263229
## 3 2023-12-31     воскресенье              1275784
## 4 2024-12-31         вторник              1289316
## 5 2025-12-31           среда              1301427
## 6 2026-12-31         четверг              1312024
## 7 2027-12-31         пятница              1322771

plot(forecasting_tbats)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph2<-autoplot(forecasting_tbats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph2

## Error of forecasting TBATS Model

Error_tbats<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_tbats1<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_tbats<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_tbats<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_tbats<-sqrt(sum((Error_tbats^2))/validation_data_days)   #  Root mean square forecast error
MAD_tbats<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_tbats<-c(Error_tbats)
REOF_A_tbats<-c(paste(round(REOF_A_tbats1,3),"%"))
REOF_F_tbats<-c(paste(round(REOF_F_tbats,3),"%"))
data.frame(correlation_tbats,RMSE_tbats,MAPE_Mean_All,MAD_tbats) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER

##   correlation_tbats RMSE_tbats                                   MAPE_Mean_All
## 1         0.9923645    6088.86 0.311 % MAPE  7 years population in Chelyabinsk
##   MAD_tbats
## 1  3294.776

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_tbats,REOF_A_tbats,REOF_F_tbats)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##   validation_dates Validation_day_name AEOF_tbats REOF_A_tbats REOF_F_tbats
## 1       2014-12-31               среда   307.5421      0.026 %      0.026 %
## 2       2015-12-31             четверг   731.1978      0.062 %      0.062 %
## 3       2016-12-31             суббота   742.0549      0.062 %      0.062 %
## 4       2017-12-31         воскресенье   443.4422      0.037 %      0.037 %
## 5       2018-12-31         понедельник  2499.0083      0.206 %      0.205 %
## 6       2019-12-31             вторник  8495.8830      0.695 %       0.69 %
## 7       2020-12-31             четверг 13405.8902      1.092 %       1.08 %

## Holt's linear trend


# Data Modeling
data_series<-ts(training_data)
model_holt<-holt(data_series,h=N_forecasting_days+validation_data_days,lambda = "auto")
accuracy(model_holt)  # accuracy on training data

##                    ME     RMSE      MAE         MPE      MAPE       MASE
## Training set 35.89561 2057.755 1040.567 0.003636185 0.1131804 0.09391386
##                   ACF1
## Training set 0.3807252

# Print Model Parameters
summary(model_holt$model)

## Holt's method 
## 
## Call:
##  holt(y = data_series, h = N_forecasting_days + validation_data_days,  
## 
##  Call:
##      lambda = "auto") 
## 
##   Box-Cox transformation: lambda= 1.9999 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.9999 
## 
##   Initial states:
##     l = 153115029226.739 
##     b = 11282238865.7469 
## 
##   sigma:  2213521530
## 
##      AIC     AICc      BIC 
## 3026.323 3027.357 3037.117 
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE      MAPE       MASE
## Training set 35.89561 2057.755 1040.567 0.003636185 0.1131804 0.09391386
##                   ACF1
## Training set 0.3807252

# Testing Data Evaluation
forecasting_holt <- predict(model_holt, h=N_forecasting_days+validation_data_days,lambda = "auto")
validation_forecast<-head(forecasting_holt$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using holt Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  7 years by using holt Model for  ==>  population in Chelyabinsk"

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_holt<-paste(round(MAPE_Per_Day,3),"%")
MAPE_holt_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in holt Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  7  days in holt Model for  ==>  population in Chelyabinsk"

paste(MAPE_Mean_All,"%")

## [1] "0.168 % MAPE  7 years population in Chelyabinsk %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in holt Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  7  days in holt Model for  ==>  population in Chelyabinsk"

data.frame(date_holt=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_holt=validation_forecast,MAPE_holt_Model)

##    date_holt validation_data_by_name actual_data forecasting_holt
## 1 2014-12-31                   среда     1170685          1170463
## 2 2015-12-31                 четверг     1181855          1181206
## 3 2016-12-31                 суббота     1193148          1191852
## 4 2017-12-31             воскресенье     1204517          1202405
## 5 2018-12-31             понедельник     1215994          1212865
## 6 2019-12-31                 вторник     1222108          1223236
## 7 2020-12-31                 четверг     1227795          1233520
##   MAPE_holt_Model
## 1         0.019 %
## 2         0.055 %
## 3         0.109 %
## 4         0.175 %
## 5         0.257 %
## 6         0.092 %
## 7         0.466 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_holt=tail(forecasting_holt$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_holt
## 1 2021-12-31         пятница             1243718
## 2 2022-12-31         суббота             1253834
## 3 2023-12-31     воскресенье             1263869
## 4 2024-12-31         вторник             1273825
## 5 2025-12-31           среда             1283703
## 6 2026-12-31         четверг             1293507
## 7 2027-12-31         пятница             1303236

plot(forecasting_holt)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph3<-autoplot(forecasting_holt,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph3

## Error of forecasting by using Holt's linear model
Error_Holt<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_Holt1<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_Holt<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_Holt<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_Holt<-sqrt(sum((Error_Holt^2))/validation_data_days)   #  Root mean square forecast error
MAD_Holt<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_Holt<-c(Error_Holt)
REOF_A_Holt<-c(paste(round(REOF_A_Holt1,3),"%"))
REOF_F_Holt<-c(paste(round(REOF_F_Holt,3),"%"))
REOF_A_Holt11<-mean(abs(((testing_data-validation_forecast)/testing_data)*100))
data.frame(correlation_Holt,RMSE_Holt,MAPE_Mean_All,MAD_Holt) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER

##   correlation_Holt RMSE_Holt                                   MAPE_Mean_All
## 1        0.9932091  2684.515 0.168 % MAPE  7 years population in Chelyabinsk
##   MAD_Holt
## 1 79.33375

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_Holt,REOF_A_Holt,REOF_F_Holt)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##   validation_dates Validation_day_name AEOF_Holt REOF_A_Holt REOF_F_Holt
## 1       2014-12-31               среда  222.2340     0.019 %     0.019 %
## 2       2015-12-31             четверг  648.9693     0.055 %     0.055 %
## 3       2016-12-31             суббота 1295.5323     0.109 %     0.109 %
## 4       2017-12-31         воскресенье 2112.3511     0.175 %     0.176 %
## 5       2018-12-31         понедельник 3128.9658     0.257 %     0.258 %
## 6       2019-12-31             вторник 1127.9785     0.092 %     0.092 %
## 7       2020-12-31             четверг 5724.7377     0.466 %     0.464 %

#Auto arima model
##################

require(tseries) # need to install tseries tj test Stationarity in time series 
paste ("tests For Check Stationarity in series  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series  ==>  population in Chelyabinsk"

kpss.test(data_series) # applay kpss test

## Warning in kpss.test(data_series): p-value smaller than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  data_series
## KPSS Level = 1.5003, Truncation lag parameter = 3, p-value = 0.01

pp.test(data_series)   # applay pp test

## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_series
## Dickey-Fuller Z(alpha) = -0.96716, Truncation lag parameter = 3,
## p-value = 0.9856
## alternative hypothesis: stationary

adf.test(data_series)  # applay adf test

## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_series
## Dickey-Fuller = -3.0299, Lag order = 3, p-value = 0.1576
## alternative hypothesis: stationary

ndiffs(data_series)    # Doing first diffrencing on data

## [1] 2

#Taking the first difference
diff1_x1<-diff(data_series)
autoplot(diff1_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab,main = "1nd differenced series")

## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub, cex.main,
## cex.lab, cex.sub, font.main, font.lab

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking first differences in  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking first differences in  ==>  population in Chelyabinsk"

kpss.test(diff1_x1)   # applay kpss test after taking first differences

## Warning in kpss.test(diff1_x1): p-value smaller than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff1_x1
## KPSS Level = 0.9331, Truncation lag parameter = 3, p-value = 0.01

pp.test(diff1_x1)     # applay pp test after taking first differences

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff1_x1
## Dickey-Fuller Z(alpha) = -4.3629, Truncation lag parameter = 3, p-value
## = 0.861
## alternative hypothesis: stationary

adf.test(diff1_x1)    # applay adf test after taking first differences

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_x1
## Dickey-Fuller = -1.634, Lag order = 3, p-value = 0.7231
## alternative hypothesis: stationary

#Taking the second difference
diff2_x1=diff(diff1_x1)
autoplot(diff2_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab ,main = "2nd differenced series")

## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub, cex.main,
## cex.lab, cex.sub, font.main, font.lab

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking Second differences in",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking Second differences in population in Chelyabinsk"

kpss.test(diff2_x1)   # applay kpss test after taking Second differences

## Warning in kpss.test(diff2_x1): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2_x1
## KPSS Level = 0.16883, Truncation lag parameter = 3, p-value = 0.1

pp.test(diff2_x1)     # applay pp test after taking Second differences

## Warning in pp.test(diff2_x1): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2_x1
## Dickey-Fuller Z(alpha) = -32.84, Truncation lag parameter = 3, p-value
## = 0.01
## alternative hypothesis: stationary

adf.test(diff2_x1)    # applay adf test after taking Second differences

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2_x1
## Dickey-Fuller = -3.5363, Lag order = 3, p-value = 0.046
## alternative hypothesis: stationary

####Fitting an ARIMA Model
#1. Using auto arima function
model1 <- auto.arima(data_series,stepwise=FALSE, approximation=FALSE, trace=T, test = c("kpss", "adf", "pp"))  #applaying auto arima

## 
##  ARIMA(0,2,0)                    : 1116.368
##  ARIMA(0,2,1)                    : 1094.213
##  ARIMA(0,2,2)                    : 1095.765
##  ARIMA(0,2,3)                    : 1097.968
##  ARIMA(0,2,4)                    : 1100.036
##  ARIMA(0,2,5)                    : 1102.28
##  ARIMA(1,2,0)                    : 1105.957
##  ARIMA(1,2,1)                    : 1095.733
##  ARIMA(1,2,2)                    : 1098.021
##  ARIMA(1,2,3)                    : 1100.257
##  ARIMA(1,2,4)                    : Inf
##  ARIMA(2,2,0)                    : 1104.269
##  ARIMA(2,2,1)                    : 1098.02
##  ARIMA(2,2,2)                    : 1100.25
##  ARIMA(2,2,3)                    : Inf
##  ARIMA(3,2,0)                    : 1103.24
##  ARIMA(3,2,1)                    : 1100.006
##  ARIMA(3,2,2)                    : Inf
##  ARIMA(4,2,0)                    : 1103.592
##  ARIMA(4,2,1)                    : 1102.398
##  ARIMA(5,2,0)                    : 1105.091
## 
## 
## 
##  Best model: ARIMA(0,2,1)

model1 # show the result of autoarima

## Series: data_series 
## ARIMA(0,2,1) 
## 
## Coefficients:
##          ma1
##       0.7915
## s.e.  0.0868
## 
## sigma^2 estimated as 2538995:  log likelihood=-545
## AIC=1094.01   AICc=1094.21   BIC=1098.26

#Make changes in the source of auto arima to run the best model
arima.string <- function (object, padding = FALSE) 
{
  order <- object$arma[c(1, 6, 2, 3, 7, 4, 5)]
  m <- order[7]
  result <- paste("ARIMA(", order[1], ",", order[2], ",", 
                  order[3], ")", sep = "")
  if (m > 1 && sum(order[4:6]) > 0) {
    result <- paste(result, "(", order[4], ",", order[5], 
                    ",", order[6], ")[", m, "]", sep = "")
  }
  if (padding && m > 1 && sum(order[4:6]) == 0) {
    result <- paste(result, "         ", sep = "")
    if (m <= 9) {
      result <- paste(result, " ", sep = "")
    }
    else if (m <= 99) {
      result <- paste(result, "  ", sep = "")
    }
    else {
      result <- paste(result, "   ", sep = "")
    }
  }
  if (!is.null(object$xreg)) {
    if (NCOL(object$xreg) == 1 && is.element("drift", names(object$coef))) {
      result <- paste(result, "with drift        ")
    }
    else {
      result <- paste("Regression with", result, "errors")
    }
  }
  else {
    if (is.element("constant", names(object$coef)) || is.element("intercept", 
                                                                 names(object$coef))) {
      result <- paste(result, "with non-zero mean")
    }
    else if (order[2] == 0 && order[5] == 0) {
      result <- paste(result, "with zero mean    ")
    }
    else {
      result <- paste(result, "                  ")
    }
  }
  if (!padding) {
    result <- gsub("[ ]*$", "", result)
  }
  return(result)
}






source("stringthearima.R")  
bestmodel <- arima.string(model1, padding = TRUE)
bestmodel <- substring(bestmodel,7,11)
bestmodel <- gsub(" ", "", bestmodel)
bestmodel <- gsub(")", "", bestmodel)
bestmodel <- strsplit(bestmodel, ",")[[1]]
bestmodel <- c(strtoi(bestmodel[1]),strtoi(bestmodel[2]),strtoi(bestmodel[3]))
bestmodel

## [1] 0 2 1

strtoi(bestmodel[3])

## [1] 1

#2. Using ACF and PACF Function
#par(mfrow=c(1,2))  # Code for making two plot in one graph 
acf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab, main=paste("ACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))    # plot ACF "auto correlation function after taking second diffrences

pacf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab,main=paste("PACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))   # plot PACF " Partial auto correlation function after taking second diffrences

library(forecast)   # install library forecast             
x1_model1= arima(data_series, order=c(bestmodel)) # Run Best model of auto arima  for forecasting
x1_model1  # Show result of best model of auto arima

## 
## Call:
## arima(x = data_series, order = c(bestmodel))
## 
## Coefficients:
##          ma1
##       0.7915
## s.e.  0.0868
## 
## sigma^2 estimated as 2488138:  log likelihood = -545,  aic = 1094.01

paste ("accuracy of autoarima Model For  ==> ",y_lab, sep=" ")

## [1] "accuracy of autoarima Model For  ==>  population in Chelyabinsk"

accuracy(x1_model1)  # aacuracy of best model from auto arima

##                     ME     RMSE      MAE          MPE       MAPE       MASE
## Training set -14.35341 1555.628 931.9643 0.0004364618 0.08954136 0.08411218
##                     ACF1
## Training set -0.07729962

x1_model1$x          # show result of best model from auto arima

## NULL

checkresiduals(x1_model1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)  # checkresiduals from best model from using auto arima

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,1)
## Q* = 4.7955, df = 9, p-value = 0.8518
## 
## Model df: 1.   Total lags used: 10

paste("Box-Ljung test , Ljung-Box test For Modelling for   ==> ",y_lab, sep=" ")

## [1] "Box-Ljung test , Ljung-Box test For Modelling for   ==>  population in Chelyabinsk"

Box.test(x1_model1$residuals^2, lag=20, type="Ljung-Box")   # Do test for resdulas by using Box-Ljung test , Ljung-Box test For Modelling

## 
##  Box-Ljung test
## 
## data:  x1_model1$residuals^2
## X-squared = 13.47, df = 20, p-value = 0.8563

library(tseries)
jarque.bera.test(x1_model1$residuals)  # Do test jarque.bera.test

## 
##  Jarque Bera Test
## 
## data:  x1_model1$residuals
## X-squared = 180.09, df = 2, p-value < 2.2e-16

#Actual Vs Fitted
plot(data_series, col='red',lwd=2, main="Actual vs Fitted Plot", xlab='Time in (days)', ylab=y_lab) # plot actual and Fitted model 
lines(fitted(x1_model1), col='blue')

#Test data

x1_test <- ts(testing_data, start =(rows-validation_data_days+1) ) # make testing data in time series and start from rows-6
forecasting_auto_arima <- forecast(x1_model1, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_auto_arima$mean,validation_data_days)
MAPE_Per_Day<-round(abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  7 years by using bats Model for  ==>  population in Chelyabinsk"

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_auto_arima<-paste(round(MAPE_Per_Day,3),"%")
MAPE_auto.arima_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  7  days in bats Model for  ==>  population in Chelyabinsk"

paste(MAPE_Mean_All,"%")

## [1] "0.362 % MAPE  7 years population in Chelyabinsk %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  7  days in bats Model for  ==>  population in Chelyabinsk"

data.frame(date_auto.arima=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_auto.arima=validation_forecast,MAPE_auto.arima_Model)

##   date_auto.arima validation_data_by_name actual_data forecasting_auto.arima
## 1      2014-12-31                   среда     1170685                1171392
## 2      2015-12-31                 четверг     1181855                1183164
## 3      2016-12-31                 суббота     1193148                1194936
## 4      2017-12-31             воскресенье     1204517                1206708
## 5      2018-12-31             понедельник     1215994                1218480
## 6      2019-12-31                 вторник     1222108                1230253
## 7      2020-12-31                 четверг     1227795                1242025
##   MAPE_auto.arima_Model
## 1                0.06 %
## 2               0.111 %
## 3                0.15 %
## 4               0.182 %
## 5               0.204 %
## 6               0.666 %
## 7               1.159 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_auto.arima=tail(forecasting_auto_arima$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_auto.arima
## 1 2021-12-31         пятница                   1253797
## 2 2022-12-31         суббота                   1265569
## 3 2023-12-31     воскресенье                   1277341
## 4 2024-12-31         вторник                   1289113
## 5 2025-12-31           среда                   1300885
## 6 2026-12-31         четверг                   1312657
## 7 2027-12-31         пятница                   1324429

plot(forecasting_auto_arima)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph4<-autoplot(forecasting_auto_arima,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph4

## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
data.frame(correlation_auto.arima,RMSE_auto.arima,MAPE_Mean_All,MAD_auto.arima) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER

##   correlation_auto.arima RMSE_auto.arima
## 1              0.9924041        6383.138
##                                     MAPE_Mean_All MAD_auto.arima
## 1 0.362 % MAPE  7 years population in Chelyabinsk       4408.063

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##   validation_dates Validation_day_name AEOF_auto.arima REOF_A_auto.arima
## 1       2014-12-31               среда        707.0871            0.06 %
## 2       2015-12-31             четверг       1309.1741           0.111 %
## 3       2016-12-31             суббота       1788.2612            0.15 %
## 4       2017-12-31         воскресенье       2191.3483           0.182 %
## 5       2018-12-31         понедельник       2486.4353           0.204 %
## 6       2019-12-31             вторник       8144.5224           0.666 %
## 7       2020-12-31             четверг      14229.6094           1.159 %
##   REOF_F_auto.arima
## 1            0.06 %
## 2           0.111 %
## 3            0.15 %
## 4           0.182 %
## 5           0.204 %
## 6           0.662 %
## 7           1.146 %

# Choose Best model by least error

paste("System Summarizes  Error ==> ( MAPE ) of Forecasting  by using bats model and BATS Model, Holt's Linear Models , and autoarima for  ==> ", y_lab , sep=" ")

## [1] "System Summarizes  Error ==> ( MAPE ) of Forecasting  by using bats model and BATS Model, Holt's Linear Models , and autoarima for  ==>  population in Chelyabinsk"

M1<-mean(REOF_A_bats)

paste("System Summarizes  Error ==> ( MAPE ) of Forecasting  by using TBATS  Model For ==> ", y_lab , sep=" ")

## [1] "System Summarizes  Error ==> ( MAPE ) of Forecasting  by using TBATS  Model For ==>  population in Chelyabinsk"

M2<-mean(REOF_A_tbats1)

paste("System Summarizes  Error ==> ( MAPE ) of Forecasting  by using Holt's Linear << Exponential Smoothing >>  For ==> ", y_lab , sep=" ")

## [1] "System Summarizes  Error ==> ( MAPE ) of Forecasting  by using Holt's Linear << Exponential Smoothing >>  For ==>  population in Chelyabinsk"

M3<-REOF_A_Holt11

paste("System Summarizes  Error ==> ( MAPE ) of Forecasting  by using auto arima  Model For ==> ", y_lab , sep=" ")

## [1] "System Summarizes  Error ==> ( MAPE ) of Forecasting  by using auto arima  Model For ==>  population in Chelyabinsk"

M4<-mean(REOF_A_auto.arima)

paste("System Summarizes  Error ==> ( MAPE ) of Forecasting  by using autoarima  Model For ==> ", y_lab , sep=" ")

## [1] "System Summarizes  Error ==> ( MAPE ) of Forecasting  by using autoarima  Model For ==>  population in Chelyabinsk"

data.frame(validation_dates,forecating_date=forecasting_data_by_name,MAPE_bats_error=REOF_A_bats,MAPE_TBATS_error=REOF_A_tbats1,MAPE_Holt_error=REOF_A_Holt1,MAPE_autoarima_error = REOF_A_auto.arima)

##   validation_dates forecating_date MAPE_bats_error MAPE_TBATS_error
## 1       2014-12-31         пятница      0.01273841       0.02627027
## 2       2015-12-31         суббота      0.01788663       0.06186866
## 3       2016-12-31     воскресенье      0.08535721       0.06219303
## 4       2017-12-31         вторник      0.18391786       0.03681494
## 5       2018-12-31           среда      0.31452894       0.20551157
## 6       2019-12-31         четверг      0.02997438       0.69518267
## 7       2020-12-31         пятница      0.26327836       1.09186714
##   MAPE_Holt_error MAPE_autoarima_error
## 1      0.01898324           0.06039943
## 2      0.05491107           0.11077282
## 3      0.10858102           0.14987757
## 4      0.17536914           0.18192755
## 5      0.25731754           0.20447760
## 6      0.09229778           0.66643229
## 7      0.46626169           1.15895646

recommend_Model<-c(M1,M2,M3,M4)
best_recommended_model<-min(recommend_Model)
paste ("lodaing .....   ... . .Select Minimum MAPE from Models for select best Model ==> ", y_lab , sep=" ")

## [1] "lodaing .....   ... . .Select Minimum MAPE from Models for select best Model ==>  population in Chelyabinsk"

best_recommended_model

## [1] 0.1296688

paste ("Best Model For Forecasting  ==> ",y_lab, sep=" ")

## [1] "Best Model For Forecasting  ==>  population in Chelyabinsk"

if(best_recommended_model >= M1) {paste("System Recommend Bats Model That's better  For forecasting==> ",y_lab, sep=" ")}

## [1] "System Recommend Bats Model That's better  For forecasting==>  population in Chelyabinsk"

if(best_recommended_model >= M2) {paste("System Recommend  That's better TBATS  For forecasting ==> ",y_lab, sep=" ")}
if(best_recommended_model >= M3) {paste("System Recommend Holt's Linear Model < Exponential Smoothing Model >   That's better  For forecasting ==> ",y_lab, sep=" ")}
if(best_recommended_model >= M4) {paste("System Recommend auto arima Model  That's better  For forecasting ==> ",y_lab, sep=" ")}

message("System finished Forecasting  by using autoarima and Holt's ,and TBATS Model ==>",y_lab, sep=" ")

## System finished Forecasting  by using autoarima and Holt's ,and TBATS Model ==>population in Chelyabinsk

message(" Thank you for using our System For Modelling  ==> ",y_lab, sep=" ")

##  Thank you for using our System For Modelling  ==> population in Chelyabinsk