Research opportunities automatic parameter selection arima models for short term forecast cases covid-19

That’s an algorithm developed to easy used auto arima

By

Makarovskikh Tatyana Anatolyevna“Макаровских Татьяна Анатольевна”

Abotaleb mostafa “Аботалеб Мостафа”

Department of Electrical Engineering and Computer Science

South ural state university, Chelyabinsk, Russian federation

# Imports
# Imports
library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## -- Attaching packages ------------------------------------------------------------------------------ fpp2 2.4 --

## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'forecast' was built under R version 4.0.3

##

library(forecast)
library(ggplot2)
library("readxl")

## Warning: package 'readxl' was built under R version 4.0.3

library(moments)

## Warning: package 'moments' was built under R version 4.0.3

library(forecast)
require(forecast)  
require(tseries)

## Loading required package: tseries

## Warning: package 'tseries' was built under R version 4.0.3

require(markovchain)

## Loading required package: markovchain

## Warning: package 'markovchain' was built under R version 4.0.3

## Package:  markovchain
## Version:  0.8.5-3
## Date:     2020-12-03
## BugReport: https://github.com/spedygiorgio/markovchain/issues

require(data.table)

## Loading required package: data.table

Full_original_data<-read_excel("F:/Phd/ALL Russia Analysis/covidActualTS.xlsx",sheet = "Chelyabinsk ")
y_lab<- "COVID 19 Infection cases in Chelyabinsk "   # input name of data
Actual_date_interval <- c("2020/03/12","2020/10/27")
Forecast_date_interval <- c("2020/10/28","2020/11/5")
validation_data_days <-11
frequency<-"days"

# Data Preparation & calculate some of statistics measures
original_data<-Full_original_data$Infected

summary(original_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   898.5  7795.5  7720.8 13666.5 18393.0

sd(original_data)  # calculate standard deviation

## [1] 6300.531

skewness(original_data)  # calculate Cofficient of skewness

## [1] 0.09264073

kurtosis(original_data)   # calculate Cofficient of kurtosis

## [1] 1.462645

rows <- NROW(original_data)
training_data<-original_data[1:(rows-validation_data_days)]
testing_data<-original_data[(rows-validation_data_days+1):rows]
AD<-fulldate<-seq(as.Date(Actual_date_interval[1]),as.Date(Actual_date_interval[2]), frequency)  #input range for actual date
FD<-seq(as.Date(Forecast_date_interval[1]),as.Date(Forecast_date_interval[2]), frequency)  #input range forecasting date
N_forecasting_days<-nrow(data.frame(FD)) 
validation_dates<-tail(AD,validation_data_days)
validation_data_by_name<-weekdays(validation_dates)
forecasting_data_by_name<-weekdays(FD)
data_series<-ts(training_data)
#plot  COVID 19 infection cases in Chelyabinsk
autoplot(data_series ,xlab=paste ("Time in  ", frequency, sep=" "), ylab = y_lab, main=paste ("Actual Data :", y_lab, sep=" "))

#Auto arima model
##################

require(tseries) # need to install tseries tj test Stationarity in time series 
paste ("tests For Check Stationarity in series  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series  ==>  COVID 19 Infection cases in Chelyabinsk "

kpss.test(data_series) # applay kpss test

## Warning in kpss.test(data_series): p-value smaller than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  data_series
## KPSS Level = 4.4678, Truncation lag parameter = 4, p-value = 0.01

pp.test(data_series)   # applay pp test

## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_series
## Dickey-Fuller Z(alpha) = -4.3559, Truncation lag parameter = 4, p-value
## = 0.8656
## alternative hypothesis: stationary

adf.test(data_series)  # applay adf test

## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_series
## Dickey-Fuller = -3.1872, Lag order = 6, p-value = 0.09118
## alternative hypothesis: stationary

ndiffs(data_series)    # Doing first diffrencing on data

## [1] 2

##Taking the first difference
diff1_x1<-diff(data_series)
autoplot(diff1_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "), ylab=y_lab,main = "1nd differenced series")

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking first differences in  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking first differences in  ==>  COVID 19 Infection cases in Chelyabinsk "

kpss.test(diff1_x1)   # applay kpss test after taking first differences

## Warning in kpss.test(diff1_x1): p-value smaller than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff1_x1
## KPSS Level = 1.5507, Truncation lag parameter = 4, p-value = 0.01

pp.test(diff1_x1)     # applay pp test after taking first differences

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff1_x1
## Dickey-Fuller Z(alpha) = -7.6935, Truncation lag parameter = 4, p-value
## = 0.6768
## alternative hypothesis: stationary

adf.test(diff1_x1)    # applay adf test after taking first differences

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_x1
## Dickey-Fuller = -1.3882, Lag order = 6, p-value = 0.8327
## alternative hypothesis: stationary

#Taking the second difference
diff2_x1=diff(diff1_x1)
autoplot(diff2_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "), ylab=y_lab ,main = "2nd differenced series")

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking Second differences in",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking Second differences in COVID 19 Infection cases in Chelyabinsk "

kpss.test(diff2_x1)   # applay kpss test after taking Second differences

## Warning in kpss.test(diff2_x1): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2_x1
## KPSS Level = 0.12227, Truncation lag parameter = 4, p-value = 0.1

pp.test(diff2_x1)     # applay pp test after taking Second differences

## Warning in pp.test(diff2_x1): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2_x1
## Dickey-Fuller Z(alpha) = -240.46, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary

adf.test(diff2_x1)    # applay adf test after taking Second differences

## Warning in adf.test(diff2_x1): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2_x1
## Dickey-Fuller = -8.035, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

####Fitting an ARIMA Model
#1. Using auto arima function
model1 <- auto.arima(data_series,stepwise=FALSE, approximation=FALSE, trace=T, test = c("kpss", "adf", "pp"))  #applaying auto arima

## 
##  ARIMA(0,2,0)                    : 1797.84
##  ARIMA(0,2,1)                    : 1783.551
##  ARIMA(0,2,2)                    : 1784.025
##  ARIMA(0,2,3)                    : 1779.896
##  ARIMA(0,2,4)                    : 1780.11
##  ARIMA(0,2,5)                    : 1778.563
##  ARIMA(1,2,0)                    : 1785.471
##  ARIMA(1,2,1)                    : 1781.27
##  ARIMA(1,2,2)                    : 1782.888
##  ARIMA(1,2,3)                    : 1781.297
##  ARIMA(1,2,4)                    : 1778.875
##  ARIMA(2,2,0)                    : 1786.935
##  ARIMA(2,2,1)                    : 1782.608
##  ARIMA(2,2,2)                    : 1784.25
##  ARIMA(2,2,3)                    : 1781.725
##  ARIMA(3,2,0)                    : 1787.066
##  ARIMA(3,2,1)                    : 1782.283
##  ARIMA(3,2,2)                    : 1781.973
##  ARIMA(4,2,0)                    : 1775.63
##  ARIMA(4,2,1)                    : 1777.185
##  ARIMA(5,2,0)                    : 1777.583
## 
## 
## 
##  Best model: ARIMA(4,2,0)

model1 # show the result of autoarima

## Series: data_series 
## ARIMA(4,2,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4
##       -0.2946  -0.0965  -0.1608  -0.2438
## s.e.   0.0656   0.0678   0.0675   0.0652
## 
## sigma^2 estimated as 203.2:  log likelihood=-882.67
## AIC=1775.35   AICc=1775.63   BIC=1792.24

#Make changes in the source of auto arima to run the best model
arima.string <- function (object, padding = FALSE) 
{
  order <- object$arma[c(1, 6, 2, 3, 7, 4, 5)]
  m <- order[7]
  result <- paste("ARIMA(", order[1], ",", order[2], ",", 
                  order[3], ")", sep = "")
  if (m > 1 && sum(order[4:6]) > 0) {
    result <- paste(result, "(", order[4], ",", order[5], 
                    ",", order[6], ")[", m, "]", sep = "")
  }
  if (padding && m > 1 && sum(order[4:6]) == 0) {
    result <- paste(result, "         ", sep = "")
    if (m <= 9) {
      result <- paste(result, " ", sep = "")
    }
    else if (m <= 99) {
      result <- paste(result, "  ", sep = "")
    }
    else {
      result <- paste(result, "   ", sep = "")
    }
  }
  if (!is.null(object$xreg)) {
    if (NCOL(object$xreg) == 1 && is.element("drift", names(object$coef))) {
      result <- paste(result, "with drift        ")
    }
    else {
      result <- paste("Regression with", result, "errors")
    }
  }
  else {
    if (is.element("constant", names(object$coef)) || is.element("intercept", 
                                                                 names(object$coef))) {
      result <- paste(result, "with non-zero mean")
    }
    else if (order[2] == 0 && order[5] == 0) {
      result <- paste(result, "with zero mean    ")
    }
    else {
      result <- paste(result, "                  ")
    }
  }
  if (!padding) {
    result <- gsub("[ ]*$", "", result)
  }
  return(result)
}

source("stringthearima.R")  
bestmodel <- arima.string(model1, padding = TRUE)
bestmodel <- substring(bestmodel,7,11)
bestmodel <- gsub(" ", "", bestmodel)
bestmodel <- gsub(")", "", bestmodel)
bestmodel <- strsplit(bestmodel, ",")[[1]]
bestmodel <- c(strtoi(bestmodel[1]),strtoi(bestmodel[2]),strtoi(bestmodel[3]))
bestmodel

## [1] 4 2 0

#strtoi(bestmodel[3])
library(forecast)   # install library forecast             
x1_model1= arima(data_series, order=c(bestmodel)) # Run Best model of auto arima  for forecasting
x1_model1  # Show result of best model of auto arima

## 
## Call:
## arima(x = data_series, order = c(bestmodel))
## 
## Coefficients:
##           ar1      ar2      ar3      ar4
##       -0.2946  -0.0965  -0.1608  -0.2438
## s.e.   0.0656   0.0678   0.0675   0.0652
## 
## sigma^2 estimated as 199.5:  log likelihood = -882.67,  aic = 1775.35

paste("accuracy of autoarima Model For  ==> ",y_lab, sep=" ")

## [1] "accuracy of autoarima Model For  ==>  COVID 19 Infection cases in Chelyabinsk "

accuracy(x1_model1)  # aacuracy of best model from auto arima

##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.8067865 14.05949 8.512275 0.7514777 3.240755 0.1083733
##                     ACF1
## Training set 0.002178755

x1_model1$x          # show result of best model from auto arima

## NULL

checkresiduals(x1_model1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)  # checkresiduals from best model from using auto arima

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,2,0)
## Q* = 2.2434, df = 6, p-value = 0.896
## 
## Model df: 4.   Total lags used: 10

paste("Box-Ljung test , Ljung-Box test For Modelling for   ==> ",y_lab, sep=" ")

## [1] "Box-Ljung test , Ljung-Box test For Modelling for   ==>  COVID 19 Infection cases in Chelyabinsk "

Box.test(x1_model1$residuals^2, lag=20, type="Ljung-Box")   # Do test for resdulas by using Box-Ljung test , Ljung-Box test For Modelling

## 
##  Box-Ljung test
## 
## data:  x1_model1$residuals^2
## X-squared = 53.207, df = 20, p-value = 7.578e-05

library(tseries)
jarque.bera.test(x1_model1$residuals)  # Do test jarque.bera.test

## 
##  Jarque Bera Test
## 
## data:  x1_model1$residuals
## X-squared = 553.5, df = 2, p-value < 2.2e-16

#Actual Vs Fitted
par(mfrow=c(1,2))
plot(data_series, col='red',lwd=2, main="Actual vs Fitted Plot", xlab='Timein (days)', ylab=y_lab) # plot actual and Fitted model 
lines(fitted(x1_model1), col='blue')

#Test data

x1_test <- ts(testing_data, start =(rows-validation_data_days+1) ) # make testing data in time series and start from rows-6
forecasting_auto_arima <- forecast(x1_model1, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_auto_arima$mean,validation_data_days)
MAPE_Per_Day<-round(abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  11 days by using bats Model for  ==>  COVID 19 Infection cases in Chelyabinsk "

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_auto_arima<-paste(round(MAPE_Per_Day,3),"%")
MAPE_auto.arima_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  11  days in bats Model for  ==>  COVID 19 Infection cases in Chelyabinsk "

paste(MAPE_Mean_All,"%")

## [1] "0.31 % MAPE  11 days COVID 19 Infection cases in Chelyabinsk  %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  11  days in bats Model for  ==>  COVID 19 Infection cases in Chelyabinsk "

data.frame(date_auto.arima=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_auto.arima=validation_forecast,MAPE_auto.arima_Model)

##    date_auto.arima validation_data_by_name actual_data forecasting_auto.arima
## 1       2020-10-17                Saturday       17223               17223.78
## 2       2020-10-18                  Sunday       17325               17322.90
## 3       2020-10-19                  Monday       17427               17420.61
## 4       2020-10-20                 Tuesday       17526               17517.87
## 5       2020-10-21               Wednesday       17630               17615.96
## 6       2020-10-22                Thursday       17740               17714.48
## 7       2020-10-23                  Friday       17857               17813.21
## 8       2020-10-24                Saturday       17986               17911.82
## 9       2020-10-25                  Sunday       18122               18010.17
## 10      2020-10-26                  Monday       18255               18108.47
## 11      2020-10-27                 Tuesday       18393               18206.77
##    MAPE_auto.arima_Model
## 1                0.005 %
## 2                0.012 %
## 3                0.037 %
## 4                0.046 %
## 5                 0.08 %
## 6                0.144 %
## 7                0.245 %
## 8                0.412 %
## 9                0.617 %
## 10               0.803 %
## 11               1.012 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_auto.arima=tail(forecasting_auto_arima$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_auto.arima
## 1 2020-10-28       Wednesday                  18305.15
## 2 2020-10-29        Thursday                  18403.58
## 3 2020-10-30          Friday                  18502.00
## 4 2020-10-31        Saturday                  18600.40
## 5 2020-11-01          Sunday                  18698.79
## 6 2020-11-02          Monday                  18797.17
## 7 2020-11-03         Tuesday                  18895.56
## 8 2020-11-04       Wednesday                  18993.95
## 9 2020-11-05        Thursday                  19092.34

plot(forecasting_auto_arima)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph4<-autoplot(forecasting_auto_arima,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph4

## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
data.frame(correlation_auto.arima,RMSE_auto.arima,MAPE_Mean_All,MAD_auto.arima) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER

##   correlation_auto.arima RMSE_auto.arima
## 1              0.9979072        83.68786
##                                                   MAPE_Mean_All MAD_auto.arima
## 1 0.31 % MAPE  11 days COVID 19 Infection cases in Chelyabinsk        56.17795

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##    validation_dates Validation_day_name AEOF_auto.arima REOF_A_auto.arima
## 1        2020-10-17            Saturday       0.7812086           0.005 %
## 2        2020-10-18              Sunday       2.0965450           0.012 %
## 3        2020-10-19              Monday       6.3910309           0.037 %
## 4        2020-10-20             Tuesday       8.1312243           0.046 %
## 5        2020-10-21           Wednesday      14.0394516            0.08 %
## 6        2020-10-22            Thursday      25.5174573           0.144 %
## 7        2020-10-23              Friday      43.7853963           0.245 %
## 8        2020-10-24            Saturday      74.1818364           0.412 %
## 9        2020-10-25              Sunday     111.8327297           0.617 %
## 10       2020-10-26              Monday     146.5349362           0.803 %
## 11       2020-10-27             Tuesday     186.2280264           1.012 %
##    REOF_F_auto.arima
## 1            0.005 %
## 2            0.012 %
## 3            0.037 %
## 4            0.046 %
## 5             0.08 %
## 6            0.144 %
## 7            0.246 %
## 8            0.414 %
## 9            0.621 %
## 10           0.809 %
## 11           1.023 %