Using auto arima for forecasting Deaths of COVID-19 in Chelyabinsk

Makarovskikh Tatyana Anatolyevna “Макаровских Татьяна Анатольевна”

Abotaleb mostafa “Аботалеб Мостафа”

Department of Electrical Engineering and Computer Science

South ural state university, Chelyabinsk, Russian federation

# Imports
# Imports
library(fpp2)

## Warning: package 'fpp2' was built under R version 4.0.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## -- Attaching packages ------------------------------------------------------------------------------ fpp2 2.4 --

## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'forecast' was built under R version 4.0.3

##

library(forecast)
library(ggplot2)
library("readxl")

## Warning: package 'readxl' was built under R version 4.0.3

library(moments)

## Warning: package 'moments' was built under R version 4.0.3

library(forecast)
require(forecast)  
require(tseries)

## Loading required package: tseries

## Warning: package 'tseries' was built under R version 4.0.3

require(markovchain)

## Loading required package: markovchain

## Warning: package 'markovchain' was built under R version 4.0.3

## Package:  markovchain
## Version:  0.8.5-3
## Date:     2020-12-03
## BugReport: https://github.com/spedygiorgio/markovchain/issues

require(data.table)

## Loading required package: data.table

Full_original_data<-read_excel("F:/Phd/ALL Russia Analysis/covidActualTS.xlsx",sheet = "Chelyabinsk ")
y_lab<- "COVID 19 Deaths cases in Chelyabinsk  "   # input name of data
Actual_date_interval <- c("2020/03/12","2020/10/27")
Forecast_date_interval <- c("2020/10/28","2020/11/5")
validation_data_days <-11
frequency<-"days"

# Data Preparation & calculate some of statistics measures
original_data<-Full_original_data$Death

summary(original_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00   88.00   65.16  104.00  159.00

sd(original_data)  # calculate standard deviation

## [1] 53.3066

skewness(original_data)  # calculate Cofficient of skewness

## [1] 0.003022683

kurtosis(original_data)   # calculate Cofficient of kurtosis

## [1] 1.522648

rows <- NROW(original_data)
training_data<-original_data[1:(rows-validation_data_days)]
testing_data<-original_data[(rows-validation_data_days+1):rows]
AD<-fulldate<-seq(as.Date(Actual_date_interval[1]),as.Date(Actual_date_interval[2]), frequency)  #input range for actual date
FD<-seq(as.Date(Forecast_date_interval[1]),as.Date(Forecast_date_interval[2]), frequency)  #input range forecasting date
N_forecasting_days<-nrow(data.frame(FD)) 
validation_dates<-tail(AD,validation_data_days)
validation_data_by_name<-weekdays(validation_dates)
forecasting_data_by_name<-weekdays(FD)
data_series<-ts(training_data)
#plot  COVID 19 infection cases in Chelyabinsk
autoplot(data_series ,xlab=paste ("Time in  ", frequency, sep=" "), ylab = y_lab, main=paste ("Actual Data :", y_lab, sep=" "))

#Auto arima model
##################

require(tseries) # need to install tseries tj test Stationarity in time series 
paste ("tests For Check Stationarity in series  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series  ==>  COVID 19 Deaths cases in Chelyabinsk  "

kpss.test(data_series) # applay kpss test

## Warning in kpss.test(data_series): p-value smaller than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  data_series
## KPSS Level = 4.3275, Truncation lag parameter = 4, p-value = 0.01

pp.test(data_series)   # applay pp test

## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_series
## Dickey-Fuller Z(alpha) = -4.3196, Truncation lag parameter = 4, p-value
## = 0.8677
## alternative hypothesis: stationary

adf.test(data_series)  # applay adf test

## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_series
## Dickey-Fuller = -2.1162, Lag order = 6, p-value = 0.5272
## alternative hypothesis: stationary

ndiffs(data_series)    # Doing first diffrencing on data

## [1] 1

##Taking the first difference
diff1_x1<-diff(data_series)
autoplot(diff1_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "), ylab=y_lab,main = "1nd differenced series")

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking first differences in  ==> ",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking first differences in  ==>  COVID 19 Deaths cases in Chelyabinsk  "

kpss.test(diff1_x1)   # applay kpss test after taking first differences

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff1_x1
## KPSS Level = 0.39903, Truncation lag parameter = 4, p-value = 0.07757

pp.test(diff1_x1)     # applay pp test after taking first differences

## Warning in pp.test(diff1_x1): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff1_x1
## Dickey-Fuller Z(alpha) = -217.64, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary

adf.test(diff1_x1)    # applay adf test after taking first differences

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_x1
## Dickey-Fuller = -3.1228, Lag order = 6, p-value = 0.1049
## alternative hypothesis: stationary

#Taking the second difference
diff2_x1=diff(diff1_x1)
autoplot(diff2_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "), ylab=y_lab ,main = "2nd differenced series")

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking Second differences in",y_lab, sep=" ")

## [1] "tests For Check Stationarity in series after taking Second differences in COVID 19 Deaths cases in Chelyabinsk  "

kpss.test(diff2_x1)   # applay kpss test after taking Second differences

## Warning in kpss.test(diff2_x1): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2_x1
## KPSS Level = 0.01464, Truncation lag parameter = 4, p-value = 0.1

pp.test(diff2_x1)     # applay pp test after taking Second differences

## Warning in pp.test(diff2_x1): p-value smaller than printed p-value

## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2_x1
## Dickey-Fuller Z(alpha) = -248.63, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary

adf.test(diff2_x1)    # applay adf test after taking Second differences

## Warning in adf.test(diff2_x1): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2_x1
## Dickey-Fuller = -11.461, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

####Fitting an ARIMA Model
#1. Using auto arima function
model1 <- auto.arima(data_series,stepwise=FALSE, approximation=FALSE, trace=T, test = c("kpss", "adf", "pp"))  #applaying auto arima

## 
##  ARIMA(0,1,0)                    : 894.3487
##  ARIMA(0,1,0) with drift         : 866.1337
##  ARIMA(0,1,1)                    : 887.0238
##  ARIMA(0,1,1) with drift         : 866.0249
##  ARIMA(0,1,2)                    : 889.0688
##  ARIMA(0,1,2) with drift         : 867.4303
##  ARIMA(0,1,3)                    : 874.5235
##  ARIMA(0,1,3) with drift         : 858.353
##  ARIMA(0,1,4)                    : 875.1446
##  ARIMA(0,1,4) with drift         : 860.029
##  ARIMA(0,1,5)                    : 875.2645
##  ARIMA(0,1,5) with drift         : 860.8915
##  ARIMA(1,1,0)                    : 886.3034
##  ARIMA(1,1,0) with drift         : 866.2097
##  ARIMA(1,1,1)                    : 855.8743
##  ARIMA(1,1,1) with drift         : 862.1341
##  ARIMA(1,1,2)                    : 856.5271
##  ARIMA(1,1,2) with drift         : 864.2279
##  ARIMA(1,1,3)                    : 847.323
##  ARIMA(1,1,3) with drift         : 846.1526
##  ARIMA(1,1,4)                    : 849.1109
##  ARIMA(1,1,4) with drift         : 848.1202
##  ARIMA(2,1,0)                    : 887.1569
##  ARIMA(2,1,0) with drift         : 868.1613
##  ARIMA(2,1,1)                    : 857.1905
##  ARIMA(2,1,1) with drift         : 864.2275
##  ARIMA(2,1,2)                    : 854.7511
##  ARIMA(2,1,2) with drift         : 854.1882
##  ARIMA(2,1,3)                    : 848.1552
##  ARIMA(2,1,3) with drift         : 847.3552
##  ARIMA(3,1,0)                    : 867.0422
##  ARIMA(3,1,0) with drift         : 857.01
##  ARIMA(3,1,1)                    : 847.9264
##  ARIMA(3,1,1) with drift         : 847.1016
##  ARIMA(3,1,2)                    : 849.8887
##  ARIMA(3,1,2) with drift         : 849.1264
##  ARIMA(4,1,0)                    : 861.7194
##  ARIMA(4,1,0) with drift         : 854.9335
##  ARIMA(4,1,1)                    : 849.9125
##  ARIMA(4,1,1) with drift         : 849.1459
##  ARIMA(5,1,0)                    : 860.2871
##  ARIMA(5,1,0) with drift         : 855.1957
## 
## 
## 
##  Best model: ARIMA(1,1,3) with drift

model1 # show the result of autoarima

## Series: data_series 
## ARIMA(1,1,3) with drift 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3   drift
##       0.9213  -0.8964  -0.1518  0.2557  0.6597
## s.e.  0.0443   0.0781   0.0959  0.0726  0.2800
## 
## sigma^2 estimated as 2.738:  log likelihood=-416.88
## AIC=845.75   AICc=846.15   BIC=866.06

#Make changes in the source of auto arima to run the best model
arima.string <- function (object, padding = FALSE) 
{
  order <- object$arma[c(1, 6, 2, 3, 7, 4, 5)]
  m <- order[7]
  result <- paste("ARIMA(", order[1], ",", order[2], ",", 
                  order[3], ")", sep = "")
  if (m > 1 && sum(order[4:6]) > 0) {
    result <- paste(result, "(", order[4], ",", order[5], 
                    ",", order[6], ")[", m, "]", sep = "")
  }
  if (padding && m > 1 && sum(order[4:6]) == 0) {
    result <- paste(result, "         ", sep = "")
    if (m <= 9) {
      result <- paste(result, " ", sep = "")
    }
    else if (m <= 99) {
      result <- paste(result, "  ", sep = "")
    }
    else {
      result <- paste(result, "   ", sep = "")
    }
  }
  if (!is.null(object$xreg)) {
    if (NCOL(object$xreg) == 1 && is.element("drift", names(object$coef))) {
      result <- paste(result, "with drift        ")
    }
    else {
      result <- paste("Regression with", result, "errors")
    }
  }
  else {
    if (is.element("constant", names(object$coef)) || is.element("intercept", 
                                                                 names(object$coef))) {
      result <- paste(result, "with non-zero mean")
    }
    else if (order[2] == 0 && order[5] == 0) {
      result <- paste(result, "with zero mean    ")
    }
    else {
      result <- paste(result, "                  ")
    }
  }
  if (!padding) {
    result <- gsub("[ ]*$", "", result)
  }
  return(result)
}

source("stringthearima.R")  
bestmodel <- arima.string(model1, padding = TRUE)
bestmodel <- substring(bestmodel,7,11)
bestmodel <- gsub(" ", "", bestmodel)
bestmodel <- gsub(")", "", bestmodel)
bestmodel <- strsplit(bestmodel, ",")[[1]]
bestmodel <- c(strtoi(bestmodel[1]),strtoi(bestmodel[2]),strtoi(bestmodel[3]))
bestmodel

## [1] 1 1 3

#strtoi(bestmodel[3])
library(forecast)   # install library forecast             
x1_model1= arima(data_series, order=c(bestmodel)) # Run Best model of auto arima  for forecasting
x1_model1  # Show result of best model of auto arima

## 
## Call:
## arima(x = data_series, order = c(bestmodel))
## 
## Coefficients:
##          ar1      ma1      ma2     ma3
##       0.9604  -0.9205  -0.1491  0.2491
## s.e.  0.0249   0.0708   0.0975  0.0719
## 
## sigma^2 estimated as 2.711:  log likelihood = -418.52,  aic = 847.04

paste("accuracy of autoarima Model For  ==> ",y_lab, sep=" ")

## [1] "accuracy of autoarima Model For  ==>  COVID 19 Deaths cases in Chelyabinsk  "

accuracy(x1_model1)  # aacuracy of best model from auto arima

##                    ME     RMSE       MAE      MPE     MAPE     MASE        ACF1
## Training set 0.169536 1.642885 0.8083013 1.535179 3.611448 1.198705 -0.01944753

x1_model1$x          # show result of best model from auto arima

## NULL

checkresiduals(x1_model1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)  # checkresiduals from best model from using auto arima

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,3)
## Q* = 11.098, df = 6, p-value = 0.0854
## 
## Model df: 4.   Total lags used: 10

paste("Box-Ljung test , Ljung-Box test For Modelling for   ==> ",y_lab, sep=" ")

## [1] "Box-Ljung test , Ljung-Box test For Modelling for   ==>  COVID 19 Deaths cases in Chelyabinsk  "

Box.test(x1_model1$residuals^2, lag=20, type="Ljung-Box")   # Do test for resdulas by using Box-Ljung test , Ljung-Box test For Modelling

## 
##  Box-Ljung test
## 
## data:  x1_model1$residuals^2
## X-squared = 21.665, df = 20, p-value = 0.359

library(tseries)
jarque.bera.test(x1_model1$residuals)  # Do test jarque.bera.test

## 
##  Jarque Bera Test
## 
## data:  x1_model1$residuals
## X-squared = 10483, df = 2, p-value < 2.2e-16

#Actual Vs Fitted
par(mfrow=c(1,2))
plot(data_series, col='red',lwd=2, main="Actual vs Fitted Plot", xlab='Timein (days)', ylab=y_lab) # plot actual and Fitted model 
lines(fitted(x1_model1), col='blue')

#Test data

x1_test <- ts(testing_data, start =(rows-validation_data_days+1) ) # make testing data in time series and start from rows-6
forecasting_auto_arima <- forecast(x1_model1, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_auto_arima$mean,validation_data_days)
MAPE_Per_Day<-round(abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE % For  11 days by using bats Model for  ==>  COVID 19 Deaths cases in Chelyabinsk  "

MAPE_Mean_All<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_auto_arima<-paste(round(MAPE_Per_Day,3),"%")
MAPE_auto.arima_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] " MAPE that's Error of Forecasting for  11  days in bats Model for  ==>  COVID 19 Deaths cases in Chelyabinsk  "

paste(MAPE_Mean_All,"%")

## [1] "2.305 % MAPE  11 days COVID 19 Deaths cases in Chelyabinsk   %"

paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")

## [1] "MAPE that's Error of Forecasting day by day for  11  days in bats Model for  ==>  COVID 19 Deaths cases in Chelyabinsk  "

data.frame(date_auto.arima=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_auto.arima=validation_forecast,MAPE_auto.arima_Model)

##    date_auto.arima validation_data_by_name actual_data forecasting_auto.arima
## 1       2020-10-17                Saturday         147               147.5308
## 2       2020-10-18                  Sunday         147               148.3946
## 3       2020-10-19                  Monday         152               149.1766
## 4       2020-10-20                 Tuesday         152               149.9276
## 5       2020-10-21               Wednesday         155               150.6489
## 6       2020-10-22                Thursday         155               151.3416
## 7       2020-10-23                  Friday         155               152.0068
## 8       2020-10-24                Saturday         159               152.6457
## 9       2020-10-25                  Sunday         159               153.2592
## 10      2020-10-26                  Monday         159               153.8484
## 11      2020-10-27                 Tuesday         159               154.4143
##    MAPE_auto.arima_Model
## 1                0.361 %
## 2                0.949 %
## 3                1.857 %
## 4                1.363 %
## 5                2.807 %
## 6                 2.36 %
## 7                1.931 %
## 8                3.996 %
## 9                3.611 %
## 10                3.24 %
## 11               2.884 %

data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_auto.arima=tail(forecasting_auto_arima$mean,N_forecasting_days))

##           FD forecating_date forecasting_by_auto.arima
## 1 2020-10-28       Wednesday                  154.9577
## 2 2020-10-29        Thursday                  155.4796
## 3 2020-10-30          Friday                  155.9809
## 4 2020-10-31        Saturday                  156.4622
## 5 2020-11-01          Sunday                  156.9245
## 6 2020-11-02          Monday                  157.3685
## 7 2020-11-03         Tuesday                  157.7948
## 8 2020-11-04       Wednesday                  158.2043
## 9 2020-11-05        Thursday                  158.5975

plot(forecasting_auto_arima)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph4<-autoplot(forecasting_auto_arima,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="blue", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph4

## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
data.frame(correlation_auto.arima,RMSE_auto.arima,MAPE_Mean_All,MAD_auto.arima) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER

##   correlation_auto.arima RMSE_auto.arima
## 1              0.9591067        4.007607
##                                                  MAPE_Mean_All MAD_auto.arima
## 1 2.305 % MAPE  11 days COVID 19 Deaths cases in Chelyabinsk         3.255041

data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F

##    validation_dates Validation_day_name AEOF_auto.arima REOF_A_auto.arima
## 1        2020-10-17            Saturday       0.5308279           0.361 %
## 2        2020-10-18              Sunday       1.3946037           0.949 %
## 3        2020-10-19              Monday       2.8233790           1.857 %
## 4        2020-10-20             Tuesday       2.0723547           1.363 %
## 5        2020-10-21           Wednesday       4.3510952           2.807 %
## 6        2020-10-22            Thursday       3.6584208            2.36 %
## 7        2020-10-23              Friday       2.9931985           1.931 %
## 8        2020-10-24            Saturday       6.3543405           3.996 %
## 9        2020-10-25              Sunday       5.7408019           3.611 %
## 10       2020-10-26              Monday       5.1515791            3.24 %
## 11       2020-10-27             Tuesday       4.5857085           2.884 %
##    REOF_F_auto.arima
## 1             0.36 %
## 2             0.94 %
## 3            1.893 %
## 4            1.382 %
## 5            2.888 %
## 6            2.417 %
## 7            1.969 %
## 8            4.163 %
## 9            3.746 %
## 10           3.348 %
## 11            2.97 %