Forecasting Covid-19 infection cases in England by using BATS, TBATS, Holt’s Linear trend, and ARIMA model

Covid 19 deaths cases in india
Makarovskikh Tatyana Anatolyevna “Макаровских Татьяна Анатольевна”
Abotaleb mostafa “Аботалеб Мостафа”
Department of Electrical Engineering and Computer Science
South ural state university, Chelyabinsk, Russian federation
Pradeep Mishra
Department of Mathematics & Statistics
Jawaharlal Nehru Krishi Vishwavidyalaya, India
#Import
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3
## 
library(forecast)
library(ggplot2)
library("readxl")
library(moments)
library(forecast)
require(forecast)  
require(tseries)
## Loading required package: tseries
require(markovchain)
## Loading required package: markovchain
## Package:  markovchain
## Version:  0.8.5-3
## Date:     2020-12-03
## BugReport: https://github.com/spedygiorgio/markovchain/issues
require(data.table)
## Loading required package: data.table
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
##Global vriable##
Full_original_data <- read_excel("F:/Phd/vaccination/Vaccine data.xlsx", sheet = "England") # path of your data ( time series data)
original_data<-Full_original_data$Cumulative_cases
y_lab <- "Covid 19 infection cases in England"   # input name of data
Actual_date_interval <- c("2020/03/01","2021/03/10")
Forecast_date_interval <- c("2021/03/11","2021/03/17")
validation_data_days <-7
frequency<-"days"
country.name <- "England"
# Data Preparation & calculate some of statistics measures
summary(original_data) # Summary your time series
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0  121416  307188  982447 1493387 4229002
# calculate standard deviation 
data.frame(skewness=skewness(original_data))  # calculate Cofficient of skewness
##   skewness
## 1 1.420603
data.frame(kurtosis=kurtosis(original_data))   # calculate Cofficient of kurtosis
##   kurtosis
## 1 3.583572
data.frame(Standard.deviation =sd(original_data))
##   Standard.deviation
## 1            1314063
#processing on data (input data)
rows <- NROW(original_data) # calculate number of rows in time series (number of days)
training_data<-original_data[1:(rows-validation_data_days)] # Training data
testing_data<-original_data[(rows-validation_data_days+1):rows] #testing data
AD<-fulldate<-seq(as.Date(Actual_date_interval[1]),as.Date(Actual_date_interval[2]), frequency)  #input range for actual date
FD<-seq(as.Date(Forecast_date_interval[1]),as.Date(Forecast_date_interval[2]), frequency)  #input range forecasting date
N_forecasting_days<-nrow(data.frame(FD))  #calculate number of days that you want to forecasting
validation_dates<-tail(AD,validation_data_days) # select validation_dates
validation_data_by_name<-weekdays(validation_dates) # put names of validation dates
forecasting_data_by_name<-weekdays(FD)  # put names of Forecasting dates
##bats model
# Data Modeling
data_series<-ts(training_data) # make your data to time series
autoplot(data_series ,xlab=paste ("Time in  ", frequency, sep=" "), ylab = y_lab, main=paste ("Actual Data training data :", y_lab, sep=" "))

model_bats<-bats(data_series)
accuracy(model_bats)  # accuracy on training data
##                    ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 107.6703 2314.496 1099.946 NaN  Inf 0.1116122 -0.03304286
# Print Model Parameters
model_bats
## BATS(1, {5,2}, 0.984, -)
## 
## Call: bats(y = data_series)
## 
## Parameters
##   Alpha: 0.3539349
##   Beta: 0.1366612
##   Damping Parameter: 0.984198
##   AR coefficients: 0.533598 0.725755 -0.702102 0.073871 0.226344
##   MA coefficients: 0.732587 -0.207675
## 
## Seed States:
##           [,1]
##  [1,] 26.32696
##  [2,] 31.20970
##  [3,]  0.00000
##  [4,]  0.00000
##  [5,]  0.00000
##  [6,]  0.00000
##  [7,]  0.00000
##  [8,]  0.00000
##  [9,]  0.00000
## 
## Sigma: 2314.496
## AIC: 9217.59
#ploting BATS Model
plot(model_bats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4)

# Testing Data Evaluation
forecasting_bats <- predict(model_bats, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_bats$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE % For  7 days by using bats Model for  ==>  Covid 19 infection cases in England"
MAPE_Mean_All.bats_Model<-round(mean(MAPE_Per_Day),3)
MAPE_Mean_All.bats<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_bats<-paste(round(MAPE_Per_Day,3),"%")
MAPE_bats_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] " MAPE that's Error of Forecasting for  7  days in bats Model for  ==>  Covid 19 infection cases in England"
paste(MAPE_Mean_All.bats,"%")
## [1] "0.063 % MAPE  7 days Covid 19 infection cases in England %"
paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE that's Error of Forecasting day by day for  7  days in bats Model for  ==>  Covid 19 infection cases in England"
data.frame(date_bats=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_bats=validation_forecast,MAPE_bats_Model)
##    date_bats validation_data_by_name actual_data forecasting_bats
## 1 2021-03-04                Thursday     4194789          4194670
## 2 2021-03-05                  Friday     4201362          4200661
## 3 2021-03-06                Saturday     4207308          4205988
## 4 2021-03-07                  Sunday     4213347          4210702
## 5 2021-03-08                  Monday     4218524          4215001
## 6 2021-03-09                 Tuesday     4223236          4219048
## 7 2021-03-10               Wednesday     4229002          4222973
##   MAPE_bats_Model
## 1         0.003 %
## 2         0.017 %
## 3         0.031 %
## 4         0.063 %
## 5         0.084 %
## 6         0.099 %
## 7         0.143 %
data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_bats=tail(forecasting_bats$mean,N_forecasting_days))
##           FD forecating_date forecasting_by_bats
## 1 2021-03-11        Thursday             4226744
## 2 2021-03-12          Friday             4230349
## 3 2021-03-13        Saturday             4233722
## 4 2021-03-14          Sunday             4236891
## 5 2021-03-15          Monday             4239857
## 6 2021-03-16         Tuesday             4242679
## 7 2021-03-17       Wednesday             4245363
plot(forecasting_bats)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph1<-autoplot(forecasting_bats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph1

## Error of forecasting
Error_bats<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_bats<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_bats<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_bats<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_bats<-sqrt(sum((Error_bats^2))/validation_data_days)   #  Root mean square forecast error
MSE_bats<-(sum((Error_bats^2))/validation_data_days)   #  Root mean square forecast error
MAD_bats<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_bats<-c(Error_bats)
REOF_Abats<-c(paste(round(REOF_A_bats,3),"%"))
REOF_Fbats<-c(paste(round(REOF_F_bats,3),"%"))
data.frame(correlation_bats,MSE_bats,RMSE_bats,MAPE_Mean_All.bats_Model,MAD_bats) # analysis of Error  by using Bats Model shows result of correlation ,MSE ,MPER
##   correlation_bats MSE_bats RMSE_bats MAPE_Mean_All.bats_Model MAD_bats
## 1        0.9992753 10792275   3285.16                    0.063 2646.452
data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_bats,REOF_Abats,REOF_Fbats)   # Analysis of error shows result AEOF,REOF_A,REOF_F
##   validation_dates Validation_day_name AEOF_bats REOF_Abats REOF_Fbats
## 1       2021-03-04            Thursday  118.8012    0.003 %    0.003 %
## 2       2021-03-05              Friday  701.1709    0.017 %    0.017 %
## 3       2021-03-06            Saturday 1320.1971    0.031 %    0.031 %
## 4       2021-03-07              Sunday 2644.6495    0.063 %    0.063 %
## 5       2021-03-08              Monday 3522.8571    0.084 %    0.084 %
## 6       2021-03-09             Tuesday 4188.4078    0.099 %    0.099 %
## 7       2021-03-10           Wednesday 6029.0793    0.143 %    0.143 %
## TBATS Model
# Data Modeling
data_series<-ts(training_data)
model_TBATS<-forecast:::fitSpecificTBATS(data_series,use.box.cox=FALSE, use.beta=TRUE,  seasonal.periods=c(6),use.damping=FALSE,k.vector=c(2))
accuracy(model_TBATS)  # accuracy on training data
##                    ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 20.58434 2457.049 1257.295 NaN  Inf 0.1275785 0.005440903
# Print Model Parameters
model_TBATS
## TBATS(1, {0,0}, 1, {<6,2>})
## 
## Call: NULL
## 
## Parameters
##   Alpha: 1.08207
##   Beta: 0.7348984
##   Damping Parameter: 1
##   Gamma-1 Values: -0.004316243
##   Gamma-2 Values: -0.0007762335
## 
## Seed States:
##             [,1]
## [1,]   26.128245
## [2,]   30.060539
## [3,] -170.977452
## [4,]    5.194262
## [5,]   59.801821
## [6,]    3.380522
## 
## Sigma: 2457.049
## AIC: 9250.514
plot(model_TBATS,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)

# Testing Data Evaluation
forecasting_tbats <- predict(model_TBATS, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_tbats$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using TBATS Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE % For  7 days by using TBATS Model for  ==>  Covid 19 infection cases in England"
MAPE_Mean_All.TBATS_Model<-round(mean(MAPE_Per_Day),3)
MAPE_Mean_All.TBATS<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_TBATS<-paste(round(MAPE_Per_Day,3),"%")
MAPE_TBATS_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in TBATS Model for  ==> ",y_lab, sep=" ")
## [1] " MAPE that's Error of Forecasting for  7  days in TBATS Model for  ==>  Covid 19 infection cases in England"
paste(MAPE_Mean_All.TBATS,"%")
## [1] "0.049 % MAPE  7 days Covid 19 infection cases in England %"
paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in TBATS Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE that's Error of Forecasting day by day for  7  days in TBATS Model for  ==>  Covid 19 infection cases in England"
data.frame(date_TBATS=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_TBATS=validation_forecast,MAPE_TBATS_Model)
##   date_TBATS validation_data_by_name actual_data forecasting_TBATS
## 1 2021-03-04                Thursday     4194789           4195005
## 2 2021-03-05                  Friday     4201362           4201747
## 3 2021-03-06                Saturday     4207308           4208312
## 4 2021-03-07                  Sunday     4213347           4214706
## 5 2021-03-08                  Monday     4218524           4221018
## 6 2021-03-09                 Tuesday     4223236           4227305
## 7 2021-03-10               Wednesday     4229002           4233851
##   MAPE_TBATS_Model
## 1          0.005 %
## 2          0.009 %
## 3          0.024 %
## 4          0.032 %
## 5          0.059 %
## 6          0.096 %
## 7          0.115 %
data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_TBATS=tail(forecasting_tbats$mean,N_forecasting_days))
##           FD forecating_date forecasting_by_TBATS
## 1 2021-03-11        Thursday              4240593
## 2 2021-03-12          Friday              4247158
## 3 2021-03-13        Saturday              4253552
## 4 2021-03-14          Sunday              4259864
## 5 2021-03-15          Monday              4266151
## 6 2021-03-16         Tuesday              4272697
## 7 2021-03-17       Wednesday              4279439
plot(forecasting_tbats)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph2<-autoplot(forecasting_tbats,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph2

## Error of forecasting TBATS Model
Error_tbats<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_tbats1<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_tbats<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_tbats<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_tbats<-sqrt(sum((Error_tbats^2))/validation_data_days)   #  Root mean square forecast error
MSE_tbats<-(sum((Error_tbats^2))/validation_data_days)   #  Root mean square forecast error
MAD_tbats<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_tbats<-c(Error_tbats)
REOF_A_tbats<-c(paste(round(REOF_A_tbats1,3),"%"))
REOF_F_tbats<-c(paste(round(REOF_F_tbats,3),"%"))
data.frame(correlation_tbats,MSE_tbats,RMSE_tbats,MAPE_Mean_All.TBATS_Model,MAD_tbats) # analysis of Error  by using TBATS model shows result of correlation ,MSE ,MPER
##   correlation_tbats MSE_tbats RMSE_tbats MAPE_Mean_All.TBATS_Model MAD_tbats
## 1         0.9992056   7049113   2655.017                     0.049  2053.894
data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_tbats,REOF_A_tbats,REOF_F_tbats)   # Analysis of error shows result AEOF,REOF_A,REOF_F
##   validation_dates Validation_day_name AEOF_tbats REOF_A_tbats REOF_F_tbats
## 1       2021-03-04            Thursday   216.1744      0.005 %      0.005 %
## 2       2021-03-05              Friday   385.0513      0.009 %      0.009 %
## 3       2021-03-06            Saturday  1004.4199      0.024 %      0.024 %
## 4       2021-03-07              Sunday  1359.3629      0.032 %      0.032 %
## 5       2021-03-08              Monday  2493.9516      0.059 %      0.059 %
## 6       2021-03-09             Tuesday  4069.1299      0.096 %      0.096 %
## 7       2021-03-10           Wednesday  4849.1707      0.115 %      0.115 %
## Holt's linear trend
# Data Modeling
data_series<-ts(training_data)
model_holt<-holt(data_series,h=N_forecasting_days+validation_data_days,lambda = "auto")
accuracy(model_holt)  # accuracy on training data
##                    ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set -129.049 2501.326 1234.822 Inf  Inf 0.1252982 0.07811485
# Print Model Parameters
summary(model_holt$model)
## Holt's method 
## 
## Call:
##  holt(y = data_series, h = N_forecasting_days + validation_data_days,  
## 
##  Call:
##      lambda = "auto") 
## 
##   Box-Cox transformation: lambda= 0.3874 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.7404 
## 
##   Initial states:
##     l = -2.3144 
##     b = -0.5316 
## 
##   sigma:  0.5849
## 
##      AIC     AICc      BIC 
## 2128.281 2128.424 2148.554 
## 
## Training set error measures:
##                    ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set -129.049 2501.326 1234.822 Inf  Inf 0.1252982 0.07811485
# Testing Data Evaluation
forecasting_holt <- predict(model_holt, h=N_forecasting_days+validation_data_days,lambda = "auto")
validation_forecast<-head(forecasting_holt$mean,validation_data_days)
MAPE_Per_Day<-round(  abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using holt Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE % For  7 days by using holt Model for  ==>  Covid 19 infection cases in England"
MAPE_Mean_All.Holt_Model<-round(mean(MAPE_Per_Day),3)
MAPE_Mean_All.Holt<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_holt<-paste(round(MAPE_Per_Day,3),"%")
MAPE_holt_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in holt Model for  ==> ",y_lab, sep=" ")
## [1] " MAPE that's Error of Forecasting for  7  days in holt Model for  ==>  Covid 19 infection cases in England"
paste(MAPE_Mean_All.Holt,"%")
## [1] "0.026 % MAPE  7 days Covid 19 infection cases in England %"
paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in holt Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE that's Error of Forecasting day by day for  7  days in holt Model for  ==>  Covid 19 infection cases in England"
data.frame(date_holt=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_holt=validation_forecast,MAPE_holt_Model)
##    date_holt validation_data_by_name actual_data forecasting_holt
## 1 2021-03-04                Thursday     4194789          4194630
## 2 2021-03-05                  Friday     4201362          4200861
## 3 2021-03-06                Saturday     4207308          4207098
## 4 2021-03-07                  Sunday     4213347          4213340
## 5 2021-03-08                  Monday     4218524          4219589
## 6 2021-03-09                 Tuesday     4223236          4225843
## 7 2021-03-10               Wednesday     4229002          4232102
##   MAPE_holt_Model
## 1         0.004 %
## 2         0.012 %
## 3         0.005 %
## 4             0 %
## 5         0.025 %
## 6         0.062 %
## 7         0.073 %
data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_holt=tail(forecasting_holt$mean,N_forecasting_days))
##           FD forecating_date forecasting_by_holt
## 1 2021-03-11        Thursday             4238368
## 2 2021-03-12          Friday             4244639
## 3 2021-03-13        Saturday             4250916
## 4 2021-03-14          Sunday             4257198
## 5 2021-03-15          Monday             4263486
## 6 2021-03-16         Tuesday             4269780
## 7 2021-03-17       Wednesday             4276079
plot(forecasting_holt)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph3<-autoplot(forecasting_holt,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph3

## Error of forecasting by using Holt's linear model
Error_Holt<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_Holt1<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_Holt<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_Holt<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_Holt<-sqrt(sum((Error_Holt^2))/validation_data_days)   #  Root mean square forecast error
MSE_Holt<-(sum((Error_Holt^2))/validation_data_days)   #  Root mean square forecast error
MAD_Holt<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_Holt<-c(Error_Holt)
REOF_A_Holt<-c(paste(round(REOF_A_Holt1,3),"%"))
REOF_F_Holt<-c(paste(round(REOF_F_Holt,3),"%"))
REOF_A_Holt11<-mean(abs(((testing_data-validation_forecast)/testing_data)*100))
data.frame(correlation_Holt,MSE_Holt,RMSE_Holt,MAPE_Mean_All.Holt_Model,MAD_Holt) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER
##   correlation_Holt MSE_Holt RMSE_Holt MAPE_Mean_All.Holt_Model MAD_Holt
## 1        0.9987401  2551846   1597.45                    0.026 842.1287
data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_Holt,REOF_A_Holt,REOF_F_Holt)   # Analysis of error shows result AEOF,REOF_A,REOF_F
##   validation_dates Validation_day_name   AEOF_Holt REOF_A_Holt REOF_F_Holt
## 1       2021-03-04            Thursday  159.426197     0.004 %     0.004 %
## 2       2021-03-05              Friday  501.119412     0.012 %     0.012 %
## 3       2021-03-06            Saturday  210.144753     0.005 %     0.005 %
## 4       2021-03-07              Sunday    6.500323         0 %         0 %
## 5       2021-03-08              Monday 1064.815771     0.025 %     0.025 %
## 6       2021-03-09             Tuesday 2606.805423     0.062 %     0.062 %
## 7       2021-03-10           Wednesday 3100.470527     0.073 %     0.073 %
#Auto arima model
##################
require(tseries) # need to install tseries tj test Stationarity in time series 
paste ("tests For Check Stationarity in series  ==> ",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series  ==>  Covid 19 infection cases in England"
kpss.test(data_series) # applay kpss test
## Warning in kpss.test(data_series): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  data_series
## KPSS Level = 5.2335, Truncation lag parameter = 5, p-value = 0.01
pp.test(data_series)   # applay pp test
## Warning in pp.test(data_series): p-value greater than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_series
## Dickey-Fuller Z(alpha) = 1.1619, Truncation lag parameter = 5, p-value
## = 0.99
## alternative hypothesis: stationary
adf.test(data_series)  # applay adf test
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_series
## Dickey-Fuller = -3.0901, Lag order = 7, p-value = 0.1169
## alternative hypothesis: stationary
ndiffs(data_series)    # Doing first diffrencing on data
## [1] 2
#Taking the first difference
diff1_x1<-diff(data_series)
autoplot(diff1_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black",  ylab=y_lab,main = "1nd differenced series")
## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking first differences in  ==> ",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series after taking first differences in  ==>  Covid 19 infection cases in England"
kpss.test(diff1_x1)   # applay kpss test after taking first differences
## Warning in kpss.test(diff1_x1): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff1_x1
## KPSS Level = 3.9593, Truncation lag parameter = 5, p-value = 0.01
pp.test(diff1_x1)     # applay pp test after taking first differences
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff1_x1
## Dickey-Fuller Z(alpha) = -6.823, Truncation lag parameter = 5, p-value
## = 0.7286
## alternative hypothesis: stationary
adf.test(diff1_x1)    # applay adf test after taking first differences
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_x1
## Dickey-Fuller = -2.0691, Lag order = 7, p-value = 0.5484
## alternative hypothesis: stationary
#Taking the second difference
diff2_x1=diff(diff1_x1)
autoplot(diff2_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", ylab=y_lab ,main = "2nd differenced series")
## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking Second differences in",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series after taking Second differences in Covid 19 infection cases in England"
kpss.test(diff2_x1)   # applay kpss test after taking Second differences
## Warning in kpss.test(diff2_x1): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2_x1
## KPSS Level = 0.13755, Truncation lag parameter = 5, p-value = 0.1
pp.test(diff2_x1)     # applay pp test after taking Second differences
## Warning in pp.test(diff2_x1): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2_x1
## Dickey-Fuller Z(alpha) = -432.73, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
adf.test(diff2_x1)    # applay adf test after taking Second differences
## Warning in adf.test(diff2_x1): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2_x1
## Dickey-Fuller = -4.9413, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
####Fitting an ARIMA Model
#1. Using auto arima function
model1 <- auto.arima(data_series,stepwise=FALSE, approximation=FALSE, trace=T, test = c("kpss", "adf", "pp"))  #applaying auto arima
## 
##  ARIMA(0,2,0)                    : 7844.595
##  ARIMA(0,2,1)                    : 7834.008
##  ARIMA(0,2,2)                    : 7834.23
##  ARIMA(0,2,3)                    : 7832.91
##  ARIMA(0,2,4)                    : 7834.673
##  ARIMA(0,2,5)                    : 7826.654
##  ARIMA(1,2,0)                    : 7835.357
##  ARIMA(1,2,1)                    : 7832.901
##  ARIMA(1,2,2)                    : 7834.931
##  ARIMA(1,2,3)                    : 7834.915
##  ARIMA(1,2,4)                    : 7836.462
##  ARIMA(2,2,0)                    : 7836.499
##  ARIMA(2,2,1)                    : 7834.926
##  ARIMA(2,2,2)                    : 7836.727
##  ARIMA(2,2,3)                    : 7797.437
##  ARIMA(3,2,0)                    : 7833.727
##  ARIMA(3,2,1)                    : 7832.388
##  ARIMA(3,2,2)                    : 7795.166
##  ARIMA(4,2,0)                    : 7828.51
##  ARIMA(4,2,1)                    : 7830.203
##  ARIMA(5,2,0)                    : 7829.337
## 
## 
## 
##  Best model: ARIMA(3,2,2)
model1 # show the result of autoarima 
## Series: data_series 
## ARIMA(3,2,2) 
## 
## Coefficients:
##          ar1      ar2      ar3      ma1     ma2
##       1.0807  -0.6514  -0.1829  -1.3424  0.8533
## s.e.  0.0741   0.0713   0.0560   0.0601  0.0456
## 
## sigma^2 estimated as 5543503:  log likelihood=-3891.48
## AIC=7794.96   AICc=7795.17   BIC=7819.26
#Make changes in the source of auto arima to run the best model
arima.string <- function (object, padding = FALSE) 
{
  order <- object$arma[c(1, 6, 2, 3, 7, 4, 5)]
  m <- order[7]
  result <- paste("ARIMA(", order[1], ",", order[2], ",", 
                  order[3], ")", sep = "")
  if (m > 1 && sum(order[4:6]) > 0) {
    result <- paste(result, "(", order[4], ",", order[5], 
                    ",", order[6], ")[", m, "]", sep = "")
  }
  if (padding && m > 1 && sum(order[4:6]) == 0) {
    result <- paste(result, "         ", sep = "")
    if (m <= 9) {
      result <- paste(result, " ", sep = "")
    }
    else if (m <= 99) {
      result <- paste(result, "  ", sep = "")
    }
    else {
      result <- paste(result, "   ", sep = "")
    }
  }
  if (!is.null(object$xreg)) {
    if (NCOL(object$xreg) == 1 && is.element("drift", names(object$coef))) {
      result <- paste(result, "with drift        ")
    }
    else {
      result <- paste("Regression with", result, "errors")
    }
  }
  else {
    if (is.element("constant", names(object$coef)) || is.element("intercept", 
                                                                 names(object$coef))) {
      result <- paste(result, "with non-zero mean")
    }
    else if (order[2] == 0 && order[5] == 0) {
      result <- paste(result, "with zero mean    ")
    }
    else {
      result <- paste(result, "                  ")
    }
  }
  if (!padding) {
    result <- gsub("[ ]*$", "", result)
  }
  return(result)
}


bestmodel <- arima.string(model1, padding = TRUE)
bestmodel <- substring(bestmodel,7,11)
bestmodel <- gsub(" ", "", bestmodel)
bestmodel <- gsub(")", "", bestmodel)
bestmodel <- strsplit(bestmodel, ",")[[1]]
bestmodel <- c(strtoi(bestmodel[1]),strtoi(bestmodel[2]),strtoi(bestmodel[3]))
bestmodel
## [1] 3 2 2
strtoi(bestmodel[3])
## [1] 2
#2. Using ACF and PACF Function
#par(mfrow=c(1,2))  # Code for making two plot in one graph 
acf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab, main=paste("ACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))    # plot ACF "auto correlation function after taking second diffrences

pacf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab,main=paste("PACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))   # plot PACF " Partial auto correlation function after taking second diffrences

library(forecast)   # install library forecast             
x1_model1= arima(data_series, order=c(bestmodel)) # Run Best model of auto arima  for forecasting
x1_model1  # Show result of best model of auto arima 
## 
## Call:
## arima(x = data_series, order = c(bestmodel))
## 
## Coefficients:
##          ar1      ar2      ar3      ma1     ma2
##       1.0807  -0.6514  -0.1829  -1.3424  0.8533
## s.e.  0.0741   0.0713   0.0560   0.0601  0.0456
## 
## sigma^2 estimated as 5478132:  log likelihood = -3891.48,  aic = 7794.96
paste ("accuracy of autoarima Model For  ==> ",y_lab, sep=" ")
## [1] "accuracy of autoarima Model For  ==>  Covid 19 infection cases in England"
accuracy(x1_model1)  # aacuracy of best model from auto arima
##                    ME    RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 21.07222 2335.04 1094.787 0.5915269 1.861959 0.1110887 0.008794354
x1_model1$x          # show result of best model from auto arima 
## NULL
checkresiduals(x1_model1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)  # checkresiduals from best model from using auto arima 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,2,2)
## Q* = 34.717, df = 5, p-value = 1.714e-06
## 
## Model df: 5.   Total lags used: 10
paste("Box-Ljung test , Ljung-Box test For Modelling for   ==> ",y_lab, sep=" ")
## [1] "Box-Ljung test , Ljung-Box test For Modelling for   ==>  Covid 19 infection cases in England"
Box.test(x1_model1$residuals^2, lag=20, type="Ljung-Box")   # Do test for resdulas by using Box-Ljung test , Ljung-Box test For Modelling
## 
##  Box-Ljung test
## 
## data:  x1_model1$residuals^2
## X-squared = 349.6, df = 20, p-value < 2.2e-16
library(tseries)
jarque.bera.test(x1_model1$residuals)  # Do test jarque.bera.test 
## 
##  Jarque Bera Test
## 
## data:  x1_model1$residuals
## X-squared = 2878.5, df = 2, p-value < 2.2e-16
#Actual Vs Fitted
plot(data_series, col='red',lwd=2, main="Actual vs Fitted Plot", xlab='Time in (days)', ylab=y_lab) # plot actual and Fitted model 
lines(fitted(x1_model1), col='black')

#Test data
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) ) # make testing data in time series and start from rows-6
forecasting_auto_arima <- forecast(x1_model1, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_auto_arima$mean,validation_data_days)
MAPE_Per_Day<-round(abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE % For  7 days by using bats Model for  ==>  Covid 19 infection cases in England"
MAPE_Mean_All.ARIMA_Model<-round(mean(MAPE_Per_Day),3)
MAPE_Mean_All.ARIMA<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_auto_arima<-paste(round(MAPE_Per_Day,3),"%")
MAPE_auto.arima_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] " MAPE that's Error of Forecasting for  7  days in bats Model for  ==>  Covid 19 infection cases in England"
paste(MAPE_Mean_All.ARIMA,"%")
## [1] "0.016 % MAPE  7 days Covid 19 infection cases in England %"
paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE that's Error of Forecasting day by day for  7  days in bats Model for  ==>  Covid 19 infection cases in England"
data.frame(date_auto.arima=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_auto.arima=validation_forecast,MAPE_auto.arima_Model)
##   date_auto.arima validation_data_by_name actual_data forecasting_auto.arima
## 1      2021-03-04                Thursday     4194789                4194933
## 2      2021-03-05                  Friday     4201362                4201416
## 3      2021-03-06                Saturday     4207308                4207587
## 4      2021-03-07                  Sunday     4213347                4213427
## 5      2021-03-08                  Monday     4218524                4219119
## 6      2021-03-09                 Tuesday     4223236                4224926
## 7      2021-03-10               Wednesday     4229002                4231013
##   MAPE_auto.arima_Model
## 1               0.003 %
## 2               0.001 %
## 3               0.007 %
## 4               0.002 %
## 5               0.014 %
## 6                0.04 %
## 7               0.048 %
data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_auto.arima=tail(forecasting_auto_arima$mean,N_forecasting_days))
##           FD forecating_date forecasting_by_auto.arima
## 1 2021-03-11        Thursday                   4237354
## 2 2021-03-12          Friday                   4243768
## 3 2021-03-13        Saturday                   4250042
## 4 2021-03-14          Sunday                   4256073
## 5 2021-03-15          Monday                   4261918
## 6 2021-03-16         Tuesday                   4267745
## 7 2021-03-17       Wednesday                   4273720
plot(forecasting_auto_arima)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph4<-autoplot(forecasting_auto_arima,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph4

MAPE_Mean_All.ARIMA
## [1] "0.016 % MAPE  7 days Covid 19 infection cases in England"
## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MSE_auto.arima<-(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
data.frame(correlation_auto.arima,MSE_auto.arima,RMSE_auto.arima,MAPE_Mean_All.ARIMA_Model,MAD_auto.arima) # analysis of Error  by using Auto ARIMAA model shows result of correlation ,MSE ,MPER
##   correlation_auto.arima MSE_auto.arima RMSE_auto.arima
## 1              0.9994218        1051900        1025.622
##   MAPE_Mean_All.ARIMA_Model MAD_auto.arima
## 1                     0.016       693.2816
data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F
##   validation_dates Validation_day_name AEOF_auto.arima REOF_A_auto.arima
## 1       2021-03-04            Thursday       144.07792           0.003 %
## 2       2021-03-05              Friday        53.65876           0.001 %
## 3       2021-03-06            Saturday       278.88022           0.007 %
## 4       2021-03-07              Sunday        79.63687           0.002 %
## 5       2021-03-08              Monday       595.47565           0.014 %
## 6       2021-03-09             Tuesday      1690.37716            0.04 %
## 7       2021-03-10           Wednesday      2010.86464           0.048 %
##   REOF_F_auto.arima
## 1           0.003 %
## 2           0.001 %
## 3           0.007 %
## 4           0.002 %
## 5           0.014 %
## 6            0.04 %
## 7           0.048 %
# Table for MAPE For counry
best_recommended_model <- min(MAPE_Mean_All.bats_Model,MAPE_Mean_All.TBATS_Model,MAPE_Mean_All.Holt_Model,MAPE_Mean_All.ARIMA_Model)
paste("System Choose Least Error ==> ( MAPE %) of Forecasting  by using bats model and BATS Model, Holt's Linear Models , and autoarima for  ==> ", y_lab , sep=" ")
## [1] "System Choose Least Error ==> ( MAPE %) of Forecasting  by using bats model and BATS Model, Holt's Linear Models , and autoarima for  ==>  Covid 19 infection cases in England"
best_recommended_model
## [1] 0.016
x1<-if(best_recommended_model >= MAPE_Mean_All.bats_Model) {paste("BATS Model")}
x2<-if(best_recommended_model >= MAPE_Mean_All.TBATS_Model) {paste("TBATS Model")}
x3<-if(best_recommended_model >= MAPE_Mean_All.Holt_Model) {paste("Holt Model")}
x4<-if(best_recommended_model >= MAPE_Mean_All.ARIMA_Model) {paste("ARIMA Model")}
result<-c(x1,x2,x3,x4)
table.error<-data.frame(country.name,BATS.Model=MAPE_Mean_All.bats_Model,TBATS.Model=MAPE_Mean_All.TBATS_Model,Holt.Model=MAPE_Mean_All.Holt_Model,ARIMA.Model=MAPE_Mean_All.ARIMA_Model,Best.Model=result)
library(ascii)
print(ascii(table(table.error)), type = "rest")
## 
## +---+--------------+------------+-------------+------------+-------------+-------------+------+
## |   | country.name | BATS.Model | TBATS.Model | Holt.Model | ARIMA.Model | Best.Model  | Freq |
## +===+==============+============+=============+============+=============+=============+======+
## | 1 | England      | 0.063      | 0.049       | 0.026      | 0.016       | ARIMA Model | 1.00 |
## +---+--------------+------------+-------------+------------+-------------+-------------+------+
message("System finished Forecasting  by using autoarima and Holt's ,TBATS, and SIR  Model ==>",y_lab, sep=" ")
## System finished Forecasting  by using autoarima and Holt's ,TBATS, and SIR  Model ==>Covid 19 infection cases in England
message(" Thank you for using our System For Modelling  ==> ",y_lab, sep=" ")
##  Thank you for using our System For Modelling  ==> Covid 19 infection cases in England