Let’s install libraries

library(ggpubr)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(mvnormtest)
library(agricolae)
## Warning: package 'agricolae' was built under R version 4.0.5
library(gplots)
## Warning: package 'gplots' was built under R version 4.0.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(decoder)
## Warning: package 'decoder' was built under R version 4.0.5
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(covid19.analytics)
## Warning: package 'covid19.analytics' was built under R version 4.0.5
library(xts)
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## <U+221A> forecast  8.14     <U+221A> expsmooth 2.3 
## <U+221A> fma       2.4
## Warning: package 'forecast' was built under R version 4.0.5
## Warning: package 'fma' was built under R version 4.0.5
## Warning: package 'expsmooth' was built under R version 4.0.5
## -- Conflicts ------------------------------------------------- fpp2_conflicts --
## x forecast::gghistogram() masks ggpubr::gghistogram()

Data

In this lab, we are going to use dataset which is about active Covid-19 cases daily basis per each city and provinces.

Data source: https://github.com/dtandev/coronavirus/tree/master/data

Let"s download data from github repository.

url <- "https://raw.githubusercontent.com/dtandev/coronavirus/master/data/CoronavirusPL%20-%20Timeseries.csv"

covid_pl <- read.csv(url,header = T, sep = ",",encoding = "UTF-8")

Preparing dataset for analysis

infected <- covid_pl[covid_pl$Infection.Death.Recovery=="I",]
infected$Infection.Death.Recovery<- 1
infected_city <- aggregate(Infection.Death.Recovery~Province+Timestamp, data = infected, FUN = sum)
infected_city_mean <- aggregate(Infection.Death.Recovery~Province, data = infected_city, FUN = mean)

Drawing box plots

# drawing box plots
# Please use ggboxplot function from ggpubr package, if you already installed and loaded it, there is no need to do it again.  


ggboxplot(infected_city, x = "Province", y = "Infection.Death.Recovery", 
          color = "Province", ylab = "Number of infected", xlab = "Province")

Preparing data for infected cases only

## Data being read from JHU/CCSE repository
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Reading data from https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
## Data retrieved on 2021-06-16 15:44:22 || Range of dates on data: 2020-01-22--2021-06-15 | Nbr of records: 276
## --------------------------------------------------------------------------------
## Processing...  POLAND
## Loading required package: pheatmap
## Warning: package 'pheatmap' was built under R version 4.0.5

Plots

Stationarity - transformations

ggseasonplot(zmiany_ts, year.labels=TRUE, year.labels.left=TRUE)

zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
zmiany_ts %>% plot(main="New Daily infections in Poland", xlab="Year", ylab="Infections in thousands")

ACF, PACF

# for values
ggAcf(zmiany_ts)

# residuals how coorelated
ggPacf(zmiany_ts)

ARIMA

There is no ‘log()’ function used because no matter what, ARIMA for zmiany_ts could not have been found.

At the end there are arima plots done and checkresiduals.

in my opinion the 32nd plot is relatively good.

Here it is presented:

zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
zmiany_ts <- zmiany_ts %>% diff(32)
zmiany_arima <- auto.arima(zmiany_ts)
## Warning: The chosen seasonal unit root test encountered an error when testing for the first difference.
## From stl(): series is not periodic or has less than two periods
## 0 seasonal differences will be used. Consider using a different unit root test.
checkresiduals(zmiany_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with zero mean
## Q* = 510.99, df = 90, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 96
plot <- zmiany_ts %>% ggplot() + ggtitle("32 Sample")

Forecasting and summary

library(forecast)
pred <- forecast(zmiany_arima, h=34)
pred %>%autoplot()

# The prediction is colored blue
fit <- naive(zmiany_ts)
summary(fit)
## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = zmiany_ts) 
## 
## Residual sd: 4029.8262 
## 
## Error measures:
##                     ME     RMSE     MAE  MPE MAPE      MASE      ACF1
## Training set -6.375262 4029.826 2245.18 -Inf  Inf 0.1900376 0.2606508
## 
## Forecasts:
##           Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## 2021.3973          -3041  -8205.43  2123.430 -10939.31  4857.314
## 2021.4000          -3041 -10344.61  4262.607 -14210.90  8128.903
## 2021.4027          -3041 -11986.06  5904.055 -16721.28 10639.282
## 2021.4055          -3041 -13369.86  7287.860 -18837.63 12755.628
## 2021.4082          -3041 -14589.02  8507.017 -20702.17 14620.168
## 2021.4110          -3041 -15691.22  9609.219 -22387.84 16305.840
## 2021.4137          -3041 -16704.80 10622.798 -23937.98 17855.975
## 2021.4164          -3041 -17648.21 11566.214 -25380.81 19298.806
## 2021.4192          -3041 -18534.29 12452.290 -26735.94 20653.943
## 2021.4219          -3041 -19372.36 13290.362 -28017.66 21935.663

Plots used to look for best transformation

# Using this for loop i foung the most (in my opinion) optimal transformation of zmiany_ts
library("gridExtra")
library(ggplot2)
myplots = list()
for(i in 1:50) {
  zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
  zmiany_ts <- zmiany_ts %>% diff(i)
  zmiany_arima <- auto.arima(zmiany_ts)
  checkresiduals(zmiany_arima)
  myplots[i] <- zmiany_ts %>% ggplot() + ggtitle(sprintf("Try number: %i", i))
  myplots[i]
}

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,3) with zero mean
## Q* = 1124.3, df = 97, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 102

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 1898.2, df = 100, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 102

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,3) with zero mean
## Q* = 405.07, df = 95, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 101

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with zero mean
## Q* = 1064.9, df = 96, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 101

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1) with zero mean
## Q* = 1378.3, df = 98, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 101

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 792.14, df = 98, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 101

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,5) with zero mean
## Q* = 162.52, df = 93, p-value = 1.104e-05
## 
## Model df: 8.   Total lags used: 101

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,2) with zero mean
## Q* = 612.13, df = 94, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 100

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,3) with non-zero mean
## Q* = 774.19, df = 92, p-value < 2.2e-16
## 
## Model df: 8.   Total lags used: 100

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,3) with zero mean
## Q* = 512.53, df = 95, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 100

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,2) with zero mean
## Q* = 357.8, df = 94, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 100

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1436.6, df = 96, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 100

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 972.22, df = 96, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 99

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 227.58, df = 95, p-value = 6.907e-13
## 
## Model df: 4.   Total lags used: 99

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,3) with zero mean
## Q* = 699.89, df = 94, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 99

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1488.1, df = 95, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 99

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,5) with zero mean
## Q* = 981.2, df = 93, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 99

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,2) with zero mean
## Q* = 324.48, df = 92, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 98

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1556, df = 94, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 98

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 1588, df = 97, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 98

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 253.35, df = 95, p-value = 2.22e-16
## 
## Model df: 3.   Total lags used: 98

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 824.85, df = 95, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 98

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,3) with zero mean
## Q* = 390.63, df = 89, p-value < 2.2e-16
## 
## Model df: 8.   Total lags used: 97

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with zero mean
## Q* = 636.99, df = 91, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 97

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,4) with zero mean
## Q* = 723.59, df = 91, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 97

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1564.1, df = 93, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 97

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 1069.3, df = 94, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 97

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 393.31, df = 93, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 96

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 807.13, df = 93, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 96

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1440.8, df = 92, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 96

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with zero mean
## Q* = 1711.8, df = 94, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 96

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with zero mean
## Q* = 510.99, df = 90, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 96

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1533.3, df = 91, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 95

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 968.56, df = 92, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 95

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,3) with zero mean
## Q* = 418.85, df = 91, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 95

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 807.62, df = 92, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 95

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with zero mean
## Q* = 1100.9, df = 89, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 95

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,1) with zero mean
## Q* = 741.81, df = 88, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 94

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with zero mean
## Q* = 1993, df = 92, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 94

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,3) with non-zero mean
## Q* = 1628.6, df = 87, p-value < 2.2e-16
## 
## Model df: 7.   Total lags used: 94

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)
## Q* = 1015.8, df = 92, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 94

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)
## Q* = 530.98, df = 92, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 94

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 819.2, df = 87, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 93

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 1593, df = 87, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 93

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,2)
## Q* = 813.35, df = 88, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 93

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)
## Q* = 483.73, df = 89, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 93

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 1442.4, df = 87, p-value < 2.2e-16
## 
## Model df: 6.   Total lags used: 93

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)
## Q* = 1033.4, df = 90, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 92

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,3)
## Q* = 301.07, df = 85, p-value < 2.2e-16
## 
## Model df: 7.   Total lags used: 92

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)
## Q* = 804.44, df = 88, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 92