Let’s install libraries
library(ggpubr)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(mvnormtest)
library(agricolae)
## Warning: package 'agricolae' was built under R version 4.0.5
library(gplots)
## Warning: package 'gplots' was built under R version 4.0.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(decoder)
## Warning: package 'decoder' was built under R version 4.0.5
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(covid19.analytics)
## Warning: package 'covid19.analytics' was built under R version 4.0.5
library(xts)
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## <U+221A> forecast 8.14 <U+221A> expsmooth 2.3
## <U+221A> fma 2.4
## Warning: package 'forecast' was built under R version 4.0.5
## Warning: package 'fma' was built under R version 4.0.5
## Warning: package 'expsmooth' was built under R version 4.0.5
## -- Conflicts ------------------------------------------------- fpp2_conflicts --
## x forecast::gghistogram() masks ggpubr::gghistogram()
In this lab, we are going to use dataset which is about active Covid-19 cases daily basis per each city and provinces.
Data source: https://github.com/dtandev/coronavirus/tree/master/data
url <- "https://raw.githubusercontent.com/dtandev/coronavirus/master/data/CoronavirusPL%20-%20Timeseries.csv"
covid_pl <- read.csv(url,header = T, sep = ",",encoding = "UTF-8")
infected <- covid_pl[covid_pl$Infection.Death.Recovery=="I",]
infected$Infection.Death.Recovery<- 1
infected_city <- aggregate(Infection.Death.Recovery~Province+Timestamp, data = infected, FUN = sum)
infected_city_mean <- aggregate(Infection.Death.Recovery~Province, data = infected_city, FUN = mean)
# drawing box plots
# Please use ggboxplot function from ggpubr package, if you already installed and loaded it, there is no need to do it again.
ggboxplot(infected_city, x = "Province", y = "Infection.Death.Recovery",
color = "Province", ylab = "Number of infected", xlab = "Province")
## Data being read from JHU/CCSE repository
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Reading data from https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
## Data retrieved on 2021-06-16 15:44:22 || Range of dates on data: 2020-01-22--2021-06-15 | Nbr of records: 276
## --------------------------------------------------------------------------------
## Processing... POLAND
## Loading required package: pheatmap
## Warning: package 'pheatmap' was built under R version 4.0.5
ggseasonplot(zmiany_ts, year.labels=TRUE, year.labels.left=TRUE)
zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
zmiany_ts %>% plot(main="New Daily infections in Poland", xlab="Year", ylab="Infections in thousands")
# for values
ggAcf(zmiany_ts)
# residuals how coorelated
ggPacf(zmiany_ts)
zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
zmiany_ts <- zmiany_ts %>% diff(32)
zmiany_arima <- auto.arima(zmiany_ts)
## Warning: The chosen seasonal unit root test encountered an error when testing for the first difference.
## From stl(): series is not periodic or has less than two periods
## 0 seasonal differences will be used. Consider using a different unit root test.
checkresiduals(zmiany_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with zero mean
## Q* = 510.99, df = 90, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 96
plot <- zmiany_ts %>% ggplot() + ggtitle("32 Sample")
library(forecast)
pred <- forecast(zmiany_arima, h=34)
pred %>%autoplot()
# The prediction is colored blue
fit <- naive(zmiany_ts)
summary(fit)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = zmiany_ts)
##
## Residual sd: 4029.8262
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -6.375262 4029.826 2245.18 -Inf Inf 0.1900376 0.2606508
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2021.3973 -3041 -8205.43 2123.430 -10939.31 4857.314
## 2021.4000 -3041 -10344.61 4262.607 -14210.90 8128.903
## 2021.4027 -3041 -11986.06 5904.055 -16721.28 10639.282
## 2021.4055 -3041 -13369.86 7287.860 -18837.63 12755.628
## 2021.4082 -3041 -14589.02 8507.017 -20702.17 14620.168
## 2021.4110 -3041 -15691.22 9609.219 -22387.84 16305.840
## 2021.4137 -3041 -16704.80 10622.798 -23937.98 17855.975
## 2021.4164 -3041 -17648.21 11566.214 -25380.81 19298.806
## 2021.4192 -3041 -18534.29 12452.290 -26735.94 20653.943
## 2021.4219 -3041 -19372.36 13290.362 -28017.66 21935.663
# Using this for loop i foung the most (in my opinion) optimal transformation of zmiany_ts
library("gridExtra")
library(ggplot2)
myplots = list()
for(i in 1:50) {
zmiany_ts <- ts(zmiany,start=c(2020,1,23),frequency=365)
zmiany_ts <- zmiany_ts %>% diff(i)
zmiany_arima <- auto.arima(zmiany_ts)
checkresiduals(zmiany_arima)
myplots[i] <- zmiany_ts %>% ggplot() + ggtitle(sprintf("Try number: %i", i))
myplots[i]
}
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,3) with zero mean
## Q* = 1124.3, df = 97, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 102
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0) with zero mean
## Q* = 1898.2, df = 100, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 102
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,3) with zero mean
## Q* = 405.07, df = 95, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 101
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0) with zero mean
## Q* = 1064.9, df = 96, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 101
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,1) with zero mean
## Q* = 1378.3, df = 98, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 101
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 792.14, df = 98, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 101
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,5) with zero mean
## Q* = 162.52, df = 93, p-value = 1.104e-05
##
## Model df: 8. Total lags used: 101
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2) with zero mean
## Q* = 612.13, df = 94, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 100
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,3) with non-zero mean
## Q* = 774.19, df = 92, p-value < 2.2e-16
##
## Model df: 8. Total lags used: 100
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,3) with zero mean
## Q* = 512.53, df = 95, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 100
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2) with zero mean
## Q* = 357.8, df = 94, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 100
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1436.6, df = 96, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 100
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 972.22, df = 96, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 99
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 227.58, df = 95, p-value = 6.907e-13
##
## Model df: 4. Total lags used: 99
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,3) with zero mean
## Q* = 699.89, df = 94, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 99
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1488.1, df = 95, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 99
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,5) with zero mean
## Q* = 981.2, df = 93, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 99
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2) with zero mean
## Q* = 324.48, df = 92, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 98
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1556, df = 94, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 98
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with zero mean
## Q* = 1588, df = 97, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 98
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 253.35, df = 95, p-value = 2.22e-16
##
## Model df: 3. Total lags used: 98
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 824.85, df = 95, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 98
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,3) with zero mean
## Q* = 390.63, df = 89, p-value < 2.2e-16
##
## Model df: 8. Total lags used: 97
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with zero mean
## Q* = 636.99, df = 91, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 97
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,4) with zero mean
## Q* = 723.59, df = 91, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 97
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1564.1, df = 93, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 97
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 1069.3, df = 94, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 97
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 393.31, df = 93, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 96
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 807.13, df = 93, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 96
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1440.8, df = 92, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 96
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with zero mean
## Q* = 1711.8, df = 94, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 96
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with zero mean
## Q* = 510.99, df = 90, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 96
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 1533.3, df = 91, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 95
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 968.56, df = 92, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 95
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,3) with zero mean
## Q* = 418.85, df = 91, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 95
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,2) with zero mean
## Q* = 807.62, df = 92, p-value < 2.2e-16
##
## Model df: 3. Total lags used: 95
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with zero mean
## Q* = 1100.9, df = 89, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 95
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with zero mean
## Q* = 741.81, df = 88, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 94
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with zero mean
## Q* = 1993, df = 92, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 94
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,3) with non-zero mean
## Q* = 1628.6, df = 87, p-value < 2.2e-16
##
## Model df: 7. Total lags used: 94
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,2)
## Q* = 1015.8, df = 92, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 94
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,2)
## Q* = 530.98, df = 92, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 94
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 819.2, df = 87, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 93
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 1593, df = 87, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 93
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2)
## Q* = 813.35, df = 88, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 93
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)
## Q* = 483.73, df = 89, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 93
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 1442.4, df = 87, p-value < 2.2e-16
##
## Model df: 6. Total lags used: 93
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,2)
## Q* = 1033.4, df = 90, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 92
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,3)
## Q* = 301.07, df = 85, p-value < 2.2e-16
##
## Model df: 7. Total lags used: 92
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)
## Q* = 804.44, df = 88, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 92