Import require Libraries and Dataset
library(tidyverse)
library(lubridate) # To formate strings to date
library(forecast) # Models(ETS,ARIMA)
library(googlesheets4) # To load google sheet data to R
library(tseries) # for Augmented Dickey Fuller Test (Stationary Test)
library(gridExtra) #Additional Grids for Plots
Fetch Covid-19 Data from data.covidmyanmar.com
covid_myanmar_data_raw = read_sheet("https://docs.google.com/spreadsheets/d/1-Csmn_rXTQvnkJR8tnFkQEyKBnhq8fz-YxyHidhONiI",sheet="cumulative number")
Select useful variables and rename them
covid_myanmar_data <- covid_myanmar_data_raw %>%
select(c(`Total test`,`announced date`,`negative`,`positive`,`deceased`,`deased with underlying diseases`,`recovered`,`active case`,`ref`,`pui`,`hospital Q`,`facility q`,`cumulative cases`)) %>%
rename(
total_test = `Total test`,
date = `announced date`,
deased_with_conditions =`deased with underlying diseases`,
active_case = `active case`,
hospital_q = `hospital Q`,
facility_q = `facility q`,
cumulative_cases = `cumulative cases`
) %>%
mutate(date = ymd(date)) %>%
mutate(positivity_rate = positive/total_test*100)
Filter dates from the start of second wave
#1:2020-03-23
#second_wave_data <- filter(covid_myanmar_data, date>"2020-08-16" & date<"2020-12-31")
second_wave_data <- filter(covid_myanmar_data, date>"2020-08-16" & date<"2020-12-31")
ts_data <- ts(second_wave_data$positive)
First, we will try visualize our time series object to know the pattern.
autoplot(ts_data) +
ggtitle("Covid-19 Cases in Myanmar") +
theme(plot.title = element_text(hjust = 0.5)) +
ylab("Positive Cases")+
xlab("Days from the start of the Second Wave")
According to the plot, series is not Stationary and no seasonal satterns. Not enough data to decompose. There are some significant outliers in the series. We will detect all the values greater than 3 standard deviations from the mean and replace them with linear interpolation method.
ts_2 <- tsclean(ts_data)
Based on the following plot, we can assume extreme values are normalized.
autoplot(ts_data, series = "Original") +
autolayer(ts_2, series="Outlier Normalized") +
ggtitle("Comparison of the Original and Normalized Series") +
theme(plot.title = element_text(hjust = 0.5)) +
ylab("Positive Cases")+
xlab("Days from the start of the Second Wave")
fit_ets <- ets(ts_2,model = "AAN")
print(summary(fit_ets))
## ETS(A,A,N)
##
## Call:
## ets(y = ts_2, model = "AAN")
##
## Smoothing parameters:
## alpha = 0.3507
## beta = 0.0381
##
## Initial states:
## l = -3.7402
## b = 4.7434
##
## sigma: 142.7282
##
## AIC AICc BIC
## 2023.437 2023.899 2038.001
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -8.373659 140.6136 101.8935 -7.430332 21.14203 0.8344837
## ACF1
## Training set -0.02326358
## ME RMSE MAE MPE MAPE MASE
## Training set -8.373659 140.6136 101.8935 -7.430332 21.14203 0.8344837
## ACF1
## Training set -0.02326358
Forecasted Values
covid_forecast_ets <- forecast::forecast(fit_ets,h = 10)
autoplot(covid_forecast_ets)+
theme(plot.title = element_text(hjust = 0.5))
ndiffs(ts_2)
## [1] 1
diffed_ts <- diff(ts_2,differences = )
autoplot(diffed_ts) +
ggtitle("Time Plot: Covid-19 Trend in Myanmar 1st Differencing") +
ylab("Positive")
adf.test(diffed_ts)
## Warning in adf.test(diffed_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diffed_ts
## Dickey-Fuller = -6.4851, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
\(H_{0}\) : that a unit root is present in a time series sample. (non-stationary) p-value > 0.05
\(H_{A}\) : is different depending on which version of the test is used (stationary) p-value <= 0.05 p-value = 0.01 < 0.05 \(\therefore\) reject \(H_{0}\) The series is stationary after first differencing.
ggAcf(diffed_ts)+
ggtitle("Autocorrelation") +
theme(plot.title = element_text(hjust = 0.5))
ggPacf(diffed_ts)+
ggtitle("Partial Autocorrelation")+
theme(plot.title = element_text(hjust = 0.5))
ACF and PACF Lags are not stable. Proceed to test with Auto ARIMA Model ##### Testing with Auto AMIMA method
fit_arima_data <- auto.arima(ts_data)
#fit_arima_data <- arima(ts_2,order = c(1, 1, 1))
summary(fit_arima_data)
## Series: ts_data
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.4996
## s.e. 0.0839
##
## sigma^2 estimated as 35645: log likelihood=-898.69
## AIC=1801.38 AICc=1801.47 BIC=1807.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 9.027877 187.4043 128.8072 -1.281323 22.48113 0.8806328 0.03871697
#,d=2,stepwise = FALSE, approximation = FALSE, trace = TRUE
covid_forecast <- forecast::forecast(fit_arima_data,h = 10)
autoplot(covid_forecast)
checkresiduals(fit_arima_data)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 14.85, df = 9, p-value = 0.09513
##
## Model df: 1. Total lags used: 10
Variations in the residuals and statistically significance value on Lag 7. We Can Assume the modle is not good enough.
\(H_{0}\) : The data are independently distributed (No Correlations) p-value > 0.05 \(H_{A}\) : The data are not independently distributed (Serial Correlations) p-value <= 0.05 p-value = 0.09513 < 0.05 \(\therefore\) reject \(H_{0}\) There are Serial Correlations. Model is not fit.
shapiro.test(fit_arima_data$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit_arima_data$residuals
## W = 0.94109, p-value = 1.643e-05
\(H_{0}\) : Sample come from Normally Distributed population (Normal) p-value > 0.05
\(H_{A}\) : Sample do not come from Normally Distributed population (Not Normal) p-value <= 0.05
p-value = 1.643e-05 < 0.05
\(\therefore\) reject \(H_{0}\)
Residuals are not Normally Distributed.
** According to Ljung-Box test and Shapiro-Wilk test, we can assume that the Model is not fitted. **
ln_ts <- log2(ts_2)
sqrt_ts <- sqrt(ts_2)
bc_ts <- BoxCox(ts_2,lambda = BoxCox.lambda(ts_data) )
p1 <- ggplot() +
geom_qq(aes(sample = ts_2)) +
ggtitle("Original Timeseries") +
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot() +
geom_qq(aes(sample = ln_ts)) +
ggtitle("Log (ln)") +
theme(plot.title = element_text(hjust = 0.5))
p3 <- ggplot() +
geom_qq(aes(sample = sqrt_ts)) +
ggtitle("Square Root") +
theme(plot.title = element_text(hjust = 0.5))
p4 <- ggplot() +
geom_qq(aes(sample = bc_ts)) +
ggtitle("Box-Cox Power Transformation") +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,p3,p4)
According to the plots none of the transformation methods can normalize the series. Although there is no improvement, we can start testing with Log Transformation.
#### ARIMA with log Transformation
ndiffs(log(ts_2))
## [1] 2
Two differencing is needed
diffed_ts <- diff(log(ts_2),differences = 2)
autoplot(diffed_ts) +
ggtitle("Data After Differencing")
adf.test(diffed_ts)
## Warning in adf.test(diffed_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diffed_ts
## Dickey-Fuller = -15.694, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
\(H_{0}\) : that a unit root is present in a time series sample. (non-stationary) p-value > 0.05
\(H_{A}\) : is different depending on which version of the test is used (stationary) p-value <= 0.05 p-value = 0.01 < 0.05 \(\therefore\) reject \(H_{0}\) The series is stationary after first differencing.
ggAcf(diffed_ts)+
ggtitle("Autocorrelation") +
theme(plot.title = element_text(hjust = 0.5))
ggPacf(diffed_ts)+
ggtitle("Partial Autocorrelation")+
theme(plot.title = element_text(hjust = 0.5))
ACF and PACF Lags are not stable. Proceed to test with Auto ARIMA Model ##### Testing with Auto AMIMA method
fit_arima_data <- auto.arima(ts_data)
#fit_arima_data <- arima(ts_2,order = c(1, 1, 1))
summary(fit_arima_data)
## Series: ts_data
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.4996
## s.e. 0.0839
##
## sigma^2 estimated as 35645: log likelihood=-898.69
## AIC=1801.38 AICc=1801.47 BIC=1807.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 9.027877 187.4043 128.8072 -1.281323 22.48113 0.8806328 0.03871697
#,d=2,stepwise = FALSE, approximation = FALSE, trace = TRUE
covid_forecast <- forecast::forecast(fit_arima_data,h = 10)
autoplot(covid_forecast)
checkresiduals(fit_arima_data)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 14.85, df = 9, p-value = 0.09513
##
## Model df: 1. Total lags used: 10
Variations in the residuals and statistically significance value on Lag 7. We Can Assume the modle is not good enough.
\(H_{0}\) : The data are independently distributed (No Correlations) p-value > 0.05 \(H_{A}\) : The data are not independently distributed (Serial Correlations) p-value <= 0.05 p-value = 0.09513 < 0.05 \(\therefore\) reject \(H_{0}\) There are Serial Correlations. Model is not fit.
shapiro.test(fit_arima_data$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit_arima_data$residuals
## W = 0.94109, p-value = 1.643e-05
\(H_{0}\) : Sample come from Normally Distributed population (Normal) p-value > 0.05
\(H_{A}\) : Sample do not come from Normally Distributed population (Not Normal) p-value <= 0.05
p-value = 1.643e-05 < 0.05
\(\therefore\) reject \(H_{0}\)
Residuals are not Normally Distributed.
** According to Ljung-Box test and Shapiro-Wilk test, we can assume that the Model is not fitted. **
### Conclusion