A series of R packages and libraries are installed beforehand.
if (!require("caret")) {
install.packages("caret", repos = "https://cran.asia/")
}
if (!require("dplyr")) {
install.packages("dplyr", repos = "https://cran.asia/")
}
if (!require("forecast")) {
install.packages("forecast", repos = "https://cran.asia/")
}
if (!require("fpp")) {
install.packages("fpp", repos = "https://cran.asia/")
}
if (!require("ggplot2")) {
install.packages("ggplot2", repos = "https://cran.asia/")
}
if (!require("gridExtra")) {
install.packages("gridExtra", repos = "https://cran.asia/")
}
if (!require("kableExtra")) {
install.packages("kableExtra", repos = "https://cran.asia/")
}
if (!require("lubridate")) {
install.packages("lubridate", repos="https://cran.asia/")
}
if (!require("plotly")) {
install.packages("plotly", repos = "https://cran.asia/")
}
if (!require("plyr")) {
install.packages("plyr", repos = "https://cran.asia/")
}
if (!require("raster")) {
install.packages("raster", repos = "https://cran.asia/")
}
if (!require("readxl")) {
install.packages("readxl", repos = "https://cran.asia/")
}
if (!require("scales")) {
install.packages("scales", repos = "https://cran.asia/")
}
if (!require("tidyquant")) {
install.packages("tidyquant", repos = "https://cran.asia/")
}
if (!require('tidyr')) {
install.packages('tidyr', repos = "https://cran.asia/")
}
if (!require("thematic")) {
install.packages("thematic", repos = "https://cran.asia/")
}
if (!require("tseries")) {
install.packages("tseries", repos = "https://cran.asia/")
}
if (!require("zoo")) {
install.packages("zoo", repos = "https://cran.asia/")
}
library(caret)
library(dplyr)
library(forecast)
library(fpp)
library(ggplot2)
library(gridExtra)
library(kableExtra)
library(lubridate)
library(plotly)
library(plyr)
library(raster)
library(readxl)
library(scales)
library(tidyquant)
library(tidyr)
library(tseries)
library(zoo)Introduction
Objectives
In this project, there are two objectives to be achieved:
- To predict the number of daily active cases
- To compare the performance of ARIMA model, Linear Regression and Poisson Distribution
Data Cleaning and Processing
Based on the aforementioned objectives, two datasets “cases_malaysia.csv” and “cases_state.csv” were obtained from https://github.com/MoH-Malaysia/covid19-public and imported to RStudio for further processing to carry out Exploraroty Data Analysis (EDA).
Data Ingestion
# covid_malaysia_endpoint <- "https://raw.githubusercontent.com/MoH-Malaysia/covid19-public/main/epidemic/cases_malasia.csv"
# covid_state_endpoint <- "https://raw.githubusercontent.com/MoH-Malaysia/covid19-public/main/epidemic/cases_state.csv"
covid_malaysia_endpoint <- "models/cases_malaysia.csv"
covid_state_endpoint <- "models/cases_state.csv"
df <- read.csv(covid_malaysia_endpoint, header = TRUE)
df_state <- read.csv(covid_state_endpoint, header = TRUE)df_country <- read.csv("models/cases_malaysia.csv", header=TRUE)
# Convert date column & sort
df_country$date <- as.Date(df_country$date, format="%Y-%m-%d")
df_country <- df_country[order(df_country$date),]
# Filter data
df_data <- df_country[,c("date", "cases_new", "cases_active")]Data Initialization
df_population <- data.frame(
c("Selangor", "Sabah", "Johor", "Sarawak", "Perak", "Kedah", "Kelantan", "Pulau Pinang", "W.P. Kuala Lumpur", "Pahang", "Terengganu", "Negeri Sembilan", "Melaka", "Perlis", "W.P. Putrajaya", "W.P. Labuan"),
c(6555400, 3833000, 3794000, 2822200, 2508900, 2194100, 1928800, 1774400, 1746600, 1684600, 1275100, 1129100, 937500, 255400, 116100, 100100)
)
colnames(df_population) <- c("NAME_1", "pop")
theme_opts <- list(theme(
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_blank()
))
malaysia <- getData("GADM", country = "MYS", level = 1)
malaysia@data <- mutate(malaysia@data, NAME_1 = replace(NAME_1, NAME_1 == "Trengganu", "Terengganu"))
malaysia@data <- mutate(malaysia@data, NAME_1 = replace(NAME_1, NAME_1 == "Kuala Lumpur", "W.P. Kuala Lumpur"))
malaysia@data <- mutate(malaysia@data, NAME_1 = replace(NAME_1, NAME_1 == "Labuan", "W.P. Labuan"))
malaysia@data <- mutate(malaysia@data, NAME_1 = replace(NAME_1, NAME_1 == "Putrajaya", "W.P. Putrajaya"))
malaysia@data$id <- rownames(malaysia@data)Data Inspection
This is to obtain an overall understanding on the selected datasets of its strucutre and data context such as data types as well as to have a glance of minimum value, maximum value, median and mean of each of the columns.
# Check the structure of the dataframe
str(df)## 'data.frame': 708 obs. of 31 variables:
## $ date : chr "2020-01-25" "2020-01-26" "2020-01-27" "2020-01-28" ...
## $ cases_new : int 4 0 0 0 3 1 0 0 0 0 ...
## $ cases_import : int 4 0 0 0 3 1 0 0 0 0 ...
## $ cases_recovered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_active : int 4 4 4 4 7 8 8 8 8 8 ...
## $ cases_cluster : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_unvax : int 4 0 0 0 3 1 0 0 0 0 ...
## $ cases_pvax : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_fvax : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_boost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_child : int 0 0 0 0 1 0 0 0 0 0 ...
## $ cases_adolescent : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_adult : int 1 0 0 0 2 1 0 0 0 0 ...
## $ cases_elderly : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_0_4 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ cases_5_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_12_17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_18_29 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_30_39 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ cases_40_49 : int 1 0 0 0 0 1 0 0 0 0 ...
## $ cases_50_59 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ cases_60_69 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_70_79 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_80 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cluster_import : int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_religious : int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_community : int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_highRisk : int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_education : int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_detentionCentre: int NA NA NA NA NA NA NA NA NA NA ...
## $ cluster_workplace : int NA NA NA NA NA NA NA NA NA NA ...
str(df_state)## 'data.frame': 11344 obs. of 25 variables:
## $ date : chr "2020-01-25" "2020-01-25" "2020-01-25" "2020-01-25" ...
## $ state : chr "Johor" "Kedah" "Kelantan" "Melaka" ...
## $ cases_new : int 4 0 0 0 0 0 0 0 0 0 ...
## $ cases_import : int 4 0 0 0 0 0 0 0 0 0 ...
## $ cases_recovered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_active : int 4 0 0 0 0 0 0 0 0 0 ...
## $ cases_cluster : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_unvax : int 4 0 0 0 0 0 0 0 0 0 ...
## $ cases_pvax : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_fvax : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_boost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_child : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_adolescent: int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_adult : int 1 0 0 0 0 0 0 0 0 0 ...
## $ cases_elderly : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_0_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_5_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_12_17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_18_29 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_30_39 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_40_49 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ cases_50_59 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_60_69 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_70_79 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cases_80 : int 0 0 0 0 0 0 0 0 0 0 ...
# Check the dimension of the dataframe
dim(df)## [1] 708 31
dim(df_state)## [1] 11344 25
# Check the first 6 rows
head(df) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 | cluster_import | cluster_religious | cluster_community | cluster_highRisk | cluster_education | cluster_detentionCentre | cluster_workplace |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2020-01-25 | 4 | 4 | 0 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-26 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-27 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-28 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-29 | 3 | 3 | 0 | 7 | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-30 | 1 | 1 | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
head(df_state) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | state | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2020-01-25 | Johor | 4 | 4 | 0 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2020-01-25 | Kedah | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020-01-25 | Kelantan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020-01-25 | Melaka | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020-01-25 | Negeri Sembilan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020-01-25 | Pahang | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Examine the statistics data
summary(df) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 | cluster_import | cluster_religious | cluster_community | cluster_highRisk | cluster_education | cluster_detentionCentre | cluster_workplace | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:708 | Min. : 0.0 | Min. : 0.00 | Min. : 0 | Min. : 1 | Min. : 0.0 | Min. : 0.0 | Min. : 0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0 | Min. : 0.0000 | Min. : 0.00 | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.00 | Min. : 14.0 | |
| Class :character | 1st Qu.: 53.5 | 1st Qu.: 3.00 | 1st Qu.: 51 | 1st Qu.: 1212 | 1st Qu.: 17.0 | 1st Qu.: 53.5 | 1st Qu.: 0 | 1st Qu.: 0.0 | 1st Qu.: 0.00 | 1st Qu.: 2.0 | 1st Qu.: 3.0 | 1st Qu.: 38.75 | 1st Qu.: 4.0 | 1st Qu.: 1.0 | 1st Qu.: 1.0 | 1st Qu.: 3.0 | 1st Qu.: 16.0 | 1st Qu.: 10.0 | 1st Qu.: 6.0 | 1st Qu.: 4.0 | 1st Qu.: 2.0 | 1st Qu.: 1.00 | 1st Qu.: 0 | 1st Qu.: 0.0000 | 1st Qu.: 0.00 | 1st Qu.: 43.0 | 1st Qu.: 4.0 | 1st Qu.: 2.25 | 1st Qu.: 2.00 | 1st Qu.: 172.5 | |
| Mode :character | Median : 1542.0 | Median : 6.00 | Median : 1303 | Median : 15124 | Median : 364.0 | Median : 1205.5 | Median : 0 | Median : 0.0 | Median : 0.00 | Median : 122.0 | Median : 68.0 | Median : 1156.50 | Median : 93.5 | Median : 46.5 | Median : 75.0 | Median : 68.0 | Median : 466.5 | Median : 391.5 | Median : 197.0 | Median : 119.0 | Median : 64.5 | Median : 22.00 | Median : 8 | Median : 0.0000 | Median : 4.00 | Median :137.5 | Median : 16.0 | Median : 16.50 | Median : 37.00 | Median : 494.5 | |
| NA | Mean : 3900.4 | Mean : 12.21 | Mean : 3798 | Mean : 45991 | Mean : 693.7 | Mean : 2389.9 | Mean : 557 | Mean : 942.0 | Mean : 11.48 | Mean : 522.4 | Mean : 257.1 | Mean : 2666.92 | Mean : 348.9 | Mean : 208.0 | Mean : 314.4 | Mean : 257.1 | Mean :1006.1 | Mean : 820.5 | Mean : 494.7 | Mean : 345.7 | Mean : 223.6 | Mean : 92.29 | Mean : 33 | Mean : 0.4727 | Mean : 23.19 | Mean :198.8 | Mean : 27.3 | Mean : 37.53 | Mean : 60.47 | Mean : 624.4 | |
| NA | 3rd Qu.: 5299.5 | 3rd Qu.: 13.00 | 3rd Qu.: 5119 | 3rd Qu.: 63406 | 3rd Qu.:1088.8 | 3rd Qu.: 3322.8 | 3rd Qu.: 92 | 3rd Qu.: 290.2 | 3rd Qu.: 0.00 | 3rd Qu.: 744.8 | 3rd Qu.: 291.5 | 3rd Qu.: 3596.00 | 3rd Qu.: 554.5 | 3rd Qu.: 294.2 | 3rd Qu.: 450.8 | 3rd Qu.: 291.5 | 3rd Qu.:1243.5 | 3rd Qu.:1136.8 | 3rd Qu.: 670.2 | 3rd Qu.: 518.0 | 3rd Qu.: 364.0 | 3rd Qu.:149.00 | 3rd Qu.: 50 | 3rd Qu.: 0.0000 | 3rd Qu.: 16.00 | 3rd Qu.:300.2 | 3rd Qu.: 41.0 | 3rd Qu.: 40.00 | 3rd Qu.: 86.00 | 3rd Qu.:1049.5 | |
| NA | Max. :24599.0 | Max. :366.00 | Max. :24855 | Max. :263850 | Max. :3394.0 | Max. :12684.0 | Max. :7318 | Max. :8448.0 | Max. :305.00 | Max. :3437.0 | Max. :1820.0 | Max. :16450.00 | Max. :1986.0 | Max. :1362.0 | Max. :2091.0 | Max. :1820.0 | Max. :6374.0 | Max. :4922.0 | Max. :3132.0 | Max. :2066.0 | Max. :1231.0 | Max. :581.00 | Max. :210 | Max. :54.0000 | Max. :359.00 | Max. :825.0 | Max. :189.0 | Max. :501.00 | Max. :439.00 | Max. :2338.0 | |
| NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :342 | NA’s :342 | NA’s :342 | NA’s :342 | NA’s :342 | NA’s :342 | NA’s :342 |
summary(df_state) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | state | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:11344 | Length:11344 | Min. : 0.0 | Min. : 0.0000 | Min. : 0.0 | Min. : -2.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.00 | Min. : 0.0000 | Min. : 0.00 | Min. : 0.00 | Min. : 0.0 | Min. : 0.00 | Min. : 0 | Min. : 0.00 | Min. : 0.00 | Min. : 0.00 | Min. : 0.00 | Min. : 0.00 | Min. : 0.0 | Min. : 0.00 | Min. : 0.000 | Min. : 0.000 | |
| Class :character | Class :character | 1st Qu.: 0.0 | 1st Qu.: 0.0000 | 1st Qu.: 0.0 | 1st Qu.: 11.0 | 1st Qu.: 0.0 | 1st Qu.: 0.0 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.0000 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.0 | 1st Qu.: 0.00 | 1st Qu.: 0 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.0 | 1st Qu.: 0.00 | 1st Qu.: 0.000 | 1st Qu.: 0.000 | |
| Mode :character | Mode :character | Median : 19.0 | Median : 0.0000 | Median : 16.0 | Median : 274.5 | Median : 3.0 | Median : 14.0 | Median : 0.00 | Median : 0.00 | Median : 0.0000 | Median : 1.00 | Median : 1.00 | Median : 13.0 | Median : 1.00 | Median : 0 | Median : 1.00 | Median : 1.00 | Median : 4.00 | Median : 4.00 | Median : 2.00 | Median : 2.0 | Median : 1.00 | Median : 0.000 | Median : 0.000 | |
| NA | NA | Mean : 243.7 | Mean : 0.7916 | Mean : 237.3 | Mean : 2874.0 | Mean : 43.3 | Mean : 149.2 | Mean : 34.77 | Mean : 58.97 | Mean : 0.7377 | Mean : 32.64 | Mean : 16.06 | Mean : 166.6 | Mean : 21.81 | Mean : 13 | Mean : 19.64 | Mean : 16.06 | Mean : 62.85 | Mean : 51.27 | Mean : 30.91 | Mean : 21.6 | Mean : 13.97 | Mean : 5.768 | Mean : 2.063 | |
| NA | NA | 3rd Qu.: 237.0 | 3rd Qu.: 0.0000 | 3rd Qu.: 215.2 | 3rd Qu.: 2532.5 | 3rd Qu.: 40.0 | 3rd Qu.: 134.0 | 3rd Qu.: 3.00 | 3rd Qu.: 8.00 | 3rd Qu.: 0.0000 | 3rd Qu.: 29.00 | 3rd Qu.: 12.00 | 3rd Qu.: 160.0 | 3rd Qu.: 22.25 | 3rd Qu.: 11 | 3rd Qu.: 17.00 | 3rd Qu.: 12.00 | 3rd Qu.: 59.00 | 3rd Qu.: 50.00 | 3rd Qu.: 29.00 | 3rd Qu.: 22.0 | 3rd Qu.: 15.00 | 3rd Qu.: 6.000 | 3rd Qu.: 2.000 | |
| NA | NA | Max. :8792.0 | Max. :74.0000 | Max. :8801.0 | Max. :94137.0 | Max. :1545.0 | Max. :6112.0 | Max. :3890.00 | Max. :3610.00 | Max. :85.0000 | Max. :1002.00 | Max. :527.00 | Max. :6549.0 | Max. :637.00 | Max. :429 | Max. :608.00 | Max. :527.00 | Max. :2524.00 | Max. :2097.00 | Max. :1265.00 | Max. :699.0 | Max. :449.00 | Max. :157.000 | Max. :62.000 |
Handle missing/duplicate values
In this part of data cleaning step, it was found out that there was missing data in some of the columns in “df” dataset while there was none in “df_state”. The rows with missing data were data recorded for year 2020 and it was not being taken into consideration for predicting daily active cases because data in year 2020 was not of interest for this project.
# Check for the columns with missing values
colSums(is.na(df)) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| x | |
|---|---|
| date | 0 |
| cases_new | 0 |
| cases_import | 0 |
| cases_recovered | 0 |
| cases_active | 0 |
| cases_cluster | 0 |
| cases_unvax | 0 |
| cases_pvax | 0 |
| cases_fvax | 0 |
| cases_boost | 0 |
| cases_child | 0 |
| cases_adolescent | 0 |
| cases_adult | 0 |
| cases_elderly | 0 |
| cases_0_4 | 0 |
| cases_5_11 | 0 |
| cases_12_17 | 0 |
| cases_18_29 | 0 |
| cases_30_39 | 0 |
| cases_40_49 | 0 |
| cases_50_59 | 0 |
| cases_60_69 | 0 |
| cases_70_79 | 0 |
| cases_80 | 0 |
| cluster_import | 342 |
| cluster_religious | 342 |
| cluster_community | 342 |
| cluster_highRisk | 342 |
| cluster_education | 342 |
| cluster_detentionCentre | 342 |
| cluster_workplace | 342 |
colSums(is.na(df_state)) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| x | |
|---|---|
| date | 0 |
| state | 0 |
| cases_new | 0 |
| cases_import | 0 |
| cases_recovered | 0 |
| cases_active | 0 |
| cases_cluster | 0 |
| cases_unvax | 0 |
| cases_pvax | 0 |
| cases_fvax | 0 |
| cases_boost | 0 |
| cases_child | 0 |
| cases_adolescent | 0 |
| cases_adult | 0 |
| cases_elderly | 0 |
| cases_0_4 | 0 |
| cases_5_11 | 0 |
| cases_12_17 | 0 |
| cases_18_29 | 0 |
| cases_30_39 | 0 |
| cases_40_49 | 0 |
| cases_50_59 | 0 |
| cases_60_69 | 0 |
| cases_70_79 | 0 |
| cases_80 | 0 |
# Show first few rows of the missing values
head(df[rowSums(is.na(df)) > 0, ]) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 | cluster_import | cluster_religious | cluster_community | cluster_highRisk | cluster_education | cluster_detentionCentre | cluster_workplace |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2020-01-25 | 4 | 4 | 0 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-26 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-27 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-28 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-29 | 3 | 3 | 0 | 7 | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
| 2020-01-30 | 1 | 1 | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NA | NA | NA | NA | NA | NA | NA |
head(df[rowSums(is.na(df_state)) > 0, ]) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| date | cases_new | cases_import | cases_recovered | cases_active | cases_cluster | cases_unvax | cases_pvax | cases_fvax | cases_boost | cases_child | cases_adolescent | cases_adult | cases_elderly | cases_0_4 | cases_5_11 | cases_12_17 | cases_18_29 | cases_30_39 | cases_40_49 | cases_50_59 | cases_60_69 | cases_70_79 | cases_80 | cluster_import | cluster_religious | cluster_community | cluster_highRisk | cluster_education | cluster_detentionCentre | cluster_workplace |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
# The missing rows for df can be ignored as there are 2020 data. 2021 data contains more columns.
# There is no missing rows for df_state.
# Check for duplicate values
df[duplicated(df)]## data frame with 0 columns and 708 rows
df[duplicated(df_state)]## data frame with 0 columns and 708 rows
# There are no duplicated rowsPreprocessing
# Change date type from String to Date
df$date <- as.Date(df$date, format = "%Y-%m-%d")
df_state$date <- as.Date(df_state$date, format = "%Y-%m-%d")Total Cases
df_total_cases <- df_state %>%
group_by(state) %>%
summarise_at(vars(cases_new), list(cases_total = sum)) %>%
mutate(cases_total = cases_total / 1000) %>%
arrange(state) %>%
dplyr::rename(NAME_1 = state)
malaysia_map <- data.table::copy(malaysia)
malaysia_map@data <- join(malaysia_map@data, df_total_cases, by = "NAME_1")
malaysia_df <- fortify(malaysia_map)## Regions defined for each Polygons
malaysia_df <- join(malaysia_df, malaysia_map@data, by = "id")
# https://garthtarr.github.io/meatR/ggplot_extensions.html
# https://rstudio-pubs-static.s3.amazonaws.com/160207_ebe47475bb7744429b9bd4c908e2dc45.html
# show total cases on Malaysia map
ggplot() +
geom_polygon(data = malaysia_df, aes(x = long, y = lat, group = group, fill = cases_total), color = "white", size = 0.25) +
theme(aspect.ratio = 2 / 5) +
scale_fill_distiller(name = "No. of Total Cases (in '000)", palette = "YlOrRd", direction = 1, breaks = pretty_breaks(n = 5)) +
labs(title = "Total Cases Since Day 1") +
theme_opts# show total cases table
df_state %>%
group_by(state) %>%
summarise_at(vars(cases_new), list(cases_total = sum)) %>%
arrange(desc(cases_total)) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| state | cases_total |
|---|---|
| Selangor | 788150 |
| Sarawak | 252341 |
| Johor | 245773 |
| Sabah | 240961 |
| W.P. Kuala Lumpur | 211176 |
| Kelantan | 169668 |
| Kedah | 168865 |
| Pulau Pinang | 161476 |
| Perak | 129352 |
| Negeri Sembilan | 113750 |
| Pahang | 95817 |
| Terengganu | 82932 |
| Melaka | 76620 |
| W.P. Labuan | 10841 |
| W.P. Putrajaya | 9428 |
| Perlis | 7204 |
Infection Rate
df_infection_rate <- df_state %>%
group_by(state) %>%
summarise_at(vars(cases_new), list(cases_total = sum)) %>%
dplyr::rename(NAME_1 = state) %>%
join(df_population, by = "NAME_1") %>%
mutate(rate = cases_total / pop) %>%
arrange(NAME_1)
malaysia_map <- data.table::copy(malaysia)
malaysia_map@data <- join(malaysia_map@data, df_infection_rate, by = "NAME_1")
malaysia_df <- fortify(malaysia_map)## Regions defined for each Polygons
malaysia_df <- join(malaysia_df, malaysia_map@data, by = "id")
# https://garthtarr.github.io/meatR/ggplot_extensions.html
# https://rstudio-pubs-static.s3.amazonaws.com/160207_ebe47475bb7744429b9bd4c908e2dc45.html
# show infection rate on Malaysia map
ggplot() +
geom_polygon(data = malaysia_df, aes(x = long, y = lat, group = group, fill = rate), color = "white", size = 0.25) +
theme(aspect.ratio = 2 / 5) +
scale_fill_distiller(name = "No. of Total Cases (%)", palette = "YlOrRd", direction = 1, breaks = pretty_breaks(n = 5), labels = percent) +
labs(title = "Infection Rate based on Total Population") +
theme_opts# show infection rate
df_state %>%
group_by(state) %>%
summarise_at(vars(cases_new), list(cases_total = sum)) %>%
dplyr::rename(NAME_1 = state) %>%
join(df_population, by = "NAME_1") %>%
dplyr::rename(state = NAME_1) %>%
mutate(rate = cases_total / pop) %>%
arrange(desc(rate)) %>%
mutate(rate = paste0(round(cases_total / pop * 100, 2), "%")) %>%
kable("html") %>%
scroll_box(width = "100%") %>%
kable_styling(font_size = 12)| state | cases_total | pop | rate |
|---|---|---|---|
| W.P. Kuala Lumpur | 211176 | 1746600 | 12.09% |
| Selangor | 788150 | 6555400 | 12.02% |
| W.P. Labuan | 10841 | 100100 | 10.83% |
| Negeri Sembilan | 113750 | 1129100 | 10.07% |
| Pulau Pinang | 161476 | 1774400 | 9.1% |
| Sarawak | 252341 | 2822200 | 8.94% |
| Kelantan | 169668 | 1928800 | 8.8% |
| Melaka | 76620 | 937500 | 8.17% |
| W.P. Putrajaya | 9428 | 116100 | 8.12% |
| Kedah | 168865 | 2194100 | 7.7% |
| Terengganu | 82932 | 1275100 | 6.5% |
| Johor | 245773 | 3794000 | 6.48% |
| Sabah | 240961 | 3833000 | 6.29% |
| Pahang | 95817 | 1684600 | 5.69% |
| Perak | 129352 | 2508900 | 5.16% |
| Perlis | 7204 | 255400 | 2.82% |
ARIMA
# Performing ARIMA on the number of COVID cases in Malaysia
# Aim to predict the next 60 days of new cases based on historical data
# A bit of cleaning
mycovidnewcases = df_country[,c(1:2)]
colnames(mycovidnewcases)## [1] "date" "cases_new"
str(mycovidnewcases)## 'data.frame': 708 obs. of 2 variables:
## $ date : Date, format: "2020-01-25" "2020-01-26" ...
## $ cases_new: int 4 0 0 0 3 1 0 0 0 0 ...
head(mycovidnewcases)## date cases_new
## 1 2020-01-25 4
## 2 2020-01-26 0
## 3 2020-01-27 0
## 4 2020-01-28 0
## 5 2020-01-29 3
## 6 2020-01-30 1
mycovidnewcases$date = as.Date(mycovidnewcases$date, format = "%d/%m/%Y")
covidplot = ggplot(mycovidnewcases, aes(date, cases_new)) +
geom_line()
covidplot- Plot ACF and PACF to understand the correlation in a time series data.
- Autocorrelation is the correlation between a time series and a delayed version of itself (lag).
- Autocorrelation Function (ACF) plots the correlation coefficient against the lag.
- The Partial Autocorrelation captures a ‘direct’ correlation between time series and a lagged version of itself
# Perform ADF and KPSS test to check stationarity
# Null hypothesis is that the series is not stationary.
adf = adf.test(mycovidnewcases[,2])
adf##
## Augmented Dickey-Fuller Test
##
## data: mycovidnewcases[, 2]
## Dickey-Fuller = -1.5953, Lag order = 8, p-value = 0.7496
## alternative hypothesis: stationary
kpss = kpss.test(mycovidnewcases[,2])## Warning in kpss.test(mycovidnewcases[, 2]): p-value smaller than printed p-value
kpss##
## KPSS Test for Level Stationarity
##
## data: mycovidnewcases[, 2]
## KPSS Level = 5.3702, Truncation lag parameter = 6, p-value = 0.01
The ADF and KPSS test suggests that the data is non-stationary and differencing is required.
Split data into train and test set.
training = mycovidnewcases[1:500,]
test = mycovidnewcases[501:709,]Check how many differencing is needed
ndiffs(training[,2]) # Differencing of one is required but ARIMA model will do this for us as long as we input correct order (value of d)## [1] 1
Use auto.arima to find out best model
summary(auto.arima(training[,2], trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)) # Best model and order: ARIMA (3,1,2)##
## ARIMA(0,1,0) : 7250.838
## ARIMA(0,1,0) with drift : 7252.388
## ARIMA(0,1,1) : 7226.424
## ARIMA(0,1,1) with drift : 7227.317
## ARIMA(0,1,2) : 7215.297
## ARIMA(0,1,2) with drift : 7215.2
## ARIMA(0,1,3) : 7216.229
## ARIMA(0,1,3) with drift : 7215.997
## ARIMA(0,1,4) : 7218.209
## ARIMA(0,1,4) with drift : 7217.961
## ARIMA(0,1,5) : 7211.764
## ARIMA(0,1,5) with drift : 7212.052
## ARIMA(1,1,0) : 7234.383
## ARIMA(1,1,0) with drift : 7235.676
## ARIMA(1,1,1) : 7215.71
## ARIMA(1,1,1) with drift : 7215.553
## ARIMA(1,1,2) : 7216.471
## ARIMA(1,1,2) with drift : 7216.263
## ARIMA(1,1,3) : 7218.263
## ARIMA(1,1,3) with drift : 7218.036
## ARIMA(1,1,4) : 7219.852
## ARIMA(1,1,4) with drift : 7219.599
## ARIMA(2,1,0) : 7226.169
## ARIMA(2,1,0) with drift : 7227.126
## ARIMA(2,1,1) : 7215.972
## ARIMA(2,1,1) with drift : 7215.732
## ARIMA(2,1,2) : 7195.211
## ARIMA(2,1,2) with drift : 7195.784
## ARIMA(2,1,3) : 7200.711
## ARIMA(2,1,3) with drift : Inf
## ARIMA(3,1,0) : 7222.493
## ARIMA(3,1,0) with drift : 7223.085
## ARIMA(3,1,1) : 7215.7
## ARIMA(3,1,1) with drift : 7215.298
## ARIMA(3,1,2) : 7175.841
## ARIMA(3,1,2) with drift : 7176.026
## ARIMA(4,1,0) : 7212.231
## ARIMA(4,1,0) with drift : 7212.011
## ARIMA(4,1,1) : 7210.832
## ARIMA(4,1,1) with drift : 7210.051
## ARIMA(5,1,0) : 7207.49
## ARIMA(5,1,0) with drift : 7206.446
##
##
##
## Best model: ARIMA(3,1,2)
## Series: training[, 2]
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## 0.9861 -0.6205 -0.2513 -1.2967 0.8667
## s.e. 0.0565 0.0601 0.0502 0.0399 0.0339
##
## sigma^2 estimated as 101232: log likelihood=-3581.84
## AIC=7175.67 AICc=7175.84 BIC=7200.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 19.26179 316.2544 167.7446 NaN Inf 0.8891027 -0.03064234
Train the model with selected ARIMA model
trainingmodel = Arima(training[,2], order = c(3,1,2))
summary(trainingmodel)## Series: training[, 2]
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## 0.9861 -0.6205 -0.2513 -1.2967 0.8667
## s.e. 0.0565 0.0601 0.0502 0.0399 0.0339
##
## sigma^2 estimated as 101232: log likelihood=-3581.84
## AIC=7175.67 AICc=7175.84 BIC=7200.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 19.26179 316.2544 167.7446 NaN Inf 0.8891027 -0.03064234
coeftest(trainingmodel)##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.986093 0.056544 17.4392 < 2.2e-16 ***
## ar2 -0.620519 0.060052 -10.3331 < 2.2e-16 ***
## ar3 -0.251273 0.050227 -5.0028 5.652e-07 ***
## ma1 -1.296708 0.039865 -32.5272 < 2.2e-16 ***
## ma2 0.866657 0.033948 25.5291 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(trainingmodel$residuals)Make prediction using test set with the training model
testpred = forecast(test[,2], model = trainingmodel, h = 60, levels = c(95))
summary(testpred)##
## Forecast method: ARIMA(3,1,2)
##
## Model Information:
## Series: object
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## 0.9861 -0.6205 -0.2513 -1.2967 0.8667
## s.e. 0.0000 0.0000 0.0000 0.0000 0.0000
##
## sigma^2 estimated as 101232: log likelihood=-1706.85
## AIC=3415.7 AICc=3415.72 BIC=3419.04
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -17.281 916.5365 652.5599 -0.8898021 6.495591 0.810123 0.08568568
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 209 2967.914 2472.661 3463.167 2210.4895 3725.338
## 210 3118.206 2560.440 3675.972 2265.1766 3971.235
## 211 3407.344 2811.877 4002.811 2496.6557 4318.032
## 212 3623.620 2992.813 4254.427 2658.8838 4588.355
## 213 3619.708 2943.420 4295.996 2585.4145 4654.002
## 214 3408.995 2672.655 4145.335 2282.8606 4535.129
## 215 3149.295 2348.014 3950.577 1923.8413 4374.749
## 216 3024.942 2168.636 3881.247 1715.3352 4334.548
## 217 3116.412 2221.233 4011.592 1747.3536 4485.471
## 218 3349.030 2426.517 4271.543 1938.1680 4759.892
## 219 3552.900 2605.801 4499.999 2104.4376 5001.363
## 220 3586.608 2609.590 4563.625 2092.3877 5080.827
## 221 3434.890 2418.556 4451.224 1880.5420 4989.238
## 222 3213.140 2151.827 4274.452 1590.0023 4836.277
## 223 3080.147 1977.292 4183.001 1393.4761 4766.818
## 224 3124.726 1989.768 4259.685 1388.9569 4860.496
## 225 3306.930 2148.134 4465.727 1534.7040 5079.157
## 226 3492.356 2312.614 4672.097 1688.0963 5296.615
## 227 3550.939 2347.393 4754.486 1710.2740 5391.605
## 228 3447.866 2214.146 4681.586 1561.0538 5334.677
## 229 3263.281 1994.456 4532.106 1322.7800 5203.781
## 230 3130.502 1827.482 4433.521 1137.7058 5123.297
## 231 3140.007 1808.746 4471.268 1104.0186 5175.995
## 232 3278.153 1924.919 4631.388 1208.5601 5347.747
## 233 3441.844 2069.516 4814.172 1343.0498 5540.638
## 234 3515.147 2122.317 4907.978 1384.9966 5645.298
## 235 3451.146 2033.403 4868.889 1282.8950 5619.396
## 236 3301.417 1854.667 4748.167 1088.8035 5514.030
## 237 3175.066 1699.019 4651.112 917.6477 5432.484
## 238 3159.463 1657.909 4661.016 863.0351 5455.891
## 239 3260.103 1737.856 4782.350 932.0270 5588.179
## 240 3400.774 1860.566 4940.982 1045.2295 5756.319
## 241 3480.961 1922.290 5039.631 1097.1792 5864.742
## 242 3447.454 1867.267 5027.642 1030.7662 5864.143
## 243 3329.310 1724.279 4934.342 874.6264 5783.994
## 244 3213.452 1582.711 4844.193 719.4483 5707.455
## 245 3180.934 1526.842 4835.027 651.2182 5710.650
## 246 3250.448 1576.687 4924.209 690.6509 5810.245
## 247 3368.285 1677.354 5059.215 782.2299 5954.339
## 248 3449.519 1741.492 5157.545 837.3178 6061.720
## 249 3439.036 1711.832 5166.241 797.5048 6080.568
## 250 3348.683 1599.640 5097.726 673.7530 6023.614
## 251 3245.679 1473.689 5017.670 535.6543 5955.705
## 252 3202.808 1409.262 4996.353 459.8168 5945.799
## 253 3247.151 1434.836 5059.466 475.4548 6018.848
## 254 3343.363 1514.498 5172.228 546.3551 6140.371
## 255 3421.493 1576.505 5266.481 599.8279 6243.158
## 256 3427.693 1565.209 5290.177 579.2695 6276.117
## 257 3361.150 1479.076 5243.225 482.7665 6239.534
## 258 3272.054 1369.225 5174.883 361.9276 6182.180
## 259 3223.929 1301.085 5146.774 283.1921 6164.667
## 260 3248.481 1307.694 5189.268 280.3028 6216.659
## 261 3324.940 1368.131 5281.749 332.2592 6317.622
## 262 3397.194 1424.995 5369.394 380.9756 6413.413
## 263 3414.830 1426.385 5403.275 373.7654 6455.894
## 264 3368.173 1361.862 5374.484 299.7853 6436.561
## 265 3293.066 1267.773 5318.360 195.6471 6390.485
## 266 3243.524 1199.554 5287.494 117.5416 6369.507
## 267 3253.000 1191.864 5314.136 100.7647 6405.235
## 268 3311.958 1235.281 5388.634 135.9553 6487.961
plot(testpred)
grid(nx = NULL, ny = NULL,
lty = 2,
col = 'gray',
lwd = 2)accuracy(testpred)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -17.281 916.5365 652.5599 -0.8898021 6.495591 0.810123 0.08568568
With rolling average
Rolling average of 7
mycovidroll7 = mycovidnewcases %>%
mutate(seven_avg = rollmean(mycovidnewcases[,2], 7, align = 'left', fill = 0)) %>%
relocate(seven_avg)
head(mycovidroll7)## seven_avg date cases_new
## 1 1.1428571 2020-01-25 4
## 2 0.5714286 2020-01-26 0
## 3 0.5714286 2020-01-27 0
## 4 0.5714286 2020-01-28 0
## 5 0.8571429 2020-01-29 3
## 6 0.7142857 2020-01-30 1
ggplot(mycovidroll7, aes(date, mycovidnewcases[,2])) +
geom_col(fill = 'pink') +
geom_line(aes(y = seven_avg), color = 'red', size = 0.75) +
geom_line(aes(y = mycovidnewcases[,2]), color = 'blue', size = 1) +
labs(title = 'MY COVID data', y = 'covid cases')Rolling average of 21
mycovidroll21 = mycovidnewcases %>%
mutate(twoone_avg = rollmean(mycovidnewcases[,2], 21, align = 'left', fill = 0)) %>%
relocate(twoone_avg)
ggplot(mycovidroll21, aes(date, mycovidnewcases[,2])) +
geom_col(fill = 'pink') +
geom_line(aes(y = mycovidroll7$seven_avg), color = 'red', size = 0.75) +
geom_line(aes(y = mycovidnewcases[,2]), color = 'blue', size = 1) +
geom_line(aes(y = twoone_avg), color = 'green', size = 0.9) +
labs(title = 'MY COVID data', y = 'covid cases')tail(mycovidroll21, 30)## twoone_avg date cases_new
## 679 4175.524 2021-12-03 5551
## 680 4079.190 2021-12-04 4896
## 681 3996.524 2021-12-05 4298
## 682 3924.143 2021-12-06 4262
## 683 3852.476 2021-12-07 4965
## 684 3754.000 2021-12-08 5020
## 685 3690.333 2021-12-09 5446
## 686 3621.333 2021-12-10 5058
## 687 3550.619 2021-12-11 4626
## 688 3491.571 2021-12-12 3490
## 689 0.000 2021-12-13 3504
## 690 0.000 2021-12-14 4097
## 691 0.000 2021-12-15 3900
## 692 0.000 2021-12-16 4262
## 693 0.000 2021-12-17 4362
## 694 0.000 2021-12-18 4083
## 695 0.000 2021-12-19 3108
## 696 0.000 2021-12-20 2589
## 697 0.000 2021-12-21 3140
## 698 0.000 2021-12-22 3519
## 699 0.000 2021-12-23 3510
## 700 0.000 2021-12-24 3528
## 701 0.000 2021-12-25 3160
## 702 0.000 2021-12-26 2778
## 703 0.000 2021-12-27 2757
## 704 0.000 2021-12-28 2897
## 705 0.000 2021-12-29 3683
## 706 0.000 2021-12-30 3997
## 707 0.000 2021-12-31 3573
## 708 0.000 2022-01-01 3386
ADF and KPSS test
adf21 = adf.test(mycovidroll21[,3])
adf21##
## Augmented Dickey-Fuller Test
##
## data: mycovidroll21[, 3]
## Dickey-Fuller = -1.5953, Lag order = 8, p-value = 0.7496
## alternative hypothesis: stationary
kpss21 = kpss.test(mycovidroll21[,3])## Warning in kpss.test(mycovidroll21[, 3]): p-value smaller than printed p-value
kpss21##
## KPSS Test for Level Stationarity
##
## data: mycovidroll21[, 3]
## KPSS Level = 5.3702, Truncation lag parameter = 6, p-value = 0.01
training21 = mycovidroll21[1:480,]
test21 = mycovidroll21[481:688,]Check differencing
ndiffs(training21[,3]) # one differencing## [1] 1
Find best ARIMA model
summary(auto.arima(training21[,3], trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)) # Best model and order: ARIMA (2,1,2)##
## ARIMA(0,1,0) : 6874.031
## ARIMA(0,1,0) with drift : 6875.552
## ARIMA(0,1,1) : 6807.521
## ARIMA(0,1,1) with drift : 6807.511
## ARIMA(0,1,2) : 6795.197
## ARIMA(0,1,2) with drift : 6794.558
## ARIMA(0,1,3) : 6797.142
## ARIMA(0,1,3) with drift : 6796.535
## ARIMA(0,1,4) : 6798.078
## ARIMA(0,1,4) with drift : 6797.562
## ARIMA(0,1,5) : 6791.3
## ARIMA(0,1,5) with drift : 6791.109
## ARIMA(1,1,0) : 6838.23
## ARIMA(1,1,0) with drift : 6839.413
## ARIMA(1,1,1) : 6797.285
## ARIMA(1,1,1) with drift : 6796.665
## ARIMA(1,1,2) : 6797.167
## ARIMA(1,1,2) with drift : 6796.553
## ARIMA(1,1,3) : Inf
## ARIMA(1,1,3) with drift : Inf
## ARIMA(1,1,4) : 6778.573
## ARIMA(1,1,4) with drift : 6779.723
## ARIMA(2,1,0) : 6815.939
## ARIMA(2,1,0) with drift : 6816.723
## ARIMA(2,1,1) : 6796.415
## ARIMA(2,1,1) with drift : 6795.793
## ARIMA(2,1,2) : 6776.005
## ARIMA(2,1,2) with drift : 6777.13
## ARIMA(2,1,3) : 6777.842
## ARIMA(2,1,3) with drift : 6778.982
## ARIMA(3,1,0) : 6808.527
## ARIMA(3,1,0) with drift : 6808.95
## ARIMA(3,1,1) : 6797.085
## ARIMA(3,1,1) with drift : 6796.419
## ARIMA(3,1,2) : 6788.033
## ARIMA(3,1,2) with drift : 6787.925
## ARIMA(4,1,0) : 6796.194
## ARIMA(4,1,0) with drift : 6796.005
## ARIMA(4,1,1) : 6793.224
## ARIMA(4,1,1) with drift : 6792.409
## ARIMA(5,1,0) : 6791.779
## ARIMA(5,1,0) with drift : 6790.994
##
##
##
## Best model: ARIMA(2,1,2)
## Series: training21[, 3]
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.3657 -0.4004 -1.8225 0.855
## s.e. 0.0662 0.0634 0.0410 0.039
##
## sigma^2 estimated as 80277: log likelihood=-3382.94
## AIC=6775.88 AICc=6776 BIC=6796.74
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 12.39903 281.8522 147.3184 NaN Inf 0.8651446 0.004717591
Train model
trainingmodel21 = Arima(training21[,3], order = c(2,1,2))
summary(trainingmodel)## Series: training[, 2]
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## 0.9861 -0.6205 -0.2513 -1.2967 0.8667
## s.e. 0.0565 0.0601 0.0502 0.0399 0.0339
##
## sigma^2 estimated as 101232: log likelihood=-3581.84
## AIC=7175.67 AICc=7175.84 BIC=7200.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 19.26179 316.2544 167.7446 NaN Inf 0.8891027 -0.03064234
coeftest(trainingmodel)##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.986093 0.056544 17.4392 < 2.2e-16 ***
## ar2 -0.620519 0.060052 -10.3331 < 2.2e-16 ***
## ar3 -0.251273 0.050227 -5.0028 5.652e-07 ***
## ma1 -1.296708 0.039865 -32.5272 < 2.2e-16 ***
## ma2 0.866657 0.033948 25.5291 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(trainingmodel$residuals)Make prediction
testpred21 = forecast(test21[,3], model = trainingmodel21, h = 30, levels = c(95))
summary(testpred21)##
## Forecast method: ARIMA(2,1,2)
##
## Model Information:
## Series: object
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.3657 -0.4004 -1.8225 0.855
## s.e. 0.0000 0.0000 0.0000 0.000
##
## sigma^2 estimated as 80277: log likelihood=-1731.79
## AIC=3465.57 AICc=3465.59 BIC=3468.91
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -17.32896 1034.98 799.1886 -0.580743 8.243826 0.9630067 0.2607745
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 209 4014.154 3651.050 4377.258 3458.834 4569.474
## 210 4195.938 3782.723 4609.153 3563.981 4827.896
## 211 4234.317 3799.364 4669.270 3569.114 4899.520
## 212 4213.940 3763.192 4664.689 3524.580 4903.301
## 213 4170.745 3704.558 4636.932 3457.773 4883.717
## 214 4119.913 3636.751 4603.075 3380.981 4858.846
## 215 4067.790 3565.432 4570.148 3299.500 4836.080
## 216 4016.960 3493.021 4540.899 3215.664 4818.256
## 217 3968.414 3420.591 4516.236 3130.591 4806.236
## 218 3922.468 3348.660 4496.276 3044.904 4800.032
## 219 3879.161 3277.510 4480.811 2959.016 4799.305
## 220 3838.414 3207.319 4469.508 2873.238 4803.589
## 221 3800.107 3138.211 4462.004 2787.824 4812.390
## 222 3764.109 3070.278 4457.940 2702.986 4825.232
## 223 3730.286 3003.589 4456.983 2618.899 4841.673
## 224 3698.509 2938.194 4458.823 2535.708 4861.309
## 225 3668.654 2874.126 4463.182 2453.529 4883.780
## 226 3640.607 2811.406 4469.809 2372.453 4908.761
## 227 3614.258 2750.042 4478.475 2292.553 4935.963
## 228 3589.504 2690.032 4488.977 2213.880 4965.129
## 229 3566.249 2631.367 4501.131 2136.470 4996.028
## 230 3544.402 2574.032 4514.772 2060.349 5028.455
## 231 3523.878 2518.005 4529.750 1985.529 5062.227
## 232 3504.596 2463.262 4545.930 1912.013 5097.179
## 233 3486.482 2409.773 4563.190 1839.798 5133.165
## 234 3469.464 2357.509 4581.419 1768.876 5170.053
## 235 3453.477 2306.436 4600.518 1699.230 5207.724
## 236 3438.458 2256.522 4620.394 1630.843 5246.073
## 237 3424.348 2207.731 4640.965 1563.693 5285.002
## 238 3411.092 2160.030 4662.155 1497.757 5324.427
plot(testpred21)
grid(nx = NULL, ny = NULL,
lty = 2,
col = 'gray',
lwd = 2)accuracy(testpred21)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -17.32896 1034.98 799.1886 -0.580743 8.243826 0.9630067 0.2607745
Linear Regression
Regression Preprocess
split_ratio <- 0.7
set.seed(168)
split_index <- createDataPartition(df_data$cases_new, p=split_ratio, list=FALSE)
data_train <- df_data[split_index,]
data_test <- df_data[-split_index,]Linear Regression Model Training (Gaussian Normal Distribution)
linear_model <- lm(cases_new~cases_active,data=data_train)
summary(linear_model)##
## Call:
## lm(formula = cases_new ~ cases_active, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3742.5 -215.4 -127.7 221.8 4264.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286e+02 5.175e+01 2.485 0.0133 *
## cases_active 8.244e-02 6.365e-04 129.506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 948.8 on 494 degrees of freedom
## Multiple R-squared: 0.9714, Adjusted R-squared: 0.9713
## F-statistic: 1.677e+04 on 1 and 494 DF, p-value: < 2.2e-16
plot(linear_model)linear_prediction <- linear_model %>% predict(data_test)
linear_compare <- data.frame(actual=data_test$cases_new, predicted=linear_prediction)
head(linear_compare)## actual predicted
## 1 4 128.9024
## 5 3 129.1497
## 8 0 129.2322
## 10 0 129.2322
## 14 1 129.7268
## 16 1 129.7268
Poisson Regression Model Training
poisson_model <- glm(cases_new~cases_active, data=data_train, family=poisson(link="log"))
summary(poisson_model)##
## Call:
## glm(formula = cases_new ~ cases_active, family = poisson(link = "log"),
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -69.27 -50.04 -12.42 23.54 96.57
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.264e+00 1.223e-03 5938 <2e-16 ***
## cases_active 1.167e-05 7.085e-09 1647 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3228235 on 495 degrees of freedom
## Residual deviance: 805402 on 494 degrees of freedom
## AIC: 809428
##
## Number of Fisher Scoring iterations: 5
poisson_prediction <- poisson_model %>% predict(data_test)
poisson_compare <- data.frame(actual=data_test$cases_new, predicted=poisson_prediction)
head(poisson_compare)## actual predicted
## 1 4 7.264387
## 5 3 7.264422
## 8 0 7.264434
## 10 0 7.264434
## 14 1 7.264504
## 16 1 7.264504
Performance Comparison
linear_performance <- data.frame(
MODEL = "Gaussian Linear",
R2 = R2(linear_prediction, data_test$cases_new),
RMSE = RMSE(linear_prediction, data_test$cases_new),
MAE = MAE(linear_prediction, data_test$cases_new)
)
linear_performance## MODEL R2 RMSE MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
poisson_performance <- data.frame(
MODEL = "Poisson GLM",
R2 = R2(poisson_prediction, data_test$cases_new),
RMSE = RMSE(poisson_prediction, data_test$cases_new),
MAE = MAE(poisson_prediction, data_test$cases_new)
)
poisson_performance## MODEL R2 RMSE MAE
## 1 Poisson GLM 0.9721543 6606.18 3816.115
Performance Charts
# Chart init
df_linear_predicted <- data.frame(date=data_test$date, cases_new=linear_prediction)
df_actual <- data_test
df_train <- data_train
lm_chart <- plot_ly()
# Predicted Data
lm_chart <- lm_chart %>%
add_trace(
x = df_linear_predicted[["date"]], y = df_linear_predicted[["cases_new"]],
name = "Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
lm_chart <- lm_chart %>%
add_trace(
x = df_actual[["date"]], y = df_actual[["cases_new"]],
name = "Actual Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
lm_chart <- lm_chart %>%
add_trace(
x = df_train[["date"]], y = df_train[["cases_new"]],
name = "Train Data",
type = "scatter",
mode = "lines",
line = list(color = 'green', width = 2)
)
# Set figure title, x and y-axes titles
lm_chart <- lm_chart %>% layout(
title = "Linear Regression of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
lm_chart# Chart init
df_poisson_predicted <- data.frame(date=data_test$date, cases_new=poisson_prediction)
poisson_chart <- plot_ly()
# Predicted Data
poisson_chart <- poisson_chart %>%
add_trace(
x = df_poisson_predicted[["date"]], y = df_poisson_predicted[["cases_new"]],
name = "Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
poisson_chart <- poisson_chart %>%
add_trace(
x = df_actual[["date"]], y = df_actual[["cases_new"]],
name = "Actual Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
poisson_chart <- poisson_chart %>%
add_trace(
x = df_train[["date"]], y = df_train[["cases_new"]],
name = "Train Data",
type = "scatter",
mode = "lines",
line = list(color = 'green', width = 1)
)
# Set figure title, x and y-axes titles
poisson_chart <- poisson_chart %>% layout(
title = "Poisson Regression of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
poisson_chartGame Time with ARIMA
Predict future new cases Using Predicted future active cases from ARIMA Model
init_year <- format(as.Date(df_data[1,1], format="%Y-%m-%d"),"%Y")
init_day <- yday(as.Date(df_data[1,1], format="%Y-%m-%d"))
data_arima <- ts(df_data$cases_active, start=c(init_year,init_day), frequency=365)
head(data_arima)## Time Series:
## Start = c(2020, 25)
## End = c(2020, 30)
## Frequency = 365
## [1] 4 4 4 4 7 8
ARIMA Training
arima_model <- auto.arima(df_data$cases_active, trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)##
## ARIMA(0,1,0) : 12574.82
## ARIMA(0,1,0) with drift : 12576.06
## ARIMA(0,1,1) : 12201.06
## ARIMA(0,1,1) with drift : 12202.55
## ARIMA(0,1,2) : 12000.65
## ARIMA(0,1,2) with drift : 12002.3
## ARIMA(0,1,3) : 11949.82
## ARIMA(0,1,3) with drift : 11951.54
## ARIMA(0,1,4) : 11892.69
## ARIMA(0,1,4) with drift : 11894.47
## ARIMA(0,1,5) : 11833.57
## ARIMA(0,1,5) with drift : 11835.4
## ARIMA(1,1,0) : 11789.02
## ARIMA(1,1,0) with drift : 11790.96
## ARIMA(1,1,1) : 11637.72
## ARIMA(1,1,1) with drift : 11639.74
## ARIMA(1,1,2) : 11636.15
## ARIMA(1,1,2) with drift : 11638.18
## ARIMA(1,1,3) : 11637.46
## ARIMA(1,1,3) with drift : 11639.49
## ARIMA(1,1,4) : 11612.64
## ARIMA(1,1,4) with drift : 11614.67
## ARIMA(2,1,0) : 11710.42
## ARIMA(2,1,0) with drift : 11712.4
## ARIMA(2,1,1) : 11636.58
## ARIMA(2,1,1) with drift : 11638.61
## ARIMA(2,1,2) : 11638.07
## ARIMA(2,1,2) with drift : 11640.11
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : 11640.24
## ARIMA(3,1,0) : 11684.41
## ARIMA(3,1,0) with drift : 11686.42
## ARIMA(3,1,1) : 11636.13
## ARIMA(3,1,1) with drift : 11638.16
## ARIMA(3,1,2) : Inf
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 11635.54
## ARIMA(4,1,0) with drift : 11637.56
## ARIMA(4,1,1) : 11623.02
## ARIMA(4,1,1) with drift : 11625.06
## ARIMA(5,1,0) : 11626.8
## ARIMA(5,1,0) with drift : 11628.84
##
##
##
## Best model: ARIMA(1,1,4)
arima_model## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
ARIMA Predict
forecast_length <- 30
arima_predict <- forecast(arima_model, forecast_length)
head(arima_predict)## $method
## [1] "ARIMA(1,1,4)"
##
## $model
## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
##
## $level
## [1] 80 95
##
## $mean
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## [1] 40614.16 40307.63 40048.57 39882.01 39719.83 39561.92 39408.15 39258.42
## [9] 39112.62 38970.65 38832.42 38697.81 38566.74 38439.11 38314.84 38193.83
## [17] 38076.00 37961.26 37849.54 37740.75 37634.82 37531.67 37431.23 37333.44
## [25] 37238.21 37145.48 37055.18 36967.26 36881.65 36798.29
##
## $lower
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 39478.114 38876.7256
## 710 38391.848 37377.6936
## 711 37403.746 36003.6634
## 712 36566.876 34811.9487
## 713 35617.312 33445.5671
## 714 34581.553 31945.1080
## 715 33477.514 30338.0259
## 716 32317.611 28643.3705
## 717 31110.863 26874.9883
## 718 29864.124 25043.4190
## 719 28582.818 23157.0088
## 720 27271.371 21222.5802
## 721 25933.490 19245.8519
## 722 24572.339 17231.7126
## 723 23190.662 15184.4066
## 724 21790.866 13107.6638
## 725 20375.089 11004.7946
## 726 18945.235 8878.7600
## 727 17503.021 6732.2261
## 728 16049.997 4567.6059
## 729 14587.570 2387.0935
## 730 13117.025 192.6909
## 731 11639.533 -2013.7690
## 732 10156.171 -4230.6039
## 733 8667.926 -6456.2660
## 734 7175.708 -8689.3286
## 735 5680.356 -10928.4742
## 736 4182.644 -13172.4849
## 737 2683.288 -15420.2328
## 738 1182.949 -17670.6729
##
## $upper
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 41750.21 42351.60
## 710 42223.42 43237.57
## 711 42693.38 44093.47
## 712 43197.15 44952.08
## 713 43822.36 45994.10
## 714 44542.28 47178.73
## 715 45338.78 48478.27
## 716 46199.22 49873.46
## 717 47114.38 51350.25
## 718 48077.18 52897.89
## 719 49082.01 54507.82
## 720 50124.25 56173.04
## 721 51199.99 57887.63
## 722 52305.88 59646.51
## 723 53439.01 61445.27
## 724 54596.79 63279.99
## 725 55776.90 65147.20
## 726 56977.28 67043.76
## 727 58196.05 68966.85
## 728 59431.50 70913.89
## 729 60682.07 72882.55
## 730 61946.32 74870.65
## 731 63222.94 76876.24
## 732 64510.70 78897.48
## 733 65808.48 80932.68
## 734 67115.25 82980.28
## 735 68430.01 85038.84
## 736 69751.88 87107.01
## 737 71080.02 89183.54
## 738 72413.63 91267.25
plot(arima_predict, main = "Predicted Active Cases", col.main = "black")Mixed Model Prediction with ARIMA
Combined Prediction Start!
last_date <- as.Date(df_data[(nrow(df_data)):nrow(df_data),1], format="%Y-%m-%d")
last_date <- last_date + 1
df_arima <- data.frame(
date=seq(last_date, by = "day", length.out = forecast_length),
cases_active=arima_predict$mean
)
# Gaussian
combined_linear_prediction <- linear_model %>% predict(df_arima)
df_combined_linear_predicted <- data.frame(date=df_arima$date, cases_new=combined_linear_prediction)
# Poisson
combined_poisson_prediction <- poisson_model %>% predict(df_arima)
df_combined_poisson_predicted <- data.frame(date=df_arima$date, cases_new=combined_poisson_prediction)Smoothen data for nicer presentation
df_data$month <- strftime(df_data$date, "%m")
df_data$year <- strftime(df_data$date, "%Y")
df_smooth <- df_data %>%
group_by(date=lubridate::floor_date(df_data$date, "month")) %>%
dplyr::summarize(cases_new = mean(cases_new)) %>%
data.frameShowtime!
combined_linear_chart <- plot_ly()
# Predicted Data
combined_linear_chart <- combined_linear_chart %>%
add_trace(
x = df_combined_linear_predicted[["date"]], y = df_combined_linear_predicted[["cases_new"]],
name = "Future Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
combined_linear_chart <- combined_linear_chart %>%
add_trace(
x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
name = "Actual Data (Rolled to Monthly)",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
# Set figure title, x and y-axes titles
combined_linear_chart <- combined_linear_chart %>% layout(
title = "Prediction of Daily New Cases (Gaussian)",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
combined_linear_chartcombined_poisson_chart <- plot_ly()
# Predicted Data
combined_poisson_chart <- combined_poisson_chart %>%
add_trace(
x = df_combined_poisson_predicted[["date"]], y = df_combined_poisson_predicted[["cases_new"]],
name = "Future Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
combined_poisson_chart <- combined_poisson_chart %>%
add_trace(
x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
name = "Actual Data (Rolled to Monthly)",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
# Set figure title, x and y-axes titles
combined_poisson_chart <- combined_poisson_chart %>% layout(
title = "Prediction of Daily New Cases (Poisson)",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
combined_poisson_chartIt seems like the Poisson model is really bad just by the sight of looking at the graph. Therefore do not relies too much on statistical benchmark such as R2 as the source of truth. The easiest way to determine on hindsight is to visualize, use them!
Final Thoughts
According to our models, COVID cases will continue to decline if large events such as the General Election, mass gatherings, and other unplanned occurrences do not occur.
The following table is the comparison of performance for all our models:
arima_performance <- data.frame(
MODEL = "ARIMA",
R2 = "NaN",
RMSE = accuracy(testpred)[, c("RMSE")],
MAE = accuracy(testpred)[, c("MAE")]
)
combined_performance <- rbind(linear_performance, poisson_performance, arima_performance)
kable(combined_performance)| MODEL | R2 | RMSE | MAE |
|---|---|---|---|
| Gaussian Linear | 0.972154274366168 | 906.3865 | 545.7654 |
| Poisson GLM | 0.972154274366168 | 6606.1804 | 3816.1151 |
| ARIMA | NaN | 916.5365 | 652.5599 |