This data is pulled from kaggle using the link below: https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset
The data covers the number of confirmed case, number of death, and number of recovered from 22/1/2020 to 14/4/2020
library(tidyverse)
library(dplyr)
library(lubridate)
library(plotly)
library(DT)
library(ggthemes)
library(ggplot2)
library(gganimate)
library(directlabels)
wd <- 'D:/Dataset/COVID/'
list.files(wd)
## [1] "COVID-19-geographic-disbtribution-worldwide-2020-04-13.xlsx"
## [2] "covid_19_data.csv"
## [3] "COVID19_line_list_data.csv"
## [4] "COVID19_open_line_list.csv"
## [5] "time_series_covid_19_confirmed.csv"
## [6] "time_series_covid_19_confirmed_US.csv"
## [7] "time_series_covid_19_deaths.csv"
## [8] "time_series_covid_19_deaths_US.csv"
## [9] "time_series_covid_19_recovered.csv"
covid_data <- read.csv(paste0(wd,'covid_19_data.csv'),header = T)
covid_country <- aggregate(cbind(Confirmed,Deaths,Recovered)~ObservationDate+Country.Region,data=covid_data,sum)
covid_country$ObservationDate <- as.Date(covid_country$ObservationDate,format='%m/%d/%Y')
covid_country <- covid_country %>%
dplyr::group_by(Country.Region) %>%
dplyr::filter(Confirmed>100) %>%
dplyr::mutate(DaySince100 = dplyr::row_number())
head(covid_country)
## # A tibble: 6 x 6
## # Groups: Country.Region [1]
## ObservationDate Country.Region Confirmed Deaths Recovered DaySince100
## <date> <fct> <dbl> <dbl> <dbl> <int>
## 1 2020-03-27 Afghanistan 110 4 2 1
## 2 2020-03-28 Afghanistan 110 4 2 2
## 3 2020-03-29 Afghanistan 120 4 2 3
## 4 2020-03-30 Afghanistan 170 4 2 4
## 5 2020-03-31 Afghanistan 174 4 5 5
## 6 2020-04-01 Afghanistan 237 4 5 6
To simplify the plot, we will focus on countries with more serious outbreak cases more than 20,000 instead.
length(unique(covid_country$Country.Region))
## [1] 125
covid_country_1 <- covid_country %>% group_by(Country.Region) %>%
filter(max(Confirmed)>20000)
length(unique(covid_country_1$Country.Region))
## [1] 15
ggplot(covid_country_1,aes(x=DaySince100,y=Confirmed,colour=Country.Region))+
geom_line(size = 1, alpha = 0.6)+
theme(legend.position = 'None')+
geom_dl(aes(label = Country.Region),method = list(dl.combine("last.points"), cex = 0.8))+
xlab('Number of Days Since > 100 cases')+
ylab('# of Confirmed Case')+
ggtitle('Number of confirmed cases since >100 cases by country')
Graph is still somewhat messy and couldn’t do much comparison in rate of change due to the difference in scale, where USA has the count in hundreds of thousands and the other are still in ten thousands. Let’s try doing this in logarithmic instead.
ggplot(covid_country_1,aes(x=DaySince100,y=log(Confirmed),colour=Country.Region))+
geom_line(size = 1, alpha = 0.6)+
theme(legend.position = 'None')+
geom_dl(aes(label = Country.Region),method = list(dl.combine("last.points"), cex = 0.8))+
xlab('Number of Days Since > 100 cases')+
ylab('Cummulative Confirmed Case - Log Scale')+
ggtitle('Number of confirmed cases since >100 cases by country')
All the countries show a similar pattern, with steep curve at the early stage, and slowly reducing the gradient and eventually stagnant. This looks better compared to the previous graph as graph are more comparable. However, we can still further improve the visualisation by making it interactive using plotly.
log_plot <- ggplot(covid_country_1,aes(x=DaySince100,y=log(Confirmed),colour=Country.Region,text =Confirmed))+
geom_line(size = 1, alpha = 0.6)+
theme(legend.position = 'None')+
geom_dl(aes(label = Country.Region),method = list(dl.combine("last.points"), cex = 0.8))+
xlab('Number of Days Since > 100 cases')+
ylab('Cummulative Confirmed Case - Log Scale')+
ggtitle('Number of confirmed cases since >100 cases by country')
ggplotly(log_plot, tooltip =c('Country.Region','DaySince100','text'))
Or making this into an animation.
p <- covid_country_1 %>%
ggplot(aes(x = DaySince100, y = log(Confirmed), color = Country.Region),text = Confirmed)+
geom_line(size = 1, alpha = 0.6)+
transition_reveal(DaySince100)+
theme(legend.position = "none", axis.title.x = element_blank())+
geom_segment(aes(xend = max(DaySince100)+1, yend = log(Confirmed)), linetype = 2, colour = 'grey') +
geom_text(aes(x = max(DaySince100)+1, label = Country.Region), hjust = 0)
p
Including all countries here seems packed. Let’s try again with a smaller subset, filtering to only shows country with total case more than 70,000.
p <- covid_country_1 %>% group_by(Country.Region) %>%
filter(max(Confirmed)>70000) %>%
ggplot(aes(x = DaySince100, y = log(Confirmed), color = Country.Region),text = Confirmed)+
geom_line(size = 1, alpha = 0.6)+
transition_reveal(DaySince100)+
theme(legend.position = "none", axis.title.x = element_blank())+
geom_segment(aes(xend = max(DaySince100)+1, yend = log(Confirmed)), linetype = 2, colour = 'grey') +
geom_text(aes(x = max(DaySince100)+1, label = Country.Region), hjust = 0)+
xlab('Number of Days Since > 100 cases')+
ylab('Cummulative Confirmed Case - Log Scale')+
ggtitle('Number of confirmed cases since >100 cases by country')
p