library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel)
library(patchwork)
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
Load data
df <- read_csv("https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv")
## Parsed with column specification:
## cols(
## Province.State = col_logical(),
## Country.Region = col_character(),
## Lat = col_double(),
## Long = col_double(),
## date = col_date(format = ""),
## cases = col_double(),
## type = col_character()
## )
## Warning: 22968 parsing failures.
## row col expected actual file
## 8469 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8470 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8471 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8472 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8473 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## .... .............. .................. ....... ..............................................................................................
## See problems(...) for more details.
processed <- df %>%
arrange(date) %>%
group_by(Country.Region, type) %>%
mutate(country_total = cumsum(cases)) %>%
ungroup() %>%
group_by(type) %>%
mutate(world_total = cumsum(cases))
World totals
plot_world <- processed %>%
arrange(desc(world_total)) %>%
distinct(type, date, .keep_all = T) %>%
ggplot(aes(date, world_total, colour = type)) +
geom_point() +
geom_line() +
facet_wrap(~type, scales = "free_y") +
theme_minimal() +
theme(legend.position = "none")
plot_world

Top countries
list <- processed %>%
filter(type == "confirmed") %>%
arrange(desc(date)) %>%
distinct(Country.Region, .keep_all = T) %>%
arrange(desc(country_total)) %>%
head(9) %>%
ungroup() %>%
select(Country.Region) %>%
as_vector()
Graph these countries
plot_confirmed_top <- processed %>%
arrange(desc(country_total)) %>%
distinct(type, date, Country.Region, .keep_all = T) %>%
filter(Country.Region %in% list) %>%
filter(type == "confirmed") %>%
ggplot(aes(date, country_total, colour = Country.Region)) +
geom_point() +
geom_line() +
facet_wrap(~ Country.Region, scales = "free_y") +
theme_minimal() +
theme(legend.position = "none")
plot_confirmed_top

Log graph of confirmed cases since breakign the 100 mark in select countries
#first find out when the 100 barrier mark was broken
plot_confirmed_100_cases <- processed %>%
#select only confirmed cases
filter(type == "confirmed") %>%
#and with more than 100 cases
filter(country_total >= 100) %>%
#find the the dat where 100 was first broken
arrange(date) %>%
#only keep this first instnace
distinct(Country.Region, type, .keep_all = T) %>%
#take the columns needed
select(Country.Region, type, date) %>%
#give the date where 100 cases broken a name prior to merger back to original data
rename(threshold = date) %>%
#join it to orignal data so that we have a new threshold column
right_join(processed) %>%
#filter for only confirmed cases
filter(type == "confirmed") %>%
#calculate dats since 100 barrier broken
mutate(days_since_threshold = date - threshold) %>%
#only keep days since 100 cases
filter(days_since_threshold > 0) %>%
arrange(date) %>%
#select countries for graph
filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
#China has to many x values, limit
filter(days_since_threshold <= 30) %>%
#some countries have multiple entries per day, only keep max per day by arranging and find first instance (distinct)
arrange(desc(country_total)) %>%
distinct(Country.Region, date, .keep_all = T) %>%
#for labeling, make a new column that only has data against the last instance for each country
group_by(Country.Region) %>%
mutate(label = if_else(days_since_threshold == max(days_since_threshold), as.character(Country.Region), NA_character_)) %>%
#plot
ggplot(aes(days_since_threshold, country_total, colour = Country.Region)) +
geom_point() +
geom_line() +
#add text labels
geom_text_repel(aes(label = label), nudge_y = -0.5, nudge_x = 0.25, segment.color = "black", colour = "black", force = 5) +
theme_minimal() +
scale_y_log10(labels = comma, limits=c(100, 100000)) +
scale_x_continuous(breaks = seq(1,30)) +
annotation_logticks(sides = "l") +
labs(x = "Days since 100th case", y = "Total confirmed cases by country", colour = "Country", title = "Confirmed COVID-19 cases by country", subtitle = "Log scale since 100th case", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
theme(legend.position="bottom", panel.grid.minor.x = element_blank())
## Joining, by = c("Country.Region", "type")
plot_confirmed_100_cases
## Warning: Removed 210 rows containing missing values (geom_text_repel).

plot_confirmed_10_deaths <- processed %>%
filter(type == "death") %>%
filter(country_total >= 10) %>%
arrange(date) %>%
distinct(Country.Region, type, .keep_all = T) %>%
select(Country.Region, type, date) %>%
rename(threshold = date) %>%
right_join(processed) %>%
filter(type == "death") %>%
mutate(days_since_threshold = date - threshold) %>%
filter(days_since_threshold > 0) %>%
arrange(date) %>%
filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
arrange(desc(country_total)) %>%
filter(days_since_threshold <= 30) %>%
distinct(Country.Region, date, .keep_all = T) %>%
group_by(Country.Region) %>%
mutate(label = if_else(days_since_threshold == max(days_since_threshold), as.character(Country.Region), NA_character_)) %>%
ggplot(aes(days_since_threshold, country_total, colour = Country.Region)) +
geom_point() +
geom_line() +
geom_text_repel(aes(label = label), nudge_y = 0.25, nudge_x = 0.25, segment.color = "black", colour = "black", force = 5) +
theme_minimal() +
scale_y_log10(labels = comma, limits=c(10, 10000)) +
scale_x_continuous(breaks = seq(1,30)) +
annotation_logticks(sides = "l") +
labs(x = "Days since 10th death", y = "Total confirmed deaths by country", colour = "Country", title = "Confirmed COVID-19 deaths by country", subtitle = "Log scale since 10th case", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
theme(legend.position="bottom", panel.grid.minor.x = element_blank())
## Joining, by = c("Country.Region", "type")
plot_confirmed_10_deaths
## Warning: Removed 148 rows containing missing values (geom_text_repel).

plot <- plot_confirmed_100_cases / plot_confirmed_10_deaths
ggsave(path = "plots", filename = "plot.png", width = 10, height = 10)
## Warning: Removed 210 rows containing missing values (geom_text_repel).
## Warning: Removed 148 rows containing missing values (geom_text_repel).
plot
## Warning: Removed 210 rows containing missing values (geom_text_repel).
## Warning: Removed 148 rows containing missing values (geom_text_repel).

How about active cases
start <- min(processed$date)
end <- max(processed$date) + 10
range <- c(start, end)
plot_active_cases <- processed %>%
pivot_wider(names_from = type, values_from = country_total) %>%
group_by(Country.Region) %>%
arrange(date) %>%
fill(confirmed, death, recovered) %>%
mutate(active_cases = confirmed - death - recovered) %>%
arrange(desc(active_cases)) %>%
distinct(Country.Region, date, .keep_all = T) %>%
filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
group_by(Country.Region) %>%
mutate(label = if_else(date == max(date), as.character(Country.Region), NA_character_)) %>%
ggplot(aes(date, active_cases, colour = Country.Region)) +
geom_point() +
geom_line() +
geom_text_repel(aes(label = label), nudge_x = 8, segment.color = "black", colour = "black", force = 5) +
scale_x_date(date_breaks = "1 week", limits = range) +
scale_y_continuous(breaks = seq(0, 100000, by = 10000)) +
theme_minimal() +
labs(x = "Date", y= "Number of active cases", title = "Number of active cases by country", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering", colour = "Country") +
theme(legend.position="bottom")
plot_active_cases
## Warning: Removed 627 rows containing missing values (geom_text_repel).

Mortality rate by country
plot_mortality_rate <- processed %>%
pivot_wider(names_from = type, values_from = country_total) %>%
group_by(Country.Region) %>%
fill(confirmed, death, recovered) %>%
filter(date == max(date)) %>%
distinct(Country.Region, .keep_all = T) %>%
mutate(death_rate = death/confirmed) %>%
ungroup() %>%
filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
mutate(Country.Region = fct_reorder(Country.Region, death_rate)) %>%
ggplot(aes(Country.Region, death_rate, fill = death_rate)) +
geom_col() +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
labs(x = "County", y= "Mortality rate", title = "Mortality rate by country", subtitle = Sys.Date(), caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
coord_flip() +
scale_fill_gradient(low = "lightpink1", high = "red2") +
theme(legend.position = "none")
plot_mortality_rate
