library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggrepel)
library(patchwork)
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal

Load data

 df <- read_csv("https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv")
## Parsed with column specification:
## cols(
##   Province.State = col_logical(),
##   Country.Region = col_character(),
##   Lat = col_double(),
##   Long = col_double(),
##   date = col_date(format = ""),
##   cases = col_double(),
##   type = col_character()
## )
## Warning: 22968 parsing failures.
##  row            col           expected  actual                                                                                           file
## 8469 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8470 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8471 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8472 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## 8473 Province.State 1/0/T/F/TRUE/FALSE Alabama 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'
## .... .............. .................. ....... ..............................................................................................
## See problems(...) for more details.
processed <- df %>% 
     arrange(date) %>% 
     group_by(Country.Region, type) %>% 
     mutate(country_total = cumsum(cases)) %>% 
  ungroup() %>% 
  group_by(type) %>% 
     mutate(world_total = cumsum(cases))

World totals

plot_world <- processed %>% 
    arrange(desc(world_total)) %>% 
    distinct(type, date, .keep_all = T) %>% 
    ggplot(aes(date, world_total, colour = type)) +
    geom_point() +
    geom_line() +
    facet_wrap(~type, scales = "free_y") +
    theme_minimal() +
    theme(legend.position = "none")

plot_world

Top countries

list <- processed %>% 
  filter(type == "confirmed") %>% 
  arrange(desc(date)) %>% 
  distinct(Country.Region, .keep_all = T) %>% 
  arrange(desc(country_total)) %>% 
  head(9) %>% 
  ungroup() %>% 
  select(Country.Region) %>% 
  as_vector()

Graph these countries

plot_confirmed_top <- processed %>% 
  arrange(desc(country_total)) %>% 
    distinct(type, date, Country.Region, .keep_all = T) %>% 
  filter(Country.Region %in% list) %>% 
  filter(type == "confirmed") %>% 
  ggplot(aes(date, country_total, colour = Country.Region)) +
  geom_point() +
  geom_line() +
  facet_wrap(~ Country.Region, scales = "free_y") +
  theme_minimal() +
  theme(legend.position = "none")
  
plot_confirmed_top

Log graph of confirmed cases since breakign the 100 mark in select countries

#first find out when the 100 barrier mark was broken
plot_confirmed_100_cases <- processed %>% 
  #select only confirmed cases
  filter(type == "confirmed") %>% 
  #and with more than 100 cases
  filter(country_total >= 100) %>% 
  #find the the dat where 100 was first broken
  arrange(date) %>% 
  #only keep this first instnace
  distinct(Country.Region, type, .keep_all = T) %>% 
  #take the columns needed
  select(Country.Region, type, date) %>% 
  #give the date where 100 cases broken a name prior to merger back to original data
  rename(threshold = date) %>% 
  #join it to orignal data so that we have a new threshold column
  right_join(processed) %>%
  #filter for only confirmed cases
  filter(type == "confirmed") %>% 
  #calculate dats since 100 barrier broken
  mutate(days_since_threshold = date - threshold) %>% 
  #only keep days since 100 cases
  filter(days_since_threshold > 0) %>% 
  arrange(date) %>%
  #select countries for graph
  filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
  #China has to many x values, limit
    filter(days_since_threshold <= 30) %>%
  #some countries have multiple entries per day, only keep max per day by arranging and find first instance (distinct)
  arrange(desc(country_total)) %>% 
  distinct(Country.Region, date, .keep_all = T) %>% 
  #for labeling, make a new column that only has data against the last instance for each country
  group_by(Country.Region) %>% 
    mutate(label = if_else(days_since_threshold == max(days_since_threshold), as.character(Country.Region), NA_character_)) %>% 
  #plot 
  ggplot(aes(days_since_threshold, country_total, colour = Country.Region)) +
  geom_point() +
  geom_line() +
  #add text labels
  geom_text_repel(aes(label =  label), nudge_y = -0.5, nudge_x = 0.25, segment.color = "black", colour = "black", force = 5) +
  theme_minimal() +
  scale_y_log10(labels = comma, limits=c(100, 100000)) +
  scale_x_continuous(breaks = seq(1,30)) +
  annotation_logticks(sides = "l") +
  labs(x = "Days since 100th case", y = "Total confirmed cases by country", colour = "Country", title = "Confirmed COVID-19 cases by country", subtitle = "Log scale since 100th case", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
  theme(legend.position="bottom", panel.grid.minor.x = element_blank()) 
## Joining, by = c("Country.Region", "type")
plot_confirmed_100_cases
## Warning: Removed 210 rows containing missing values (geom_text_repel).

plot_confirmed_10_deaths <- processed %>% 
    filter(type == "death") %>% 
    filter(country_total >= 10) %>% 
    arrange(date) %>% 
    distinct(Country.Region, type, .keep_all = T) %>% 
    select(Country.Region, type, date) %>% 
    rename(threshold = date) %>% 
    right_join(processed) %>%
    filter(type == "death") %>% 
    mutate(days_since_threshold = date - threshold) %>% 
    filter(days_since_threshold > 0) %>% 
    arrange(date) %>%
    filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
    arrange(desc(country_total)) %>% 
    filter(days_since_threshold <= 30) %>% 
    distinct(Country.Region, date, .keep_all = T) %>% 
    group_by(Country.Region) %>% 
    mutate(label = if_else(days_since_threshold == max(days_since_threshold), as.character(Country.Region), NA_character_)) %>% 
    ggplot(aes(days_since_threshold, country_total, colour = Country.Region)) +
    geom_point() +
    geom_line() +
    geom_text_repel(aes(label =  label), nudge_y = 0.25, nudge_x = 0.25, segment.color = "black", colour = "black", force = 5) +
    theme_minimal() +
    scale_y_log10(labels = comma, limits=c(10, 10000)) +
    scale_x_continuous(breaks = seq(1,30)) +
    annotation_logticks(sides = "l") +
    labs(x = "Days since 10th death", y = "Total confirmed deaths by country", colour = "Country", title = "Confirmed COVID-19 deaths by country", subtitle = "Log scale since 10th case", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
    theme(legend.position="bottom", panel.grid.minor.x = element_blank())  
## Joining, by = c("Country.Region", "type")
plot_confirmed_10_deaths
## Warning: Removed 148 rows containing missing values (geom_text_repel).

plot <- plot_confirmed_100_cases / plot_confirmed_10_deaths 

ggsave(path = "plots", filename = "plot.png", width = 10, height = 10)
## Warning: Removed 210 rows containing missing values (geom_text_repel).
## Warning: Removed 148 rows containing missing values (geom_text_repel).
plot
## Warning: Removed 210 rows containing missing values (geom_text_repel).

## Warning: Removed 148 rows containing missing values (geom_text_repel).

How about active cases

start <- min(processed$date)
end <- max(processed$date) + 10
range <- c(start, end)

plot_active_cases <- processed %>% 
    pivot_wider(names_from = type, values_from = country_total) %>%       
  group_by(Country.Region) %>% 
  arrange(date) %>% 
  fill(confirmed, death, recovered) %>% 
  mutate(active_cases = confirmed - death - recovered) %>%
  arrange(desc(active_cases)) %>% 
  distinct(Country.Region, date, .keep_all = T) %>%
  filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
  group_by(Country.Region) %>% 
    mutate(label = if_else(date == max(date), as.character(Country.Region), NA_character_)) %>%
  ggplot(aes(date, active_cases, colour = Country.Region)) +
    geom_point() +
    geom_line() +
  geom_text_repel(aes(label =  label), nudge_x = 8, segment.color = "black", colour = "black", force = 5) +
  scale_x_date(date_breaks = "1 week", limits = range) +
  scale_y_continuous(breaks = seq(0, 100000, by = 10000)) +
    theme_minimal() +
  labs(x = "Date", y= "Number of active cases", title = "Number of active cases by country", caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering", colour = "Country") +
  theme(legend.position="bottom")

plot_active_cases
## Warning: Removed 627 rows containing missing values (geom_text_repel).

Mortality rate by country

plot_mortality_rate <- processed %>%
  pivot_wider(names_from = type, values_from = country_total) %>%
   group_by(Country.Region) %>%
  fill(confirmed, death, recovered) %>% 
    filter(date == max(date)) %>% 
  distinct(Country.Region, .keep_all = T) %>% 
  mutate(death_rate = death/confirmed) %>%
  ungroup() %>% 
  filter(Country.Region %in% c("Ireland", "Spain", "Italy", "US", "United Kingdom", "Iran", "France", "Germany", "Japan", "Korea, South", "China")) %>%
  mutate(Country.Region = fct_reorder(Country.Region, death_rate)) %>% 
  ggplot(aes(Country.Region, death_rate, fill = death_rate)) +
  geom_col() +
  scale_y_continuous(labels = percent_format()) +
  theme_minimal() +
  labs(x = "County", y= "Mortality rate", title = "Mortality rate by country", subtitle = Sys.Date(), caption = "Data source: Johns Hopkins University Center for Systems Science and Engineering") +
  coord_flip() +
  scale_fill_gradient(low = "lightpink1",  high = "red2") +
  theme(legend.position = "none")

plot_mortality_rate