This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(utils)
# load data from ECDC
data <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", na.strings = "", fileEncoding = "UTF-8-BOM")
This R notebook loads data about the ongoing COVID-19 pandemic directly from ECDC.
Time to load some libraries. Not all of them will be used.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(stringr)
library(tibble)
library(broom)
library(ggplot2)
library(knitr)
library(devtools)
## Loading required package: usethis
library(parallel)
library(foreach)
## Warning: package 'foreach' was built under R version 3.6.2
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(tictoc)
(library(EpiModel))
## Loading required package: deSolve
## Loading required package: networkDynamic
## Loading required package: network
## network: Classes for Relational Data
## Version 1.16.0 created on 2019-11-30.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## Mark S. Handcock, University of California -- Los Angeles
## David R. Hunter, Penn State University
## Martina Morris, University of Washington
## Skye Bender-deMoll, University of Washington
## For citation information, type citation("network").
## Type help("network-package") to get started.
##
## networkDynamic: version 0.10.1, created on 2020-01-16
## Copyright (c) 2020, Carter T. Butts, University of California -- Irvine
## Ayn Leslie-Cook, University of Washington
## Pavel N. Krivitsky, University of Wollongong
## Skye Bender-deMoll, University of Washington
## with contributions from
## Zack Almquist, University of California -- Irvine
## David R. Hunter, Penn State University
## Li Wang
## Kirk Li, University of Washington
## Steven M. Goodreau, University of Washington
## Jeffrey Horner
## Martina Morris, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("networkDynamic").
## Loading required package: tergm
## Loading required package: ergm
##
## ergm: version 3.10.4, created on 2019-06-10
## Copyright (c) 2019, Mark S. Handcock, University of California -- Los Angeles
## David R. Hunter, Penn State University
## Carter T. Butts, University of California -- Irvine
## Steven M. Goodreau, University of Washington
## Pavel N. Krivitsky, University of Wollongong
## Martina Morris, University of Washington
## with contributions from
## Li Wang
## Kirk Li, University of Washington
## Skye Bender-deMoll, University of Washington
## Chad Klumb
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("ergm").
## NOTE: Versions before 3.6.1 had a bug in the implementation of the bd()
## constriant which distorted the sampled distribution somewhat. In
## addition, Sampson's Monks datasets had mislabeled vertices. See the
## NEWS and the documentation for more details.
## NOTE: Some common term arguments pertaining to vertex attribute and
## level selection have changed in 3.10.0. See terms help for more
## details. Use 'options(ergm.term=list(version="3.9.4"))' to use old
## behavior.
##
## tergm: version 3.6.1, created on 2019-06-12
## Copyright (c) 2019, Pavel N. Krivitsky, University of Wollongong
## Mark S. Handcock, University of California -- Los Angeles
## with contributions from
## David R. Hunter, Penn State University
## Steven M. Goodreau, University of Washington
## Martina Morris, University of Washington
## Nicole Bohme Carnegie, New York University
## Carter T. Butts, University of California -- Irvine
## Ayn Leslie-Cook, University of Washington
## Skye Bender-deMoll
## Li Wang
## Kirk Li, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("tergm").
## [1] "EpiModel" "tergm" "ergm" "networkDynamic"
## [5] "network" "deSolve" "tictoc" "foreach"
## [9] "parallel" "devtools" "usethis" "knitr"
## [13] "broom" "lubridate" "magrittr" "forcats"
## [17] "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse"
## [25] "stats" "graphics" "grDevices" "utils"
## [29] "datasets" "methods" "base"
library(incidence)
##
## Attaching package: 'incidence'
## The following object is masked from 'package:broom':
##
## bootstrap
library(earlyR)
library(R0)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(ggsci)
Let’s arrange the data a bit smarter.
# set the serial interval to a lognormal distribution with a mean of 4.8 and a spread of 2.3
mGT <- generation.time("lognormal", c(4.8, 2.3))
#convert the date to a readable format
data <- data %>% group_by(countriesAndTerritories) %>%
mutate(Ndate = make_date(year, month, day))
#calculate a cumulative number of deaths from the daily incidence
data <- data %>% group_by(countriesAndTerritories) %>%
arrange(Ndate) %>%
mutate(Pdeaths = cumsum(deaths))
#calculate a cumulatice number of cases from the daily incidence
data <- data %>% group_by(countriesAndTerritories) %>%
arrange(Ndate) %>%
mutate(Pcases = cumsum(cases))
#calculate the number of cases per million population
data <- data %>% group_by(countriesAndTerritories) %>%
arrange(Ndate) %>%
mutate(CasesPerMil = (Pcases/popData2018*1000000))
#calculate deaths per milion
data <- data %>%
group_by(countriesAndTerritories) %>%
arrange(Ndate) %>%
mutate(DeathPerMil = (Pdeaths/popData2018*1000000))
#calculate the CFR from cumulative deaths divided by cases
data <- data %>%
group_by(countriesAndTerritories) %>%
arrange(Ndate) %>%
mutate(CFR = (Pdeaths/Pcases))
Let’s select some countries to work on. Set the countries you want to work with - use the official ISO 3166 three letter abbrevations. You can find the ISO codes here https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
{
country1 <- readline(prompt = "Enter country 1: "); print(country1)
country2 <- readline(prompt = "Enter country 2: "); print(country2)
country3 <- readline(prompt = "Enter country 3: "); print(country3)
country4 <- readline(prompt = "Enter country 4: "); print(country4)
country5 <- readline(prompt = "Enter country 5: "); print(country5)
}
## Enter country 1:
## [1] ""
## Enter country 2:
## [1] ""
## Enter country 3:
## [1] ""
## Enter country 4:
## [1] ""
## Enter country 5:
## [1] ""
Now we’ve got all the data calculated and arranged. Time to create some visualizations. Let’s start by making a graph showing how the number of daily deaths evolve.
ggplot(data = data %>% filter(countryterritoryCode == country1 | countryterritoryCode == country2 | countryterritoryCode == country3 | countryterritoryCode == country4 | countryterritoryCode == country5)) + geom_smooth(mapping = aes(x = Ndate, y = deaths, color = countryterritoryCode)) + scale_color_lancet() #remember that you can define the countries yourself
#you can try changing the colors by changing scale_color_lancet to other options.
Now let’s plot the cumulative deaths.
ggplot(data = data %>% filter(countryterritoryCode == country1 | countryterritoryCode == country2 | countryterritoryCode == country3 | countryterritoryCode == country4 | countryterritoryCode == country5)) + geom_point(mapping = aes(x = Ndate, y = Pdeaths, color = countryterritoryCode)) + scale_color_lancet() #remember that you can define the countries yourself
And the cumulative cases.
ggplot(data = data %>% filter(countryterritoryCode == country1 | countryterritoryCode == country2 | countryterritoryCode == country3 | countryterritoryCode == country4 | countryterritoryCode == country5)) + geom_point(mapping = aes(x = Ndate, y = Pcases, color = countryterritoryCode)) + scale_color_lancet()
And now a graph of death per million population.
ggplot(data = data %>% filter(countryterritoryCode == country1 | countryterritoryCode == country2 | countryterritoryCode == country3 | countryterritoryCode == country4 | countryterritoryCode == country5)) + geom_point(mapping = aes(x = Ndate, y = DeathPerMil, color = countryterritoryCode)) + scale_color_lancet()
And finally a graph showing the CFR per country.
ggplot(data = data %>% filter(countryterritoryCode == country1 | countryterritoryCode == country2 | countryterritoryCode == country3 | countryterritoryCode == country4 | countryterritoryCode == country5)) + geom_smooth(mapping = aes(x = Ndate, y = CFR, color = countryterritoryCode)) + scale_color_lancet()
There’s always an option to do the graph either as points, which will be the most correct, or as a smooth line. Here R will do some underlying mathematical corrections. For messy data a smooth line is preferred - for instance plotting the CFR.