library(lubridate)
library(tidyverse)
library(RCurl)

Data source

The Johns Hopkins real time data is being provided for visualisation onlline by a large number of sites.

https://coronavirus.jhu.edu/map.html?fbclid=IwAR346sosOhM6MvfNRK7Am3Ne7V4l04HUSb_Pavrx_TOsaSzj7HGYRWhpI74

These are very useful visual tools but it is not intuitively obvious how you can obtain the full longitudinal data from these visualisation applications in order to run your own analyses in R.

The data below is downloaded directly from the Johns Hopkins github repository that is the source of the online maps. The raw csv is in a wide format that requires pivoting into a data frame and the conversion of the text column headers to dates. There are three sets of data, confirmed cases, deaths and recoveries.

Downloading and reformatting

URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
data <- read.csv(text = URL, check.names = F)

pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")
d$Date<-as.Date(d$Date,format="%m/%d/%y")

Confirmed<-d

URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
data <- read.csv(text = URL, check.names = F)

pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")

d$Date<-as.Date(d$Date,format="%m/%d/%y")

Deaths<-d

URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")
data <- read.csv(text = URL, check.names = F)

pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")

d$Date<-as.Date(d$Date,format="%m/%d/%y")
Recovered<-d

Summarising to country, joining tables and calculating new cases

The raw John Hopkins data is provided at the level of provinces. It can be grouped and summarised at country level. A left join on country name and date provides a single table. Subtracting the previous day’s total from the cumulative total of cases provides the number of new cases which were reported each day.

Confirmed %>% group_by(Country, Date) %>% summarise(NCases=sum(NCases)) -> confirmed_country
Deaths %>% group_by(Country, Date) %>% summarise(NDeaths=sum(NCases)) -> deaths_country
Recovered %>% group_by(Country, Date) %>% summarise(NRecovered=sum(NCases)) -> recovered_country

confirmed_country %>% left_join(deaths_country) %>% left_join(recovered_country) -> by_country
 by_country%>%arrange(Date) %>% mutate(New_cases = NCases - lag(NCases, default = first(NCases))) -> by_country

Combined data

The times series now grouped by country can be ordered, searched, filtered and downloaded a directly from this page.

aqm::dt(by_country)

Confirmed daily cases in China

by_country %>% filter(Country =="China") -> china
ggplot(china,aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")

Daily cases in South Korea

by_country %>% filter(Country =="Korea, South") %>%
ggplot(aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")

Comparisons with South Korea

lst<-c("United Kingdom","US","Korea, South")
by_country %>% filter(Country %in% lst) %>% filter( Date>= as.Date("2020-02-15")) %>%
  ggplot(aes(x=Date,y=New_cases, colour=Country)) + geom_point() + geom_smooth(se=FALSE, method="loess")

Italy, Spain and Iran

lst<-c("Italy","Spain","Iran")
by_country %>% filter(Country %in% lst) %>% filter( Date>= as.Date("2020-02-15")) %>%
  ggplot(aes(x=Date,y=New_cases, colour=Country)) + geom_point() + geom_smooth(se=FALSE, method="loess")

Total outside Asia

lst<-c("Korea, South", "China","Japan","Iran")
by_country %>% filter(!(Country %in% lst)) %>% group_by(Date) %>% summarise(New_cases=sum(New_cases)) %>% filter( Date>= as.Date("2020-02-15")) %>%
  ggplot(aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")

lst<-c("Korea, South", "China","Japan","Iran")
by_country %>% filter(!(Country %in% lst)) %>% group_by(Date) %>% summarise(New_cases=sum(New_cases)) %>% filter( Date>= as.Date("2020-02-15")) %>%
  ggplot(aes(x=Date,y=log10(New_cases))) + geom_point() + geom_smooth(se=FALSE, method="loess")