library(lubridate)
library(tidyverse)
library(RCurl)
The Johns Hopkins real time data is being provided for visualisation onlline by a large number of sites.
These are very useful visual tools but it is not intuitively obvious how you can obtain the full longitudinal data from these visualisation applications in order to run your own analyses in R.
The data below is downloaded directly from the Johns Hopkins github repository that is the source of the online maps. The raw csv is in a wide format that requires pivoting into a data frame and the conversion of the text column headers to dates. There are three sets of data, confirmed cases, deaths and recoveries.
URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
data <- read.csv(text = URL, check.names = F)
pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")
d$Date<-as.Date(d$Date,format="%m/%d/%y")
Confirmed<-d
URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
data <- read.csv(text = URL, check.names = F)
pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")
d$Date<-as.Date(d$Date,format="%m/%d/%y")
Deaths<-d
URL <- getURL("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")
data <- read.csv(text = URL, check.names = F)
pivot_longer(data,cols=5:dim(data)[2],names_to = "Date") ->d
names(d)<-c("Province","Country","Lat","Long","Date","NCases")
d$Date<-as.Date(d$Date,format="%m/%d/%y")
Recovered<-d
The raw John Hopkins data is provided at the level of provinces. It can be grouped and summarised at country level. A left join on country name and date provides a single table. Subtracting the previous day’s total from the cumulative total of cases provides the number of new cases which were reported each day.
Confirmed %>% group_by(Country, Date) %>% summarise(NCases=sum(NCases)) -> confirmed_country
Deaths %>% group_by(Country, Date) %>% summarise(NDeaths=sum(NCases)) -> deaths_country
Recovered %>% group_by(Country, Date) %>% summarise(NRecovered=sum(NCases)) -> recovered_country
confirmed_country %>% left_join(deaths_country) %>% left_join(recovered_country) -> by_country
by_country%>%arrange(Date) %>% mutate(New_cases = NCases - lag(NCases, default = first(NCases))) -> by_country
The times series now grouped by country can be ordered, searched, filtered and downloaded a directly from this page.
aqm::dt(by_country)
by_country %>% filter(Country =="China") -> china
ggplot(china,aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")
by_country %>% filter(Country =="Korea, South") %>%
ggplot(aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")
lst<-c("United Kingdom","US","Korea, South")
by_country %>% filter(Country %in% lst) %>% filter( Date>= as.Date("2020-02-15")) %>%
ggplot(aes(x=Date,y=New_cases, colour=Country)) + geom_point() + geom_smooth(se=FALSE, method="loess")
lst<-c("Italy","Spain","Iran")
by_country %>% filter(Country %in% lst) %>% filter( Date>= as.Date("2020-02-15")) %>%
ggplot(aes(x=Date,y=New_cases, colour=Country)) + geom_point() + geom_smooth(se=FALSE, method="loess")
lst<-c("Korea, South", "China","Japan","Iran")
by_country %>% filter(!(Country %in% lst)) %>% group_by(Date) %>% summarise(New_cases=sum(New_cases)) %>% filter( Date>= as.Date("2020-02-15")) %>%
ggplot(aes(x=Date,y=New_cases)) + geom_point() + geom_smooth(se=FALSE, method="loess")
lst<-c("Korea, South", "China","Japan","Iran")
by_country %>% filter(!(Country %in% lst)) %>% group_by(Date) %>% summarise(New_cases=sum(New_cases)) %>% filter( Date>= as.Date("2020-02-15")) %>%
ggplot(aes(x=Date,y=log10(New_cases))) + geom_point() + geom_smooth(se=FALSE, method="loess")