Today is

Sys.time()
## [1] "2020-09-10 18:49:58 +07"

Loading COVID-19 dataset

library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2, warn.conflicts=F, quietly=T)
library(plotly, warn.conflicts=F, quietly=T)

# set.seed(1234)
filename <- "covid.csv"
#if (!file.exists(filename)) {
  download.file("https://covid.ourworldindata.org/data/owid-covid-data.csv", destfile = filename, quiet=T)
#}
covid <- read.csv("covid.csv", header = T)

Fix total_cases

covid$date <- as.Date(covid$date)

fn.fix <- function(label) {
  sapply(1:nrow(covid), function (n) {
    x <- covid[[label]][n]

    if (is.na(x)) {
      this_loc <- subset(covid, location == covid$location[n])
      m <- mean(c(
        max(0, subset(this_loc, date < covid$date[n])[[label]], na.rm = T),
        min(max(covid[[label]], na.rm = T), subset(this_loc, date > covid$date[n])[[label]], na.rm = T)
      ))
      
      if (is.na(m)) {
        return(0)
      }
  
      return(m)
    }
    
    return(x)
  })
}

covid$total_cases_fixed <- fn.fix("total_cases")
select(
  covid[covid$location == "Thailand" & covid$total_cases_fixed > 0,],
  location, date, total_cases, total_cases_fixed
)

Fix total_cases_per_million

covid$total_cases_per_million_fixed <- fn.fix("total_cases_per_million")
covid$total_deaths_per_million_fixed <- fn.fix("total_deaths_per_million")

select(
  covid[covid$location == "Thailand" & covid$total_deaths_per_million_fixed > 0,],
  location, date, total_deaths_per_million, total_deaths_per_million_fixed
)

Max total cases, as of now

location_lookup <- covid %>% distinct(location)

fn.create_lookup <- function(label) {
  sapply(location_lookup$location, function (loc) {
    max(0, covid[covid$location == loc & covid$new_cases > 0,][[label]], na.rm = T)
  })
}

location_lookup$max_total_cases <- fn.create_lookup("total_cases")
location_lookup$max_total_cases_per_mil <- fn.create_lookup("total_cases_per_million")
location_lookup$max_total_deaths_per_mil <- fn.create_lookup("total_deaths_per_million")

location_lookup[location_lookup$location == "Thailand",]

Day over 100

date_over_100_lookup <- sapply(location_lookup$location, function (loc) {
  this_loc <- covid[covid$location == loc & covid$total_cases_fixed >= 100,]
  if (length(this_loc$date) > 0) {
    return(min(this_loc$date))
  }

  return(NA)
})

covid$day_over_100 <- sapply(1:nrow(covid), function (n) {
  d <- covid$date[n] - date_over_100_lookup[[covid$location[n]]]
  if (is.na(d) | d < 0) {
    return(NA)
  }
  
  return(d)
})

select(
  covid[!is.na(covid$day_over_100),],
  location, date, day_over_100
) %>% distinct(location, .keep_all = T) %>% arrange(date)

Top N’s

fn.create_nth_lookup <- function(label) {
  length(location_lookup[[label]]) - rank(location_lookup[[label]]) + 1
}

location_lookup$nth_cases <- fn.create_nth_lookup("max_total_cases_per_mil")
location_lookup$nth_deaths <- fn.create_nth_lookup("max_total_deaths_per_mil")

location_lookup %>% arrange(desc(max_total_cases_per_mil)) %>% select(location, max_total_cases_per_mil)
location_lookup %>% arrange(desc(max_total_deaths_per_mil)) %>% select(location, max_total_deaths_per_mil)

Ranks

subset(
  location_lookup,
  location == "Thailand"
) %>% select(location, nth_cases, nth_deaths, max_total_cases_per_mil, max_total_deaths_per_mil)
location_lookup %>%
  arrange(nth_cases + nth_deaths) %>%
  select(location, nth_cases, nth_deaths, max_total_cases_per_mil, max_total_deaths_per_mil)
location_lookup %>%
  filter(location != "World") %>%
  arrange(desc(max_total_cases)) %>%
  select(location, nth_cases, nth_deaths, max_total_cases_per_mil, max_total_deaths_per_mil)

Top N, and Thailand, since 100th day

ggplotly(ggplot(
  covid %>% subset(
    !is.na(day_over_100) &
      new_cases > 0 &
      (location %in% (
        location_lookup %>% arrange(desc(max_total_cases_per_mil))
      )[1:10,]$location | location == "Thailand")),
  aes(day_over_100, new_cases_per_million, color = location)) 
  + geom_line() 
  + scale_y_log10()
  + labs(
    title = "New cases per million since day over 100 (highest total cases per miliion)",
    x = "Day over 100", y = "Number of new cases per million")
)
ggplotly(ggplot(
  covid %>% subset(
    !is.na(day_over_100) &
      new_deaths > 0 &
      (location %in% (
        location_lookup %>% arrange(desc(max_total_deaths_per_mil))
      )[1:10,]$location | location == "Thailand")),
  aes(day_over_100, new_deaths_per_million, color = location)) 
  + geom_line()
  + scale_y_log10()
  + labs(
    title = "New deaths per million since day over 100 (highest total deaths per miliion)",
    x = "Day over 100", y = "Number of new deaths per million")
)