1 CSSEGISandData

ref: https://ramikrispin.github.io/coronavirus/

# simplify code: https://uc-r.github.io/pipe
`%>%` <- magrittr::`%>%`

1.1 Confirmed Cases

1.1.1 raw file

raw_conf <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_confirmed_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_conf[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.1.1.1 from CUM to DAILY

df_conf <- raw_conf[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_conf)){

  raw_conf[,i] <- as.integer(raw_conf[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_conf[[names(raw_conf)[i]]] <- raw_conf[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_conf[[names(raw_conf)[i]]] <- raw_conf[, i] - raw_conf[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_conf[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.1.1.2 from WIDE to LONG,

df_conf1 <-  df_conf %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_conf1[1:3,]

## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.1.1.3 Parsing the date

df_conf1$month <- sub("X", "",
                      strsplit(df_conf1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_conf1$day <- strsplit(df_conf1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_conf1$date <- as.Date(paste("2020", df_conf1$month, df_conf1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_conf1[1:3,]

## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.1.1.4 Aggregate the data

df_conf2 <- df_conf1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "confirmed",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_conf2[1:3,]

## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type     
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>    
## 1 ""             Afghanistan       33    65 2020-01-22     0 confirmed
## 2 ""             Afghanistan       33    65 2020-01-23     0 confirmed
## 3 ""             Afghanistan       33    65 2020-01-24     0 confirmed

1.2 Death Cases

1.2.1 raw file

raw_death <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_deaths_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_death[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.2.1.1 from CUM to DAILY

df_death <- raw_death[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_death)){

  raw_death[,i] <- as.integer(raw_death[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_death[[names(raw_death)[i]]] <- raw_death[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_death[[names(raw_death)[i]]] <- raw_death[, i] - raw_death[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_death[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.2.1.2 from WIDE to LONG,

df_death1 <-  df_death %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_death1[1:3,]

## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.2.1.3 Parsing the date

df_death1$month <- sub("X", "",
                      strsplit(df_death1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_death1$day <- strsplit(df_death1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_death1$date <- as.Date(paste("2020", df_death1$month, df_death1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_death1[1:3,]

## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.2.1.4 Aggregate the data

df_death2 <- df_death1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "death",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_death2[1:3,]

## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type 
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>
## 1 ""             Afghanistan       33    65 2020-01-22     0 death
## 2 ""             Afghanistan       33    65 2020-01-23     0 death
## 3 ""             Afghanistan       33    65 2020-01-24     0 death

1.3 Recovered Cases

1.3.1 raw file

raw_recov <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_recovered_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_recov[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.3.1.1 from CUM to DAILY

df_recov <- raw_recov[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_recov)){

  raw_recov[,i] <- as.integer(raw_recov[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_recov[[names(raw_recov)[i]]] <- raw_recov[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_recov[[names(raw_recov)[i]]] <- raw_recov[, i] - raw_recov[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_recov[1:3,1:7]

##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.3.1.2 from WIDE to LONG,

df_recov1 <-  df_recov %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_recov1[1:3,]

## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.3.1.3 Parsing the date

df_recov1$month <- sub("X", "",
                      strsplit(df_recov1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_recov1$day <- strsplit(df_recov1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_recov1$date <- as.Date(paste("2020", df_recov1$month, df_recov1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_recov1[1:3,]

## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.3.1.4 Aggregate the data

df_recov2 <- df_recov1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "recovered",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_recov2[1:3,]

## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type     
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>    
## 1 ""             Afghanistan       33    65 2020-01-22     0 recovered
## 2 ""             Afghanistan       33    65 2020-01-23     0 recovered
## 3 ""             Afghanistan       33    65 2020-01-24     0 recovered

1.4 Cases, aggregated

coronavirus <- 
  dplyr::bind_rows(df_conf2, 
                   df_death2, 
                   df_recov2) %>%
  dplyr::select(date, 
                province = Province.State, 
                country = Country.Region, 
                lat = Lat, 
                long = Long, 
                type, cases) %>%
  as.data.frame()

1.4.1 Validation

if(ncol(coronavirus) != 7){
  stop("WRONG number of columns - should be 7")
} else if(nrow(coronavirus) < 100000){
  stop("WRONG number of raws - should be more than 100,000")
} else if(min(coronavirus$date) != as.Date("2020-01-22")){
  stop("WRONG starting date - should be 22nd Jan")
}

1.4.2 save

save(coronavirus, file = "coronavirus.RData") 
write.csv(coronavirus, "coronavirus.csv", row.names = FALSE)

COVID-19 Data

Kunal Haria

Last compiled on Tuesday 09 June 2020

1 CSSEGISandData

1.1 Confirmed Cases

1.1.1 raw file

1.1.1.1 from CUM to DAILY

1.1.1.2 from WIDE to LONG,

1.1.1.3 Parsing the date

1.1.1.4 Aggregate the data

1.2 Death Cases

1.2.1 raw file

1.2.1.1 from CUM to DAILY

1.2.1.2 from WIDE to LONG,

1.2.1.3 Parsing the date

1.2.1.4 Aggregate the data

1.3 Recovered Cases

1.3.1 raw file

1.3.1.1 from CUM to DAILY

1.3.1.2 from WIDE to LONG,

1.3.1.3 Parsing the date

1.3.1.4 Aggregate the data

1.4 Cases, aggregated

1.4.1 Validation

1.4.2 save