1 CSSEGISandData

# simplify code: https://uc-r.github.io/pipe
`%>%` <- magrittr::`%>%`

1.1 Confirmed Cases

1.1.1 raw file

raw_conf <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_confirmed_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_conf[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.1.1.1 from CUM to DAILY

df_conf <- raw_conf[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_conf)){

  raw_conf[,i] <- as.integer(raw_conf[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_conf[[names(raw_conf)[i]]] <- raw_conf[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_conf[[names(raw_conf)[i]]] <- raw_conf[, i] - raw_conf[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_conf[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.1.1.2 from WIDE to LONG,

df_conf1 <-  df_conf %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_conf1[1:3,]
## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.1.1.3 Parsing the date

df_conf1$month <- sub("X", "",
                      strsplit(df_conf1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_conf1$day <- strsplit(df_conf1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_conf1$date <- as.Date(paste("2020", df_conf1$month, df_conf1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_conf1[1:3,]
## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.1.1.4 Aggregate the data

df_conf2 <- df_conf1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "confirmed",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_conf2[1:3,]
## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type     
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>    
## 1 ""             Afghanistan       33    65 2020-01-22     0 confirmed
## 2 ""             Afghanistan       33    65 2020-01-23     0 confirmed
## 3 ""             Afghanistan       33    65 2020-01-24     0 confirmed

1.2 Death Cases

1.2.1 raw file

raw_death <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_deaths_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_death[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.2.1.1 from CUM to DAILY

df_death <- raw_death[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_death)){

  raw_death[,i] <- as.integer(raw_death[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_death[[names(raw_death)[i]]] <- raw_death[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_death[[names(raw_death)[i]]] <- raw_death[, i] - raw_death[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_death[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.2.1.2 from WIDE to LONG,

df_death1 <-  df_death %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_death1[1:3,]
## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.2.1.3 Parsing the date

df_death1$month <- sub("X", "",
                      strsplit(df_death1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_death1$day <- strsplit(df_death1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_death1$date <- as.Date(paste("2020", df_death1$month, df_death1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_death1[1:3,]
## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.2.1.4 Aggregate the data

df_death2 <- df_death1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "death",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_death2[1:3,]
## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type 
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>
## 1 ""             Afghanistan       33    65 2020-01-22     0 death
## 2 ""             Afghanistan       33    65 2020-01-23     0 death
## 3 ""             Afghanistan       33    65 2020-01-24     0 death

1.3 Recovered Cases

1.3.1 raw file

raw_recov <- 
  read.csv(
    file = 
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_recovered_global.csv",
    stringsAsFactors = FALSE)

# look at the first 3 rows and first 7 columns
raw_recov[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.3.1.1 from CUM to DAILY

df_recov <- raw_recov[, 1:4]
           # first 4 cols are location details

# remaining columns relate to dates
for(i in 5:ncol(raw_recov)){

  raw_recov[,i] <- as.integer(raw_recov[,i])
  
  # separate the cases into DAILY
  if(i == 5){
    # first day is the same in both forms of the data
    df_recov[[names(raw_recov)[i]]] <- raw_recov[, i]
  } else {
    # diff of the cumulative total provided by CSSEGISandData
    df_recov[[names(raw_recov)[i]]] <- raw_recov[, i] - raw_recov[, i - 1]
  }

}

# look at the first 3 rows and first 7 columns
df_recov[1:3,1:7]
##   Province.State Country.Region     Lat    Long X1.22.20 X1.23.20 X1.24.20
## 1                   Afghanistan 33.0000 65.0000        0        0        0
## 2                       Albania 41.1533 20.1683        0        0        0
## 3                       Algeria 28.0339  1.6596        0        0        0

1.3.1.2 from WIDE to LONG,

df_recov1 <-  df_recov %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
                                             names_to = "date_temp",
                                             values_to = "cases_temp")

# look at the first 3 rows and ALL columns
df_recov1[1:3,]
## # A tibble: 3 x 6
##   Province.State Country.Region   Lat  Long date_temp cases_temp
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int>
## 1 ""             Afghanistan       33    65 X1.22.20           0
## 2 ""             Afghanistan       33    65 X1.23.20           0
## 3 ""             Afghanistan       33    65 X1.24.20           0

1.3.1.3 Parsing the date

df_recov1$month <- sub("X", "",
                      strsplit(df_recov1$date_temp, split = "\\.") %>%
                        purrr::map_chr(~.x[1]) )

df_recov1$day <- strsplit(df_recov1$date_temp, split = "\\.") %>%
  purrr::map_chr(~.x[2])


df_recov1$date <- as.Date(paste("2020", df_recov1$month, df_recov1$day, sep = "-"))

# look at the first 3 rows and ALL columns
df_recov1[1:3,]
## # A tibble: 3 x 9
##   Province.State Country.Region   Lat  Long date_temp cases_temp month day  
##   <chr>          <chr>          <dbl> <dbl> <chr>          <int> <chr> <chr>
## 1 ""             Afghanistan       33    65 X1.22.20           0 1     22   
## 2 ""             Afghanistan       33    65 X1.23.20           0 1     23   
## 3 ""             Afghanistan       33    65 X1.24.20           0 1     24   
## # ... with 1 more variable: date <date>

1.3.1.4 Aggregate the data

df_recov2 <- df_recov1 %>%
  dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
  dplyr::summarise(cases = sum(cases_temp)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(type = "recovered",
                # Remove leading/trailing whitespace from character strings
                Country.Region = trimws(Country.Region),
                Province.State = trimws(Province.State))

# look at the first 3 rows and ALL columns
df_recov2[1:3,]
## # A tibble: 3 x 7
##   Province.State Country.Region   Lat  Long date       cases type     
##   <chr>          <chr>          <dbl> <dbl> <date>     <int> <chr>    
## 1 ""             Afghanistan       33    65 2020-01-22     0 recovered
## 2 ""             Afghanistan       33    65 2020-01-23     0 recovered
## 3 ""             Afghanistan       33    65 2020-01-24     0 recovered

1.4 Cases, aggregated

coronavirus <- 
  dplyr::bind_rows(df_conf2, 
                   df_death2, 
                   df_recov2) %>%
  dplyr::select(date, 
                province = Province.State, 
                country = Country.Region, 
                lat = Lat, 
                long = Long, 
                type, cases) %>%
  as.data.frame()

1.4.1 Validation

if(ncol(coronavirus) != 7){
  stop("WRONG number of columns - should be 7")
} else if(nrow(coronavirus) < 100000){
  stop("WRONG number of raws - should be more than 100,000")
} else if(min(coronavirus$date) != as.Date("2020-01-22")){
  stop("WRONG starting date - should be 22nd Jan")
}

1.4.2 save

save(coronavirus, file = "coronavirus.RData") 
write.csv(coronavirus, "coronavirus.csv", row.names = FALSE)