# simplify code: https://uc-r.github.io/pipe
`%>%` <- magrittr::`%>%`
raw_conf <-
read.csv(
file =
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_confirmed_global.csv",
stringsAsFactors = FALSE)
# look at the first 3 rows and first 7 columns
raw_conf[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_conf <- raw_conf[, 1:4]
# first 4 cols are location details
# remaining columns relate to dates
for(i in 5:ncol(raw_conf)){
raw_conf[,i] <- as.integer(raw_conf[,i])
# separate the cases into DAILY
if(i == 5){
# first day is the same in both forms of the data
df_conf[[names(raw_conf)[i]]] <- raw_conf[, i]
} else {
# diff of the cumulative total provided by CSSEGISandData
df_conf[[names(raw_conf)[i]]] <- raw_conf[, i] - raw_conf[, i - 1]
}
}
# look at the first 3 rows and first 7 columns
df_conf[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_conf1 <- df_conf %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
names_to = "date_temp",
values_to = "cases_temp")
# look at the first 3 rows and ALL columns
df_conf1[1:3,]
## # A tibble: 3 x 6
## Province.State Country.Region Lat Long date_temp cases_temp
## <chr> <chr> <dbl> <dbl> <chr> <int>
## 1 "" Afghanistan 33 65 X1.22.20 0
## 2 "" Afghanistan 33 65 X1.23.20 0
## 3 "" Afghanistan 33 65 X1.24.20 0
df_conf1$month <- sub("X", "",
strsplit(df_conf1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[1]) )
df_conf1$day <- strsplit(df_conf1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[2])
df_conf1$date <- as.Date(paste("2020", df_conf1$month, df_conf1$day, sep = "-"))
# look at the first 3 rows and ALL columns
df_conf1[1:3,]
## # A tibble: 3 x 9
## Province.State Country.Region Lat Long date_temp cases_temp month day
## <chr> <chr> <dbl> <dbl> <chr> <int> <chr> <chr>
## 1 "" Afghanistan 33 65 X1.22.20 0 1 22
## 2 "" Afghanistan 33 65 X1.23.20 0 1 23
## 3 "" Afghanistan 33 65 X1.24.20 0 1 24
## # ... with 1 more variable: date <date>
df_conf2 <- df_conf1 %>%
dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
dplyr::summarise(cases = sum(cases_temp)) %>%
dplyr::ungroup() %>%
dplyr::mutate(type = "confirmed",
# Remove leading/trailing whitespace from character strings
Country.Region = trimws(Country.Region),
Province.State = trimws(Province.State))
# look at the first 3 rows and ALL columns
df_conf2[1:3,]
## # A tibble: 3 x 7
## Province.State Country.Region Lat Long date cases type
## <chr> <chr> <dbl> <dbl> <date> <int> <chr>
## 1 "" Afghanistan 33 65 2020-01-22 0 confirmed
## 2 "" Afghanistan 33 65 2020-01-23 0 confirmed
## 3 "" Afghanistan 33 65 2020-01-24 0 confirmed
raw_death <-
read.csv(
file =
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_deaths_global.csv",
stringsAsFactors = FALSE)
# look at the first 3 rows and first 7 columns
raw_death[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_death <- raw_death[, 1:4]
# first 4 cols are location details
# remaining columns relate to dates
for(i in 5:ncol(raw_death)){
raw_death[,i] <- as.integer(raw_death[,i])
# separate the cases into DAILY
if(i == 5){
# first day is the same in both forms of the data
df_death[[names(raw_death)[i]]] <- raw_death[, i]
} else {
# diff of the cumulative total provided by CSSEGISandData
df_death[[names(raw_death)[i]]] <- raw_death[, i] - raw_death[, i - 1]
}
}
# look at the first 3 rows and first 7 columns
df_death[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_death1 <- df_death %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
names_to = "date_temp",
values_to = "cases_temp")
# look at the first 3 rows and ALL columns
df_death1[1:3,]
## # A tibble: 3 x 6
## Province.State Country.Region Lat Long date_temp cases_temp
## <chr> <chr> <dbl> <dbl> <chr> <int>
## 1 "" Afghanistan 33 65 X1.22.20 0
## 2 "" Afghanistan 33 65 X1.23.20 0
## 3 "" Afghanistan 33 65 X1.24.20 0
df_death1$month <- sub("X", "",
strsplit(df_death1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[1]) )
df_death1$day <- strsplit(df_death1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[2])
df_death1$date <- as.Date(paste("2020", df_death1$month, df_death1$day, sep = "-"))
# look at the first 3 rows and ALL columns
df_death1[1:3,]
## # A tibble: 3 x 9
## Province.State Country.Region Lat Long date_temp cases_temp month day
## <chr> <chr> <dbl> <dbl> <chr> <int> <chr> <chr>
## 1 "" Afghanistan 33 65 X1.22.20 0 1 22
## 2 "" Afghanistan 33 65 X1.23.20 0 1 23
## 3 "" Afghanistan 33 65 X1.24.20 0 1 24
## # ... with 1 more variable: date <date>
df_death2 <- df_death1 %>%
dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
dplyr::summarise(cases = sum(cases_temp)) %>%
dplyr::ungroup() %>%
dplyr::mutate(type = "death",
# Remove leading/trailing whitespace from character strings
Country.Region = trimws(Country.Region),
Province.State = trimws(Province.State))
# look at the first 3 rows and ALL columns
df_death2[1:3,]
## # A tibble: 3 x 7
## Province.State Country.Region Lat Long date cases type
## <chr> <chr> <dbl> <dbl> <date> <int> <chr>
## 1 "" Afghanistan 33 65 2020-01-22 0 death
## 2 "" Afghanistan 33 65 2020-01-23 0 death
## 3 "" Afghanistan 33 65 2020-01-24 0 death
raw_recov <-
read.csv(
file =
"https://raw.githubusercontent.com/
CSSEGISandData/
COVID-19/
master/
csse_covid_19_data/
csse_covid_19_time_series/
time_series_covid19_recovered_global.csv",
stringsAsFactors = FALSE)
# look at the first 3 rows and first 7 columns
raw_recov[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_recov <- raw_recov[, 1:4]
# first 4 cols are location details
# remaining columns relate to dates
for(i in 5:ncol(raw_recov)){
raw_recov[,i] <- as.integer(raw_recov[,i])
# separate the cases into DAILY
if(i == 5){
# first day is the same in both forms of the data
df_recov[[names(raw_recov)[i]]] <- raw_recov[, i]
} else {
# diff of the cumulative total provided by CSSEGISandData
df_recov[[names(raw_recov)[i]]] <- raw_recov[, i] - raw_recov[, i - 1]
}
}
# look at the first 3 rows and first 7 columns
df_recov[1:3,1:7]
## Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
## 1 Afghanistan 33.0000 65.0000 0 0 0
## 2 Albania 41.1533 20.1683 0 0 0
## 3 Algeria 28.0339 1.6596 0 0 0
df_recov1 <- df_recov %>% tidyr::pivot_longer(cols = dplyr::starts_with("X"),
names_to = "date_temp",
values_to = "cases_temp")
# look at the first 3 rows and ALL columns
df_recov1[1:3,]
## # A tibble: 3 x 6
## Province.State Country.Region Lat Long date_temp cases_temp
## <chr> <chr> <dbl> <dbl> <chr> <int>
## 1 "" Afghanistan 33 65 X1.22.20 0
## 2 "" Afghanistan 33 65 X1.23.20 0
## 3 "" Afghanistan 33 65 X1.24.20 0
df_recov1$month <- sub("X", "",
strsplit(df_recov1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[1]) )
df_recov1$day <- strsplit(df_recov1$date_temp, split = "\\.") %>%
purrr::map_chr(~.x[2])
df_recov1$date <- as.Date(paste("2020", df_recov1$month, df_recov1$day, sep = "-"))
# look at the first 3 rows and ALL columns
df_recov1[1:3,]
## # A tibble: 3 x 9
## Province.State Country.Region Lat Long date_temp cases_temp month day
## <chr> <chr> <dbl> <dbl> <chr> <int> <chr> <chr>
## 1 "" Afghanistan 33 65 X1.22.20 0 1 22
## 2 "" Afghanistan 33 65 X1.23.20 0 1 23
## 3 "" Afghanistan 33 65 X1.24.20 0 1 24
## # ... with 1 more variable: date <date>
df_recov2 <- df_recov1 %>%
dplyr::group_by(Province.State, Country.Region, Lat, Long, date) %>%
dplyr::summarise(cases = sum(cases_temp)) %>%
dplyr::ungroup() %>%
dplyr::mutate(type = "recovered",
# Remove leading/trailing whitespace from character strings
Country.Region = trimws(Country.Region),
Province.State = trimws(Province.State))
# look at the first 3 rows and ALL columns
df_recov2[1:3,]
## # A tibble: 3 x 7
## Province.State Country.Region Lat Long date cases type
## <chr> <chr> <dbl> <dbl> <date> <int> <chr>
## 1 "" Afghanistan 33 65 2020-01-22 0 recovered
## 2 "" Afghanistan 33 65 2020-01-23 0 recovered
## 3 "" Afghanistan 33 65 2020-01-24 0 recovered
coronavirus <-
dplyr::bind_rows(df_conf2,
df_death2,
df_recov2) %>%
dplyr::select(date,
province = Province.State,
country = Country.Region,
lat = Lat,
long = Long,
type, cases) %>%
as.data.frame()
if(ncol(coronavirus) != 7){
stop("WRONG number of columns - should be 7")
} else if(nrow(coronavirus) < 100000){
stop("WRONG number of raws - should be more than 100,000")
} else if(min(coronavirus$date) != as.Date("2020-01-22")){
stop("WRONG starting date - should be 22nd Jan")
}
save(coronavirus, file = "coronavirus.RData")
write.csv(coronavirus, "coronavirus.csv", row.names = FALSE)