#================
# IMPORT PACKAGES
#================

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
covid_rename_columns <- function(input_data){
  input_data %>% 
    rename('subregion' = 'Province/State'
           ,'country' = 'Country/Region'
           ,'lat' = 'Lat'
           ,'long' = 'Long'
    ) ->
    output_data
  return(output_data)
}
covid_pivot_data <- function(input_data, value_var_name){
  input_data %>% 
    pivot_longer(cols = -one_of('country','subregion','lat','long')
                 ,names_to = 'date'
                 ,values_to = value_var_name
    ) ->
    output_data
  return(output_data)
}
#======================
# DEFINE: Convert Dates
#======================

covid_convert_dates <- function(input_data){
  input_data %>% 
    mutate(date = mdy(date)) ->
    output_data
  return(output_data)
}
#=======================
# DEFINE: Rearrange Data
#=======================
covid_rearrange_data <- function(input_data){
  input_data %>% 
    select(country, subregion, date, lat, long, everything()) %>% 
    arrange(country, subregion, date) ->
    output_data
  return(output_data)
}
#======================================
# DEFINE DATA COVID PROCESSING FUNCTION
#======================================

covid_get_data <- function(input_url, value_var_name){
  covid_data_inprocess <- read_csv(input_url)
  covid_data_inprocess <- covid_rename_columns(covid_data_inprocess)
  covid_data_inprocess <- covid_pivot_data(covid_data_inprocess, value_var_name)
  covid_data_inprocess <- covid_convert_dates(covid_data_inprocess)
  covid_data_inprocess <- covid_rearrange_data(covid_data_inprocess)
  return(covid_data_inprocess)
}
url_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
url_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
covid_confirmed = covid_get_data(url_confirmed,'confirmed')
## Rows: 280 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr   (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_deaths = covid_get_data(url_deaths,'dead')
## Rows: 280 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr   (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_recovered = covid_get_data(url_recovered,'recovered')
## Rows: 265 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr   (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(covid_deaths)
## # A tibble: 198,520 x 6
##    country     subregion date         lat  long  dead
##    <chr>       <chr>     <date>     <dbl> <dbl> <dbl>
##  1 Afghanistan <NA>      2020-01-22  33.9  67.7     0
##  2 Afghanistan <NA>      2020-01-23  33.9  67.7     0
##  3 Afghanistan <NA>      2020-01-24  33.9  67.7     0
##  4 Afghanistan <NA>      2020-01-25  33.9  67.7     0
##  5 Afghanistan <NA>      2020-01-26  33.9  67.7     0
##  6 Afghanistan <NA>      2020-01-27  33.9  67.7     0
##  7 Afghanistan <NA>      2020-01-28  33.9  67.7     0
##  8 Afghanistan <NA>      2020-01-29  33.9  67.7     0
##  9 Afghanistan <NA>      2020-01-30  33.9  67.7     0
## 10 Afghanistan <NA>      2020-01-31  33.9  67.7     0
## # ... with 198,510 more rows
count(covid_confirmed)
## # A tibble: 1 x 1
##        n
##    <int>
## 1 198520
count(covid_deaths)
## # A tibble: 1 x 1
##        n
##    <int>
## 1 198520
count(covid_recovered)
## # A tibble: 1 x 1
##        n
##    <int>
## 1 187885
#-------------------------
# DROP UNNECESSARY COLUMNS
#-------------------------
covid_deaths <- covid_deaths %>% select(-long, -lat)
covid_recovered <- covid_recovered %>% select(-long, -lat)
head(covid_recovered)
## # A tibble: 6 x 4
##   country     subregion date       recovered
##   <chr>       <chr>     <date>         <dbl>
## 1 Afghanistan <NA>      2020-01-22         0
## 2 Afghanistan <NA>      2020-01-23         0
## 3 Afghanistan <NA>      2020-01-24         0
## 4 Afghanistan <NA>      2020-01-25         0
## 5 Afghanistan <NA>      2020-01-26         0
## 6 Afghanistan <NA>      2020-01-27         0
covid_confirmed %>% 
  left_join(covid_deaths, on = c(country, subregion, date)) %>% 
  left_join(covid_recovered, on = c(country, subregion, date)) ->
  covid_data
## Joining, by = c("country", "subregion", "date")
## Joining, by = c("country", "subregion", "date")
print(covid_data)
## # A tibble: 198,520 x 8
##    country     subregion date         lat  long confirmed  dead recovered
##    <chr>       <chr>     <date>     <dbl> <dbl>     <dbl> <dbl>     <dbl>
##  1 Afghanistan <NA>      2020-01-22  33.9  67.7         0     0         0
##  2 Afghanistan <NA>      2020-01-23  33.9  67.7         0     0         0
##  3 Afghanistan <NA>      2020-01-24  33.9  67.7         0     0         0
##  4 Afghanistan <NA>      2020-01-25  33.9  67.7         0     0         0
##  5 Afghanistan <NA>      2020-01-26  33.9  67.7         0     0         0
##  6 Afghanistan <NA>      2020-01-27  33.9  67.7         0     0         0
##  7 Afghanistan <NA>      2020-01-28  33.9  67.7         0     0         0
##  8 Afghanistan <NA>      2020-01-29  33.9  67.7         0     0         0
##  9 Afghanistan <NA>      2020-01-30  33.9  67.7         0     0         0
## 10 Afghanistan <NA>      2020-01-31  33.9  67.7         0     0         0
## # ... with 198,510 more rows
covid_data %>% 
  arrange(country, subregion, date) %>% 
  group_by(country, subregion) %>% 
  mutate(new_cases = confirmed - lag(confirmed)) %>% 
  ungroup() ->
  covid_data
print(covid_data)
## # A tibble: 198,520 x 9
##    country  subregion date         lat  long confirmed  dead recovered new_cases
##    <chr>    <chr>     <date>     <dbl> <dbl>     <dbl> <dbl>     <dbl>     <dbl>
##  1 Afghani~ <NA>      2020-01-22  33.9  67.7         0     0         0        NA
##  2 Afghani~ <NA>      2020-01-23  33.9  67.7         0     0         0         0
##  3 Afghani~ <NA>      2020-01-24  33.9  67.7         0     0         0         0
##  4 Afghani~ <NA>      2020-01-25  33.9  67.7         0     0         0         0
##  5 Afghani~ <NA>      2020-01-26  33.9  67.7         0     0         0         0
##  6 Afghani~ <NA>      2020-01-27  33.9  67.7         0     0         0         0
##  7 Afghani~ <NA>      2020-01-28  33.9  67.7         0     0         0         0
##  8 Afghani~ <NA>      2020-01-29  33.9  67.7         0     0         0         0
##  9 Afghani~ <NA>      2020-01-30  33.9  67.7         0     0         0         0
## 10 Afghani~ <NA>      2020-01-31  33.9  67.7         0     0         0         0
## # ... with 198,510 more rows
covid_data %>%
ggplot(aes(x=date, y=confirmed)) +
geom_point(color="tomato", aes(y=confirmed)) +
labs(x="date", y="confirmed", title = "NYC Shooting confirmed", subtitle ="Daily confirmed")