#================
# IMPORT PACKAGES
#================
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
covid_rename_columns <- function(input_data){
input_data %>%
rename('subregion' = 'Province/State'
,'country' = 'Country/Region'
,'lat' = 'Lat'
,'long' = 'Long'
) ->
output_data
return(output_data)
}
covid_pivot_data <- function(input_data, value_var_name){
input_data %>%
pivot_longer(cols = -one_of('country','subregion','lat','long')
,names_to = 'date'
,values_to = value_var_name
) ->
output_data
return(output_data)
}
#======================
# DEFINE: Convert Dates
#======================
covid_convert_dates <- function(input_data){
input_data %>%
mutate(date = mdy(date)) ->
output_data
return(output_data)
}
#=======================
# DEFINE: Rearrange Data
#=======================
covid_rearrange_data <- function(input_data){
input_data %>%
select(country, subregion, date, lat, long, everything()) %>%
arrange(country, subregion, date) ->
output_data
return(output_data)
}
#======================================
# DEFINE DATA COVID PROCESSING FUNCTION
#======================================
covid_get_data <- function(input_url, value_var_name){
covid_data_inprocess <- read_csv(input_url)
covid_data_inprocess <- covid_rename_columns(covid_data_inprocess)
covid_data_inprocess <- covid_pivot_data(covid_data_inprocess, value_var_name)
covid_data_inprocess <- covid_convert_dates(covid_data_inprocess)
covid_data_inprocess <- covid_rearrange_data(covid_data_inprocess)
return(covid_data_inprocess)
}
url_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
url_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
covid_confirmed = covid_get_data(url_confirmed,'confirmed')
## Rows: 280 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_deaths = covid_get_data(url_deaths,'dead')
## Rows: 280 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_recovered = covid_get_data(url_recovered,'recovered')
## Rows: 265 Columns: 713
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Province/State, Country/Region
## dbl (711): Lat, Long, 1/22/20, 1/23/20, 1/24/20, 1/25/20, 1/26/20, 1/27/20, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(covid_deaths)
## # A tibble: 198,520 x 6
## country subregion date lat long dead
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 Afghanistan <NA> 2020-01-22 33.9 67.7 0
## 2 Afghanistan <NA> 2020-01-23 33.9 67.7 0
## 3 Afghanistan <NA> 2020-01-24 33.9 67.7 0
## 4 Afghanistan <NA> 2020-01-25 33.9 67.7 0
## 5 Afghanistan <NA> 2020-01-26 33.9 67.7 0
## 6 Afghanistan <NA> 2020-01-27 33.9 67.7 0
## 7 Afghanistan <NA> 2020-01-28 33.9 67.7 0
## 8 Afghanistan <NA> 2020-01-29 33.9 67.7 0
## 9 Afghanistan <NA> 2020-01-30 33.9 67.7 0
## 10 Afghanistan <NA> 2020-01-31 33.9 67.7 0
## # ... with 198,510 more rows
count(covid_confirmed)
## # A tibble: 1 x 1
## n
## <int>
## 1 198520
count(covid_deaths)
## # A tibble: 1 x 1
## n
## <int>
## 1 198520
count(covid_recovered)
## # A tibble: 1 x 1
## n
## <int>
## 1 187885
#-------------------------
# DROP UNNECESSARY COLUMNS
#-------------------------
covid_deaths <- covid_deaths %>% select(-long, -lat)
covid_recovered <- covid_recovered %>% select(-long, -lat)
head(covid_recovered)
## # A tibble: 6 x 4
## country subregion date recovered
## <chr> <chr> <date> <dbl>
## 1 Afghanistan <NA> 2020-01-22 0
## 2 Afghanistan <NA> 2020-01-23 0
## 3 Afghanistan <NA> 2020-01-24 0
## 4 Afghanistan <NA> 2020-01-25 0
## 5 Afghanistan <NA> 2020-01-26 0
## 6 Afghanistan <NA> 2020-01-27 0
covid_confirmed %>%
left_join(covid_deaths, on = c(country, subregion, date)) %>%
left_join(covid_recovered, on = c(country, subregion, date)) ->
covid_data
## Joining, by = c("country", "subregion", "date")
## Joining, by = c("country", "subregion", "date")
print(covid_data)
## # A tibble: 198,520 x 8
## country subregion date lat long confirmed dead recovered
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan <NA> 2020-01-22 33.9 67.7 0 0 0
## 2 Afghanistan <NA> 2020-01-23 33.9 67.7 0 0 0
## 3 Afghanistan <NA> 2020-01-24 33.9 67.7 0 0 0
## 4 Afghanistan <NA> 2020-01-25 33.9 67.7 0 0 0
## 5 Afghanistan <NA> 2020-01-26 33.9 67.7 0 0 0
## 6 Afghanistan <NA> 2020-01-27 33.9 67.7 0 0 0
## 7 Afghanistan <NA> 2020-01-28 33.9 67.7 0 0 0
## 8 Afghanistan <NA> 2020-01-29 33.9 67.7 0 0 0
## 9 Afghanistan <NA> 2020-01-30 33.9 67.7 0 0 0
## 10 Afghanistan <NA> 2020-01-31 33.9 67.7 0 0 0
## # ... with 198,510 more rows
covid_data %>%
arrange(country, subregion, date) %>%
group_by(country, subregion) %>%
mutate(new_cases = confirmed - lag(confirmed)) %>%
ungroup() ->
covid_data
print(covid_data)
## # A tibble: 198,520 x 9
## country subregion date lat long confirmed dead recovered new_cases
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani~ <NA> 2020-01-22 33.9 67.7 0 0 0 NA
## 2 Afghani~ <NA> 2020-01-23 33.9 67.7 0 0 0 0
## 3 Afghani~ <NA> 2020-01-24 33.9 67.7 0 0 0 0
## 4 Afghani~ <NA> 2020-01-25 33.9 67.7 0 0 0 0
## 5 Afghani~ <NA> 2020-01-26 33.9 67.7 0 0 0 0
## 6 Afghani~ <NA> 2020-01-27 33.9 67.7 0 0 0 0
## 7 Afghani~ <NA> 2020-01-28 33.9 67.7 0 0 0 0
## 8 Afghani~ <NA> 2020-01-29 33.9 67.7 0 0 0 0
## 9 Afghani~ <NA> 2020-01-30 33.9 67.7 0 0 0 0
## 10 Afghani~ <NA> 2020-01-31 33.9 67.7 0 0 0 0
## # ... with 198,510 more rows
covid_data %>%
ggplot(aes(x=date, y=confirmed)) +
geom_point(color="tomato", aes(y=confirmed)) +
labs(x="date", y="confirmed", title = "NYC Shooting confirmed", subtitle ="Daily confirmed")
