# install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
covid <- read_csv("WHO_covid_deaths.csv")
## Rows: 206901 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): Date_reported, Country_code, Country, WHO_region
## dbl (4): New_cases, Cumulative_cases, New_deaths, Cumulative_deaths
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(covid)
## Rows: 206,901
## Columns: 8
## $ Date_reported <chr> "1/3/2020", "1/4/2020", "1/5/2020", "1/6/2020", "1/7~
## $ Country_code <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF"~
## $ Country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghan~
## $ WHO_region <chr> "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EMRO", "EMR~
## $ New_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Cumulative_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ New_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Cumulative_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
Field Name | Type | Description |
---|---|---|
Date_reported | Date | Date of reporting to WHO |
Country_code | String | ISO Alpha-2 country code |
Country String | Country | Country, territory, area |
WHO_region | String | WHO regional offices: WHO Member States are grouped into six WHO regions – Regional Office for Africa (AFRO), Regional Office for the Americas (AMRO), Regional Office for South-East Asia (SEARO), Regional Office for Europe (EURO), Regional Office for the Eastern Mediterranean (EMRO), and Regional Office for the Western Pacific (WPRO) |
New_cases | Integer | New confirmed cases. Calculated by subtracting previous cumulative case count from current cumulative cases count |
Cumulative_cases | Integer | Cumulative confirmed cases reported to WHO to date |
New_deaths | Integer | New confirmed deaths. Calculated by subtracting previous cumulative deaths from current cumulative deaths |
Cumulative_deaths | Integer | Cumulative confirmed deaths reported to WHO to date |
knitr::include_graphics("WHO_covid.PNG")
unique(covid$WHO_region)
## [1] "EMRO" "EURO" "AFRO" "WPRO" "AMRO" "SEARO" "Other"
europe <- covid %>%
filter(WHO_region == "EURO")
summary(europe$New_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -42 8 239 4064 1716 500563
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
covidnew <- covid %>%
mutate(date = as.Date(Date_reported, format = "%m/%d/%Y"))%>%
mutate(month = as.factor(month(date)))
head(covidnew)
## # A tibble: 6 x 10
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1/3/2020 AF Afghanistan EMRO 0 0
## 2 1/4/2020 AF Afghanistan EMRO 0 0
## 3 1/5/2020 AF Afghanistan EMRO 0 0
## 4 1/6/2020 AF Afghanistan EMRO 0 0
## 5 1/7/2020 AF Afghanistan EMRO 0 0
## 6 1/8/2020 AF Afghanistan EMRO 0 0
## # ... with 4 more variables: New_deaths <dbl>, Cumulative_deaths <dbl>,
## # date <date>, month <fct>
p1 <- covidnew %>%
ggplot(aes(date, New_deaths)) +
geom_point()
p1
covidclean <- covidnew %>%
filter(New_cases >=0 & New_deaths >= 0)
head(covidclean)
## # A tibble: 6 x 10
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1/3/2020 AF Afghanistan EMRO 0 0
## 2 1/4/2020 AF Afghanistan EMRO 0 0
## 3 1/5/2020 AF Afghanistan EMRO 0 0
## 4 1/6/2020 AF Afghanistan EMRO 0 0
## 5 1/7/2020 AF Afghanistan EMRO 0 0
## 6 1/8/2020 AF Afghanistan EMRO 0 0
## # ... with 4 more variables: New_deaths <dbl>, Cumulative_deaths <dbl>,
## # date <date>, month <fct>
p2 <- covidclean %>%
ggplot(aes(date, New_deaths, color = WHO_region)) +
geom_line()
p2
Let’s focus on those two regions
AMROSEARO <- covidclean %>%
filter(WHO_region %in% c("AMRO", "SEARO"))
head(AMROSEARO)
## # A tibble: 6 x 10
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1/3/2020 AI Anguilla AMRO 0 0
## 2 1/4/2020 AI Anguilla AMRO 0 0
## 3 1/5/2020 AI Anguilla AMRO 0 0
## 4 1/6/2020 AI Anguilla AMRO 0 0
## 5 1/7/2020 AI Anguilla AMRO 0 0
## 6 1/8/2020 AI Anguilla AMRO 0 0
## # ... with 4 more variables: New_deaths <dbl>, Cumulative_deaths <dbl>,
## # date <date>, month <fct>
sumtable <- AMROSEARO %>%
select(date, month, Country, WHO_region, New_deaths) %>%
group_by(Country) %>%
summarize(sum = sum(New_deaths)) %>%
arrange(desc(sum))
top10 <- top_n(sumtable, 10, sum)
top10
## # A tibble: 10 x 2
## Country sum
## <chr> <dbl>
## 1 United States of America 996970
## 2 Brazil 665142
## 3 India 524490
## 4 Mexico 324768
## 5 Peru 213106
## 6 Indonesia 156548
## 7 Colombia 139829
## 8 Argentina 128825
## 9 Chile 57810
## 10 Canada 40695
The US tops the number of deaths at nearly 1 million and Brazil, India, Mexico, and Peru follow
Can we see those top 10 countries’ deaths plotted over time?
p3 <- covidclean %>%
filter(Country %in% c("United States of America", "Brazil","India", "Mexico", "Peru", "Indonesia", "Colombia", "Argentina", "Chile", "Canada")) %>%
ggplot(aes(date, New_deaths, fill = Country)) +
scale_fill_viridis_d()+
ggtitle("Daily Deaths for Countries with Most Deaths")+
geom_area()+
theme_bw()
p3
This dataset is lacking population size. Rates of cases and deaths would be more appropriate for comparison purposes. That will be in the next tutorial - how to join country population datasets with the covid dataset and plot rates rather than raw counts.