# Load the readr, ggplot2, and dplyr packages
library(readr)
## Warning: package 'readr' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read datasets/confirmed_cases_worldwide.csv into confirmed_cases_worldwide
confirmed_cases_worldwide <- read_csv('datasets/confirmed_cases_worldwide.csv')
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## cum_cases = col_double()
## )
# See the result
confirmed_cases_worldwide
## # A tibble: 56 x 2
## date cum_cases
## <date> <dbl>
## 1 2020-01-22 555
## 2 2020-01-23 653
## 3 2020-01-24 941
## 4 2020-01-25 1434
## 5 2020-01-26 2118
## 6 2020-01-27 2927
## 7 2020-01-28 5578
## 8 2020-01-29 6166
## 9 2020-01-30 8234
## 10 2020-01-31 9927
## # ... with 46 more rows
# Draw a line plot of cumulative cases vs. date
# Label the y-axis
ggplot(confirmed_cases_worldwide,aes(date, cum_cases)) +
geom_line() +
ylab("Cumulative confirmed cases")
3.
# Read in datasets/confirmed_cases_china_vs_world.csv
confirmed_cases_china_vs_world <- read_csv('datasets/confirmed_cases_china_vs_world.csv')
## Parsed with column specification:
## cols(
## is_china = col_character(),
## date = col_date(format = ""),
## cases = col_double(),
## cum_cases = col_double()
## )
# See the result
glimpse(confirmed_cases_china_vs_world)
## Observations: 112
## Variables: 4
## $ is_china <chr> "China", "China", "China", "China", "China", "China", "Ch...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-...
## $ cases <dbl> 548, 95, 277, 486, 669, 802, 2632, 578, 2054, 1661, 2089,...
## $ cum_cases <dbl> 548, 643, 920, 1406, 2075, 2877, 5509, 6087, 8141, 9802, ...
# Draw a line plot of cumulative cases vs. date, grouped and colored by is_china
# Define aesthetics within the line geom
plt_cum_confirmed_cases_china_vs_world <- ggplot(confirmed_cases_china_vs_world) +
geom_line(aes(date, cum_cases,group=is_china, color = is_china)) +
ylab("Cumulative confirmed cases")
# See the plot
plt_cum_confirmed_cases_china_vs_world
4.
who_events <- tribble(
~ date, ~ event,
"2020-01-30", "Global health\nemergency declared",
"2020-03-11", "Pandemic\ndeclared",
"2020-02-13", "China reporting\nchange"
) %>%
mutate(date = as.Date(date))
# Using who_events, add vertical dashed lines with an xintercept at date
# and text at date, labeled by event, and at 100000 on the y-axis
plt_cum_confirmed_cases_china_vs_world +
geom_vline(data = who_events, aes(xintercept = date), linetype = 'dashed') +
geom_text(data = who_events, aes(x = date,label = event), y = 100000)
5.
# Filter for China, from Feb 15
china_after_feb15 <- confirmed_cases_china_vs_world %>%
filter(date >= "2020-02-15",is_china == "China")
# Using china_after_feb15, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
ggplot(china_after_feb15, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) +
ylab("Cumulative confirmed cases")
## `geom_smooth()` using formula 'y ~ x'
# Filter confirmed_cases_china_vs_world for not China
not_china <- confirmed_cases_china_vs_world %>%
filter(is_china != "China")
# Using not_china, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
plt_not_china_trend_lin <- ggplot(data = not_china, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(method = "lm", se = F) +
ylab("Cumulative confirmed cases")
# See the result
plt_not_china_trend_lin
## `geom_smooth()` using formula 'y ~ x'
7.
# Modify the plot to use a logarithmic scale on the y-axis
plt_not_china_trend_lin +
scale_y_log10()
## `geom_smooth()` using formula 'y ~ x'
8.
# Run this to get the data for each country
confirmed_cases_by_country <- read_csv("datasets/confirmed_cases_by_country.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## province = col_character(),
## date = col_date(format = ""),
## cases = col_double(),
## cum_cases = col_double()
## )
glimpse(confirmed_cases_by_country)
## Observations: 13,272
## Variables: 5
## $ country <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Antigua ...
## $ province <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ cum_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
# Group by country, summarize to calculate total cases, find the top 7
top_countries_by_total_cases <- confirmed_cases_by_country %>%
group_by(country) %>%
summarize(total_cases = max(cum_cases)) %>%
top_n(7)
## Selecting by total_cases
# See the result
top_countries_by_total_cases
## # A tibble: 7 x 2
## country total_cases
## <chr> <dbl>
## 1 France 7699
## 2 Germany 9257
## 3 Iran 16169
## 4 Italy 31506
## 5 Korea, South 8320
## 6 Spain 11748
## 7 US 6421
# Run this to get the data for the top 7 countries
confirmed_cases_top7_outside_china <- read_csv("datasets/confirmed_cases_top7_outside_china.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## date = col_date(format = ""),
## cum_cases = col_double()
## )
#
glimpse(confirmed_cases_top7_outside_china)
## Observations: 2,030
## Variables: 3
## $ country <chr> "Germany", "Iran", "Italy", "Korea, South", "Spain", "US"...
## $ date <date> 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-...
## $ cum_cases <dbl> 16, 0, 3, 31, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, ...
# Using confirmed_cases_top7_outside_china, draw a line plot of
# cum_cases vs. date, grouped and colored by country
ggplot(data = confirmed_cases_top7_outside_china) +
geom_line(aes(x = date, y = cum_cases, group = country, color = country)) +
ylab("Cumulative confirmed cases")