Covid project

# Load the readr, ggplot2, and dplyr packages
library(readr)

## Warning: package 'readr' was built under R version 3.6.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Read datasets/confirmed_cases_worldwide.csv into confirmed_cases_worldwide
confirmed_cases_worldwide <- read_csv('datasets/confirmed_cases_worldwide.csv')

## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   cum_cases = col_double()
## )

# See the result
confirmed_cases_worldwide

## # A tibble: 56 x 2
##    date       cum_cases
##    <date>         <dbl>
##  1 2020-01-22       555
##  2 2020-01-23       653
##  3 2020-01-24       941
##  4 2020-01-25      1434
##  5 2020-01-26      2118
##  6 2020-01-27      2927
##  7 2020-01-28      5578
##  8 2020-01-29      6166
##  9 2020-01-30      8234
## 10 2020-01-31      9927
## # ... with 46 more rows

# Draw a line plot of cumulative cases vs. date
# Label the y-axis
ggplot(confirmed_cases_worldwide,aes(date, cum_cases)) +
  geom_line() +
  ylab("Cumulative confirmed cases")

# Read in datasets/confirmed_cases_china_vs_world.csv
confirmed_cases_china_vs_world <- read_csv('datasets/confirmed_cases_china_vs_world.csv')

## Parsed with column specification:
## cols(
##   is_china = col_character(),
##   date = col_date(format = ""),
##   cases = col_double(),
##   cum_cases = col_double()
## )

# See the result
glimpse(confirmed_cases_china_vs_world)

## Observations: 112
## Variables: 4
## $ is_china  <chr> "China", "China", "China", "China", "China", "China", "Ch...
## $ date      <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-...
## $ cases     <dbl> 548, 95, 277, 486, 669, 802, 2632, 578, 2054, 1661, 2089,...
## $ cum_cases <dbl> 548, 643, 920, 1406, 2075, 2877, 5509, 6087, 8141, 9802, ...

# Draw a line plot of cumulative cases vs. date, grouped and colored by is_china
# Define aesthetics within the line geom
plt_cum_confirmed_cases_china_vs_world <- ggplot(confirmed_cases_china_vs_world) +
  geom_line(aes(date, cum_cases,group=is_china, color = is_china)) +
  ylab("Cumulative confirmed cases")

# See the plot
plt_cum_confirmed_cases_china_vs_world

who_events <- tribble(
  ~ date, ~ event,
  "2020-01-30", "Global health\nemergency declared",
  "2020-03-11", "Pandemic\ndeclared",
  "2020-02-13", "China reporting\nchange"
) %>%
  mutate(date = as.Date(date))

# Using who_events, add vertical dashed lines with an xintercept at date
# and text at date, labeled by event, and at 100000 on the y-axis
plt_cum_confirmed_cases_china_vs_world +
geom_vline(data = who_events, aes(xintercept = date), linetype = 'dashed') +
  geom_text(data = who_events, aes(x = date,label = event), y = 100000)

# Filter for China, from Feb 15
china_after_feb15 <- confirmed_cases_china_vs_world %>%
filter(date >= "2020-02-15",is_china == "China")

# Using china_after_feb15, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
ggplot(china_after_feb15, aes(x = date, y = cum_cases)) +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE) +
  ylab("Cumulative confirmed cases")

## `geom_smooth()` using formula 'y ~ x'

# Filter confirmed_cases_china_vs_world for not China
not_china <- confirmed_cases_china_vs_world %>%
  filter(is_china != "China")

# Using not_china, draw a line plot cum_cases vs. date
# Add a smooth trend line using linear regression, no error bars
plt_not_china_trend_lin <- ggplot(data = not_china, aes(x = date, y = cum_cases)) +
  geom_line() +
  geom_smooth(method = "lm", se = F) +
  ylab("Cumulative confirmed cases")

# See the result
plt_not_china_trend_lin

## `geom_smooth()` using formula 'y ~ x'

# Modify the plot to use a logarithmic scale on the y-axis
plt_not_china_trend_lin + 
  scale_y_log10()

## `geom_smooth()` using formula 'y ~ x'

# Run this to get the data for each country
confirmed_cases_by_country <- read_csv("datasets/confirmed_cases_by_country.csv")

## Parsed with column specification:
## cols(
##   country = col_character(),
##   province = col_character(),
##   date = col_date(format = ""),
##   cases = col_double(),
##   cum_cases = col_double()
## )

glimpse(confirmed_cases_by_country)

## Observations: 13,272
## Variables: 5
## $ country   <chr> "Afghanistan", "Albania", "Algeria", "Andorra", "Antigua ...
## $ province  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ date      <date> 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-...
## $ cases     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ cum_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

# Group by country, summarize to calculate total cases, find the top 7
top_countries_by_total_cases <- confirmed_cases_by_country %>%
  group_by(country) %>%
  summarize(total_cases = max(cum_cases)) %>%
  top_n(7)

## Selecting by total_cases

# See the result
top_countries_by_total_cases

## # A tibble: 7 x 2
##   country      total_cases
##   <chr>              <dbl>
## 1 France              7699
## 2 Germany             9257
## 3 Iran               16169
## 4 Italy              31506
## 5 Korea, South        8320
## 6 Spain              11748
## 7 US                  6421

# Run this to get the data for the top 7 countries
confirmed_cases_top7_outside_china <-     read_csv("datasets/confirmed_cases_top7_outside_china.csv")

## Parsed with column specification:
## cols(
##   country = col_character(),
##   date = col_date(format = ""),
##   cum_cases = col_double()
## )

# 
glimpse(confirmed_cases_top7_outside_china)

## Observations: 2,030
## Variables: 3
## $ country   <chr> "Germany", "Iran", "Italy", "Korea, South", "Spain", "US"...
## $ date      <date> 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-...
## $ cum_cases <dbl> 16, 0, 3, 31, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, ...

# Using confirmed_cases_top7_outside_china, draw a line plot of
# cum_cases vs. date, grouped and colored by country
ggplot(data = confirmed_cases_top7_outside_china) +
  geom_line(aes(x = date, y = cum_cases, group = country, color = country)) +
  ylab("Cumulative confirmed cases")

Covid project

Daniel Smith

5/9/2020