This project is based on the Visualizing COVID-19 project available at learn.datacamp.com.

#Read dataset for line plot showing cases in China
confirmed_cases_worldwide <- read.csv("datasets/confirmed_cases_worldwide.csv", 
                                      colClasses = c("Date", "integer"))

#Show the result
glimpse(confirmed_cases_worldwide)
## Rows: 56
## Columns: 2
## $ date      <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-...
## $ cum_cases <int> 555, 653, 941, 1434, 2118, 2927, 5578, 6166, 8234, 9927, ...
#Line plot of the cumulative number of cases confirmed globally over time
confirmed_cases_worldwide$date = as.Date(confirmed_cases_worldwide$date)

g1 <- ggplot(data = confirmed_cases_worldwide, 
             aes(x = date, 
                 y = cum_cases, 
                 group = 1)) +
  geom_line() +
  labs(title = "Cumulative Number of Cases Confirmed Globally over Time", 
       y = "Number of Cases",
       x = "") +
  theme(plot.title = element_text(hjust = 0.5)) #Center plot title

g1

# Read data for line plot comparing China to other countries
confirmed_cases_china_vs_world <- read.csv("datasets/confirmed_cases_china_vs_world.csv",
                                           colClasses = c("factor","Date","integer","integer"))

# Show the result
glimpse(confirmed_cases_china_vs_world)
## Rows: 112
## Columns: 4
## $ is_china  <fct> China, China, China, China, China, China, China, China, C...
## $ date      <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-...
## $ cases     <int> 548, 95, 277, 486, 669, 802, 2632, 578, 2054, 1661, 2089,...
## $ cum_cases <int> 548, 643, 920, 1406, 2075, 2877, 5509, 6087, 8141, 9802, ...
#Line plot comparing cases in China to cases elsewhere
g2 <- ggplot(data = confirmed_cases_china_vs_world) +
  geom_line(aes(date, 
                cum_cases,
                group = is_china, 
                col = is_china)) +
  labs(title = "Cumulative Number of Confirmed Cases over Time", 
       y = "Cumulative Confirmed Cases", 
       x = "") + 
  theme(legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

g2

#Code provided by DataCamp
who_events <- tribble(
  ~ date, ~ event,
  "2020-01-30", "Global health\nemergency declared",
  "2020-03-11", "Pandemic\ndeclared",
  "2020-02-13", "China reporting\nchange"
) %>%
  mutate(date = as.Date(date))
#Adding WHO data to the line plot comparing cases in China to cases elsewhere

g2 +
  geom_vline(data = who_events, 
             aes(xintercept = date), 
             linetype = "dashed") +
  geom_text(data = who_events, 
            aes(x = date, label = event),
            size = 3.5,
            angle = 90,
            y = 100000)

#China data after reporting change (Feb. 15)
china_after_feb15 <- confirmed_cases_china_vs_world %>%
                        filter(is_china == "China" & 
                               date >= '2020-02-15')
#Fitting a GAM to Chinese cases after Feb. 15
g3 = ggplot(data = china_after_feb15, 
            aes(x = date, 
                y = cum_cases)) +
  geom_line() +
  geom_smooth(method = "gam", 
              se = FALSE) + 
  labs(title = "Cumulative Number of Cases Confirmed in China after Feb. 15",
       x = "Date", 
       y = "Cumulative Confirmed Cases") + 
  theme(plot.title = element_text(hjust = 0.5))

g3
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

#Filter data for countries other than China
not_china <- confirmed_cases_china_vs_world %>%
                        filter(is_china == "Not China")
#Fitting a GAM to cases in countries other than China
g4 <- ggplot(data = not_china, 
             aes(x = date, 
                 y = cum_cases)) +
  geom_line() +
  labs(title = "Cumulative Number of Cases Confirmed in Countries Other Than China",
       x = "Date", 
       y = "Cumulative Confirmed Cases") + 
  theme(plot.title = element_text(hjust = 0.5))

g4 + 
  geom_smooth(method = "gam", 
              se = FALSE)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

#Fitting a linear model to log cumulative confirmed cases for countries other than China
g4 + 
  scale_y_log10() +
  geom_smooth(method = "lm", 
              se = FALSE) +
  ylab("log Cumulative Confirmed Cases")
## `geom_smooth()` using formula 'y ~ x'

#Read data on cases by country
confirmed_cases_by_country <- read.csv("datasets/confirmed_cases_by_country.csv",
                                       colClasses = c("factor", "factor", "Date", "integer", "integer"))
glimpse(confirmed_cases_by_country)
## Rows: 13,272
## Columns: 5
## $ country   <fct> Afghanistan, Albania, Algeria, Andorra, Antigua and Barbu...
## $ province  <fct> , , , , , , , , , , , , , , , , , , , , , , , , , 
## $ date      <date> 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-...
## $ cases     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ cum_cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
#Transform the data to Group by country, summarize to calculate total cases, find the top 7
top_countries_by_total_cases <- confirmed_cases_by_country %>%
  group_by(country) %>%
  summarize(total_cases = sum(cases)) %>%
  top_n(n = 7) %>%
  arrange(-total_cases)
## `summarise()` ungrouping output (override with `.groups` argument)
## Selecting by total_cases
#See the result
top_countries_by_total_cases
## # A tibble: 7 x 2
##   country      total_cases
##   <fct>              <int>
## 1 Italy              31506
## 2 Iran               16169
## 3 Spain              11748
## 4 Germany             9257
## 5 Korea, South        8320
## 6 France              7699
## 7 US                  6421
#Bar chart showing the top 7 countries by number of cases as of March 17, 2020

g5 = ggplot(top_countries_by_total_cases, 
            aes(x = reorder(country, -total_cases), 
                y = total_cases,
                fill = country)) +
  geom_col() +
  labs(title = "Cumulative Number of Confirmed Cases by Country (3/17/2020)",
       y = "Cumulative Confirmed Cases",
       x = "") + 
  theme(legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

g5

#Read data for top 7 countries
confirmed_cases_top7_outside_china = read.csv("datasets/confirmed_cases_top7_outside_china.csv",
                                              colClasses = c("factor", "Date", "integer"))

glimpse(confirmed_cases_top7_outside_china)
## Rows: 2,030
## Columns: 3
## $ country   <fct> "Germany", "Iran", "Italy", "Korea, South", "Spain", "US"...
## $ date      <date> 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-18, 2020-02-...
## $ cum_cases <int> 16, 0, 3, 31, 2, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, ...
# Using confirmed_cases_top7_outside_china, draw a line plot of
# cum_cases vs. date, grouped and colored by country
g6 = ggplot(confirmed_cases_top7_outside_china) +
  geom_line(aes(date, 
                cum_cases, 
                group = country, 
                col = country)) + 
    labs(title = "Number of Confirmed Cases by Country (Top 7)",
         y = "Cumulative Confirmed Cases",
         x = "") +
    theme(legend.title = element_blank(),
          plot.title = element_text(hjust = 0.5))

g6