PART I

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(readxl)
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
covid_deaths <- read.csv("time_series_covid_19_deaths.csv",
                         header=FALSE)
covid_deaths_US <- read.csv("time_series_covid_19_deaths_US.csv",
                         header=FALSE)
covid_confirmed <- read.csv("time_series_covid_19_confirmed.csv",
                         header=FALSE)
covid_confirmed_US <- read.csv("time_series_covid_19_confirmed_US.csv",
                         header=FALSE)
covid_recovered <- read.csv("time_series_covid_19_recovered.csv",
                         header=FALSE)
covid_worldwid_cases <- read.csv("covid_19_data.csv",
                         header=FALSE)
covid_worldwid_cases_list <- read.csv("COVID19_line_list_data.numbers",
                         header=FALSE)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 1 appears to contain embedded nulls
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 2 appears to contain embedded nulls
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 3 appears to contain embedded nulls
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 4 appears to contain embedded nulls
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 5 appears to contain embedded nulls
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input

1, From epidemic to pandemic

head(covid_worldwid_cases)
##    V1              V2             V3             V4              V5        V6
## 1 SNo ObservationDate Province/State Country/Region     Last Update Confirmed
## 2   1      01/22/2020          Anhui Mainland China 1/22/2020 17:00       1.0
## 3   2      01/22/2020        Beijing Mainland China 1/22/2020 17:00      14.0
## 4   3      01/22/2020      Chongqing Mainland China 1/22/2020 17:00       6.0
## 5   4      01/22/2020         Fujian Mainland China 1/22/2020 17:00       1.0
## 6   5      01/22/2020          Gansu Mainland China 1/22/2020 17:00       0.0
##       V7        V8
## 1 Deaths Recovered
## 2    0.0       0.0
## 3    0.0       0.0
## 4    0.0       0.0
## 5    0.0       0.0
## 6    0.0       0.0
covid_worldwid_cases <- read_excel("covid_19_data.xlsx")


ggplot(covid_worldwid_cases, aes(y = Confirmed, x = ObservationDate, color))+
  geom_jitter(stat = "identity")

2, Confirmed cases throughout the world

ggplot(covid_worldwid_cases, aes(y = Confirmed, x = Country))+
  geom_boxplot()

3, China compared to the rest of the world

library(dplyr)
target <- c("US", "UK", "Mainland China", "Brazil", "Russia", "Italy")
covid_country <- filter(covid_worldwid_cases, Country %in% target)

ggplot(covid_country, aes(y = Confirmed, x = ObservationDate, color = Country))+
  geom_jitter(stat = "identity")

target2 <- c("Mainland China")
covid_country2 <- filter(covid_worldwid_cases, Country %in% target2)

ggplot(covid_country2, aes(y = Confirmed, x = ObservationDate, color = Province))+
  geom_jitter(stat = "identity")

4, Let’s annotate! The dataset chosen for creating graphics above contain variables between January 22nd through May 11th. Plots for US and China make multiple lines and trends as they take confirmed cases by provinces or states. In terms of state and province, the significantly projecting lines in both pink and slightly light green(“US” and “Mainland China”) are Hubei in which the city of Wuhang lies and New York state, respectively. Within the context of domestic infection route in China, confirmed cases and the pace of infection was extremely significant and exponentially grew at least in the end of January, as Wuhang was the source of infection.

5, Adding a trend line to China

covid_worldwid_cases$ObservationDate <- as.Date(covid_worldwid_cases$ObservationDate, format='%m/%d')
target2 <- c("Mainland China")
covid_country2 <- filter(covid_worldwid_cases, Country %in% target2)


ggplot(covid_country2, aes(y = Confirmed, x = ObservationDate, color = Province))+
  geom_point() +
  geom_smooth(aes(group = Province), method = "lm") 

6, And the rest of the world?

target <- c("US", "UK", "Mainland China", "Brazil", "Russia", "Italy")
covid_country <- filter(covid_worldwid_cases, Country %in% target)

ggplot(covid_country, aes(y = Confirmed, x = ObservationDate, color = Country))+
  geom_jitter(stat = "identity")+
  geom_smooth(aes(group = Province), method = "lm") 
## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in qt((1 - level)/2, df): NaNs produced

7, Adding a logarithmic scale

ggplot(covid_country, aes(y = Confirmed, x = ObservationDate, color = Country))+
  geom_jitter(stat = "identity")+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

8, Which countries outside of China have been hit hardest? The number of cases in other cities in China was, in turn, recorded significantly less than that of in Hubei. We can assume that this is due to the fact that Hubei province is geographically spacious that people did not closely interacted with people outside of Hubei. Comparing with other nations with entire China, appproximately 40 days later the first confirmed case was reported, we see increasing confirmed cases in Italy, United States, and United Kingdom. After that, we see extremely exponential grwoth of confirmed cases in those nations. As of May 11th, United States has the hihgest number of confirmed cases in the world.

9, Plotting hardest hit countries as of Mid-March 2020

target <- c("US", "UK", "Mainland China", "Brazil", "Russia", "Italy")
covid_country <- filter(covid_worldwid_cases, Country %in% target)

ggplot(covid_country, aes(y = Confirmed, x = ObservationDate, color = Country))+
  geom_jitter(stat = "identity")+
  scale_x_date(limits = as.Date(c("2020-03-15","2020-05-11")))
## Warning: Removed 3076 rows containing missing values (geom_point).