link dataset :“https://github.com/CSSEGISandData/COVID-19”

1.Load R Packages:

library(magrittr)
library(lubridate)
library(tidyverse)
library(gridExtra)
library(kableExtra)

2.Loading dataset:

data.confirmed = read.csv("D://COVID-19-master/COVID-19-master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
data.deaths = read.csv("D://COVID-19-master/COVID-19-master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
data.recovered = read.csv("D://COVID-19-master/COVID-19-master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

data.confirmed[,1:10] %>% sample_n(10) %>%
  kable("html",booktabs = TRUE) %>%
  kable_styling(font_size=10)

Province.State	Country.Region	Lat	Long	X1.26.20	X1.27.20
Los Angeles, CA	US	34.0522	-118.2437	1	1
Middlesex County, MA	US	42.4672	-71.2874	0	0
	Brunei	4.5353	114.7277	0	0
	Algeria	28.0339	1.6596	0	0
Diamond Princess cruise ship	Others	35.4437	139.6380	0	0
	Norway	60.4720	8.4689	0	0
Montgomery County, MD	US	39.1547	-77.2405	0	0
Fayette County, KY	US	38.0606	-84.4803	0	0
Jefferson Parish, LA	US	29.6499	-90.1121	0	0
	Peru	-9.1900	-75.0152	0	0

3.Data Preparation:

3.1 Data Cleaning:

## data cleaning and transformation
cleanData <- function(data) {
## remove some columns
data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
## convert from wide to long format
data %<>% gather(key=date, value=count, -country)
## convert from character to date
data %<>% mutate(date = date %>% substr(2,8) %>% mdy())
## aggregate by country
data %<>% group_by(country, date) %>% summarise(count=sum(count)) %>% as.data.frame()
return(data)
}
## clean the three datasets
data.confirmed %<>% cleanData() %>% rename(confirmed=count)
data.deaths %<>% cleanData() %>% rename(deaths=count)
data.recovered %<>% cleanData() %>% rename(recovered=count)
## merge above 3 datasets into one, by country and date
data <- data.confirmed %>% merge(data.deaths) %>% merge(data.recovered)
## first 10 records when it first broke out in China
data %>% filter(country=='Mainland China') %>% head(10) %>%
kable("html", booktabs=T, caption="Raw Data (with first 10 Columns Only)",
format.args=list(big.mark=","))

Raw Data (with first 10 Columns Only)
country	date	confirmed	deaths	recovered
Mainland China	2020-01-22	547	17	28
Mainland China	2020-01-23	639	18	30
Mainland China	2020-01-24	916	26	36
Mainland China	2020-01-25	1,399	42	39
Mainland China	2020-01-26	2,062	56	49
Mainland China	2020-01-27	2,863	82	58
Mainland China	2020-01-28	5,494	131	101
Mainland China	2020-01-29	6,070	133	120
Mainland China	2020-01-30	8,124	171	135
Mainland China	2020-01-31	9,783	213	214

3.2 Cases for the Whole World:

data.world = data %>% group_by(date) %>% 
  summarise(country ="World",
            confirmed = sum(confirmed),
            deaths = sum(deaths),
            recovered = sum(recovered))
data %<>% rbind(data.world)
## remaining confirmed cases
data %<>% mutate(remaining.confirmed = confirmed - deaths - recovered)

3.3 Daily Increased and Death Rates:

## sort by country and date
data %<>% arrange(country, date)
## daily increases of deaths and cured cases
## set NA to the increases on day1
n = nrow(data)
day1 <- min(data$date)
data %<>% mutate(confirmed.inc = ifelse(date == day1, NA, confirmed - lag(confirmed, n=1)),
deaths.inc = ifelse(date == day1, NA, deaths - lag(deaths, n=1)),
recovered.inc = ifelse(date == day1, NA, recovered - lag(recovered, n=1)))
## death rate based on total deaths and cured cases
data %<>% mutate(rate.upper = (100 * deaths / (deaths + recovered)) %>% round(1))
## lower bound: death rate based on total confirmed cases
data %<>% mutate(rate.lower = (100 * deaths / confirmed) %>% round(1))
## death rate based on the number of death/cured on every single day
data %<>% mutate(rate.daily = (100 * deaths.inc / (deaths.inc + recovered.inc)) %>% round(1))

4.Visualization:

4.1 Top 10 countries with most confirmed cases:

## ranking by confirmed cases
data.latest <- data %>% filter(date == max(date)) %>%
         select(country, date, confirmed, deaths, recovered, remaining.confirmed) %>%
          mutate(ranking = dense_rank(desc(confirmed)))
## top 10 countries: 12 incl. 'World' and 'Others'
top.countries <- data.latest %>% filter(ranking <= 12) %>%
arrange(ranking) %>% pull(country) %>% as.character()
## move 'Others' to the end
top.countries %<>% setdiff('Others') %>% c('Others')
top.countries

##  [1] "World"          "Mainland China" "Italy"          "South Korea"   
##  [5] "Iran"           "France"         "Germany"        "Spain"         
##  [9] "US"             "Japan"          "Switzerland"    "Others"

df <- data.latest %>% filter(!is.na(country) & country!='World') %>%  
     mutate(country=ifelse(ranking <= 12, as.character(country), 'Others')) %>%
     mutate(country=country %>% factor(levels=c(top.countries)))
df %<>% group_by(country) %>% summarise(confirmed=sum(confirmed))
## precentage and label
df %<>% mutate(per = (100*confirmed/sum(confirmed)) %>% round(1)) %>%
mutate(txt = paste0(country, ':', confirmed, '(', per, '%)'))
# pie(df$confirmed, labels=df$txt, cex=0.7)
df %>% ggplot(aes(fill=country)) +
geom_bar(aes(x='', y=per), stat='identity') +
coord_polar("y", start=0) +
xlab('') + ylab('Percentage (%)') +
labs(title="Top 10 Countries with Most Confirmed Cases") +
scale_fill_discrete(name='Country', labels=df$txt)

## convert from wide to long format, for purpose of drawing a area plot
data.long <- data %>% select(c(country, date, confirmed, remaining.confirmed, recovered, deaths)) %>%
gather(key=type, value=count, -c(country, date))
## set factor levels to show them in a desirable order
data.long %<>% mutate(type = factor(type, c('confirmed', 'remaining.confirmed', 'recovered', 'deaths')))
## cases by type
df <- data.long %>% filter(country %in% top.countries) %<>%
mutate(country=country %>% factor(levels=c(top.countries)))
df %>% filter(country != 'World') %>%
ggplot(aes(x=date, y=count, fill=country)) +
geom_area() + xlab('Date') + ylab('Count') +
labs(title='Cases around the World') +
theme(legend.title=element_blank()) +
facet_wrap(~type, ncol=2, scales='free_y')

## excluding Mainland China
df %>% filter(!(country %in% c('World', 'Mainland China'))) %>%
ggplot(aes(x=date, y=count, fill=country)) +
geom_area() + xlab('Date') + ylab('Count') +
labs(title='Cases around the World (excl. China)') +
theme(legend.title=element_blank()) +
facet_wrap(~type, ncol=2, scales='free_y')

## if Australia in not in top 10, add it in and remove 'Others'
if(!('Australia' %in% top.countries)) {
top.countries %<>% setdiff('Others') %>% c('Australia')
df <- data.long %>% filter(country %in% top.countries) %<>%
mutate(country=country %>% factor(levels=c(top.countries)))
}
## cases by country
df %>% filter(type != 'confirmed') %>%
ggplot(aes(x=date, y=count, fill=type)) +
geom_area(alpha=0.5) + xlab('Date') + ylab('Count') +
labs(title='COVID-19 Cases by Country') +
scale_fill_manual(values=c('red', 'green', 'black')) +
theme(legend.title=element_blank(), legend.position='bottom') +
facet_wrap(~country, ncol=3, scales='free_y')

4.2 Current (or Remaining) Confirmed Cases:

# data %<>% filter(country=='Mainland China')
# data %<>% filter(country=='Australia')
data %<>% filter(country=='World')
n <- nrow(data)
## current confirmed and its increase
plot1 <- ggplot(data, aes(x=date, y=remaining.confirmed)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Current Confirmed Cases')
plot2 <- ggplot(data, aes(x=date, y=confirmed.inc)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Increase in Current Confirmed')
# + ylim(0, 4500)
grid.arrange(plot1, plot2, ncol=2)

4.3 Deaths and Cured Cases:

## a scatter plot with a smoothed line and vertical x-axis labels
plot1 <- ggplot(data, aes(x=date, y=deaths)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Deaths')
plot2 <- ggplot(data, aes(x=date, y=recovered)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Recovered Cases')
plot3 <- ggplot(data, aes(x=date, y=deaths.inc)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Increase in Deaths')
plot4 <- ggplot(data, aes(x=date, y=recovered.inc)) +
geom_point() + geom_smooth() +
xlab('Date') + ylab('Count') + labs(title='Increase in Recovered Cases')
## show four plots together, with 2 plots in each row
grid.arrange(plot1, plot2, plot3, plot4, nrow=2)

4.4 Death Rates:

## three death rates
plot1 <- ggplot(data, aes(x=date)) +
geom_line(aes(y=rate.upper, colour='Upper bound')) +
geom_line(aes(y=rate.lower, colour='Lower bound')) +
geom_line(aes(y=rate.daily, colour='Daily')) +
  xlab('Date') + ylab('Death Rate (%)') + labs(title='Overall') +
theme(legend.position='bottom', legend.title=element_blank()) +
ylim(0, 90)
## focusing on last 2 weeks
plot2 <- ggplot(data[n-(14:0),], aes(x=date)) +
geom_line(aes(y=rate.upper, colour='Upper bound')) +
geom_line(aes(y=rate.lower, colour='Lower bound')) +
geom_line(aes(y=rate.daily, colour='Daily')) +
xlab('Date') + ylab('Death Rate (%)') + labs(title='Last two weeks') +
theme(legend.position='bottom', legend.title=element_blank()) +
ylim(0, 10)
grid.arrange(plot1, plot2, ncol=2)

Phân tích dữ liệu COVID-19

Đặng Khải Hoàn

3/10/2020

1.Load R Packages:

2.Loading dataset:

3.Data Preparation:

3.1 Data Cleaning:

3.2 Cases for the Whole World:

3.3 Daily Increased and Death Rates:

4.Visualization:

4.1 Top 10 countries with most confirmed cases:

4.2 Current (or Remaining) Confirmed Cases:

4.3 Deaths and Cured Cases:

4.4 Death Rates: