library(tidyverse)
library(tidylog)
library(dplyr)
library(dslabs)
library(ggplot2)
library(plotly)
d <- read_csv("processed.csv")
#(present/past)-1
d %>%
arrange(date) %>%
group_by(country_region) %>%
filter(confirmed_num > 0) %>%
mutate(growth_confirmed = confirmed_num/lag(confirmed_num) -1)
## # A tibble: 8,903 x 6
## # Groups: country_region [185]
## date country_region deaths_num confirmed_num recovered_num
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2020-01-22 China 17 548 28
## 2 2020-01-22 Japan 0 2 0
## 3 2020-01-22 Korea, South 0 1 0
## 4 2020-01-22 Taiwan* 0 1 0
## 5 2020-01-22 Thailand 0 2 0
## 6 2020-01-22 US 0 1 0
## 7 2020-01-23 China 18 643 30
## 8 2020-01-23 Japan 0 2 0
## 9 2020-01-23 Korea, South 0 1 0
## 10 2020-01-23 Singapore 0 1 0
## # … with 8,893 more rows, and 1 more variable: growth_confirmed <dbl>
g_countries <- c("Saudi Arabia", "United Arab Emirates", "Bahrain", "Qatar", "Kuwait", "Oman")
i_max <- which.max(d$date)
d$date[i_max]
## [1] "2020-04-20"
g_gulf <-
d %>%
filter(date == d$date[i_max], country_region %in% g_countries)
Saudi Arabia has the highest number of death cases but the number is reasonable given the number of people living in the kingdom.
g_gulf %>%
ggplot(aes(x = reorder(country_region, -deaths_num), y = deaths_num)) +
geom_col() +
labs(
x = "",
y = "number of death cases",
title = "Saudi Arabia has the highest death cases among gulf countries"
)
US has highest confirmed numbers.
highest_confirmed_countries <-
d %>%
filter(date == d$date[i_max]) %>%
pivot_longer(-c(date, country_region, confirmed_num)) %>%
arrange(desc(confirmed_num)) %>%
pull(country_region) %>%
unique() %>%
head(10)
highest_confirmed_countries
## [1] "US" "Spain" "Italy" "France"
## [5] "Germany" "United Kingdom" "Turkey" "China"
## [9] "Iran" "Russia"
highest_confirmed_countries <-
d %>%
filter(date == d$date[i_max], country_region %in%
highest_confirmed_countries) %>%
pivot_longer(-c(date, country_region, confirmed_num))%>% ggplot(aes(x = reorder(country_region, -confirmed_num),y = confirmed_num)
) +
geom_col(position = "dodge") +
labs(
x = "",
y = "confirmed_num",
title = " US has highest confirmed numbers"
)
highest_confirmed_countries
highest_deaths_countries <-
d %>%
filter(date == d$date[i_max]) %>%
pivot_longer(-c(date, country_region, deaths_num)) %>%
arrange(desc(deaths_num)) %>%
pull(country_region) %>%
unique() %>%
head(10)
highest_deaths_countries
## [1] "US" "Italy" "Spain" "France"
## [5] "United Kingdom" "Belgium" "Iran" "Germany"
## [9] "China" "Netherlands"
US has highest death numbers
highest_deaths_countries <-
d %>%
filter(date == d$date[i_max], country_region %in%
highest_deaths_countries) %>%
pivot_longer(-c(date, country_region, deaths_num))%>% ggplot(aes(x = reorder(country_region, -deaths_num),y = deaths_num)
) +
geom_col(position = "dodge") +
labs(
x = "",
y = "deaths_num",
title = " US has highest death numbers"
)
highest_deaths_countries
highest_recovered_countries <-
d %>%
filter(date == d$date[i_max]) %>%
pivot_longer(-c(date, country_region, recovered_num)) %>%
arrange(desc(recovered_num)) %>%
pull(country_region) %>%
unique() %>%
head(10)
highest_recovered_countries
## [1] "Germany" "Spain" "China" "US" "Iran"
## [6] "Italy" "France" "Brazil" "Switzerland" "Turkey"
Germany has the highest recovered cases.
highest_recovered_countries <-
d %>%
filter(date == d$date[i_max], country_region %in%
highest_recovered_countries) %>%
pivot_longer(-c(date, country_region, recovered_num))%>% ggplot(aes(x = reorder(country_region, -recovered_num),y = recovered_num)
) +
geom_col(position = "dodge") +
labs(
x = "",
y = "recovered_num",
title = " Germany has highest recovery numbers"
)
highest_recovered_countries
deaths_top_countries <-
d %>%
filter(date == d$date[i_max]) %>%
select(-confirmed_num, -recovered_num) %>%
arrange(desc(deaths_num)) %>%
head(10)
Saudi <-
d %>%
filter(date == d$date[i_max], country_region == "Saudi Arabia") %>%
select(-confirmed_num, -recovered_num)
deaths_top_countries <-
deaths_top_countries %>%
rbind(Saudi)
deaths_top_countries %>%
ggplot(aes(reorder(x = country_region, deaths_num), y = deaths_num))+
geom_col()+
geom_col(data = Saudi)+
coord_flip()+
scale_y_continuous(labels = scales::comma)+
labs(
x = "number of death cases",
y= "",
title = "Saudi Arabia has the lowest number of death cases compared with other countries"
)
Saudi Arabia has the third highest number of confirmed cases
similar_countries <-
d%>%
filter(date == d$date[i_max],
confirmed_num < 10700 & confirmed_num > 10000) %>%
pull(country_region)
similar_countries <-
d %>%
filter(country_region %in% similar_countries) %>%
ggplot(aes(x = date, y = confirmed_num, col= country_region))+
geom_point()
ggplotly()
Saudi Arabia has the second highest number of death cases
d%>%
filter(date == d$date[i_max],
deaths_num < 130 & deaths_num > 100) %>%
arrange(desc(confirmed_num)) %>%
select(-recovered_num, -confirmed_num) %>%
pivot_longer(-c(date, country_region), names_to = "case_type") %>%
ggplot(aes(x = reorder(country_region,value), y = value, fill = case_type))+
geom_col(position = "dodge")+
coord_flip()
Saudi Arabia has the fourth highest number of recovered cases.
d%>%
filter(date == d$date[i_max],
recovered_num < 2000 & recovered_num > 1480) %>%
arrange(desc(confirmed_num)) %>%
select(-deaths_num, -confirmed_num) %>%
pivot_longer(-c( date, country_region), names_to = "case_type") %>%
ggplot(aes(x = reorder(country_region,value), y = value, fill = case_type))+
geom_col(position = "dodge")+
coord_flip()
d_similar <-
d %>%
filter(confirmed_num >= 5) %>%
arrange(date) %>%
group_by(country_region) %>%
mutate(day_num = 1:n()) %>%
select(-deaths_num, -recovered_num)
d_similar %>%
ggplot(aes(x = day_num, y = confirmed_num, col = country_region))+
geom_line()+
geom_text(data = d_similar %>% filter(date == d$date[i_max]), aes(label = country_region))+
labs(
x = "days since first confirmed case"
)+
theme(
legend.position = "none"
)
ggplotly()
d_all_100 <-
d %>%
filter(confirmed_num >= 60000) %>%
arrange(date) %>%
group_by(country_region) %>%
mutate(day_num = 1:n()) %>%
select(-deaths_num, -recovered_num)
d_all_100 %>%
ggplot(aes(x = day_num, y = confirmed_num, col = country_region))+
geom_line()+
geom_text(data = d_all_100 %>% filter(date == d$date[i_max]), aes(label = country_region))+
labs(
x = "days since first confirmed case"
)+
theme(
legend.position = "none"
)
ggplotly()
d_all_1400 <-
d %>%
filter(confirmed_num >= 100000) %>%
arrange(date) %>%
group_by(country_region) %>%
mutate(day_num = 1:n()) %>%
select(-deaths_num, -recovered_num)
d_all_1400 %>%
ggplot(aes(x = day_num, y = confirmed_num, col = country_region))+ geom_line()+
geom_text(data = d_all_1400 %>% filter(date == d$date[i_max]),
aes(label = country_region))+
labs(
x = "days since first confirmed case"
)+
theme(
legend.position = "none"
)
ggplotly()