In this post, I would like to share the results of my analysis on COVID19 data obtanined from https://ourworldindata.org. Luckily, the dataset contains some additional relevant information such as diabetes prevalence and GDP per capita, that of course makes it fun to perform exploration.
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.0 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggtext)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
covid2019 <- read.csv(file.choose(), header = T)
dim(covid2019)
## [1] 22242 33
I am not gonna perform time-series analalysis. Thus, in this example, the analysis is limited to the data in a single day to see total cases, total data, and other relevant data. I choose for the most recent updated data which is 04-06-2020.
Filter some variables relevant for your desired analysis. For my case, I would like to explore diabetes prevalence, hospital beds availability, covid death rate, and smokers. Thus, I excluded countries that have no data on such variables.
health.data <- covid2019 %>%
filter(date == "2020-06-04") %>%
filter(location != "World") %>%
filter(!is.na(diabetes_prevalence)) %>%
filter(!is.na(hospital_beds_per_thousand)) %>%
filter(!is.na(cvd_death_rate)) %>%
filter(!is.na(female_smokers)) %>%
filter(!is.na(male_smokers))
glimpse(health.data)
## Rows: 128
## Columns: 33
## $ iso_code <chr> "ALB", "DZA", "ARG", "ARM", "AUS", ...
## $ continent <chr> "Europe", "Africa", "South America"...
## $ location <chr> "Albania", "Algeria", "Argentina", ...
## $ date <chr> "2020-06-04", "2020-06-04", "2020-0...
## $ total_cases <int> 1184, 9626, 19255, 10524, 7229, 167...
## $ new_cases <int> 20, 0, 949, 515, 8, 31, 325, 0, 504...
## $ total_deaths <int> 33, 667, 583, 170, 102, 670, 76, 11...
## $ new_deaths <int> 0, 0, 14, 12, 0, 1, 5, 0, 1, 37, 0,...
## $ total_cases_per_million <dbl> 411.425, 219.516, 426.035, 3551.525...
## $ new_cases_per_million <dbl> 6.950, 0.000, 20.998, 173.797, 0.31...
## $ total_deaths_per_million <dbl> 11.467, 15.211, 12.899, 57.370, 4.0...
## $ new_deaths_per_million <dbl> 0.000, 0.000, 0.310, 4.050, 0.000, ...
## $ total_tests <dbl> NA, NA, 178448, NA, 1546329, 471466...
## $ new_tests <dbl> NA, NA, 5501, NA, 33629, 8508, NA, ...
## $ total_tests_per_thousand <dbl> NA, NA, 3.948, NA, 60.641, 52.348, ...
## $ new_tests_per_thousand <dbl> NA, NA, 0.122, NA, 1.319, 0.945, NA...
## $ new_tests_smoothed <dbl> NA, NA, 4767, NA, 25555, 6299, NA, ...
## $ new_tests_smoothed_per_thousand <dbl> NA, NA, 0.105, NA, 1.002, 0.699, NA...
## $ tests_units <chr> "", "", "tests performed", "", "tes...
## $ stringency_index <dbl> 67.59, NA, NA, NA, NA, NA, 85.19, N...
## $ population <dbl> 2877800, 43851043, 45195777, 296323...
## $ population_density <dbl> 104.871, 17.348, 16.177, 102.931, 3...
## $ median_age <dbl> 38.0, 29.1, 31.9, 35.7, 37.9, 44.4,...
## $ aged_65_older <dbl> 13.188, 6.211, 11.198, 11.232, 15.5...
## $ aged_70_older <dbl> 8.643, 3.857, 7.441, 7.571, 10.129,...
## $ gdp_per_capita <dbl> 11803.431, 13913.839, 18933.907, 87...
## $ extreme_poverty <dbl> 1.1, 0.5, 0.6, 1.8, 0.5, 0.7, NA, N...
## $ cvd_death_rate <dbl> 304.195, 278.364, 191.032, 341.010,...
## $ diabetes_prevalence <dbl> 10.08, 6.73, 5.50, 7.11, 5.07, 6.35...
## $ female_smokers <dbl> 7.1, 0.7, 16.2, 1.5, 13.0, 28.4, 0....
## $ male_smokers <dbl> 51.2, 30.4, 27.7, 52.1, 16.5, 30.9,...
## $ handwashing_facilities <dbl> NA, 83.741, NA, 94.043, NA, NA, 83....
## $ hospital_beds_per_thousand <dbl> 2.890, 1.900, 5.000, 4.200, 3.840, ...
cor1 <- ggplot(health.data, aes(x = total_deaths, y = median_age)) +
geom_smooth (method = lm) + geom_point() +
scale_x_continuous(breaks= pretty_breaks()) +
theme(axis.text = element_text(size = 10),
axis.title = element_markdown(size = 10),
axis.text.x = element_markdown(size = 10),
axis.text.y = element_markdown(size = 10),
panel.grid = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "grey"),
plot.title = element_markdown(size = 10),
legend.position = "none")
ggplotly(cor1)
## `geom_smooth()` using formula 'y ~ x'
# Correlation Coefficient
cor_cof1 <- cor.test(health.data$total_deaths, health.data$median_age, method = "pearson")
print(cor_cof1)
##
## Pearson's product-moment correlation
##
## data: health.data$total_deaths and health.data$median_age
## t = 2.1927, df = 126, p-value = 0.03016
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01881559 0.35349027
## sample estimates:
## cor
## 0.1917201
cor2<- ggplot(health.data, aes(x = female_smokers, y = gdp_per_capita)) +
geom_smooth (method = lm, se = F, color = "black") + geom_point() +
scale_x_continuous(breaks= pretty_breaks()) +
theme(axis.text = element_text(size = 10),
axis.title = element_markdown(size = 10),
axis.text.x = element_markdown(size = 10),
axis.text.y = element_markdown(size = 10),
panel.grid = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "grey"),
plot.title = element_markdown(size = 10),
legend.position = "none")
ggplotly(cor2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
# Let's inspect correlation coefficient and statistical significance
cor_cof2 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof2)
##
## Pearson's product-moment correlation
##
## data: health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1782194 0.4870328
## sample estimates:
## cor
## 0.341821
cor3 <- ggplot(health.data, aes(x = diabetes_prevalence, y = gdp_per_capita)) +
geom_smooth (method = lm, se = F, color = "black") + geom_point() +
scale_x_continuous(breaks= pretty_breaks()) +
theme(axis.text = element_text(size = 10),
axis.title = element_markdown(size = 10),
axis.text.x = element_markdown(size = 10),
axis.text.y = element_markdown(size = 10),
panel.grid = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "grey"),
plot.title = element_markdown(size = 10),
legend.position = "none")
ggplotly(cor3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
#Checking correlation coefficient and p-value
cor_cof3 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof3)
##
## Pearson's product-moment correlation
##
## data: health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1782194 0.4870328
## sample estimates:
## cor
## 0.341821
# filtering Asia and arrange according descending total cases
topasia <- health.data %>%
arrange(desc(total_cases)) %>%
filter(continent == "Asia")
# selecting Top 20 Countries
top20.asia <- topasia[1:20,]
# I tried to play around with ggplot color. Here I customize palette color that refers to theme provided by <https://wondernote.org>. You can also copy this code if you want to explore.
# Pastel color palettes
pastel <- c("#ABDEE6", "#CBAACB", "#FFFFB5", "#FFCCB6", "#F3B0C3",
"#C6DBDA", "#FEE1E8", "#FED7C3", "#F6EAC2", "#ECD5E3", "#FF968A", "#FFAEA5",
"#FFC5BF", "#FFD8BE", "#D4F0F0", "#8FCACA", "#CCE2CB", "#B6CFB6", "#97C1A9",
"#FCB9AA", "#FFDBCC", "#ECEAE4", "#A2E1DB", "#55CBCD")
#Gemstone color palettes
gemstone <- c("#53051D", "#9E1C5C", "#EF70AA", "#FF8C94", "#F12761",
"#00C6C7", "#96E5E2", "#00ACA5", "#006F60", "#005245",
"#EEAC4D", "#FFF2C3", "#EE84B3", "#740E4C", "#E2038D",
"#B57E79", "#FF6F68", "#00E937", "#006072", "#5C2A2E",
"#20503E", "#187B30", "#75E0B0", "#2D8498", "#4CB0A6")
# Plotting total death ranks
death_ranks_asia <- ggplot(top20.asia, aes(x = total_deaths, y = reorder(location, total_deaths))) +
geom_col(aes(fill = location)) +
scale_fill_manual(values = pastel) +
theme(axis.text = element_text(size = 10),
axis.title = element_markdown(size = 10),
axis.text.x = element_markdown(size = 10),
axis.text.y = element_markdown(size = 10),
panel.grid = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "grey"),
plot.title = element_markdown(size = 10),
legend.position = "none") +
labs(title="Top 20 total deaths of COVID 19 in Asia",
source="ourworld in data",
x="Total Deaths",
y="Countries")
ggplotly(death_ranks_asia)
# Plotting total case ranks
case.rank.asia <- ggplot(top20.asia, aes(x = total_cases, y = reorder(location, total_cases))) +
geom_col(aes(fill = location)) +
scale_fill_manual(values = gemstone) +
theme(axis.text = element_text(size = 10),
axis.title = element_markdown(size = 10),
axis.text.x = element_markdown(size = 10),
axis.text.y = element_markdown(size = 10),
panel.grid = element_blank(),
panel.background = element_blank(),
axis.line = element_line(color = "grey"),
plot.title = element_markdown(size = 10),
legend.position = "none") +
labs(title="Top 20 Rank of total cases of COVID 19 in Asia",
source="ourworld in data",
x="Total Cases",
y="Countries")
ggplotly(case.rank.asia)
Desclaimer : This analysis is merely conducted for exercise and is not intended as rigorious scientific study. You my not use it as a reference for your scholarly works :)