Introduction

In this post, I would like to share the results of my analysis on COVID19 data obtanined from https://ourworldindata.org. Luckily, the dataset contains some additional relevant information such as diabetes prevalence and GDP per capita, that of course makes it fun to perform exploration.

Steps

  1. Packages. The followings are required packages.
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggtext)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
  1. Open csv data and assign to new data frame
covid2019 <- read.csv(file.choose(), header = T)
dim(covid2019)
## [1] 22242    33
  1. I am not gonna perform time-series analalysis. Thus, in this example, the analysis is limited to the data in a single day to see total cases, total data, and other relevant data. I choose for the most recent updated data which is 04-06-2020.

  2. Filter some variables relevant for your desired analysis. For my case, I would like to explore diabetes prevalence, hospital beds availability, covid death rate, and smokers. Thus, I excluded countries that have no data on such variables.

 health.data <- covid2019 %>%
  filter(date == "2020-06-04") %>%
  filter(location != "World") %>%
  filter(!is.na(diabetes_prevalence)) %>%
  filter(!is.na(hospital_beds_per_thousand)) %>%
  filter(!is.na(cvd_death_rate)) %>%
  filter(!is.na(female_smokers)) %>%
  filter(!is.na(male_smokers))
glimpse(health.data)
## Rows: 128
## Columns: 33
## $ iso_code                        <chr> "ALB", "DZA", "ARG", "ARM", "AUS", ...
## $ continent                       <chr> "Europe", "Africa", "South America"...
## $ location                        <chr> "Albania", "Algeria", "Argentina", ...
## $ date                            <chr> "2020-06-04", "2020-06-04", "2020-0...
## $ total_cases                     <int> 1184, 9626, 19255, 10524, 7229, 167...
## $ new_cases                       <int> 20, 0, 949, 515, 8, 31, 325, 0, 504...
## $ total_deaths                    <int> 33, 667, 583, 170, 102, 670, 76, 11...
## $ new_deaths                      <int> 0, 0, 14, 12, 0, 1, 5, 0, 1, 37, 0,...
## $ total_cases_per_million         <dbl> 411.425, 219.516, 426.035, 3551.525...
## $ new_cases_per_million           <dbl> 6.950, 0.000, 20.998, 173.797, 0.31...
## $ total_deaths_per_million        <dbl> 11.467, 15.211, 12.899, 57.370, 4.0...
## $ new_deaths_per_million          <dbl> 0.000, 0.000, 0.310, 4.050, 0.000, ...
## $ total_tests                     <dbl> NA, NA, 178448, NA, 1546329, 471466...
## $ new_tests                       <dbl> NA, NA, 5501, NA, 33629, 8508, NA, ...
## $ total_tests_per_thousand        <dbl> NA, NA, 3.948, NA, 60.641, 52.348, ...
## $ new_tests_per_thousand          <dbl> NA, NA, 0.122, NA, 1.319, 0.945, NA...
## $ new_tests_smoothed              <dbl> NA, NA, 4767, NA, 25555, 6299, NA, ...
## $ new_tests_smoothed_per_thousand <dbl> NA, NA, 0.105, NA, 1.002, 0.699, NA...
## $ tests_units                     <chr> "", "", "tests performed", "", "tes...
## $ stringency_index                <dbl> 67.59, NA, NA, NA, NA, NA, 85.19, N...
## $ population                      <dbl> 2877800, 43851043, 45195777, 296323...
## $ population_density              <dbl> 104.871, 17.348, 16.177, 102.931, 3...
## $ median_age                      <dbl> 38.0, 29.1, 31.9, 35.7, 37.9, 44.4,...
## $ aged_65_older                   <dbl> 13.188, 6.211, 11.198, 11.232, 15.5...
## $ aged_70_older                   <dbl> 8.643, 3.857, 7.441, 7.571, 10.129,...
## $ gdp_per_capita                  <dbl> 11803.431, 13913.839, 18933.907, 87...
## $ extreme_poverty                 <dbl> 1.1, 0.5, 0.6, 1.8, 0.5, 0.7, NA, N...
## $ cvd_death_rate                  <dbl> 304.195, 278.364, 191.032, 341.010,...
## $ diabetes_prevalence             <dbl> 10.08, 6.73, 5.50, 7.11, 5.07, 6.35...
## $ female_smokers                  <dbl> 7.1, 0.7, 16.2, 1.5, 13.0, 28.4, 0....
## $ male_smokers                    <dbl> 51.2, 30.4, 27.7, 52.1, 16.5, 30.9,...
## $ handwashing_facilities          <dbl> NA, 83.741, NA, 94.043, NA, NA, 83....
## $ hospital_beds_per_thousand      <dbl> 2.890, 1.900, 5.000, 4.200, 3.840, ...
  1. Here some interesting results from correlation analysis I conducted.
  1. Total deaths have a positively small correlation with median age
cor1 <- ggplot(health.data, aes(x = total_deaths, y = median_age)) +
  geom_smooth (method = lm) + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")

ggplotly(cor1)
## `geom_smooth()` using formula 'y ~ x'
# Correlation Coefficient 
cor_cof1 <- cor.test(health.data$total_deaths, health.data$median_age, method = "pearson")
print(cor_cof1)
## 
##  Pearson's product-moment correlation
## 
## data:  health.data$total_deaths and health.data$median_age
## t = 2.1927, df = 126, p-value = 0.03016
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01881559 0.35349027
## sample estimates:
##       cor 
## 0.1917201
  1. The total of female smokers is positively correlated with GDP per Capita
cor2<- ggplot(health.data, aes(x = female_smokers, y = gdp_per_capita)) +
  geom_smooth (method = lm, se = F, color = "black") + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")

ggplotly(cor2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
# Let's inspect correlation coefficient and statistical significance
cor_cof2 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof2)
## 
##  Pearson's product-moment correlation
## 
## data:  health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1782194 0.4870328
## sample estimates:
##      cor 
## 0.341821
  1. Diabetes prevalence is also higher among the countries with higher gdp Per capita
cor3 <- ggplot(health.data, aes(x = diabetes_prevalence, y = gdp_per_capita)) +
  geom_smooth (method = lm, se = F, color = "black") + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")
ggplotly(cor3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
#Checking correlation coefficient and p-value
cor_cof3 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof3)
## 
##  Pearson's product-moment correlation
## 
## data:  health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1782194 0.4870328
## sample estimates:
##      cor 
## 0.341821
  1. Investigating COVID19 in Asia, in particular top ranks of total cases in Asia.
# filtering Asia and arrange according descending total cases
topasia <- health.data %>%
  arrange(desc(total_cases)) %>%
  filter(continent == "Asia")

# selecting Top 20 Countries
top20.asia <- topasia[1:20,]
  1. Visualizing total cases in those top 20 countries
# I tried to play around with ggplot color. Here I customize palette color that refers to theme provided by <https://wondernote.org>. You can also copy this code if you want to explore.

# Pastel color palettes
pastel <- c("#ABDEE6", "#CBAACB", "#FFFFB5", "#FFCCB6", "#F3B0C3",
  "#C6DBDA", "#FEE1E8", "#FED7C3", "#F6EAC2", "#ECD5E3", "#FF968A", "#FFAEA5",
  "#FFC5BF", "#FFD8BE", "#D4F0F0", "#8FCACA", "#CCE2CB", "#B6CFB6", "#97C1A9",
  "#FCB9AA", "#FFDBCC", "#ECEAE4", "#A2E1DB", "#55CBCD")

#Gemstone color palettes

gemstone <- c("#53051D", "#9E1C5C", "#EF70AA", "#FF8C94", "#F12761",
              "#00C6C7", "#96E5E2", "#00ACA5", "#006F60", "#005245", 
              "#EEAC4D", "#FFF2C3", "#EE84B3", "#740E4C", "#E2038D",
              "#B57E79", "#FF6F68", "#00E937", "#006072", "#5C2A2E",
              "#20503E", "#187B30", "#75E0B0", "#2D8498", "#4CB0A6")

# Plotting total death ranks
death_ranks_asia <- ggplot(top20.asia, aes(x = total_deaths, y = reorder(location, total_deaths))) +
  geom_col(aes(fill = location)) +
  scale_fill_manual(values = pastel) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none") +
    labs(title="Top 20 total deaths of COVID 19 in Asia", 
       source="ourworld in data",
       x="Total Deaths",
       y="Countries")

ggplotly(death_ranks_asia)
# Plotting total case ranks 
case.rank.asia <- ggplot(top20.asia, aes(x = total_cases, y = reorder(location, total_cases))) +
  geom_col(aes(fill = location)) + 
  scale_fill_manual(values = gemstone) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none") +
  labs(title="Top 20 Rank of total cases of COVID 19 in Asia", 
       source="ourworld in data",
       x="Total Cases",
       y="Countries")
ggplotly(case.rank.asia)

Desclaimer : This analysis is merely conducted for exercise and is not intended as rigorious scientific study. You my not use it as a reference for your scholarly works :)