Steps

Packages. The followings are required packages.

library(tidyverse)

## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggtext)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

Open csv data and assign to new data frame

covid2019 <- read.csv(file.choose(), header = T)
dim(covid2019)

## [1] 22242    33

I am not gonna perform time-series analalysis. Thus, in this example, the analysis is limited to the data in a single day to see total cases, total data, and other relevant data. I choose for the most recent updated data which is 04-06-2020.
Filter some variables relevant for your desired analysis. For my case, I would like to explore diabetes prevalence, hospital beds availability, covid death rate, and smokers. Thus, I excluded countries that have no data on such variables.

 health.data <- covid2019 %>%
  filter(date == "2020-06-04") %>%
  filter(location != "World") %>%
  filter(!is.na(diabetes_prevalence)) %>%
  filter(!is.na(hospital_beds_per_thousand)) %>%
  filter(!is.na(cvd_death_rate)) %>%
  filter(!is.na(female_smokers)) %>%
  filter(!is.na(male_smokers))
glimpse(health.data)

## Rows: 128
## Columns: 33
## $ iso_code                        <chr> "ALB", "DZA", "ARG", "ARM", "AUS", ...
## $ continent                       <chr> "Europe", "Africa", "South America"...
## $ location                        <chr> "Albania", "Algeria", "Argentina", ...
## $ date                            <chr> "2020-06-04", "2020-06-04", "2020-0...
## $ total_cases                     <int> 1184, 9626, 19255, 10524, 7229, 167...
## $ new_cases                       <int> 20, 0, 949, 515, 8, 31, 325, 0, 504...
## $ total_deaths                    <int> 33, 667, 583, 170, 102, 670, 76, 11...
## $ new_deaths                      <int> 0, 0, 14, 12, 0, 1, 5, 0, 1, 37, 0,...
## $ total_cases_per_million         <dbl> 411.425, 219.516, 426.035, 3551.525...
## $ new_cases_per_million           <dbl> 6.950, 0.000, 20.998, 173.797, 0.31...
## $ total_deaths_per_million        <dbl> 11.467, 15.211, 12.899, 57.370, 4.0...
## $ new_deaths_per_million          <dbl> 0.000, 0.000, 0.310, 4.050, 0.000, ...
## $ total_tests                     <dbl> NA, NA, 178448, NA, 1546329, 471466...
## $ new_tests                       <dbl> NA, NA, 5501, NA, 33629, 8508, NA, ...
## $ total_tests_per_thousand        <dbl> NA, NA, 3.948, NA, 60.641, 52.348, ...
## $ new_tests_per_thousand          <dbl> NA, NA, 0.122, NA, 1.319, 0.945, NA...
## $ new_tests_smoothed              <dbl> NA, NA, 4767, NA, 25555, 6299, NA, ...
## $ new_tests_smoothed_per_thousand <dbl> NA, NA, 0.105, NA, 1.002, 0.699, NA...
## $ tests_units                     <chr> "", "", "tests performed", "", "tes...
## $ stringency_index                <dbl> 67.59, NA, NA, NA, NA, NA, 85.19, N...
## $ population                      <dbl> 2877800, 43851043, 45195777, 296323...
## $ population_density              <dbl> 104.871, 17.348, 16.177, 102.931, 3...
## $ median_age                      <dbl> 38.0, 29.1, 31.9, 35.7, 37.9, 44.4,...
## $ aged_65_older                   <dbl> 13.188, 6.211, 11.198, 11.232, 15.5...
## $ aged_70_older                   <dbl> 8.643, 3.857, 7.441, 7.571, 10.129,...
## $ gdp_per_capita                  <dbl> 11803.431, 13913.839, 18933.907, 87...
## $ extreme_poverty                 <dbl> 1.1, 0.5, 0.6, 1.8, 0.5, 0.7, NA, N...
## $ cvd_death_rate                  <dbl> 304.195, 278.364, 191.032, 341.010,...
## $ diabetes_prevalence             <dbl> 10.08, 6.73, 5.50, 7.11, 5.07, 6.35...
## $ female_smokers                  <dbl> 7.1, 0.7, 16.2, 1.5, 13.0, 28.4, 0....
## $ male_smokers                    <dbl> 51.2, 30.4, 27.7, 52.1, 16.5, 30.9,...
## $ handwashing_facilities          <dbl> NA, 83.741, NA, 94.043, NA, NA, 83....
## $ hospital_beds_per_thousand      <dbl> 2.890, 1.900, 5.000, 4.200, 3.840, ...

Here some interesting results from correlation analysis I conducted.

Total deaths have a positively small correlation with median age

cor1 <- ggplot(health.data, aes(x = total_deaths, y = median_age)) +
  geom_smooth (method = lm) + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")

ggplotly(cor1)

## `geom_smooth()` using formula 'y ~ x'

# Correlation Coefficient 
cor_cof1 <- cor.test(health.data$total_deaths, health.data$median_age, method = "pearson")
print(cor_cof1)

## 
##  Pearson's product-moment correlation
## 
## data:  health.data$total_deaths and health.data$median_age
## t = 2.1927, df = 126, p-value = 0.03016
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01881559 0.35349027
## sample estimates:
##       cor 
## 0.1917201

The total of female smokers is positively correlated with GDP per Capita

cor2<- ggplot(health.data, aes(x = female_smokers, y = gdp_per_capita)) +
  geom_smooth (method = lm, se = F, color = "black") + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")

ggplotly(cor2)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

# Let's inspect correlation coefficient and statistical significance
cor_cof2 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof2)

## 
##  Pearson's product-moment correlation
## 
## data:  health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1782194 0.4870328
## sample estimates:
##      cor 
## 0.341821

Diabetes prevalence is also higher among the countries with higher gdp Per capita

cor3 <- ggplot(health.data, aes(x = diabetes_prevalence, y = gdp_per_capita)) +
  geom_smooth (method = lm, se = F, color = "black") + geom_point() +
  scale_x_continuous(breaks= pretty_breaks()) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none")
ggplotly(cor3)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

#Checking correlation coefficient and p-value
cor_cof3 <- cor.test(health.data$gdp_per_capita, health.data$female_smokers, method = "pearson")
print(cor_cof3)

## 
##  Pearson's product-moment correlation
## 
## data:  health.data$gdp_per_capita and health.data$female_smokers
## t = 4.0666, df = 125, p-value = 8.383e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1782194 0.4870328
## sample estimates:
##      cor 
## 0.341821

Investigating COVID19 in Asia, in particular top ranks of total cases in Asia.

# filtering Asia and arrange according descending total cases
topasia <- health.data %>%
  arrange(desc(total_cases)) %>%
  filter(continent == "Asia")

# selecting Top 20 Countries
top20.asia <- topasia[1:20,]

Visualizing total cases in those top 20 countries

# I tried to play around with ggplot color. Here I customize palette color that refers to theme provided by <https://wondernote.org>. You can also copy this code if you want to explore.

# Pastel color palettes
pastel <- c("#ABDEE6", "#CBAACB", "#FFFFB5", "#FFCCB6", "#F3B0C3",
  "#C6DBDA", "#FEE1E8", "#FED7C3", "#F6EAC2", "#ECD5E3", "#FF968A", "#FFAEA5",
  "#FFC5BF", "#FFD8BE", "#D4F0F0", "#8FCACA", "#CCE2CB", "#B6CFB6", "#97C1A9",
  "#FCB9AA", "#FFDBCC", "#ECEAE4", "#A2E1DB", "#55CBCD")

#Gemstone color palettes

gemstone <- c("#53051D", "#9E1C5C", "#EF70AA", "#FF8C94", "#F12761",
              "#00C6C7", "#96E5E2", "#00ACA5", "#006F60", "#005245", 
              "#EEAC4D", "#FFF2C3", "#EE84B3", "#740E4C", "#E2038D",
              "#B57E79", "#FF6F68", "#00E937", "#006072", "#5C2A2E",
              "#20503E", "#187B30", "#75E0B0", "#2D8498", "#4CB0A6")

# Plotting total death ranks
death_ranks_asia <- ggplot(top20.asia, aes(x = total_deaths, y = reorder(location, total_deaths))) +
  geom_col(aes(fill = location)) +
  scale_fill_manual(values = pastel) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none") +
    labs(title="Top 20 total deaths of COVID 19 in Asia", 
       source="ourworld in data",
       x="Total Deaths",
       y="Countries")

ggplotly(death_ranks_asia)

# Plotting total case ranks 
case.rank.asia <- ggplot(top20.asia, aes(x = total_cases, y = reorder(location, total_cases))) +
  geom_col(aes(fill = location)) + 
  scale_fill_manual(values = gemstone) +
  theme(axis.text = element_text(size = 10), 
        axis.title = element_markdown(size = 10),
        axis.text.x = element_markdown(size = 10),
        axis.text.y = element_markdown(size = 10),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "grey"),
        plot.title = element_markdown(size = 10),
        legend.position = "none") +
  labs(title="Top 20 Rank of total cases of COVID 19 in Asia", 
       source="ourworld in data",
       x="Total Cases",
       y="Countries")
ggplotly(case.rank.asia)

Desclaimer : This analysis is merely conducted for exercise and is not intended as rigorious scientific study. You my not use it as a reference for your scholarly works :)

Exploring COVID19 Data Using R for Beginner

Syifaul Muntafi

June 7, 2020

Introduction

Steps