Objective

  1. Worldbank Census data processing (merge multiple dataframe)
  2. Data visualization (ggplot, plotly, gganimate)
  3. Correlation

Load packages

pacman::p_load(
  rio,          
  here,         
  tidyverse,
  lubridate,
  plotly,
  gganimate,
  ggExtra,     # marginal histogram 
  ggalt,       # scatterplot encircle
  ggcorrplot   # corrlogam 
  )    

Import data

# GDP 
wb_gdp <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Data", skip = 3)
wb_metadata <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Metadata - Countries")
# Population 
wb_pop <- import(here("data", "API_SP.POP.TOTL_DS2_en_excel_v2_5358476.xls"), which = "Data", skip = 3)
# Life expectancy at birth
wb_life <- import(here("data", "API_SP.DYN.LE00.IN_DS2_en_excel_v2_5358902.xls"), which = "Data", skip = 3)
# GDP per capita 
wb_gdpcap <- import(here("data", "API_NY.GDP.PCAP.CD_DS2_en_excel_v2_5358450.xls"), which = "Data", skip = 3)
# GDP growth (annual %)
wb_gdpgrowth <- import(here("data", "API_NY.GDP.MKTP.KD.ZG_DS2_en_excel_v2_5358368.xls"), which = "Data", skip = 3)

Prepare data

# merge 2 datasets 
dt <- merge(wb_gdp, wb_metadata, by = 'Country Code')

# transform data from wide to long 
dt <- dt %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdp"
  )

# drop columns 
dt$`Indicator Code`<- NULL
dt$SpecialNotes <- NULL
dt$TableName <- NULL
dt$`Indicator Name` <- NULL

Note that we need to transform variable class for ggplot:

# transform variable class 
dt$year <- as.integer(dt$year)
dt$`Country Code` <- as.factor(dt$`Country Code`)
dt$`Country Name` <- as.factor(dt$`Country Name`)

# check na value 
sum(is.na(dt))
## [1] 9288

Tranform data from long to wide

# Population 
wb_pop <- wb_pop %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "pop"
  ) %>% 
  select(`Country Code`, year, pop)

# Life expectancy at birth
wb_life <- wb_life %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "lifeExp"
  ) %>% 
  select(`Country Code`, year, lifeExp)

# GDP per capita 
wb_gdpcap <- wb_gdpcap %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdpPercap"
  ) %>% 
  select(`Country Code`, year, gdpPercap)

# GDP growth (annual %)
wb_gdpgrowth <- wb_gdpgrowth %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdpGrowth"
  ) %>% 
  select(`Country Code`, year, gdpGrowth)

Add cols of pop, life expectancy, gdp per cap, and gdp growth to dataframe of gdp

df_list <- list(dt, wb_pop, wb_life, wb_gdpcap, wb_gdpgrowth)
full_dt <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) ## using R base

str(full_dt)
## 'data.frame':    16492 obs. of  10 variables:
##  $ Country Code: Factor w/ 266 levels "ABW","AFE","AFG",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year        : chr  "1960" "1961" "1962" "1963" ...
##  $ Country Name: Factor w/ 265 levels "Afghanistan",..: 13 13 13 13 13 13 13 13 13 13 ...
##  $ Region      : chr  "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" ...
##  $ IncomeGroup : chr  "High income" "High income" "High income" "High income" ...
##  $ gdp         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pop         : num  54608 55811 56682 57475 58178 ...
##  $ lifeExp     : num  64.2 64.5 64.8 65.1 65.3 ...
##  $ gdpPercap   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gdpGrowth   : num  NA NA NA NA NA NA NA NA NA NA ...
full_dt <- full_dt %>%
  janitor::clean_names()  # clean columns name 

Correlation

Scatterplot

library(ggplot2)
theme_set(theme_bw())  # pre-set the bw theme.

full_dt <- full_dt %>% filter(year >= 2020) %>% filter (!is.na(region))

g1 <- ggplot(full_dt, aes(x = log(gdp_percap), y = life_exp)) +
  geom_point(aes(col = region, size = pop)) +
  geom_smooth(method = "loess", se=F, formula = y ~ x) +
  ylim(c(49, 89)) +   # min-max value of life_exp, help the map easy to follow / see 
  labs(subtitle="gdp_percap ~ life_exp", 
       y="lifeExp", 
       x="gdpPercap", 
       title="Scatterplot", 
       caption = "Source: WB 2023")

plot(g1)

Scatterplot Encircling

full_dt_select <- full_dt %>% filter(life_exp <= 55)

g2 <- ggplot(full_dt, aes(x=log(gdp_percap), y=life_exp)) +
  geom_point(aes(col=region, size=pop)) +
  geom_smooth(method="loess", se=F, formula = y ~ x) +
  xlim(c(5, 13)) +
  ylim(c(49, 89)) +
  geom_encircle(aes(x=log(gdp_percap), y=life_exp),
                data=full_dt_select,
                color="red",
                size=2,
                expand=0.08) + # The color and size (thickness) of the curve 
  labs(subtitle="gdp_percap ~ life_exp", 
       y="lifeExp", 
       x="gdpPercap", 
       title="Scatterplot", 
       caption = "Source: WB 2023")

plot(g2)

Jitter Plot

theme_set(theme_bw())  # pre-set the bw theme

g3 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)
  labs(subtitle = "gdp per cap ~ life exp",
       y="gdp_percap",
       x="life_exp",
       title = "Scatterplot with overlapping points",
       caption = "Source: WB 2023")
## $y
## [1] "gdp_percap"
## 
## $x
## [1] "life_exp"
## 
## $title
## [1] "Scatterplot with overlapping points"
## 
## $subtitle
## [1] "gdp per cap ~ life exp"
## 
## $caption
## [1] "Source: WB 2023"
## 
## attr(,"class")
## [1] "labels"
g3

The original data has more than 200 data points but the chart seems to display fewer points. This is because there are many overlapping points appearing as a single dot. The fact that both gdp_percap and life_exp are integers in the source dataset made it all the more convenient to hide this detail. So just be extra careful the next time you make scatterplot with integers.

So how to handle this? There are few options. We can make a jitter plot with jitter_geom()

g3 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
  geom_point() +
  geom_smooth(method = lm, se=F, formula = y ~ x) +
  geom_jitter(width = .5, size = 1) +  #the overlapping points are randomly jittered around its original position based on a threshold controlled by the width argument.
  labs(subtitle = "gdp per cap ~ life exp",
       y="gdp_percap",
       x="life_exp",
       title = "Scatterplot with overlapping points",
       caption = "Source: WB 2023")
g3

The second option to overcome the problem of data points overlap is to use what is called a counts chart. Whereever there is more points overlap, the size of the circle gets bigger.

g4 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
  geom_smooth(method = lm, se=F, formula = y ~ x) +
  geom_count(col="tomato3", show.legend = F) +
  labs(subtitle = "gdp") +
  labs(subtitle = "gdp per cap ~ life exp",
       y="gdp_percap",
       x="life_exp",
       title = "Scatterplot with overlapping points - count plot",
       caption = "Source: WB 2023")
g4

Marginal Histogram / Boxplot

library(ggplot2)
library()

g4 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
  geom_smooth(method = lm, se=F, formula = y ~ x) +
  geom_count()

ggMarginal(g4, type = "histogram", fill = "transparent")

# ggMarginal(g4, type = "boxplot", fill = "transparent")
# ggMarginal(g4, type = "density", fill = "transparent")

Correlogram

## Prepare data 
# replace na value with mean value 
# option 1
correlation <- full_dt %>% select(-c(1:5, 10))

for(i in 1:ncol(correlation)){
  correlation[is.na(correlation[,i]), i] <- mean(correlation[,i], na.rm = TRUE)
}

# option 2
t <- full_dt %>% select(-c(1:5, 10))
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
t <- na_mean(t)
correlation <- scale(correlation)
corr <- round(cor(correlation), 1)

g1 <- ggcorrplot(corr, hc.order = TRUE,
           type = "lower",
           lab = TRUE,
           lab_size = 3,
           method = "circle",
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of mtcars", 
           ggtheme=theme_bw)

g1

References

  1. [Top 50 ggplot2 Visualizations - The Master List] (http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html#1.%20Correlation)