Collect data from World Bank and make a Bubble Plot

#=======================================================================================================
#  Reference: https://cran.r-project.org/web/packages/wbstats/vignettes/Using_the_wbstats_package.html
#             https://www.r-bloggers.com/life-expectancy-animated/
#=======================================================================================================

# Load wbstats package: 

library(wbstats)

# General information in list structure: 

general_information <-  wb_cachelist

# Show some basic information: 
str(general_information, max.level = 1)


#---------------------------------------------------------------------
# Extract data frame that contains general information for countries
#---------------------------------------------------------------------


df_countries <- general_information[[1]]


# Some insights, for example, income group: 

library(tidyverse)

df_countries %>% 
  group_by(income) %>% 
  count() %>% 
  ungroup()


#------------------------------
#  Indicators provided by WB
#------------------------------


df_indicators <- general_information[[2]]

# Number of indicators: 

dim(df_indicators)

# Description for indicators: 

df_indicators %>% 
  filter(str_detect(indicator, "gross domestic")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "capital")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "death")) %>% 
  head() %>% 
  View()

df_indicators %>% 
  filter(str_detect(indicator, "tax")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "Population")) %>% 
  head() %>% 
  View()

# A list of indicators: 

my_indicator <- c("SP.POP.TOTL", "SP.DYN.LE00.IN", "NY.GDP.PCAP.PP.CD")

df_indicators %>% 
  filter(indicatorID %in% my_indicator) %>% 
  select(1:2)


# Collect data for an indicator for all nations: 

educ_data <- wb(country = "all",
                indicator = "PRJ.POP.2024.3.MF",
                startdate = 2000,
                enddate = 2018)

# Collect data, an indicator for some nations: 

educ_data_3nations <- wb(country = c("AFG", "AGO", "VNM"),
                         indicator = "SH.TBS.MORT",
                         startdate = 2016,
                         enddate = 2017)

# Collect some indicators for some nations: 

educ_tuberculosis <- wb(country = c("AFG", "AGO", "VNM"),
                        indicator = c("SH.TBS.MORT", "PRJ.POP.2024.3.MF"), 
                        startdate = 2016,
                        enddate = 2017) # Example 1. 




mydf <- wb(country = "all",
           indicator = my_indicator, 
           startdate = 2016,
           enddate = 2017) # Example 2. 



# Rename some columns and filter data for year of 2016: 

mydf_small <- mydf %>% 
  filter(date == 2016) %>% 
  select(country, iso2c, indicatorID, value)


# Convert to wide form and rename for some columns: 

mydf_small %>% 
  ungroup() %>% 
  spread(key = "indicatorID", value = "value") %>% 
  na.omit() %>% 
  rename(pop = SP.POP.TOTL, life = SP.DYN.LE00.IN, gdp = NY.GDP.PCAP.PP.CD) -> mydf_small_wide


# Filter our data (remove Aggregates label): 

income_group <- df_countries %>% 
  mutate_if(is.factor, as.character) %>% 
  filter(income != "Aggregates") %>% 
  select(iso2c, region, income)


# Meger data sets and remove missing points: 

total_df <- right_join(mydf_small_wide, income_group, by = "iso2c") %>% na.omit()


#-------------------------------------------------------------------------------------------------------------
# Page 38 from A. Deaton's book (https://www.amazon.com/gp/product/0691165629/ref=dbs_a_def_rwt_bibl_vppi_i0)
#-------------------------------------------------------------------------------------------------------------

library(ggrepel)
library(scales)
library(ggsci)
library(extrafont)
my_font <- "Roboto Condensed"


my_country <- c("Vietnam", "China", "India", "Thailand", "Malaysia", "France", "Cambodia", 
                "Germany", "Japan", "Nigeria", "Indonesia", "Singapore", "Philippines") # Select some nations. 


total_df %>% 
  filter(gdp < 100000) %>% 
  ggplot(aes(gdp, life, size = pop, color = income)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm", formula = y ~ log(x), color = "orange", alpha = 0.1, se = FALSE) + 
  geom_text_repel(data = total_df %>% filter(country %in% my_country), aes(label = country), color = "gray20", size = 5, force = 19, family = my_font) + 
  scale_x_continuous(breaks = seq(0, 100000, 10000), labels = dollar) + 
  scale_y_continuous(breaks = seq(50, 85, 5)) + 
  scale_size(range = c(1, 30)) + 
  scale_color_lancet(name = "Group:") + 
  guides(size = FALSE) + 
  labs(x = "GDP per capital", 
       y = "Life expectancy", 
       title = "The relationship between Life Expectancy and GDP per capital in 2016", 
       subtitle = "According to WHO definitions, Life Expectancy at reflects the overall mortality level of a population and it is defined as\nthe average number of years that a newborn is expected to live if current mortality rates continue to apply. ", 
       caption =  "Data Source: The World Bank") + 
  theme_minimal() + 
  theme(legend.position = c(0.83, 0.30)) + 
  theme(legend.title = element_text(size = 12, face = "bold", family = my_font)) + 
  theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) + 
  theme(plot.title = element_text(family = my_font, size = 26, color = "gray10")) + 
  theme(plot.subtitle = element_text(family = my_font, size = 15, color = "gray40")) + 
  theme(plot.caption = element_text(family = my_font, size = 12, color = "gray40", face = "italic")) + 
  theme(axis.text = element_text(family = my_font, size = 14, color = "gray30")) + 
  theme(legend.text = element_text(family = my_font, size = 12, color = "grey30")) + 
  theme(legend.title = element_text(family = my_font, size = 12, color = "grey30")) + 
  theme(axis.title = element_text(family = my_font, size = 15)) + 
  # theme(axis.title.x = element_text(family = my_font, size = 15, hjust = 0)) + 
  # theme(axis.title.y = element_text(family = my_font, size = 15, hjust = 1)) + 
  theme(panel.grid.minor = element_blank()) +
  theme(plot.background = element_rect(fill = "seashell", color = NA))
---
title: "wbstats - R Package for Collecting Data From World Bank"
author: "Nguyen Chi Dung"
subtitle: "Daily Graph Series"
output:
  html_document:
    code_download: yes
    code_folding: hide
    highlight: zenburn
    theme: flatly
    toc: yes
    toc_float: yes
  word_document:
    toc: yes
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.retina=2)
```

# Collect data from World Bank and make a Bubble Plot

![](C:\\Users\\Zbook\\Desktop\\Depocen_Course\\b2.jpg)


```{r, eval=FALSE}
#=======================================================================================================
#  Reference: https://cran.r-project.org/web/packages/wbstats/vignettes/Using_the_wbstats_package.html
#             https://www.r-bloggers.com/life-expectancy-animated/
#=======================================================================================================

# Load wbstats package: 

library(wbstats)

# General information in list structure: 

general_information <-  wb_cachelist

# Show some basic information: 
str(general_information, max.level = 1)


#---------------------------------------------------------------------
# Extract data frame that contains general information for countries
#---------------------------------------------------------------------


df_countries <- general_information[[1]]


# Some insights, for example, income group: 

library(tidyverse)

df_countries %>% 
  group_by(income) %>% 
  count() %>% 
  ungroup()


#------------------------------
#  Indicators provided by WB
#------------------------------


df_indicators <- general_information[[2]]

# Number of indicators: 

dim(df_indicators)

# Description for indicators: 

df_indicators %>% 
  filter(str_detect(indicator, "gross domestic")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "capital")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "death")) %>% 
  head() %>% 
  View()

df_indicators %>% 
  filter(str_detect(indicator, "tax")) %>% 
  head() %>% 
  View()


df_indicators %>% 
  filter(str_detect(indicator, "Population")) %>% 
  head() %>% 
  View()

# A list of indicators: 

my_indicator <- c("SP.POP.TOTL", "SP.DYN.LE00.IN", "NY.GDP.PCAP.PP.CD")

df_indicators %>% 
  filter(indicatorID %in% my_indicator) %>% 
  select(1:2)


# Collect data for an indicator for all nations: 

educ_data <- wb(country = "all",
                indicator = "PRJ.POP.2024.3.MF",
                startdate = 2000,
                enddate = 2018)

# Collect data, an indicator for some nations: 

educ_data_3nations <- wb(country = c("AFG", "AGO", "VNM"),
                         indicator = "SH.TBS.MORT",
                         startdate = 2016,
                         enddate = 2017)

# Collect some indicators for some nations: 

educ_tuberculosis <- wb(country = c("AFG", "AGO", "VNM"),
                        indicator = c("SH.TBS.MORT", "PRJ.POP.2024.3.MF"), 
                        startdate = 2016,
                        enddate = 2017) # Example 1. 




mydf <- wb(country = "all",
           indicator = my_indicator, 
           startdate = 2016,
           enddate = 2017) # Example 2. 



# Rename some columns and filter data for year of 2016: 

mydf_small <- mydf %>% 
  filter(date == 2016) %>% 
  select(country, iso2c, indicatorID, value)


# Convert to wide form and rename for some columns: 

mydf_small %>% 
  ungroup() %>% 
  spread(key = "indicatorID", value = "value") %>% 
  na.omit() %>% 
  rename(pop = SP.POP.TOTL, life = SP.DYN.LE00.IN, gdp = NY.GDP.PCAP.PP.CD) -> mydf_small_wide


# Filter our data (remove Aggregates label): 

income_group <- df_countries %>% 
  mutate_if(is.factor, as.character) %>% 
  filter(income != "Aggregates") %>% 
  select(iso2c, region, income)


# Meger data sets and remove missing points: 

total_df <- right_join(mydf_small_wide, income_group, by = "iso2c") %>% na.omit()


#-------------------------------------------------------------------------------------------------------------
# Page 38 from A. Deaton's book (https://www.amazon.com/gp/product/0691165629/ref=dbs_a_def_rwt_bibl_vppi_i0)
#-------------------------------------------------------------------------------------------------------------

library(ggrepel)
library(scales)
library(ggsci)
library(extrafont)
my_font <- "Roboto Condensed"


my_country <- c("Vietnam", "China", "India", "Thailand", "Malaysia", "France", "Cambodia", 
                "Germany", "Japan", "Nigeria", "Indonesia", "Singapore", "Philippines") # Select some nations. 


total_df %>% 
  filter(gdp < 100000) %>% 
  ggplot(aes(gdp, life, size = pop, color = income)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm", formula = y ~ log(x), color = "orange", alpha = 0.1, se = FALSE) + 
  geom_text_repel(data = total_df %>% filter(country %in% my_country), aes(label = country), color = "gray20", size = 5, force = 19, family = my_font) + 
  scale_x_continuous(breaks = seq(0, 100000, 10000), labels = dollar) + 
  scale_y_continuous(breaks = seq(50, 85, 5)) + 
  scale_size(range = c(1, 30)) + 
  scale_color_lancet(name = "Group:") + 
  guides(size = FALSE) + 
  labs(x = "GDP per capital", 
       y = "Life expectancy", 
       title = "The relationship between Life Expectancy and GDP per capital in 2016", 
       subtitle = "According to WHO definitions, Life Expectancy at reflects the overall mortality level of a population and it is defined as\nthe average number of years that a newborn is expected to live if current mortality rates continue to apply. ", 
       caption =  "Data Source: The World Bank") + 
  theme_minimal() + 
  theme(legend.position = c(0.83, 0.30)) + 
  theme(legend.title = element_text(size = 12, face = "bold", family = my_font)) + 
  theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) + 
  theme(plot.title = element_text(family = my_font, size = 26, color = "gray10")) + 
  theme(plot.subtitle = element_text(family = my_font, size = 15, color = "gray40")) + 
  theme(plot.caption = element_text(family = my_font, size = 12, color = "gray40", face = "italic")) + 
  theme(axis.text = element_text(family = my_font, size = 14, color = "gray30")) + 
  theme(legend.text = element_text(family = my_font, size = 12, color = "grey30")) + 
  theme(legend.title = element_text(family = my_font, size = 12, color = "grey30")) + 
  theme(axis.title = element_text(family = my_font, size = 15)) + 
  # theme(axis.title.x = element_text(family = my_font, size = 15, hjust = 0)) + 
  # theme(axis.title.y = element_text(family = my_font, size = 15, hjust = 1)) + 
  theme(panel.grid.minor = element_blank()) +
  theme(plot.background = element_rect(fill = "seashell", color = NA))

  
```

