Objective

  1. Census data processing (merge multiple dataframe)
  2. Data visualization (ggplot, plotly, gganimate)

Load packages

pacman::p_load(
  rio,          
  here,         
  tidyverse,
  lubridate,
  plotly,
  gganimate
  )    

Import data

# GDP 
wb_gdp <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Data", skip = 3)
wb_metadata <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Metadata - Countries")

# Population 
wb_pop <- import(here("data", "API_SP.POP.TOTL_DS2_en_excel_v2_5358476.xls"), which = "Data", skip = 3)

# Life expectancy at birth
wb_life <- import(here("data", "API_SP.DYN.LE00.IN_DS2_en_excel_v2_5358902.xls"), which = "Data", skip = 3)

# GDP per capita 
wb_gdpcap <- import(here("data", "API_NY.GDP.PCAP.CD_DS2_en_excel_v2_5358450.xls"), which = "Data", skip = 3)

# GDP growth (annual %)
wb_gdpgrowth <- import(here("data", "API_NY.GDP.MKTP.KD.ZG_DS2_en_excel_v2_5358368.xls"), which = "Data", skip = 3)

Prepare data

# merge 2 datasets 
dt <- merge(wb_gdp, wb_metadata, by = 'Country Code')

# transform data from wide to long 
dt <- dt %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdp"
  )

# drop columns 
dt$`Indicator Code`<- NULL
dt$SpecialNotes <- NULL
dt$TableName <- NULL
dt$`Indicator Name` <- NULL

Note that we need to transform variable class for ggplot:

# transform variable class 
dt$year <- as.integer(dt$year)
dt$`Country Code` <- as.factor(dt$`Country Code`)
dt$`Country Name` <- as.factor(dt$`Country Name`)

# check na value 
sum(is.na(dt))
## [1] 9288

plot

plot gdp for 1 country

# prepare data
dt_aut <- dt[dt$`Country Code` == "AUT", ]
dt_aut <- cbind(dt_aut, "GDP" = dt_aut$gdp/1000000000)

# plot
ggplot(dt_aut, aes(x = year, y = gdp)) +
  geom_line() +
  labs(title = "GDP (Austria)",
       caption = "Source: Wordbank Data 2023",
       x = "Year",
       y = "Bil dollars") +
  theme_classic()

plot for countries

Line graph

If we want the graph with colours

# prepare data - dplyr 
dt_multi <- dt %>% filter(`Country Name` == c("Austria", "France", "Germany", "Switzerland")) # dplyr
dt_multi <- dt_multi %>% mutate(GDP = gdp/1000000000) # dplyr

# prepare data - R Base  
# dt_multi <- dt[dt$`Country Code` %in% c("Austria", "France", "Germany", "Switzerland"), ] # R base 
# dt_multi <- cbind(dt_multi, "GDP" = dt_multi$gdp/1000000000) # R base

# basic plot 
a1 <- ggplot(dt_multi, aes(x = year, y = GDP, colour =`Country Name`)) + 
  geom_line()

# customise the plot 
a2 <- ggplot(dt_multi, aes(x = year, y = GDP, colour =`Country Name`)) + 
  geom_line() +
  labs(title    = "GDP",
       subtitle = "In thousand dollars",
       caption  = "Source: Worldbank data 2023",
       x        = "Year") + # Rename the title of the x-axis
  theme_classic() +
  theme(axis.title.y = element_blank()) # Delete the title of the y axis 

# Put 2 plots one the same page 
gridExtra::grid.arrange(a1, a2, ncol = 1, nrow = 2) # arrange plot in 1 same page

If we dont’t want plot with colors and want to customise legend name

ggplot(dt_multi, aes(year, GDP, linetype =`Country Name`)) +  # using linetype = 
  geom_line() +
  labs(title    = "GDP",
       subtitle = "In thousand dollars",
       caption  = "Source: Worldbank data 2023",
       x        = "Year") +
  scale_linetype_discrete(name = "Country Name") +  # Customise legend (bang chu thich)
  theme_classic() +
  theme(axis.title.y = element_blank()) # Delete the title of the y axis 

Bar graph

# Extract data
dt_gdp <- dt %>% 
  filter(`Country Name` %in% c("Austria", "France", "Germany", "Switzerland"),
         year >= 2000, year <= 2010) %>% 
  select(`year`, `Country Code`, `Country Name`, `gdp`) %>% 
  rename(Year = `year`, Country = `Country Name`, GDP = `gdp`)

# Create graph
b1 <- ggplot(dt_gdp, aes(Year, GDP, fill = Country)) +
  geom_bar(stat = "identity") +
  labs(title = "GDP",
       subtitle = "In billion dollars",
       caption = "Source: Worldbank data 2023",
       x = "Year") 

# Create graph
b2 <- ggplot(dt_gdp, aes(Year, GDP, fill = Country)) +
  geom_bar(stat = "identity") +
  labs(title = "GDP",
       subtitle = "In billion dollars",
       caption = "Source: Worldbank data 2023",
       x = "Year") +
  # Change the standard set of colours used to fill the bars
  scale_fill_brewer(palette = "Set1") +
  theme_classic() +
  theme(axis.title.y = element_blank()) # Delete the title of the y axis

# Put 2 plots one the same page 
gridExtra::grid.arrange(b1, b2, ncol = 2, nrow = 2) # arrange plot in 1 same page

gridExtra::grid.arrange(b1, b2, ncol = 1, nrow = 2) # arrange plot in 1 same page

Buble chart

Step 1: prepare data Tranform data from long to wide

# Population 
wb_pop <- wb_pop %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "pop"
  ) %>% 
  select(`Country Code`, year, pop)

# Life expectancy at birth
wb_life <- wb_life %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "lifeExp"
  ) %>% 
  select(`Country Code`, year, lifeExp)

# GDP per capita 
wb_gdpcap <- wb_gdpcap %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdpPercap"
  ) %>% 
  select(`Country Code`, year, gdpPercap)

# GDP growth (annual %)
wb_gdpgrowth <- wb_gdpgrowth %>% 
  pivot_longer(
    cols = c(5:66),
    names_to = "year",
    values_to = "gdpGrowth"
  ) %>% 
  select(`Country Code`, year, gdpGrowth)

Add cols of pop, life expectancy, gdp per cap, and gdp growth to dataframe of gdp

df_list <- list(dt, wb_pop, wb_life, wb_gdpcap, wb_gdpgrowth)
full_dt <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) ## using R base
head(full_dt, 10)
##    Country Code year Country Name                    Region IncomeGroup gdp
## 1           ABW 1960        Aruba Latin America & Caribbean High income  NA
## 2           ABW 1961        Aruba Latin America & Caribbean High income  NA
## 3           ABW 1962        Aruba Latin America & Caribbean High income  NA
## 4           ABW 1963        Aruba Latin America & Caribbean High income  NA
## 5           ABW 1964        Aruba Latin America & Caribbean High income  NA
## 6           ABW 1965        Aruba Latin America & Caribbean High income  NA
## 7           ABW 1966        Aruba Latin America & Caribbean High income  NA
## 8           ABW 1967        Aruba Latin America & Caribbean High income  NA
## 9           ABW 1968        Aruba Latin America & Caribbean High income  NA
## 10          ABW 1969        Aruba Latin America & Caribbean High income  NA
##      pop lifeExp gdpPercap gdpGrowth
## 1  54608  64.152        NA        NA
## 2  55811  64.537        NA        NA
## 3  56682  64.752        NA        NA
## 4  57475  65.132        NA        NA
## 5  58178  65.294        NA        NA
## 6  58782  65.502        NA        NA
## 7  59291  66.063        NA        NA
## 8  59522  66.439        NA        NA
## 9  59471  66.757        NA        NA
## 10 59330  67.168        NA        NA

Step 2: Plot graph

full_dt <- full_dt %>% filter(!is.na(Region))

p1 <- ggplot(
  full_dt, aes(x = gdpPercap, y = lifeExp, size = pop, colour = `Country Name`)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  # scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
p1 

p2 <- ggplot(
  full_dt, aes(x = gdpPercap, y = lifeExp, size = pop, colour = `Country Name`)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
p2 

gridExtra::grid.arrange(p1, p2, ncol = 1, nrow = 2) # arrange plot in 1 same page

Step 2: Plot graph

full_dt <- full_dt %>% filter(year >= 2020)

p1 <- ggplot(
  full_dt, aes(x = gdpPercap, y = lifeExp, size = pop, colour = `Country Name`)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  # scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
p1 

p2 <- ggplot(
  full_dt, aes(x = gdpPercap, y = lifeExp, size = pop, colour = `Country Name`)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
p2 

gridExtra::grid.arrange(p1, p2, ncol = 1, nrow = 2) # arrange plot in 1 same page

Using animation

# Note:R Markdown does not support to visualise this graph in the dynamic format when it is published in rpub

gif <- p2 +
  # turn static plot into animation by defining the content of the transition_time object.
  transition_time(year) +
  labs(title = "Year: {frame_time}")

gif 
## NULL

Using plot_ly()

fig <- full_dt %>%
  plot_ly(
    x = ~gdpPercap, 
    y = ~lifeExp, 
    size = ~pop, 
    color = ~Region, 
    frame = ~year, 
    text = ~`Country Name`, 
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )

fig <- fig %>% layout(
    xaxis = list(
      type = "log"
    )
  )

fig

References

  1. [Creating Interactive Visualizations in R] (https://ladal.edu.au/motion.html#Introduction);
  2. World Bank Open Data.