Motivation

I would like to create a dashboard that ranks the economic strength of all countries based on several important economic indicators. This document shows how to collect that data from public sources and create a ranking algorithm.

Collecting Fundamental Data

The Economist publishes country fundamental data on their website. Here is the code to collect that data.

library(tidyverse)
library(rvest)
library(RSelenium)
library(tidyquant)
library(cluster)
library(ggdendro)
library(directlabels)
library(psych) 


#Get html using a JS phantom browser
url.base <- "http://www.economist.com/indicators"
pJS <- phantom()
Sys.sleep(1) # give the binary a moment
remDr <- remoteDriver(browserName = 'phantomjs')
remDr$open()
remDr$navigate(url.base)
html.raw <- read_html(remDr$getPageSource()[[1]])

#Parse Data into a data frame
vector.text <- html.raw %>% 
  html_nodes("tr") %>%
  html_text() %>%
  str_split("\n") %>% unlist() %>% trimws() %>% .[. != ""]

index.start <- str_which(vector.text, "United States")
vector.text <- vector.text[index.start:length(vector.text)]

data.initial.raw  <- tibble(country             = vector.text,
                            GDP.latest          = lead(vector.text),
                            GDP.quarter         = lead(vector.text,2),
                            GDP.2018            = lead(vector.text,3),
                            GDP.2019            = lead(vector.text,4),
                            IP                  = lead(vector.text,5),
                            CPI.latest          = lead(vector.text,6),
                            CPI.year.ago        = lead(vector.text,7),
                            CPI.2018            = lead(vector.text,8),
                            unemployment.rate   = lead(vector.text,9),
                            trade.balance       = lead(vector.text,10),
                            CAB                 = lead(vector.text,11),
                            CAB.ratio           = lead(vector.text,12),
                            currency.today      = lead(vector.text,13),
                            currency.year.ago   = lead(vector.text,14),
                            budget.balance      = lead(vector.text,15),
                            interest.rate.short = lead(vector.text,16),
                            interest.rate.long  = lead(vector.text,17)) %>%
  filter(str_detect(country, "[0-9]") == FALSE) %>% 
  filter(country != "na") %>%
  filter(str_detect(country, "nil") == FALSE) %>% 
  filter(country != "-")

Data Cleaning

To clean the data, we must convert all entries to numeric and calculate derivative indicators such as the change in currency value. This will give us the dataset required for ranking countries.

convert.to.numeric <- function(x,n) {
  str_sub(x,0,n) %>% as.numeric()
}

data.initial.clean <-
  data.initial.raw %>%
    mutate(
      GDP.latest         = convert.to.numeric(GDP.latest,4),
      IP                 = convert.to.numeric(IP, 4),
      CPI.latest         = convert.to.numeric(CPI.latest, 4),
      CPI.year.ago       = convert.to.numeric(CPI.year.ago, 4),
      unemployment.rate  = convert.to.numeric(unemployment.rate, 3),
      CAB.ratio          = convert.to.numeric(CAB.ratio,4),
      currency.today     = currency.today %>% str_replace(.,",","") %>% as.numeric(),
      currency.year.ago  = currency.year.ago %>% str_replace(.,",","") %>% as.numeric(),
      budget.balance     = convert.to.numeric(budget.balance,4),
      interest.rate.10   = convert.to.numeric(interest.rate.long, 4)
    ) %>%
    select(country, GDP.latest,IP, CPI.latest, CPI.year.ago, unemployment.rate, CAB.ratio, currency.today, currency.year.ago, budget.balance, interest.rate.10) %>%
    filter(country != "+") %>%
    mutate(
      currency.delta     = (currency.today-currency.year.ago)/currency.year.ago,
      CPI.delta          = (CPI.latest - CPI.year.ago)/CPI.year.ago
    ) %>%
    select(-currency.today,-currency.year.ago, -CPI.year.ago)

Visualize Distributions

Next, we visualize the distributions of each indicator in our dataset.

library(tidyverse)
data.initial.clean <- read_csv("data.initial.clean.csv")

data.initial.clean %>%
  filter(currency.delta < 10) %>% # Venezuela is too wierd
  gather(key = "indicator",GDP.latest:CPI.delta, value = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(aes(fill = indicator)) +
  facet_wrap(~indicator, scales = "free_x") +
  labs(
    title    = "Distribution of Initital Data",
    y        = "Count",
    x        = "Value"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Transform the Data

We must transform the data so that later we can calculate z-scores. There are three steps: First, winsorize the data at the 95th percentile to control the effect of outliers on our scoring algorithm. Second, shift the distributions to the right so that all values are greater than zero. Third, take the log of all values.

# Define paramaters for Winsorizing data
winsorize.data <- function(x) {
  winsor(x, trim = 0.05, na.rm = TRUE)
}

# Define function for shifting distributions into positive territory (before applying logs)
get.lower.bound <- function(x) {
  x %>%
    min(., na.rm = TRUE) %>%
    abs() + 1
}

shift.to.the.right <- function(x) {
  x + get.lower.bound(x)
}

data.transformed <- 
  data.initial.clean %>%
    # Winsorize Data
    mutate_if(is.numeric, funs(winsorize.data)) %>%
    # Shift data to the right
    mutate_if(is.numeric, funs(shift.to.the.right)) %>%
    # Take the logarithm of the data
    mutate_if(is.numeric, funs(log)) 

Score Countries

Finally, we score the countries by taking the average of the Z-scores of the economic indicators, taking care to change the sign for economic indicators where a higher value implies a worse economic outcome.

data.z.scores <- 
  data.transformed %>%
    mutate_if(is.numeric, funs(scale)) %>%
    # Invert score for indicators where higher number implies a worse economic outcome
    mutate(
      CPI.latest        = (-1)*CPI.latest,
      unemployment.rate = (-1)*unemployment.rate,
      currency.delta    = (-1)*currency.delta
    ) %>%
    # Calculate final score
    rowwise() %>%
    mutate(
      score.final       = mean(c(GDP.latest, IP, CPI.latest, unemployment.rate, CAB.ratio,budget.balance,interest.rate.10,currency.delta,CPI.delta), na.rm = TRUE)
    ) %>% 
    ungroup() %>%
    arrange(desc(score.final))

data.z.scores %>%
  select(country, score.final) %>%
  left_join(data.initial.clean)

Here is the final ranking of all countries:

Country Final Score GDP IP CPI Unemployment CAB Budget 10Y Yields FX Delta CPI Delta
Thailand 0.77 4.0 2.6 1.1 1.2 10.0 -2.3 2.62 -8.8 175.0
China 0.59 6.8 7.0 1.8 3.9 1.1 -3.5 3.10 -8.0 50.0
Singapore 0.55 4.3 5.9 0.2 2.0 21.0 -0.7 2.63 -4.3 -71.4
Slovenia 0.50 6.0 4.2 1.5 9.1 5.7 0.2 NA -7.8 -16.7
Czech Republic 0.49 5.5 -1.0 1.9 2.2 0.7 0.9 1.89 -12.5 -5.0
Malaysia 0.46 5.9 3.1 1.3 3.3 3.2 -2.8 4.16 -8.6 -73.5
Estonia 0.42 5.0 7.6 2.9 6.8 2.3 -0.3 NA -7.8 -9.4
Switzerland 0.42 1.9 8.7 0.8 2.7 9.7 0.8 0.13 0.0 100.0
Hong Kong 0.40 4.7 0.7 2.6 2.9 4.0 0.8 2.22 0.8 420.0
Luxembourg 0.36 1.7 4.8 1.1 5.7 4.7 0.9 NA -7.8 -42.1
Netherlands 0.36 2.8 3.5 0.9 4.9 9.8 0.8 0.75 -7.8 -43.8
Taiwan 0.36 3.0 3.1 2.0 3.7 13.0 -0.9 1.01 -3.3 1900.0
Germany 0.34 2.3 3.2 1.6 3.4 7.7 1.0 0.64 -7.8 -20.0
Israel 0.27 3.0 6.5 0.4 3.6 3.5 -2.5 1.96 -0.3 -42.9
Iceland 0.26 1.5 NA 2.3 2.3 4.5 1.2 NA 0.0 21.1
Hungary 0.25 4.4 2.0 2.3 3.9 1.9 -2.6 2.98 -5.0 4.5
Philippines 0.25 6.8 13.0 4.5 5.3 -0.2 -1.9 6.03 6.1 40.6
Austria 0.22 2.9 5.1 1.8 5.0 2.4 -0.6 0.61 -7.8 -14.3
Chile 0.17 3.3 8.7 1.9 6.9 -0.6 -2.1 4.45 -5.2 -29.6
Lithuania 0.16 4.0 8.8 2.3 9.2 0.8 0.6 1.15 -7.8 -34.3
Vietnam 0.16 6.8 9.4 2.8 2.3 -0.4 -6.3 4.54 0.4 -34.9
Finland 0.14 2.2 7.0 0.8 8.8 0.9 -0.7 0.79 -7.8 0.0
Poland 0.13 4.4 1.9 1.6 6.6 -0.7 -2.2 3.29 -5.5 -20.0
Latvia 0.11 4.3 0.6 2.0 8.1 -0.2 -1.2 NA -7.8 -41.2
Sweden 0.11 3.3 6.8 1.7 6.5 4.0 0.6 0.76 -1.1 -10.5
India 0.11 7.2 4.4 4.6 5.9 -2.0 -3.5 7.90 4.7 53.3
Euro area 0.05 2.5 3.0 1.2 8.5 3.3 -0.9 0.64 -7.8 -36.8
South Korea 0.05 2.9 -4.3 1.6 4.1 4.7 0.7 2.82 -4.5 -15.8
Japan 0.04 0.9 2.4 1.1 2.5 4.0 -4.9 0.02 -2.7 450.0
Slovakia 0.04 3.6 -2.3 2.9 5.6 -0.6 -1.2 0.76 -7.8 262.5
Canada 0.02 2.9 4.5 2.3 5.8 -2.7 -2.0 2.48 -5.7 43.7
New Zealand 0.02 3.2 2.0 1.1 4.4 -2.7 1.0 2.76 0.7 -47.6
Indonesia 0.01 5.1 1.1 3.4 5.0 -2.1 -2.5 7.04 5.5 -19.0
United States -0.04 2.9 4.3 2.5 3.9 -2.8 -4.6 2.99 NA 13.6
Russia -0.07 0.9 0.9 2.4 5.0 3.4 -0.9 8.13 10.7 -41.5
Belgium -0.09 1.6 0.1 1.5 6.4 0.0 -0.9 0.87 -7.8 -34.8
France -0.09 2.1 1.8 1.6 8.8 -0.8 -2.4 0.79 -7.8 33.3
Italy -0.10 1.4 3.6 0.5 11.0 2.7 -2.0 1.96 -7.8 -73.7
Australia -0.10 2.4 1.6 1.9 5.5 -2.2 -1.2 2.83 -0.7 -9.5
Portugal -0.12 2.0 1.8 0.4 7.9 0.1 -1.0 1.75 -7.8 -80.0
Ireland -0.13 8.4 -9.9 -0.4 5.9 7.4 -0.2 1.05 -7.8 -144.4
Greece -0.14 1.8 1.1 NA 20.0 -1.2 0.2 4.13 -7.8 NA
Denmark -0.15 1.3 -9.8 0.8 4.1 7.8 -0.7 0.68 -7.4 -27.3
Norway -0.26 0.3 -6.7 2.4 3.9 6.3 4.9 1.94 -4.9 9.1
Pakistan -0.27 5.4 5.8 3.7 5.9 -5.8 -5.4 8.50 10.6 -22.9
Britain -0.28 1.2 2.9 2.5 4.2 -3.7 -1.8 1.53 -3.8 8.7
South Africa -0.30 1.5 2.3 3.8 26.0 -2.8 -3.6 8.49 -7.7 -37.7
Spain -0.34 2.9 -3.6 1.1 16.0 1.7 -2.6 1.21 -7.8 -57.7
Mexico -0.36 1.2 -3.7 4.6 3.2 -1.8 -2.3 7.78 5.6 -20.7
Turkey -0.41 7.3 6.8 10.0 10.0 -5.7 -2.8 14.00 26.0 -9.1
Ukraine -0.45 3.1 1.0 13.0 1.2 -4.2 -2.6 NA 0.0 8.3
Colombia -0.45 1.4 -1.4 3.1 9.4 -2.9 -2.0 6.62 0.2 -34.0
Peru -0.57 2.2 0.3 0.5 7.0 -1.7 -3.5 NA 0.3 -86.5
Brazil -0.71 2.1 1.3 2.8 13.0 -1.2 -7.0 8.35 18.7 -31.7
Argentina -0.79 3.9 6.1 25.0 7.2 -5.3 -5.5 6.23 66.7 NA
Venezuela -0.83 -8.8 NA NA 7.3 2.6 -15.0 8.24 699020.0 NA
Saudi Arabia -1.18 -0.7 NA 2.8 6.0 3.7 -7.3 NA 0.0 -500.0
Egypt -1.22 NA 6.2 13.0 10.0 -4.0 -9.8 NA -5.6 -58.1

Conclusion

The above code can be used to collect fundamental economic indicators for countries and then rank those countries based on strength of economic activity.