I would like to create a dashboard that ranks the economic strength of all countries based on several important economic indicators. This document shows how to collect that data from public sources and create a ranking algorithm.
The Economist publishes country fundamental data on their website. Here is the code to collect that data.
library(tidyverse)
library(rvest)
library(RSelenium)
library(tidyquant)
library(cluster)
library(ggdendro)
library(directlabels)
library(psych)
#Get html using a JS phantom browser
url.base <- "http://www.economist.com/indicators"
pJS <- phantom()
Sys.sleep(1) # give the binary a moment
remDr <- remoteDriver(browserName = 'phantomjs')
remDr$open()
remDr$navigate(url.base)
html.raw <- read_html(remDr$getPageSource()[[1]])
#Parse Data into a data frame
vector.text <- html.raw %>%
html_nodes("tr") %>%
html_text() %>%
str_split("\n") %>% unlist() %>% trimws() %>% .[. != ""]
index.start <- str_which(vector.text, "United States")
vector.text <- vector.text[index.start:length(vector.text)]
data.initial.raw <- tibble(country = vector.text,
GDP.latest = lead(vector.text),
GDP.quarter = lead(vector.text,2),
GDP.2018 = lead(vector.text,3),
GDP.2019 = lead(vector.text,4),
IP = lead(vector.text,5),
CPI.latest = lead(vector.text,6),
CPI.year.ago = lead(vector.text,7),
CPI.2018 = lead(vector.text,8),
unemployment.rate = lead(vector.text,9),
trade.balance = lead(vector.text,10),
CAB = lead(vector.text,11),
CAB.ratio = lead(vector.text,12),
currency.today = lead(vector.text,13),
currency.year.ago = lead(vector.text,14),
budget.balance = lead(vector.text,15),
interest.rate.short = lead(vector.text,16),
interest.rate.long = lead(vector.text,17)) %>%
filter(str_detect(country, "[0-9]") == FALSE) %>%
filter(country != "na") %>%
filter(str_detect(country, "nil") == FALSE) %>%
filter(country != "-")
To clean the data, we must convert all entries to numeric and calculate derivative indicators such as the change in currency value. This will give us the dataset required for ranking countries.
convert.to.numeric <- function(x,n) {
str_sub(x,0,n) %>% as.numeric()
}
data.initial.clean <-
data.initial.raw %>%
mutate(
GDP.latest = convert.to.numeric(GDP.latest,4),
IP = convert.to.numeric(IP, 4),
CPI.latest = convert.to.numeric(CPI.latest, 4),
CPI.year.ago = convert.to.numeric(CPI.year.ago, 4),
unemployment.rate = convert.to.numeric(unemployment.rate, 3),
CAB.ratio = convert.to.numeric(CAB.ratio,4),
currency.today = currency.today %>% str_replace(.,",","") %>% as.numeric(),
currency.year.ago = currency.year.ago %>% str_replace(.,",","") %>% as.numeric(),
budget.balance = convert.to.numeric(budget.balance,4),
interest.rate.10 = convert.to.numeric(interest.rate.long, 4)
) %>%
select(country, GDP.latest,IP, CPI.latest, CPI.year.ago, unemployment.rate, CAB.ratio, currency.today, currency.year.ago, budget.balance, interest.rate.10) %>%
filter(country != "+") %>%
mutate(
currency.delta = (currency.today-currency.year.ago)/currency.year.ago,
CPI.delta = (CPI.latest - CPI.year.ago)/CPI.year.ago
) %>%
select(-currency.today,-currency.year.ago, -CPI.year.ago)
Next, we visualize the distributions of each indicator in our dataset.
library(tidyverse)
data.initial.clean <- read_csv("data.initial.clean.csv")
data.initial.clean %>%
filter(currency.delta < 10) %>% # Venezuela is too wierd
gather(key = "indicator",GDP.latest:CPI.delta, value = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(aes(fill = indicator)) +
facet_wrap(~indicator, scales = "free_x") +
labs(
title = "Distribution of Initital Data",
y = "Count",
x = "Value"
) +
theme_minimal() +
theme(legend.position = "none")
We must transform the data so that later we can calculate z-scores. There are three steps: First, winsorize the data at the 95th percentile to control the effect of outliers on our scoring algorithm. Second, shift the distributions to the right so that all values are greater than zero. Third, take the log of all values.
# Define paramaters for Winsorizing data
winsorize.data <- function(x) {
winsor(x, trim = 0.05, na.rm = TRUE)
}
# Define function for shifting distributions into positive territory (before applying logs)
get.lower.bound <- function(x) {
x %>%
min(., na.rm = TRUE) %>%
abs() + 1
}
shift.to.the.right <- function(x) {
x + get.lower.bound(x)
}
data.transformed <-
data.initial.clean %>%
# Winsorize Data
mutate_if(is.numeric, funs(winsorize.data)) %>%
# Shift data to the right
mutate_if(is.numeric, funs(shift.to.the.right)) %>%
# Take the logarithm of the data
mutate_if(is.numeric, funs(log))
Finally, we score the countries by taking the average of the Z-scores of the economic indicators, taking care to change the sign for economic indicators where a higher value implies a worse economic outcome.
data.z.scores <-
data.transformed %>%
mutate_if(is.numeric, funs(scale)) %>%
# Invert score for indicators where higher number implies a worse economic outcome
mutate(
CPI.latest = (-1)*CPI.latest,
unemployment.rate = (-1)*unemployment.rate,
currency.delta = (-1)*currency.delta
) %>%
# Calculate final score
rowwise() %>%
mutate(
score.final = mean(c(GDP.latest, IP, CPI.latest, unemployment.rate, CAB.ratio,budget.balance,interest.rate.10,currency.delta,CPI.delta), na.rm = TRUE)
) %>%
ungroup() %>%
arrange(desc(score.final))
data.z.scores %>%
select(country, score.final) %>%
left_join(data.initial.clean)
Here is the final ranking of all countries:
Country | Final Score | GDP | IP | CPI | Unemployment | CAB | Budget | 10Y Yields | FX Delta | CPI Delta |
---|---|---|---|---|---|---|---|---|---|---|
Thailand | 0.77 | 4.0 | 2.6 | 1.1 | 1.2 | 10.0 | -2.3 | 2.62 | -8.8 | 175.0 |
China | 0.59 | 6.8 | 7.0 | 1.8 | 3.9 | 1.1 | -3.5 | 3.10 | -8.0 | 50.0 |
Singapore | 0.55 | 4.3 | 5.9 | 0.2 | 2.0 | 21.0 | -0.7 | 2.63 | -4.3 | -71.4 |
Slovenia | 0.50 | 6.0 | 4.2 | 1.5 | 9.1 | 5.7 | 0.2 | NA | -7.8 | -16.7 |
Czech Republic | 0.49 | 5.5 | -1.0 | 1.9 | 2.2 | 0.7 | 0.9 | 1.89 | -12.5 | -5.0 |
Malaysia | 0.46 | 5.9 | 3.1 | 1.3 | 3.3 | 3.2 | -2.8 | 4.16 | -8.6 | -73.5 |
Estonia | 0.42 | 5.0 | 7.6 | 2.9 | 6.8 | 2.3 | -0.3 | NA | -7.8 | -9.4 |
Switzerland | 0.42 | 1.9 | 8.7 | 0.8 | 2.7 | 9.7 | 0.8 | 0.13 | 0.0 | 100.0 |
Hong Kong | 0.40 | 4.7 | 0.7 | 2.6 | 2.9 | 4.0 | 0.8 | 2.22 | 0.8 | 420.0 |
Luxembourg | 0.36 | 1.7 | 4.8 | 1.1 | 5.7 | 4.7 | 0.9 | NA | -7.8 | -42.1 |
Netherlands | 0.36 | 2.8 | 3.5 | 0.9 | 4.9 | 9.8 | 0.8 | 0.75 | -7.8 | -43.8 |
Taiwan | 0.36 | 3.0 | 3.1 | 2.0 | 3.7 | 13.0 | -0.9 | 1.01 | -3.3 | 1900.0 |
Germany | 0.34 | 2.3 | 3.2 | 1.6 | 3.4 | 7.7 | 1.0 | 0.64 | -7.8 | -20.0 |
Israel | 0.27 | 3.0 | 6.5 | 0.4 | 3.6 | 3.5 | -2.5 | 1.96 | -0.3 | -42.9 |
Iceland | 0.26 | 1.5 | NA | 2.3 | 2.3 | 4.5 | 1.2 | NA | 0.0 | 21.1 |
Hungary | 0.25 | 4.4 | 2.0 | 2.3 | 3.9 | 1.9 | -2.6 | 2.98 | -5.0 | 4.5 |
Philippines | 0.25 | 6.8 | 13.0 | 4.5 | 5.3 | -0.2 | -1.9 | 6.03 | 6.1 | 40.6 |
Austria | 0.22 | 2.9 | 5.1 | 1.8 | 5.0 | 2.4 | -0.6 | 0.61 | -7.8 | -14.3 |
Chile | 0.17 | 3.3 | 8.7 | 1.9 | 6.9 | -0.6 | -2.1 | 4.45 | -5.2 | -29.6 |
Lithuania | 0.16 | 4.0 | 8.8 | 2.3 | 9.2 | 0.8 | 0.6 | 1.15 | -7.8 | -34.3 |
Vietnam | 0.16 | 6.8 | 9.4 | 2.8 | 2.3 | -0.4 | -6.3 | 4.54 | 0.4 | -34.9 |
Finland | 0.14 | 2.2 | 7.0 | 0.8 | 8.8 | 0.9 | -0.7 | 0.79 | -7.8 | 0.0 |
Poland | 0.13 | 4.4 | 1.9 | 1.6 | 6.6 | -0.7 | -2.2 | 3.29 | -5.5 | -20.0 |
Latvia | 0.11 | 4.3 | 0.6 | 2.0 | 8.1 | -0.2 | -1.2 | NA | -7.8 | -41.2 |
Sweden | 0.11 | 3.3 | 6.8 | 1.7 | 6.5 | 4.0 | 0.6 | 0.76 | -1.1 | -10.5 |
India | 0.11 | 7.2 | 4.4 | 4.6 | 5.9 | -2.0 | -3.5 | 7.90 | 4.7 | 53.3 |
Euro area | 0.05 | 2.5 | 3.0 | 1.2 | 8.5 | 3.3 | -0.9 | 0.64 | -7.8 | -36.8 |
South Korea | 0.05 | 2.9 | -4.3 | 1.6 | 4.1 | 4.7 | 0.7 | 2.82 | -4.5 | -15.8 |
Japan | 0.04 | 0.9 | 2.4 | 1.1 | 2.5 | 4.0 | -4.9 | 0.02 | -2.7 | 450.0 |
Slovakia | 0.04 | 3.6 | -2.3 | 2.9 | 5.6 | -0.6 | -1.2 | 0.76 | -7.8 | 262.5 |
Canada | 0.02 | 2.9 | 4.5 | 2.3 | 5.8 | -2.7 | -2.0 | 2.48 | -5.7 | 43.7 |
New Zealand | 0.02 | 3.2 | 2.0 | 1.1 | 4.4 | -2.7 | 1.0 | 2.76 | 0.7 | -47.6 |
Indonesia | 0.01 | 5.1 | 1.1 | 3.4 | 5.0 | -2.1 | -2.5 | 7.04 | 5.5 | -19.0 |
United States | -0.04 | 2.9 | 4.3 | 2.5 | 3.9 | -2.8 | -4.6 | 2.99 | NA | 13.6 |
Russia | -0.07 | 0.9 | 0.9 | 2.4 | 5.0 | 3.4 | -0.9 | 8.13 | 10.7 | -41.5 |
Belgium | -0.09 | 1.6 | 0.1 | 1.5 | 6.4 | 0.0 | -0.9 | 0.87 | -7.8 | -34.8 |
France | -0.09 | 2.1 | 1.8 | 1.6 | 8.8 | -0.8 | -2.4 | 0.79 | -7.8 | 33.3 |
Italy | -0.10 | 1.4 | 3.6 | 0.5 | 11.0 | 2.7 | -2.0 | 1.96 | -7.8 | -73.7 |
Australia | -0.10 | 2.4 | 1.6 | 1.9 | 5.5 | -2.2 | -1.2 | 2.83 | -0.7 | -9.5 |
Portugal | -0.12 | 2.0 | 1.8 | 0.4 | 7.9 | 0.1 | -1.0 | 1.75 | -7.8 | -80.0 |
Ireland | -0.13 | 8.4 | -9.9 | -0.4 | 5.9 | 7.4 | -0.2 | 1.05 | -7.8 | -144.4 |
Greece | -0.14 | 1.8 | 1.1 | NA | 20.0 | -1.2 | 0.2 | 4.13 | -7.8 | NA |
Denmark | -0.15 | 1.3 | -9.8 | 0.8 | 4.1 | 7.8 | -0.7 | 0.68 | -7.4 | -27.3 |
Norway | -0.26 | 0.3 | -6.7 | 2.4 | 3.9 | 6.3 | 4.9 | 1.94 | -4.9 | 9.1 |
Pakistan | -0.27 | 5.4 | 5.8 | 3.7 | 5.9 | -5.8 | -5.4 | 8.50 | 10.6 | -22.9 |
Britain | -0.28 | 1.2 | 2.9 | 2.5 | 4.2 | -3.7 | -1.8 | 1.53 | -3.8 | 8.7 |
South Africa | -0.30 | 1.5 | 2.3 | 3.8 | 26.0 | -2.8 | -3.6 | 8.49 | -7.7 | -37.7 |
Spain | -0.34 | 2.9 | -3.6 | 1.1 | 16.0 | 1.7 | -2.6 | 1.21 | -7.8 | -57.7 |
Mexico | -0.36 | 1.2 | -3.7 | 4.6 | 3.2 | -1.8 | -2.3 | 7.78 | 5.6 | -20.7 |
Turkey | -0.41 | 7.3 | 6.8 | 10.0 | 10.0 | -5.7 | -2.8 | 14.00 | 26.0 | -9.1 |
Ukraine | -0.45 | 3.1 | 1.0 | 13.0 | 1.2 | -4.2 | -2.6 | NA | 0.0 | 8.3 |
Colombia | -0.45 | 1.4 | -1.4 | 3.1 | 9.4 | -2.9 | -2.0 | 6.62 | 0.2 | -34.0 |
Peru | -0.57 | 2.2 | 0.3 | 0.5 | 7.0 | -1.7 | -3.5 | NA | 0.3 | -86.5 |
Brazil | -0.71 | 2.1 | 1.3 | 2.8 | 13.0 | -1.2 | -7.0 | 8.35 | 18.7 | -31.7 |
Argentina | -0.79 | 3.9 | 6.1 | 25.0 | 7.2 | -5.3 | -5.5 | 6.23 | 66.7 | NA |
Venezuela | -0.83 | -8.8 | NA | NA | 7.3 | 2.6 | -15.0 | 8.24 | 699020.0 | NA |
Saudi Arabia | -1.18 | -0.7 | NA | 2.8 | 6.0 | 3.7 | -7.3 | NA | 0.0 | -500.0 |
Egypt | -1.22 | NA | 6.2 | 13.0 | 10.0 | -4.0 | -9.8 | NA | -5.6 | -58.1 |
The above code can be used to collect fundamental economic indicators for countries and then rank those countries based on strength of economic activity.