Data source - The World Bank For computation, the following indicators were used:
# load economic indicators
fdi <- read_csv("./data/csv/fdi.csv", skip = 4)
cpi <- read_csv("./data/csv/cpi.csv", skip = 4)
gdp <- read_csv("./data/csv/gdp.csv", skip = 4)
unemployment <- read_csv("./data/csv/unemployment.csv", skip = 4)
Here I created a custom function to get rid of all unnecessary data
and keep only the most recent value of each indicator. Function searches
for 1st nonempty field from the ‘tail’ side. For those countries with no
value specified for no year, it takes unproper string from column
preceeding indicator values. This string is being changed to NA (for
clarity). Function returns table with columns: ” country name, country
code,
# df - original data frame
# indicatorAbbrv - string, abbreviated name of economic indicator, to be used as a column header, e.g. GDP, CPI
# nullableStr - string that should be replaced with null
lastValAndFormatting <- function(df, indicatorAbbrv, nullableStr) {
df_lastVal <- na.omit(melt(as.data.table(df, keep.rownames = TRUE), id=c("Country Name", "Country Code")))[, value[.N], by = c("Country Name", "Country Code")]
colnames(df_lastVal) <- c("CountryName", "CountryCode", "IndicatorValue")
df_lastVal <- replace_with_na(df_lastVal, replace = list(IndicatorValue = nullableStr))
df_lastVal$IndicatorValue <- as.numeric(df_lastVal$IndicatorValue)
colnames(df_lastVal) <- c("CountryName", "CountryCode", indicatorAbbrv)
return(df_lastVal)
}
fdi_last <- lastValAndFormatting(fdi, "FDI", "BN.KLT.DINV.CD")
cpi_last <- lastValAndFormatting(cpi, "CPI", "FP.CPI.TOTL")
gdp_last <- lastValAndFormatting(gdp, "GDP", "NY.GDP.MKTP.CD")
ur_last <- lastValAndFormatting(unemployment, "UR", "SL.UEM.TOTL.ZS")
econ_data <- cbind(gdp_last, CPI=cpi_last$CPI, FDI=fdi_last$FDI, UR=ur_last$UR)
econ_data <- na.omit(econ_data)
From the scree plots produced with fviz_nbclust() function, I can conclude that optimal number of cluster is 5 - as computed by the elbow method (also close t the result computed by average silhouette width method.
# Elbow method
fviz_nbclust(scale(econ_data[,3:6]), kmeans, method = "wss")+ geom_vline(xintercept = 5, linetype = 2)
# Silhouete method
fviz_nbclust(scale(econ_data[,3:6]), kmeans, method = "silhouette")
Seed is set to make the result reproducible.
set.seed(1234)
kmeans_k5 <- kmeans(scale(econ_data[,3:6]), 5, nstart = 100)
# plot the clusters
fviz_cluster(kmeans_k5, data = scale(econ_data[,3:6]), geom = c("point"),ellipse.type = "euclid")
Below I check how numerous is representation of each cluster and what are mean values of economic indicators within.
count(econ_data, kmeans_k5$cluster)
## kmeans_k5$cluster n
## 1: 1 2
## 2: 2 6
## 3: 3 2
## 4: 4 128
## 5: 5 37
econ_data[,3:6] %>%
group_by(kmeans_k5$cluster) %>%
summarize_all(funs(mean(.)))
## # A tibble: 5 × 5
## `kmeans_k5$cluster` GDP CPI FDI UR
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.32e10 19408. -2.33e 8 16.9
## 2 2 2.32e12 116. 9.92e10 4.46
## 3 3 2.05e13 127. -1.16e11 5.14
## 4 4 2.64e11 238. -1.75e 9 5.81
## 5 5 1.55e11 164. -3.17e 9 18.1
# Polygons of countries: http://thematicmapping.org/downloads/TM_WORLD_BORDERS_SIMPL-0.3.zip
# Reading shape file
world_polygon <- readOGR(
dsn= "./data/map_polygons/TM_WORLD_BORDERS_SIMPL-0.3" ,
layer="TM_WORLD_BORDERS_SIMPL-0.3",
verbose=FALSE
)
world_coordinates <- c(38.02213230455896, 18.457031704152197)
merged_data <- merge(world_polygon, econ_data_clustered, by.x = "ISO3", by.y = "CountryCode")
merged_data@data$cluster<- as.numeric(merged_data@data$cluster)
#Labels for pop-ups
labels <- paste(
"<b> Country: </b>", merged_data@data$NAME, "<br>",
"<b> Cluster: </b>", merged_data@data$cluster, "<br>",sep="") %>%
lapply(htmltools::HTML)
factpal <- colorFactor(heat.colors(5), merged_data@data$cluster)
# creating choropleth map using Leaflet
leaflet(merged_data) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
setView( lat=world_coordinates[2], lng=world_coordinates[1] , zoom=3) %>%
addPolygons(stroke = FALSE,
fillOpacity = 0.5,
color = ~factpal(cluster),
label = labels ) %>%
addLegend(pal = factpal, values = ~cluster, position = "bottomleft")