Libraries

library(magrittr)
library(tidyverse)
library(corrplot)
library(psych)
library(plotly)
library(htmlwidgets)
require(FactoMineR)
require(factoextra)
library(cluster)

Data & Assignment

Data and assignment were taken from https://www.kaggle.com/datasets/vipulgohel/clustering-pca-assignment

Assignment:

After the current funding programs, International Humanitarian NGO has raised around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. The significant issues that come while making this decision have mostly related to the countries in dire need of Aid. Our job is as Data analyst is to classify the countries using the socio-economic and health factors that determine the overall development of nations. After this analysis, we need to suggest countries that the CEO needs to focus on and give the highest priority.

data_dictionary = read_csv("data-dictionary.csv")
data_dictionary
## # A tibble: 10 × 2
##    `Column Name` Description                                                    
##    <chr>         <chr>                                                          
##  1 country       Name of the country                                            
##  2 child_mort    Death of children under 5 years of age per 1000 live births    
##  3 exports       Exports of goods and services. Given as %age of the Total GDP  
##  4 health        Total health spending as %age of Total GDP                     
##  5 imports       Imports of goods and services. Given as %age of the Total GDP  
##  6 Income        Net income per person                                          
##  7 Inflation     The measurement of the annual growth rate of the Total GDP     
##  8 life_expec    The average number of years a new born child would live if the…
##  9 total_fer     The number of children that would be born to each woman if the…
## 10 gdpp          The GDP per capita. Calculated as the Total GDP divided by the…
country = read_csv("Country-data.csv")
head(country)
## # A tibble: 6 × 10
##   country    child…¹ exports health imports income infla…² life_…³ total…⁴  gdpp
##   <chr>        <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <dbl>
## 1 Afghanist…    90.2    10     7.58    44.9   1610    9.44    56.2    5.82   553
## 2 Albania       16.6    28     6.55    48.6   9930    4.49    76.3    1.65  4090
## 3 Algeria       27.3    38.4   4.17    31.4  12900   16.1     76.5    2.89  4460
## 4 Angola       119      62.3   2.85    42.9   5900   22.4     60.1    6.16  3530
## 5 Antigua a…    10.3    45.5   6.03    58.9  19100    1.44    76.8    2.13 12200
## 6 Argentina     14.5    18.9   8.1     16    18700   20.9     75.8    2.37 10300
## # … with abbreviated variable names ¹​child_mort, ²​inflation, ³​life_expec,
## #   ⁴​total_fer
colSums(is.na(country))
##    country child_mort    exports     health    imports     income  inflation 
##          0          0          0          0          0          0          0 
## life_expec  total_fer       gdpp 
##          0          0          0

There are no missing values. I need to get rid of country names column

country1 = country %>% dplyr::select(-country)
#row.names(country1)=country$country
summary(country1)
##    child_mort        exports            health          imports        
##  Min.   :  2.60   Min.   :  0.109   Min.   : 1.810   Min.   :  0.0659  
##  1st Qu.:  8.25   1st Qu.: 23.800   1st Qu.: 4.920   1st Qu.: 30.2000  
##  Median : 19.30   Median : 35.000   Median : 6.320   Median : 43.3000  
##  Mean   : 38.27   Mean   : 41.109   Mean   : 6.816   Mean   : 46.8902  
##  3rd Qu.: 62.10   3rd Qu.: 51.350   3rd Qu.: 8.600   3rd Qu.: 58.7500  
##  Max.   :208.00   Max.   :200.000   Max.   :17.900   Max.   :174.0000  
##      income         inflation         life_expec      total_fer    
##  Min.   :   609   Min.   : -4.210   Min.   :32.10   Min.   :1.150  
##  1st Qu.:  3355   1st Qu.:  1.810   1st Qu.:65.30   1st Qu.:1.795  
##  Median :  9960   Median :  5.390   Median :73.10   Median :2.410  
##  Mean   : 17145   Mean   :  7.782   Mean   :70.56   Mean   :2.948  
##  3rd Qu.: 22800   3rd Qu.: 10.750   3rd Qu.:76.80   3rd Qu.:3.880  
##  Max.   :125000   Max.   :104.000   Max.   :82.80   Max.   :7.490  
##       gdpp       
##  Min.   :   231  
##  1st Qu.:  1330  
##  Median :  4660  
##  Mean   : 12964  
##  3rd Qu.: 14050  
##  Max.   :105000
country1 %>% cor() %>% corrplot()

Positive correlation is observed between child mortality & total fertility, exports & imports, income & GDP; negative correlation is between child mortality & life expectancy, total fertility & life expectancy. These correlations are quite logical.

PCA

pca = prcomp(country1, center = TRUE, scale = TRUE)
pca
## Standard deviations (1, .., p=9):
## [1] 2.0336314 1.2435217 1.0818425 0.9973889 0.8127847 0.4728437 0.3368067
## [8] 0.2971790 0.2586020
## 
## Rotation (n x k) = (9 x 9):
##                   PC1          PC2         PC3          PC4         PC5
## child_mort -0.4195194 -0.192883937  0.02954353 -0.370653262  0.16896968
## exports     0.2838970 -0.613163494 -0.14476069 -0.003091019 -0.05761584
## health      0.1508378  0.243086779  0.59663237 -0.461897497 -0.51800037
## imports     0.1614824 -0.671820644  0.29992674  0.071907461 -0.25537642
## income      0.3984411 -0.022535530 -0.30154750 -0.392159039  0.24714960
## inflation  -0.1931729  0.008404473 -0.64251951 -0.150441762 -0.71486910
## life_expec  0.4258394  0.222706743 -0.11391854  0.203797235 -0.10821980
## total_fer  -0.4037290 -0.155233106 -0.01954925 -0.378303645  0.13526221
## gdpp        0.3926448  0.046022396 -0.12297749 -0.531994575  0.18016662
##                     PC6         PC7         PC8         PC9
## child_mort -0.200628153  0.07948854  0.68274306  0.32754180
## exports     0.059332832  0.70730269  0.01419742 -0.12308207
## health     -0.007276456  0.24983051 -0.07249683  0.11308797
## imports     0.030031537 -0.59218953  0.02894642  0.09903717
## income     -0.160346990 -0.09556237 -0.35262369  0.61298247
## inflation  -0.066285372 -0.10463252  0.01153775 -0.02523614
## life_expec  0.601126516 -0.01848639  0.50466425  0.29403981
## total_fer   0.750688748 -0.02882643 -0.29335267 -0.02633585
## gdpp       -0.016778761 -0.24299776  0.24969636 -0.62564572

I will try to describe PC1 and PC2, the more we move away from the first main component, the less informative these components become, so it is not necessary to interpret them all.

PC1 has positive associations with exports, health, imports, income, life_expec, gdpp; and negative associations with child_mort, inflation, total_fer. Let’s say PC1 is responsible for country stability and wealth, as it is good for countries to have high indicators in exports, imports, income, health spendings, life expectancy and GDP and have low indicators in child mortality and inflation, what about total fertility - it is known developed countries have low fertility rates, so the negative weight of total_fer is consistent with the interpretation of PC1. PC2 has greatest contribution in exports and imports, it can be considered as responsible for trade balance.

biplot(pca, cex = 0.5, col =c("darkgrey","blue"))

High correlations between: gdpp & income, total_fer & child_mort.

Choosing number of PCs

summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6    PC7
## Standard deviation     2.0336 1.2435 1.0818 0.9974 0.8128 0.47284 0.3368
## Proportion of Variance 0.4595 0.1718 0.1300 0.1105 0.0734 0.02484 0.0126
## Cumulative Proportion  0.4595 0.6313 0.7614 0.8719 0.9453 0.97015 0.9828
##                            PC8     PC9
## Standard deviation     0.29718 0.25860
## Proportion of Variance 0.00981 0.00743
## Cumulative Proportion  0.99257 1.00000

Approaches to choose number of components:

  1. choose as many main components as there are eigenvalues greater than 1 - thus we get 3 PCs

  2. (at least 70-80%) Let’s make at least 80% of information - 4 PCs by this approach

  3. scree plot - in my opinion, it looks like we need 2 PCs by this method

plot(pca, type = "l")

I decided to stop at 3 PCs

pca1 = data.frame(pca$x[,1:3])
head(pca1)
##           PC1         PC2        PC3
## 1 -2.90428986 -0.09533386  0.7159652
## 2  0.42862224  0.58639208  0.3324855
## 3 -0.28436983  0.45380957 -1.2178421
## 4 -2.92362976 -1.69047094 -1.5204709
## 5  1.03047668 -0.13624894  0.2250441
## 6  0.02234007  1.77385167 -0.8673884

Clustering with k-means

I chose k-means clustering because it’s one of the most common forms of clustering and quite simple and convenient. It makes K-number of clusters in which observations are very similar to each other, and observations in different clusters are very different from each other. Countries can be divided into developing and developed, so I expect 2 clusters.

1 Elbow Method

fviz_nbclust(pca1, kmeans, method = "wss")

One way to choose the optimum K value is to choose the number of cluster in the area of “bend of an elbow” - and it looks like the optimum number is 2.

2 Gap Statistic

Another way to choose the optimum K value is to check on plot where’s the biggest jump in within-cluster distance occurred.

fviz_nbclust(pca1, kmeans, method = "gap_stat")

Here, 2 is also the optimum number of clusters.

Clustering

set.seed(123)
km = kmeans(pca1, centers = 2)
pca1$cluster = km$cluster
fviz_cluster(km, data = pca1,  labelsize = 7)

Countries in dire need of aid

country2 = country
country2$cluster = km$cluster
results = country2 %>% dplyr::select(-country) %>% group_by(cluster) %>% summarise_all(mean)
t(results)
##                    [,1]        [,2]
## cluster        1.000000    2.000000
## child_mort    10.676667   70.522078
## exports       49.572222   31.216870
## health         7.583889    5.917792
## imports       49.968889   43.291765
## income     27506.666667 5033.285714
## inflation      4.611667   11.487221
## life_expec    76.952222   63.079221
## total_fer      1.886556    4.188571
## gdpp       22029.333333 2368.493506

Countries in cluster 2 have worse indicators: high child mortality rates and fertility rates, higher inflation, lower GDP, income, exports, imports, health spending, and lower life expectancy.

Which countries are needed to focus on and given the highest priority:

All factors

country2 %>%
  filter(child_mort > max(results$child_mort),
         exports < min(results$exports),
         health < min(results$health),
         imports < min(results$imports),
         income < min(results$income),
         inflation > max(results$inflation),
         life_expec < min(results$life_expec),
         total_fer > max(results$total_fer),
         gdpp < min(results$gdpp)) %>%
  dplyr::select(-cluster)
## # A tibble: 1 × 10
##   country child_mort exports health imports income infla…¹ life_…² total…³  gdpp
##   <chr>        <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <dbl>
## 1 Guinea         109    30.3   4.93    43.2   1190    16.1      58    5.34   648
## # … with abbreviated variable names ¹​inflation, ²​life_expec, ³​total_fer

Considering that 10 million is not a particularly large sum, it is quite possible to send all the funds to one country - Guinea.

It is also quite reasonable to divide countries into those with difficulties with socio-economic factors and with health factors. In my opinion, health factors should be a priority.

Socio-economic factors

country2 %>%
  filter(exports < min(results$exports),
         imports < min(results$imports),
         income < min(results$income),
         inflation > max(results$inflation),
         gdpp < min(results$gdpp)) %>%
  dplyr::select(-child_mort,-health,-life_expec,-total_fer,-cluster)
## # A tibble: 8 × 6
##   country      exports imports income inflation  gdpp
##   <chr>          <dbl>   <dbl>  <dbl>     <dbl> <dbl>
## 1 Burundi         8.92    39.2    764      12.3   231
## 2 Eritrea         4.79    23.3   1420      11.6   482
## 3 Guinea         30.3     43.2   1190      16.1   648
## 4 Malawi         22.8     34.9   1030      12.1   459
## 5 Nepal           9.58    36.4   1990      15.1   592
## 6 Sierra Leone   16.8     34.5   1220      17.2   399
## 7 Sudan          19.7     17.2   3370      19.6  1480
## 8 Yemen          30       34.4   4480      23.6  1310

Health factors

country2 %>%
  filter(child_mort > max(results$child_mort),
         health < min(results$health),
         life_expec < min(results$life_expec),
         total_fer > max(results$total_fer)) %>%
  dplyr::select(country,child_mort,health,life_expec,total_fer)
## # A tibble: 14 × 5
##    country                  child_mort health life_expec total_fer
##    <chr>                         <dbl>  <dbl>      <dbl>     <dbl>
##  1 Angola                        119     2.85       60.1      6.16
##  2 Benin                         111     4.1        61.8      5.36
##  3 Cameroon                      108     5.13       57.3      5.11
##  4 Central African Republic      149     3.98       47.5      5.21
##  5 Chad                          150     4.53       56.5      6.59
##  6 Cote d'Ivoire                 111     5.3        56.3      5.27
##  7 Equatorial Guinea             111     4.48       60.9      5.21
##  8 Ghana                          74.7   5.22       62.2      4.27
##  9 Guinea                        109     4.93       58        5.34
## 10 Mali                          137     4.98       59.5      6.55
## 11 Mozambique                    101     5.21       54.5      5.56
## 12 Niger                         123     5.16       58.8      7.49
## 13 Nigeria                       130     5.07       60.5      5.84
## 14 Zambia                         83.1   5.89       52        5.4