library(magrittr)
library(tidyverse)
library(corrplot)
library(psych)
library(plotly)
library(htmlwidgets)
require(FactoMineR)
require(factoextra)
library(cluster)
Data and assignment were taken from https://www.kaggle.com/datasets/vipulgohel/clustering-pca-assignment
Assignment:
After the current funding programs, International Humanitarian NGO has raised around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. The significant issues that come while making this decision have mostly related to the countries in dire need of Aid. Our job is as Data analyst is to classify the countries using the socio-economic and health factors that determine the overall development of nations. After this analysis, we need to suggest countries that the CEO needs to focus on and give the highest priority.
data_dictionary = read_csv("data-dictionary.csv")
data_dictionary
## # A tibble: 10 × 2
## `Column Name` Description
## <chr> <chr>
## 1 country Name of the country
## 2 child_mort Death of children under 5 years of age per 1000 live births
## 3 exports Exports of goods and services. Given as %age of the Total GDP
## 4 health Total health spending as %age of Total GDP
## 5 imports Imports of goods and services. Given as %age of the Total GDP
## 6 Income Net income per person
## 7 Inflation The measurement of the annual growth rate of the Total GDP
## 8 life_expec The average number of years a new born child would live if the…
## 9 total_fer The number of children that would be born to each woman if the…
## 10 gdpp The GDP per capita. Calculated as the Total GDP divided by the…
country = read_csv("Country-data.csv")
head(country)
## # A tibble: 6 × 10
## country child…¹ exports health imports income infla…² life_…³ total…⁴ gdpp
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanist… 90.2 10 7.58 44.9 1610 9.44 56.2 5.82 553
## 2 Albania 16.6 28 6.55 48.6 9930 4.49 76.3 1.65 4090
## 3 Algeria 27.3 38.4 4.17 31.4 12900 16.1 76.5 2.89 4460
## 4 Angola 119 62.3 2.85 42.9 5900 22.4 60.1 6.16 3530
## 5 Antigua a… 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200
## 6 Argentina 14.5 18.9 8.1 16 18700 20.9 75.8 2.37 10300
## # … with abbreviated variable names ¹child_mort, ²inflation, ³life_expec,
## # ⁴total_fer
colSums(is.na(country))
## country child_mort exports health imports income inflation
## 0 0 0 0 0 0 0
## life_expec total_fer gdpp
## 0 0 0
There are no missing values. I need to get rid of country names column
country1 = country %>% dplyr::select(-country)
#row.names(country1)=country$country
summary(country1)
## child_mort exports health imports
## Min. : 2.60 Min. : 0.109 Min. : 1.810 Min. : 0.0659
## 1st Qu.: 8.25 1st Qu.: 23.800 1st Qu.: 4.920 1st Qu.: 30.2000
## Median : 19.30 Median : 35.000 Median : 6.320 Median : 43.3000
## Mean : 38.27 Mean : 41.109 Mean : 6.816 Mean : 46.8902
## 3rd Qu.: 62.10 3rd Qu.: 51.350 3rd Qu.: 8.600 3rd Qu.: 58.7500
## Max. :208.00 Max. :200.000 Max. :17.900 Max. :174.0000
## income inflation life_expec total_fer
## Min. : 609 Min. : -4.210 Min. :32.10 Min. :1.150
## 1st Qu.: 3355 1st Qu.: 1.810 1st Qu.:65.30 1st Qu.:1.795
## Median : 9960 Median : 5.390 Median :73.10 Median :2.410
## Mean : 17145 Mean : 7.782 Mean :70.56 Mean :2.948
## 3rd Qu.: 22800 3rd Qu.: 10.750 3rd Qu.:76.80 3rd Qu.:3.880
## Max. :125000 Max. :104.000 Max. :82.80 Max. :7.490
## gdpp
## Min. : 231
## 1st Qu.: 1330
## Median : 4660
## Mean : 12964
## 3rd Qu.: 14050
## Max. :105000
country1 %>% cor() %>% corrplot()
Positive correlation is observed between child mortality & total fertility, exports & imports, income & GDP; negative correlation is between child mortality & life expectancy, total fertility & life expectancy. These correlations are quite logical.
pca = prcomp(country1, center = TRUE, scale = TRUE)
pca
## Standard deviations (1, .., p=9):
## [1] 2.0336314 1.2435217 1.0818425 0.9973889 0.8127847 0.4728437 0.3368067
## [8] 0.2971790 0.2586020
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## child_mort -0.4195194 -0.192883937 0.02954353 -0.370653262 0.16896968
## exports 0.2838970 -0.613163494 -0.14476069 -0.003091019 -0.05761584
## health 0.1508378 0.243086779 0.59663237 -0.461897497 -0.51800037
## imports 0.1614824 -0.671820644 0.29992674 0.071907461 -0.25537642
## income 0.3984411 -0.022535530 -0.30154750 -0.392159039 0.24714960
## inflation -0.1931729 0.008404473 -0.64251951 -0.150441762 -0.71486910
## life_expec 0.4258394 0.222706743 -0.11391854 0.203797235 -0.10821980
## total_fer -0.4037290 -0.155233106 -0.01954925 -0.378303645 0.13526221
## gdpp 0.3926448 0.046022396 -0.12297749 -0.531994575 0.18016662
## PC6 PC7 PC8 PC9
## child_mort -0.200628153 0.07948854 0.68274306 0.32754180
## exports 0.059332832 0.70730269 0.01419742 -0.12308207
## health -0.007276456 0.24983051 -0.07249683 0.11308797
## imports 0.030031537 -0.59218953 0.02894642 0.09903717
## income -0.160346990 -0.09556237 -0.35262369 0.61298247
## inflation -0.066285372 -0.10463252 0.01153775 -0.02523614
## life_expec 0.601126516 -0.01848639 0.50466425 0.29403981
## total_fer 0.750688748 -0.02882643 -0.29335267 -0.02633585
## gdpp -0.016778761 -0.24299776 0.24969636 -0.62564572
I will try to describe PC1 and PC2, the more we move away from the first main component, the less informative these components become, so it is not necessary to interpret them all.
PC1 has positive associations with exports, health, imports, income, life_expec, gdpp; and negative associations with child_mort, inflation, total_fer. Let’s say PC1 is responsible for country stability and wealth, as it is good for countries to have high indicators in exports, imports, income, health spendings, life expectancy and GDP and have low indicators in child mortality and inflation, what about total fertility - it is known developed countries have low fertility rates, so the negative weight of total_fer is consistent with the interpretation of PC1. PC2 has greatest contribution in exports and imports, it can be considered as responsible for trade balance.
biplot(pca, cex = 0.5, col =c("darkgrey","blue"))
High correlations between: gdpp & income, total_fer & child_mort.
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0336 1.2435 1.0818 0.9974 0.8128 0.47284 0.3368
## Proportion of Variance 0.4595 0.1718 0.1300 0.1105 0.0734 0.02484 0.0126
## Cumulative Proportion 0.4595 0.6313 0.7614 0.8719 0.9453 0.97015 0.9828
## PC8 PC9
## Standard deviation 0.29718 0.25860
## Proportion of Variance 0.00981 0.00743
## Cumulative Proportion 0.99257 1.00000
Approaches to choose number of components:
choose as many main components as there are eigenvalues greater than 1 - thus we get 3 PCs
(at least 70-80%) Let’s make at least 80% of information - 4 PCs by this approach
scree plot - in my opinion, it looks like we need 2 PCs by this method
plot(pca, type = "l")
I decided to stop at 3 PCs
pca1 = data.frame(pca$x[,1:3])
head(pca1)
## PC1 PC2 PC3
## 1 -2.90428986 -0.09533386 0.7159652
## 2 0.42862224 0.58639208 0.3324855
## 3 -0.28436983 0.45380957 -1.2178421
## 4 -2.92362976 -1.69047094 -1.5204709
## 5 1.03047668 -0.13624894 0.2250441
## 6 0.02234007 1.77385167 -0.8673884
I chose k-means clustering because it’s one of the most common forms of clustering and quite simple and convenient. It makes K-number of clusters in which observations are very similar to each other, and observations in different clusters are very different from each other. Countries can be divided into developing and developed, so I expect 2 clusters.
fviz_nbclust(pca1, kmeans, method = "wss")
One way to choose the optimum K value is to choose the number of cluster in the area of “bend of an elbow” - and it looks like the optimum number is 2.
Another way to choose the optimum K value is to check on plot where’s the biggest jump in within-cluster distance occurred.
fviz_nbclust(pca1, kmeans, method = "gap_stat")
Here, 2 is also the optimum number of clusters.
set.seed(123)
km = kmeans(pca1, centers = 2)
pca1$cluster = km$cluster
fviz_cluster(km, data = pca1, labelsize = 7)
country2 = country
country2$cluster = km$cluster
results = country2 %>% dplyr::select(-country) %>% group_by(cluster) %>% summarise_all(mean)
t(results)
## [,1] [,2]
## cluster 1.000000 2.000000
## child_mort 10.676667 70.522078
## exports 49.572222 31.216870
## health 7.583889 5.917792
## imports 49.968889 43.291765
## income 27506.666667 5033.285714
## inflation 4.611667 11.487221
## life_expec 76.952222 63.079221
## total_fer 1.886556 4.188571
## gdpp 22029.333333 2368.493506
Countries in cluster 2 have worse indicators: high child mortality rates and fertility rates, higher inflation, lower GDP, income, exports, imports, health spending, and lower life expectancy.
Which countries are needed to focus on and given the highest priority:
country2 %>%
filter(child_mort > max(results$child_mort),
exports < min(results$exports),
health < min(results$health),
imports < min(results$imports),
income < min(results$income),
inflation > max(results$inflation),
life_expec < min(results$life_expec),
total_fer > max(results$total_fer),
gdpp < min(results$gdpp)) %>%
dplyr::select(-cluster)
## # A tibble: 1 × 10
## country child_mort exports health imports income infla…¹ life_…² total…³ gdpp
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Guinea 109 30.3 4.93 43.2 1190 16.1 58 5.34 648
## # … with abbreviated variable names ¹inflation, ²life_expec, ³total_fer
Considering that 10 million is not a particularly large sum, it is quite possible to send all the funds to one country - Guinea.
It is also quite reasonable to divide countries into those with difficulties with socio-economic factors and with health factors. In my opinion, health factors should be a priority.
country2 %>%
filter(exports < min(results$exports),
imports < min(results$imports),
income < min(results$income),
inflation > max(results$inflation),
gdpp < min(results$gdpp)) %>%
dplyr::select(-child_mort,-health,-life_expec,-total_fer,-cluster)
## # A tibble: 8 × 6
## country exports imports income inflation gdpp
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Burundi 8.92 39.2 764 12.3 231
## 2 Eritrea 4.79 23.3 1420 11.6 482
## 3 Guinea 30.3 43.2 1190 16.1 648
## 4 Malawi 22.8 34.9 1030 12.1 459
## 5 Nepal 9.58 36.4 1990 15.1 592
## 6 Sierra Leone 16.8 34.5 1220 17.2 399
## 7 Sudan 19.7 17.2 3370 19.6 1480
## 8 Yemen 30 34.4 4480 23.6 1310
country2 %>%
filter(child_mort > max(results$child_mort),
health < min(results$health),
life_expec < min(results$life_expec),
total_fer > max(results$total_fer)) %>%
dplyr::select(country,child_mort,health,life_expec,total_fer)
## # A tibble: 14 × 5
## country child_mort health life_expec total_fer
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Angola 119 2.85 60.1 6.16
## 2 Benin 111 4.1 61.8 5.36
## 3 Cameroon 108 5.13 57.3 5.11
## 4 Central African Republic 149 3.98 47.5 5.21
## 5 Chad 150 4.53 56.5 6.59
## 6 Cote d'Ivoire 111 5.3 56.3 5.27
## 7 Equatorial Guinea 111 4.48 60.9 5.21
## 8 Ghana 74.7 5.22 62.2 4.27
## 9 Guinea 109 4.93 58 5.34
## 10 Mali 137 4.98 59.5 6.55
## 11 Mozambique 101 5.21 54.5 5.56
## 12 Niger 123 5.16 58.8 7.49
## 13 Nigeria 130 5.07 60.5 5.84
## 14 Zambia 83.1 5.89 52 5.4