HELP International have been able to raise around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. So, CEO has to make decision to choose the countries that are in the direst need of aid. Hence, your Job as a Data scientist is to categorise the countries using some socio-economic and health factors that determine the overall development of the country. Then you need to suggest the countries which the CEO needs to focus on the most.
To categorize the countries using socio-economic and health factors that determine the overall development of the country.
HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities.
country<-read.csv("data/Country-data.csv") # Use country dataset
head(country) # Check first 5 rows on dataset
str(country)
## 'data.frame': 167 obs. of 10 variables:
## $ country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ child_mort: num 90.2 16.6 27.3 119 10.3 14.5 18.1 4.8 4.3 39.2 ...
## $ exports : num 10 28 38.4 62.3 45.5 18.9 20.8 19.8 51.3 54.3 ...
## $ health : num 7.58 6.55 4.17 2.85 6.03 8.1 4.4 8.73 11 5.88 ...
## $ imports : num 44.9 48.6 31.4 42.9 58.9 16 45.3 20.9 47.8 20.7 ...
## $ income : int 1610 9930 12900 5900 19100 18700 6700 41400 43200 16000 ...
## $ inflation : num 9.44 4.49 16.1 22.4 1.44 20.9 7.77 1.16 0.873 13.8 ...
## $ life_expec: num 56.2 76.3 76.5 60.1 76.8 75.8 73.3 82 80.5 69.1 ...
## $ total_fer : num 5.82 1.65 2.89 6.16 2.13 2.37 1.69 1.93 1.44 1.92 ...
## $ gdpp : int 553 4090 4460 3530 12200 10300 3220 51900 46900 5840 ...
country <- country %>%
tibble::column_to_rownames('country')
colSums(is.na(country))
## child_mort exports health imports income inflation life_expec
## 0 0 0 0 0 0 0
## total_fer gdpp
## 0 0
No nulled rows on this data, we can skip the imputation process
Still skewed on every variables, and looks like many variables that has outlier. For this data, I’ll consider to keep the outliers.
GGally::ggcorr(country,nbreaks = 6, label = TRUE,)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Consider to throw away income, life_expec, total_fer as this variables has strong correlation to the main variables that we need to explore
country_clean<-country%>%
select(-c(income,life_expec,total_fer))
glimpse(country_clean)
## Rows: 167
## Columns: 6
## $ child_mort <dbl> 90.2, 16.6, 27.3, 119.0, 10.3, 14.5, 18.1, 4.8, 4.3, 39.2, ~
## $ exports <dbl> 10.0, 28.0, 38.4, 62.3, 45.5, 18.9, 20.8, 19.8, 51.3, 54.3,~
## $ health <dbl> 7.58, 6.55, 4.17, 2.85, 6.03, 8.10, 4.40, 8.73, 11.00, 5.88~
## $ imports <dbl> 44.9, 48.6, 31.4, 42.9, 58.9, 16.0, 45.3, 20.9, 47.8, 20.7,~
## $ inflation <dbl> 9.440, 4.490, 16.100, 22.400, 1.440, 20.900, 7.770, 1.160, ~
## $ gdpp <int> 553, 4090, 4460, 3530, 12200, 10300, 3220, 51900, 46900, 58~
Many types and categories on variables results ambiguous interpretation and analysis to the data. Scalling the data can be the best option to generalized all of the values into the homogenous dataset
library(scales)
# function for scalling variables
rscl <- function(dat, column.nos) {
nms <- names(dat)
for(col in column.nos) {
name <- paste(nms[col],".scl", sep = "")
dat[name] <- rescale(dat[,col])
}
cat(paste("Rescaled ", length(column.nos), " variable(s)n\n"))
dat
}
country_scale<-rscl(country_clean, c(1,2,3,4,5,6))
## Rescaled 6 variable(s)n
#Drop real values
country_scale<-country_scale%>%select(-c(1,2,3,4,5,6))
# Check Statistical Summary on Scaled Data
summary(country_scale)
## child_mort.scl exports.scl health.scl imports.scl
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.02751 1st Qu.:0.1185 1st Qu.:0.1933 1st Qu.:0.1733
## Median :0.08130 Median :0.1746 Median :0.2803 Median :0.2486
## Mean :0.17366 Mean :0.2051 Mean :0.3111 Mean :0.2692
## 3rd Qu.:0.28968 3rd Qu.:0.2563 3rd Qu.:0.4220 3rd Qu.:0.3374
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## inflation.scl gdpp.scl
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.05563 1st Qu.:0.01049
## Median :0.08872 Median :0.04227
## Mean :0.11082 Mean :0.12154
## 3rd Qu.:0.13825 3rd Qu.:0.13190
## Max. :1.00000 Max. :1.00000
factoextra::fviz_nbclust(country_scale, kmeans, method = "wss", k.max = 10)
silhouette
factoextra::fviz_nbclust(country_clean, kmeans, "silhouette", k.max = 10) + labs(subtitle = "Silhouette method")
gap stat
factoextra::fviz_nbclust(country_clean, kmeans, "gap_stat", k.max = 10) + labs(subtitle = "Gap Statistic method")
set.seed(156)
km <- kmeans(country_scale, centers = 3)
km
## K-means clustering with 3 clusters of sizes 92, 28, 47
##
## Cluster means:
## child_mort.scl exports.scl health.scl imports.scl inflation.scl gdpp.scl
## 1 0.087332247 0.2215866 0.2784271 0.2801684 0.10880926 0.07926797
## 2 0.008798164 0.2837882 0.4943843 0.2945604 0.05346086 0.44469670
## 3 0.440862665 0.1259916 0.2658847 0.2326472 0.14892732 0.01175047
##
## Clustering vector:
## Afghanistan Albania
## 3 1
## Algeria Angola
## 1 3
## Antigua and Barbuda Argentina
## 1 1
## Armenia Australia
## 1 2
## Austria Azerbaijan
## 2 1
## Bahamas Bahrain
## 1 1
## Bangladesh Barbados
## 3 1
## Belarus Belgium
## 1 2
## Belize Benin
## 1 3
## Bhutan Bolivia
## 1 1
## Bosnia and Herzegovina Botswana
## 1 1
## Brazil Brunei
## 1 1
## Bulgaria Burkina Faso
## 1 3
## Burundi Cambodia
## 3 1
## Cameroon Canada
## 3 2
## Cape Verde Central African Republic
## 1 3
## Chad Chile
## 3 1
## China Colombia
## 1 1
## Comoros Congo, Dem. Rep.
## 3 3
## Congo, Rep. Costa Rica
## 1 1
## Cote d'Ivoire Croatia
## 3 1
## Cyprus Czech Republic
## 1 1
## Denmark Dominican Republic
## 2 1
## Ecuador Egypt
## 1 1
## El Salvador Equatorial Guinea
## 1 3
## Eritrea Estonia
## 3 1
## Fiji Finland
## 1 2
## France Gabon
## 2 3
## Gambia Georgia
## 3 1
## Germany Ghana
## 2 3
## Greece Grenada
## 2 1
## Guatemala Guinea
## 1 3
## Guinea-Bissau Guyana
## 3 1
## Haiti Hungary
## 3 1
## Iceland India
## 2 3
## Indonesia Iran
## 1 1
## Iraq Ireland
## 1 2
## Israel Italy
## 2 2
## Jamaica Japan
## 1 2
## Jordan Kazakhstan
## 1 1
## Kenya Kiribati
## 3 3
## Kuwait Kyrgyz Republic
## 1 1
## Lao Latvia
## 3 1
## Lebanon Lesotho
## 1 3
## Liberia Libya
## 3 1
## Lithuania Luxembourg
## 1 2
## Macedonia, FYR Madagascar
## 1 3
## Malawi Malaysia
## 3 1
## Maldives Mali
## 1 3
## Malta Mauritania
## 2 3
## Mauritius Micronesia, Fed. Sts.
## 1 1
## Moldova Mongolia
## 1 1
## Montenegro Morocco
## 1 1
## Mozambique Myanmar
## 3 3
## Namibia Nepal
## 1 3
## Netherlands New Zealand
## 2 2
## Niger Nigeria
## 3 3
## Norway Oman
## 2 1
## Pakistan Panama
## 3 1
## Paraguay Peru
## 1 1
## Philippines Poland
## 1 1
## Portugal Qatar
## 2 2
## Romania Russia
## 1 1
## Rwanda Samoa
## 3 1
## Saudi Arabia Senegal
## 1 3
## Serbia Seychelles
## 1 1
## Sierra Leone Singapore
## 3 2
## Slovak Republic Slovenia
## 1 2
## Solomon Islands South Africa
## 1 1
## South Korea Spain
## 1 2
## Sri Lanka St. Vincent and the Grenadines
## 1 1
## Sudan Suriname
## 3 1
## Sweden Switzerland
## 2 2
## Tajikistan Tanzania
## 3 3
## Thailand Timor-Leste
## 1 3
## Togo Tonga
## 3 1
## Tunisia Turkey
## 1 1
## Turkmenistan Uganda
## 1 3
## Ukraine United Arab Emirates
## 1 1
## United Kingdom United States
## 2 2
## Uruguay Uzbekistan
## 1 1
## Vanuatu Venezuela
## 1 1
## Vietnam Yemen
## 1 3
## Zambia
## 3
##
## Within cluster sum of squares by cluster:
## [1] 4.951952 4.689040 4.261413
## (between_SS / total_SS = 42.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
country_clust <- country_scale %>%
na.omit() %>%
bind_cols(cluster = as.factor(km$cluster)) %>%
select(cluster, 1:7)
country_clust
country_clust %>%
mutate(cluster = cluster) %>%
ggplot(aes(health.scl, child_mort.scl,
color = cluster)) +
geom_point(alpha = 0.5) +
geom_mark_hull() +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(legend.position = "right")
country_clust %>%
group_by(cluster) %>%
summarise_if(is.numeric, "mean") %>%
mutate_if(is.numeric,
.funs = "round", digits = 2) %>% select(1:7)%>%
tibble::column_to_rownames('cluster')
Conclusion:
- highest child mortality rate is in cluster 3
- highest export rate is in cluster 2
- highest health spending rate is in cluster 2
- highest imports rate is in cluster 2
- highest inflation rate is in cluster 3
- highest GDP is in cluster 2 `
Country that is in cluster 3 seems to have serious rate on human development program, as those countries facing high children mortality rate and inflation rate, as well as lower rate on health spending (third lower) and GDP.
let’s see the countries that categorized into cluster 3
country[country_clust$cluster==3,]
those countries that categorized into cluster 3 dominated by Africa and East Asia Countries. Based on
Reducing dimensions
country_pca <- PCA(country_clean, scale.unit = T, ncp = 5, graph = F,
quali.sup = 6)
summary(country_pca )
##
## Call:
## PCA(X = country_clean, scale.unit = T, ncp = 5, quali.sup = 6,
## graph = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Variance 1.997 1.311 0.808 0.703 0.180
## % of var. 39.946 26.227 16.170 14.051 3.605
## Cumulative % of var. 39.946 66.174 82.343 96.395 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## Afghanistan | 1.753 | -1.236 0.458 0.497 | -0.193 0.017 0.012 |
## Albania | 0.795 | 0.076 0.002 0.009 | -0.433 0.086 0.297 |
## Algeria | 1.433 | -0.771 0.178 0.289 | 0.696 0.221 0.236 |
## Angola | 2.946 | -1.188 0.423 0.163 | 2.513 2.884 0.728 |
## Antigua and Barbuda | 1.096 | 0.835 0.209 0.580 | -0.110 0.006 0.010 |
## Argentina | 2.102 | -1.365 0.558 0.422 | -0.606 0.168 0.083 |
## Armenia | 1.259 | -0.414 0.051 0.108 | 0.092 0.004 0.005 |
## Australia | 1.829 | -0.410 0.050 0.050 | -1.641 1.229 0.805 |
## Austria | 1.902 | 1.064 0.339 0.313 | -1.412 0.911 0.551 |
## Azerbaijan | 1.362 | -0.629 0.119 0.213 | 0.396 0.072 0.085 |
## Dim.3 ctr cos2
## Afghanistan 1.163 1.002 0.440 |
## Albania -0.375 0.104 0.223 |
## Algeria -0.982 0.714 0.470 |
## Angola 0.681 0.343 0.053 |
## Antigua and Barbuda -0.418 0.129 0.145 |
## Argentina -0.865 0.554 0.169 |
## Armenia -0.765 0.433 0.369 |
## Australia -0.673 0.336 0.136 |
## Austria 0.069 0.004 0.001 |
## Azerbaijan -0.668 0.331 0.241 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## child_mort | -0.563 15.868 0.317 | 0.364 10.125 0.133 | 0.709
## exports | 0.821 33.780 0.675 | 0.475 17.187 0.225 | -0.057
## health | 0.230 2.658 0.053 | -0.751 42.982 0.564 | 0.397
## imports | 0.829 34.397 0.687 | 0.319 7.770 0.102 | 0.368
## inflation | -0.515 13.298 0.266 | 0.536 21.936 0.288 | -0.096
## ctr cos2
## child_mort 62.235 0.503 |
## exports 0.400 0.003 |
## health 19.449 0.157 |
## imports 16.782 0.136 |
## inflation 1.134 0.009 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## 231 | 2.573 | -1.291 0.252 -0.914 | -1.084 0.177 -0.946 |
## 327 | 3.034 | 0.514 0.029 0.364 | -0.698 0.053 -0.610 |
## 334 | 2.331 | -1.090 0.218 -0.771 | 0.963 0.171 0.841 |
## 348 | 2.354 | -1.105 0.221 -0.782 | 0.573 0.059 0.500 |
## 399 | 4.035 | -1.976 0.240 -1.398 | -0.634 0.025 -0.554 |
## 413 | 1.405 | -0.890 0.402 -0.630 | 0.674 0.230 0.589 |
## 419 | 1.704 | -0.933 0.300 -0.660 | 0.721 0.179 0.629 |
## 446 | 3.289 | -2.185 0.441 -1.546 | 0.619 0.035 0.540 |
## 459 | 1.599 | -1.361 0.724 -0.963 | 0.243 0.023 0.212 |
## 482 | 2.309 | -1.893 0.672 -1.339 | 0.475 0.042 0.415 |
## Dim.3 cos2 v.test
## 231 1.754 0.465 1.951 |
## 327 2.654 0.765 2.952 |
## 334 1.616 0.480 1.797 |
## 348 1.530 0.423 1.701 |
## 399 3.151 0.610 3.505 |
## 413 -0.060 0.002 -0.067 |
## 419 0.984 0.334 1.095 |
## 446 1.496 0.207 1.664 |
## 459 0.784 0.240 0.872 |
## 482 -0.692 0.090 -0.770 |
factoextra::fviz_eig(country_pca,
ncp = 5,
addlabels = T,
main = "Variance explained by each dimensions")
getting the first 3 dimensions
country_pca_new <- data.frame(country_pca$ind$coord[, 1:3]) %>% bind_cols(cluster = as.factor(km$cluster)) %>%
select(cluster, 1:3)
country_pca_new
graph
factoextra::fviz_pca_var(country_pca,
select.var = list(contrib = 5),
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
GRAPH
factoextra::fviz_cluster(object = km, data = country_clean, labelsize = 0) + theme_minimal()