#install.packages(ggplot2)
library(ggplot2)
#install.packages("dplyr")
library(dplyr)
#install.packages("Hmisc")
library(Hmisc)
#install.packages("factoextra")
library(factoextra)
#install.packages("cluster")
library(cluster)
#install.packages("magrittr")
library(magrittr)
#install.packages("NbClust")
library(NbClust)
#install.packages("tidyr")
library(tidyr)
data <- read.table("./world_data_2023.csv", header=TRUE, sep=",", dec=".")
head(data)
## Country Population Urban Birth Fertility Physicians
## 1 Afghanistan 38.041754 9.797273 32.49 4.47 0.28
## 2 Albania 2.854191 1.747593 11.78 1.62 1.20
## 3 Algeria 43.053054 31.510100 24.28 3.02 1.72
## 4 Angola 31.825295 21.061025 40.73 5.52 0.21
## 5 Antigua and Barbuda 0.097118 0.023800 15.33 1.99 2.76
## 6 Argentina 44.938712 41.339571 17.02 2.26 3.96
## Maternal_mort Infant_mort Expectancy Density Land Co2 GDP
## 1 638 47.9 64.5 60 652.230 8.672 19101.354
## 2 15 7.8 78.5 105 28.748 4.536 15278.077
## 3 112 20.1 76.7 18 2381.741 150.006 169988.236
## 4 241 51.6 60.8 26 1246.700 34.693 94635.416
## 5 42 5.0 76.9 223 0.443 0.557 1727.759
## 6 39 8.8 76.5 17 2780.400 201.348 449663.447
## CPI Armed
## 1 149.90 323
## 2 119.05 9
## 3 151.36 317
## 4 261.73 117
## 5 113.81 0
## 6 232.75 105
This data set includes a list of 162 observations (country) and 15 variables.
This data set was retrieved from kaggle.com from the user Nidula Elgiriyewithana ⚡.
The objective of the following analysis is to create different groups from the data set based on several clustering variables.
summary(data[,-1])
## Population Urban Birth Fertility
## Min. : 0.0971 Min. : 0.0238 Min. : 6.40 Min. :0.980
## 1st Qu.: 3.5551 1st Qu.: 2.0331 1st Qu.:11.07 1st Qu.:1.695
## Median : 10.4776 Median : 5.6373 Median :17.91 Median :2.230
## Mean : 46.3877 Mean : 25.7659 Mean :20.17 Mean :2.666
## 3rd Qu.: 33.8290 3rd Qu.: 19.3212 3rd Qu.:28.54 3rd Qu.:3.505
## Max. :1397.7150 Max. :842.9340 Max. :46.08 Max. :6.910
## Physicians Maternal_mort Infant_mort Expectancy
## Min. :0.010 Min. : 2.0 Min. : 1.400 Min. :52.80
## 1st Qu.:0.370 1st Qu.: 12.0 1st Qu.: 5.825 1st Qu.:66.75
## Median :1.575 Median : 49.0 Median :13.350 Median :74.20
## Mean :1.835 Mean : 156.5 Mean :21.082 Mean :72.48
## 3rd Qu.:2.965 3rd Qu.: 192.8 3rd Qu.:33.525 3rd Qu.:77.80
## Max. :7.120 Max. :1140.0 Max. :84.500 Max. :84.20
## Density Land Co2 GDP
## Min. : 2.0 Min. : 0.298 Min. : 0.293 Min. : 1340
## 1st Qu.: 32.0 1st Qu.: 43.627 1st Qu.: 5.145 1st Qu.: 14233
## Median : 83.0 Median : 190.951 Median : 17.837 Median : 52722
## Mean : 211.8 Mean : 808.129 Mean : 204.583 Mean : 566423
## 3rd Qu.: 152.8 3rd Qu.: 638.597 3rd Qu.: 94.122 3rd Qu.: 303024
## Max. :8358.0 Max. :17098.240 Max. :9893.038 Max. :21427700
## CPI Armed
## Min. : 99.03 Min. : 0.00
## 1st Qu.: 114.66 1st Qu.: 10.25
## Median : 128.93 Median : 28.50
## Mean : 170.15 Mean : 154.43
## 3rd Qu.: 158.69 3rd Qu.: 135.50
## Max. :2740.27 Max. :3031.00
I called for the summary of the data set to check that all of the data is included in the analysis.
data_clu_std <- as.data.frame(scale(data[c(2, 5, 6, 9, 12, 13)]))
head(data_clu_std)
## Population Fertility Physicians Expectancy Co2 GDP
## 1 -0.052864555 1.4110966 -0.99246398 -1.0467676 -0.217427448 -0.2317626
## 2 -0.275747565 -0.8177892 -0.40509618 0.7890420 -0.222017695 -0.2333816
## 3 -0.021122270 0.2771021 -0.07310569 0.5530093 -0.060571060 -0.1678698
## 4 -0.092240487 2.2322651 -1.03715500 -1.5319458 -0.188548621 -0.1997778
## 5 -0.293211256 -0.5284250 0.59087529 0.5792352 -0.226433700 -0.2391195
## 6 -0.009178245 -0.3172674 1.35700719 0.5267834 -0.003590286 -0.0494417
I decided to use Population, Fertility, Physicians, Expectancy, Co2 and GDP as clustering variables. The general hypothesis of this analysis is that the clustering variables means that were selected will have a statistically significant difference within the formed groups.
data$Dissimilarity = sqrt(data_clu_std$Population^2 + data_clu_std$Fertility^2 + data_clu_std$Expectancy^2 + data_clu_std$Physicians^2 + data_clu_std$Co2^2 + data_clu_std$GDP^2)
head(data[order(-data$Dissimilarity), c("Country", "Dissimilarity")], 15)
## Country Dissimilarity
## 35 China 16.028600
## 156 United States 10.536766
## 69 India 8.792635
## 111 Niger 3.787735
## 33 Chad 3.626080
## 112 Nigeria 3.472245
## 57 Georgia 3.436074
## 95 Mali 3.287616
## 32 Central African Republic 3.271732
## 42 Democratic Republic of the Congo 3.228195
## 89 Lithuania 3.051820
## 131 Sierra Leone 2.955368
## 4 Angola 2.913679
## 77 Japan 2.902531
## 26 Burundi 2.859809
data <- data %>%
filter(!Country %in% c("China","United States", "India"))
data_clu_std <- as.data.frame(scale(data[c(2, 5, 6, 9, 12, 13)]))
I checked for dissimilarity to find possible outliers, given the differences between China, United States and India with the rest of the countries in the data set so I decided to remove them.
get_clust_tendency(data_clu_std,
n = nrow(data_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.8444727
##
## $plot
## NULL
The Hopkins statistic determined that the data selected is good for clustering.
rownames(data_clu_std) <- data$Country
I changed the row names in the data_cloud_std data set with the names of the countries to be able to identify them easily.
Distance <- get_dist(data_clu_std,
method = "euclidian")
fviz_dist(Distance,
gradient = list(low = "darkred",
mid = "grey95",
high = "white"))
fviz_nbclust(data_clu_std, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
fviz_nbclust(data_clu_std, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
NbClust(data_clu_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 12 proposed 3 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 0.3287 62.0679 110.0306 -4.0222 196.1666 433678310407 32395.4721 679.4058
## 3 4.8993 107.1122 34.6002 1.2816 425.7343 230306597437 5426.3258 399.4550
## 4 0.7970 98.1464 38.7235 1.9865 625.1605 116807010021 3769.1155 326.9408
## 5 13.7665 101.0246 7.7665 4.9166 770.2562 73277842958 2036.0695 261.5884
## 6 0.1428 85.8875 22.1843 3.6325 825.8586 74381024157 1816.9657 249.0294
## 7 0.5686 85.0882 35.8806 5.0839 948.9931 46668577264 1365.1841 217.4938
## 8 80.9224 94.6480 5.6934 8.6608 1085.0939 25897697812 913.4189 175.9578
## 9 0.0388 86.0774 14.9123 7.8927 1136.3531 23744138997 887.9400 169.5644
## 10 1.1115 85.2050 13.8145 8.7254 1230.1800 16247681907 753.9913 154.2314
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 5.2705 1.3953 0.2593 1.1851 0.4939 1.6788 -41.2425 -1.4584 0.3317
## 3 13.5761 2.3732 0.2208 0.9669 0.4362 0.8927 11.1774 0.4538 0.4361
## 4 18.3651 2.8996 0.1906 1.1754 0.3513 0.8197 10.7797 0.7694 0.4007
## 5 21.0886 3.6240 0.1724 1.0724 0.3628 0.7614 12.8517 1.1758 0.3801
## 6 23.0606 3.8068 0.1697 1.2219 0.2939 3.8744 -10.3865 -2.6864 0.3500
## 7 26.7715 4.3587 0.1580 1.1128 0.2801 0.5355 6.9401 2.8608 0.3315
## 8 30.8223 5.3877 0.1792 1.0468 0.2963 1.1285 -5.2376 -0.3285 0.3190
## 9 33.5413 5.5908 0.1768 1.1231 0.2753 0.7848 13.4332 1.0308 0.3020
## 10 37.4240 6.1466 0.1578 1.1926 0.2514 1.0880 -0.2426 -0.1556 0.2893
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 339.7029 0.5933 1.7908 0.1118 0.1100 0.0029 3.9274 1.8495 1.4949
## 3 133.1517 0.6043 1.9199 0.5268 0.0359 0.0025 3.1965 1.3464 1.1521
## 4 81.7352 0.4987 0.0783 1.0658 0.0610 0.0026 3.1038 1.1653 0.8721
## 5 52.3177 0.5236 4.8872 1.0906 0.0645 0.0027 3.1257 1.0599 0.8425
## 6 41.5049 0.4805 0.4765 1.3435 0.0324 0.0028 3.5709 1.0151 0.7668
## 7 31.0705 0.4714 0.0560 1.4622 0.0374 0.0029 2.9558 0.9653 0.6504
## 8 21.9947 0.4785 2.6210 1.4516 0.0450 0.0031 3.1078 0.9085 0.5893
## 9 18.8405 0.4589 0.8286 1.6018 0.0450 0.0032 3.6913 0.8891 0.6120
## 10 15.4231 0.4128 0.1077 2.0764 0.0450 0.0034 3.6293 0.8353 0.5689
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.4772 111.7593 1.0000
## 3 0.6622 47.4378 0.8421
## 4 0.3979 74.1480 0.5969
## 5 0.6256 24.5414 0.3199
## 6 0.4889 14.6349 1.0000
## 7 0.2864 19.9309 0.0221
## 8 0.1255 320.5498 1.0000
## 9 0.6378 27.8311 0.4056
## 10 -0.0981 -33.5791 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 8.0000 3.0000 3.0000 10.0000 3.0000 3 3.00
## Value_Index 80.9224 107.1122 75.4303 8.7254 229.5677 89872125555 26969.15
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 3.0000 8.0000 10.0000 3.0000 2.0000 2.0000
## Value_Index 207.4364 8.3056 -0.8258 0.1578 0.9669 0.4939 1.6788
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 3.0000 3.0000 3.0000 3.0000 2.0000
## Value_Index -41.2425 -1.4584 0.4361 206.5512 0.6043 1.9199 0.1118
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 2.00 0 7.0000 0 10.0000
## Value_Index 0.11 0 2.9558 0 0.5689
##
## $Best.partition
## Afghanistan Albania
## 2 3
## Algeria Angola
## 3 2
## Antigua and Barbuda Argentina
## 3 3
## Armenia Australia
## 3 3
## Austria Azerbaijan
## 3 3
## The Bahamas Bahrain
## 3 3
## Bangladesh Barbados
## 2 3
## Belgium Belize
## 3 3
## Benin Bhutan
## 2 3
## Bolivia Bosnia and Herzegovina
## 3 3
## Botswana Brazil
## 2 1
## Brunei Bulgaria
## 3 3
## Burkina Faso Burundi
## 2 2
## Ivory Coast Cape Verde
## 2 3
## Cambodia Cameroon
## 2 2
## Canada Central African Republic
## 1 2
## Chad Chile
## 2 3
## Colombia Republic of the Congo
## 3 2
## Costa Rica Croatia
## 3 3
## Cyprus Czech Republic
## 3 3
## Democratic Republic of the Congo Denmark
## 2 3
## Djibouti Dominican Republic
## 2 3
## Ecuador Egypt
## 3 2
## El Salvador Equatorial Guinea
## 3 2
## Estonia Ethiopia
## 3 2
## Fiji Finland
## 2 3
## France Gabon
## 1 2
## The Gambia Georgia
## 2 3
## Germany Ghana
## 1 2
## Greece Guatemala
## 3 3
## Guinea Guinea-Bissau
## 2 2
## Guyana Haiti
## 3 2
## Honduras Hungary
## 3 3
## Iceland Indonesia
## 3 1
## Iran Iraq
## 1 2
## Republic of Ireland Israel
## 3 3
## Italy Jamaica
## 1 3
## Japan Jordan
## 1 3
## Kazakhstan Kenya
## 3 2
## Kuwait Kyrgyzstan
## 3 3
## Laos Latvia
## 2 3
## Lebanon Lesotho
## 3 2
## Liberia Libya
## 2 3
## Lithuania Luxembourg
## 3 3
## Madagascar Malawi
## 2 2
## Malaysia Maldives
## 3 3
## Mali Malta
## 2 3
## Mauritania Mauritius
## 2 3
## Mexico Moldova
## 1 3
## Mongolia Montenegro
## 3 3
## Morocco Mozambique
## 3 2
## Myanmar Namibia
## 2 2
## Nepal Netherlands
## 3 3
## New Zealand Nicaragua
## 3 3
## Niger Nigeria
## 2 2
## Norway Oman
## 3 3
## Pakistan Panama
## 2 3
## Papua New Guinea Paraguay
## 2 3
## Peru Philippines
## 3 2
## Poland Portugal
## 3 3
## Qatar Romania
## 3 3
## Russia Rwanda
## 1 2
## Saudi Arabia Senegal
## 3 2
## Serbia Seychelles
## 3 3
## Sierra Leone Singapore
## 2 3
## Slovakia Slovenia
## 3 3
## South Africa South Korea
## 2 1
## Spain Sri Lanka
## 3 3
## Sudan Suriname
## 2 3
## Sweden Switzerland
## 3 3
## Syria Tajikistan
## 3 2
## Tanzania Thailand
## 2 3
## East Timor Togo
## 2 2
## Trinidad and Tobago Tunisia
## 3 3
## Turkey Uganda
## 3 2
## Ukraine United Arab Emirates
## 3 3
## United Kingdom Uruguay
## 1 3
## Venezuela Vietnam
## 3 3
## Yemen Zambia
## 2 2
## Zimbabwe
## 2
In the last 4 chunks I called for different models to determine the right amount of clusters for the analysis. In the correlation matrix we can observe that there might be different amounts of clusters that can be created as it might inicate 4 or 5 clusters. From the elbow method I interpreted 3 as the right amount of clusters. The silhouette analysis suggests 4 as the right amount of clusters. Finally I used the function NbClust to get which was the most precise amount of clusters, the model determined to 3 to be the best number of clusters to be used.
Clustering <- kmeans(data_clu_std,
centers = 3, #Number of groups
nstart = 25) #Number of different positions of initial leaders
Clustering
## K-means clustering with 3 clusters of sizes 16, 88, 55
##
## Cluster means:
## Population Fertility Physicians Expectancy Co2 GDP
## 1 1.5152568 -0.7441583 0.5898642 0.8728482 2.3157505 2.3999453
## 2 -0.3665258 -0.5732201 0.4988016 0.5615269 -0.2231156 -0.2177999
## 3 0.1456394 1.1336345 -0.9696795 -1.1523625 -0.3166878 -0.3496860
##
## Clustering vector:
## Afghanistan Albania
## 3 2
## Algeria Angola
## 2 3
## Antigua and Barbuda Argentina
## 2 2
## Armenia Australia
## 2 1
## Austria Azerbaijan
## 2 2
## The Bahamas Bahrain
## 2 2
## Bangladesh Barbados
## 3 2
## Belgium Belize
## 2 2
## Benin Bhutan
## 3 2
## Bolivia Bosnia and Herzegovina
## 2 2
## Botswana Brazil
## 3 1
## Brunei Bulgaria
## 2 2
## Burkina Faso Burundi
## 3 3
## Ivory Coast Cape Verde
## 3 2
## Cambodia Cameroon
## 3 3
## Canada Central African Republic
## 1 3
## Chad Chile
## 3 2
## Colombia Republic of the Congo
## 2 3
## Costa Rica Croatia
## 2 2
## Cyprus Czech Republic
## 2 2
## Democratic Republic of the Congo Denmark
## 3 2
## Djibouti Dominican Republic
## 3 2
## Ecuador Egypt
## 2 3
## El Salvador Equatorial Guinea
## 2 3
## Estonia Ethiopia
## 2 3
## Fiji Finland
## 3 2
## France Gabon
## 1 3
## The Gambia Georgia
## 3 2
## Germany Ghana
## 1 3
## Greece Guatemala
## 2 2
## Guinea Guinea-Bissau
## 3 3
## Guyana Haiti
## 2 3
## Honduras Hungary
## 2 2
## Iceland Indonesia
## 2 1
## Iran Iraq
## 1 3
## Republic of Ireland Israel
## 2 2
## Italy Jamaica
## 1 2
## Japan Jordan
## 1 2
## Kazakhstan Kenya
## 2 3
## Kuwait Kyrgyzstan
## 2 2
## Laos Latvia
## 3 2
## Lebanon Lesotho
## 2 3
## Liberia Libya
## 3 2
## Lithuania Luxembourg
## 2 2
## Madagascar Malawi
## 3 3
## Malaysia Maldives
## 2 2
## Mali Malta
## 3 2
## Mauritania Mauritius
## 3 2
## Mexico Moldova
## 1 2
## Mongolia Montenegro
## 2 2
## Morocco Mozambique
## 2 3
## Myanmar Namibia
## 3 3
## Nepal Netherlands
## 2 2
## New Zealand Nicaragua
## 2 2
## Niger Nigeria
## 3 3
## Norway Oman
## 2 2
## Pakistan Panama
## 3 2
## Papua New Guinea Paraguay
## 3 2
## Peru Philippines
## 2 3
## Poland Portugal
## 2 2
## Qatar Romania
## 2 2
## Russia Rwanda
## 1 3
## Saudi Arabia Senegal
## 1 3
## Serbia Seychelles
## 2 2
## Sierra Leone Singapore
## 3 2
## Slovakia Slovenia
## 2 2
## South Africa South Korea
## 3 1
## Spain Sri Lanka
## 1 2
## Sudan Suriname
## 3 2
## Sweden Switzerland
## 2 2
## Syria Tajikistan
## 2 3
## Tanzania Thailand
## 3 2
## East Timor Togo
## 3 3
## Trinidad and Tobago Tunisia
## 2 2
## Turkey Uganda
## 1 3
## Ukraine United Arab Emirates
## 2 2
## United Kingdom Uruguay
## 1 2
## Venezuela Vietnam
## 2 2
## Yemen Zambia
## 3 3
## Zimbabwe
## 3
##
## Within cluster sum of squares by cluster:
## [1] 139.5111 129.4391 129.9611
## (between_SS / total_SS = 57.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
I created the clusters with kmeans indicating 3 clusters with 25 iterations of the centers, we can now observe how many countries are assigned to each cluster and also which country was assigned to which cluster. We can observe that the 57.9% of the total variance is explained by the variance between the clusters, this indicated that our clustering proposal is successful.
fviz_cluster(Clustering,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
labelsize = 5,
data = data_clu_std)
The visual representation of the clusters does show that we need to remove some countries.
data <- data %>%
filter(!Country %in% c("Japan", "Russia", "Pakistan", "Nigeria", "Thailand", "Vietnam","Tajikistan", "Guatemala"))
data_clu_std <- as.data.frame(scale(data[c(2, 5, 6, 9, 12, 13)]))
head(data_clu_std)
## Population Fertility Physicians Expectancy Co2 GDP
## 1 0.3885440 1.3949665 -0.9887986 -1.0414581 -0.4824655 -0.4280356
## 2 -0.5489911 -0.8227056 -0.4116393 0.7897341 -0.5102674 -0.4348052
## 3 0.5220647 0.2666772 -0.0854189 0.5542951 0.4675710 -0.1608702
## 4 0.2229131 2.2120037 -1.0327129 -1.5254160 -0.3075543 -0.2942925
## 5 -0.6224503 -0.5347973 0.5670220 0.5804550 -0.5370139 -0.4587979
## 6 0.5723061 -0.3247020 1.3198384 0.5281352 0.8126880 0.3343321
rownames(data_clu_std) <- data$Country
I decided to remove Japan, Russia, Pakistan, Nigeria, Vietnam, Thailand, Tajikistan and Guatemala, the first 4 were deleted because of their distance from the center of their clusters and the last 2 because of the proximity to other clusters.I deleted these countries in the original “data” data set and I assigned the data again do “data_clu_std” to have the same amount of observations on both data frames.
Clustering <- kmeans(data_clu_std,
centers = 3, #Number of groups
nstart = 25) #Number of different positions of initial leaders
Clustering
## K-means clustering with 3 clusters of sizes 50, 85, 16
##
## Cluster means:
## Population Fertility Physicians Expectancy Co2 GDP
## 1 0.03138653 1.1949870 -1.0112576 -1.1871686 -0.4229671 -0.3895997
## 2 -0.36288556 -0.5766333 0.5238457 0.5659901 -0.2177326 -0.2006129
## 3 1.82974665 -0.6709701 0.3772495 0.7030795 2.4784764 2.2832548
##
## Clustering vector:
## Afghanistan Albania
## 1 2
## Algeria Angola
## 2 1
## Antigua and Barbuda Argentina
## 2 2
## Armenia Australia
## 2 3
## Austria Azerbaijan
## 2 2
## The Bahamas Bahrain
## 2 2
## Bangladesh Barbados
## 3 2
## Belgium Belize
## 2 2
## Benin Bhutan
## 1 2
## Bolivia Bosnia and Herzegovina
## 2 2
## Botswana Brazil
## 1 3
## Brunei Bulgaria
## 2 2
## Burkina Faso Burundi
## 1 1
## Ivory Coast Cape Verde
## 1 2
## Cambodia Cameroon
## 1 1
## Canada Central African Republic
## 3 1
## Chad Chile
## 1 2
## Colombia Republic of the Congo
## 2 1
## Costa Rica Croatia
## 2 2
## Cyprus Czech Republic
## 2 2
## Democratic Republic of the Congo Denmark
## 1 2
## Djibouti Dominican Republic
## 1 2
## Ecuador Egypt
## 2 1
## El Salvador Equatorial Guinea
## 2 1
## Estonia Ethiopia
## 2 1
## Fiji Finland
## 1 2
## France Gabon
## 3 1
## The Gambia Georgia
## 1 2
## Germany Ghana
## 3 1
## Greece Guinea
## 2 1
## Guinea-Bissau Guyana
## 1 2
## Haiti Honduras
## 1 2
## Hungary Iceland
## 2 2
## Indonesia Iran
## 3 3
## Iraq Republic of Ireland
## 1 2
## Israel Italy
## 2 3
## Jamaica Jordan
## 2 2
## Kazakhstan Kenya
## 2 1
## Kuwait Kyrgyzstan
## 2 2
## Laos Latvia
## 1 2
## Lebanon Lesotho
## 2 1
## Liberia Libya
## 1 2
## Lithuania Luxembourg
## 2 2
## Madagascar Malawi
## 1 1
## Malaysia Maldives
## 2 2
## Mali Malta
## 1 2
## Mauritania Mauritius
## 1 2
## Mexico Moldova
## 3 2
## Mongolia Montenegro
## 2 2
## Morocco Mozambique
## 2 1
## Myanmar Namibia
## 1 1
## Nepal Netherlands
## 2 2
## New Zealand Nicaragua
## 2 2
## Niger Norway
## 1 2
## Oman Panama
## 2 2
## Papua New Guinea Paraguay
## 1 2
## Peru Philippines
## 2 1
## Poland Portugal
## 2 2
## Qatar Romania
## 2 2
## Rwanda Saudi Arabia
## 1 3
## Senegal Serbia
## 1 2
## Seychelles Sierra Leone
## 2 1
## Singapore Slovakia
## 2 2
## Slovenia South Africa
## 2 3
## South Korea Spain
## 3 3
## Sri Lanka Sudan
## 2 1
## Suriname Sweden
## 2 2
## Switzerland Syria
## 2 2
## Tanzania East Timor
## 1 1
## Togo Trinidad and Tobago
## 1 2
## Tunisia Turkey
## 2 3
## Uganda Ukraine
## 1 2
## United Arab Emirates United Kingdom
## 2 3
## Uruguay Venezuela
## 2 2
## Yemen Zambia
## 1 1
## Zimbabwe
## 1
##
## Within cluster sum of squares by cluster:
## [1] 79.53855 128.11649 132.64510
## (between_SS / total_SS = 62.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(Clustering,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
labelsize = 5,
data = data_clu_std)
I did the same steps as before the rebuild the clusters, we can observe that now we have a better clustering, I will not remove any more countries.
Averages <- Clustering$centers
Averages #Average values of cluster variables to describe groups
## Population Fertility Physicians Expectancy Co2 GDP
## 1 0.03138653 1.1949870 -1.0112576 -1.1871686 -0.4229671 -0.3895997
## 2 -0.36288556 -0.5766333 0.5238457 0.5659901 -0.2177326 -0.2006129
## 3 1.82974665 -0.6709701 0.3772495 0.7030795 2.4784764 2.2832548
This chunk calls for the averages values of the cluster variables in each of the groups, this information is important to be able to give a description to these groups.
Figure <- as.data.frame(Averages)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c("Population", "Fertility", "Physicians","Expectancy", "Co2", "GDP"))
Figure$Group <- factor(Figure$id,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$ImeF <- factor(Figure$name,
levels = c("Population", "Fertility", "Physicians","Expectancy", "Co2", "GDP"),
labels = c("Population", "Fertility", "Physicians","Expectancy", "Co2", "GDP"))
library(ggplot2)
ggplot(Figure, aes(x = ImeF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables") +
scale_color_brewer(palette="Set1") +
ylim(-1.5, 2.5) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
This graph is a visual representation of the averages of the groups that were formed. Group 1 has countries that in average have a higher fertility rate, clearly less than average number of physicians relative to the population and life expectancy, also the the CO2 annual emissions and the GDP are slightly below average. This would mean that this is the group of countries that are less developed from the 151 that remained in the analysis. Group 2 is the biggest group, all of the measured variables are slightly deviated from the average, population, fertility rate, CO2 and GDP are slightly below average, while physicians relative to the population and life expectancy are above average. This would be the group of countries that are more developed than group 1. Group 3 is the smallest group, it shows to have the biggest populations but the lowest average fertility rates, physicians relative to the population and life expectancy are above average and CO2 emissions and GDP are highly above average. This is the group of countries that are as developed as group 2 but with the biggest economies and industry from the data set.
data$Group <- Clustering$cluster
I created a new column that includes the Group to which each observation belongs to.
fit <- aov(cbind(Population, Fertility, Physicians, Expectancy, Co2, GDP) ~ as.factor(Group),
data = data)
summary(fit)
## Response Population :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 91295 45647 56.297 < 2.2e-16 ***
## Residuals 148 120003 811
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Fertility :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 176.496 88.248 183.34 < 2.2e-16 ***
## Residuals 148 71.239 0.481
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Physicians :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 194.97 97.486 77.504 < 2.2e-16 ***
## Residuals 148 186.16 1.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Expectancy :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 6172.8 3086.39 176.04 < 2.2e-16 ***
## Residuals 148 2594.8 17.53
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Co2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 2462366 1231183 212.53 < 2.2e-16 ***
## Residuals 148 857373 5793
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response GDP :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 3.0117e+13 1.5059e+13 125.72 < 2.2e-16 ***
## Residuals 148 1.7727e+13 1.1978e+11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This ANOVA test shows us if there is significant difference between at least two groups that were just formed in each of the clustering variables that were used, in this case all of the p-values are approximate to 0, which means that we have the desired difference between groups within all of the variables, for this reason the selected variables are appropiate.
library(rstatix)
data %>%
group_by(Group) %>%
shapiro_test(Urban)
## # A tibble: 3 × 4
## Group variable statistic p
## <int> <chr> <dbl> <dbl>
## 1 1 Urban 0.746 6.17e- 8
## 2 2 Urban 0.710 1.14e-11
## 3 3 Urban 0.772 1.18e- 3
I chose the variable Urban to do the validation of the cluster, first I tested for normal distribution within groups for the variable Urban, as normality is violated in all three groups I will perform a non-parametric test to determine if the means of the groups for this variables are statistically different from each other.
kruskal.test(Urban ~ Group,
data = data)
##
## Kruskal-Wallis rank sum test
##
## data: Urban by Group
## Kruskal-Wallis chi-squared = 41.397, df = 2, p-value = 1.025e-09
kruskal_effsize(Urban ~ Group,
data = data)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Urban 151 0.266 eta2[H] large
Since the p-value from the Kruskal-Wallis rank sum test is below 0.05 we reject the null hypothesis, which means that the differnces between the means or Urban for groups 1, 2 and 3 are statistically significant, this tells us that the clustering successfuly divided the countries in 3 groups. The effect size for this test is large.
Conclusion:
The countries were clustered based on six key socioeconomic and demographic variables: Population, Fertility, Physicians, Expectancy, CO2, and GDP. The optimal number of clusters was determined to be three through multiple methods.
The resulting groups are characterized as:
Group 1: countries with high fertility rate, clearly less than average number of physicians relative to the population and life expectancy, CO2 annual emissions and GDP slightly below average. This would mean that this is the group of countries that are less developed. Group 2: biggest group, all of the measured variables are slightly deviated from the average. Population, fertility rate, CO2 and GDP are slightly below average. Physicians relative to the population and life expectancy are above average. This would be the group of countries within the average but are more developed than group 1. Group 3: smallest group, it shows to have the biggest populations but the lowest average fertility rates, physicians relative to the population and life expectancy are above average and CO2 emissions and GDP are highly above average. This is the group of countries that are as developed as group 2 but with the biggest economies and industry from the data set.
ANOVA results confirmed that there are statistically significant differences across clusters for all six clustering variables.
Validation with the Urban Population variable confirmed significant differences between clusters, supported by a Kruskal-Wallis test and a large effect size.
These findings indicate that the clustering solution effectively separates countries based on socioeconomic and demographic indicators. The rejection of both the main and secondary null hypotheses validates the robustness of the clusters and confirms that these groups capture meaningful distinctions between countries.