Column Name Description

country - Name of the country

child_mort - Death of children under 5 years of age per 1000 live births

exports - Exports of goods and services per capita. Given as %age of the GDP per capita

health - Total health spending per capita. Given as %age of GDP per capita

imports - Imports of goods and services per capita. Given as %age of the GDP per capita

Income - Net income per person

Inflation - The measurement of the annual growth rate of the Total GDP

life_expec - The average number of years a new born child would live if the current mortality patterns are to remain the same

total_fer - The number of children that would be born to each woman if the current age-fertility rates remain the same.

gdpp - The GDP per capita. Calculated as the Total GDP divided by the total population.

getwd()

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(flexclust)

## Loading required package: grid

## Loading required package: lattice

## Loading required package: modeltools

## Loading required package: stats4

library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)

## Loading required package: gtools

library(gridExtra)
library(corrplot)

## corrplot 0.84 loaded

library(factoextra)

ctry <- read.csv(file = "C:\\Users\\cynar\\Desktop\\school\\Semester 1\\unsupervised learning\\Country-data.csv", head = T, sep = ",")  
View(ctry)

Converting Variables to Numeric

ctry$child_mort <- as.numeric(ctry$child_mort)
ctry$exports <- as.numeric(ctry$exports)
ctry$health <- as.numeric(ctry$health)
ctry$imports <- as.numeric(ctry$imports)
ctry$inflation <- as.numeric(ctry$inflation)
ctry$life_expec <- as.numeric(ctry$life_expec)
ctry$total_fer <- as.numeric(ctry$total_fer)

SUMMARY

summary(ctry)

##    country            child_mort        exports            health      
##  Length:167         Min.   :  2.60   Min.   :  0.109   Min.   : 1.810  
##  Class :character   1st Qu.:  8.25   1st Qu.: 23.800   1st Qu.: 4.920  
##  Mode  :character   Median : 19.30   Median : 35.000   Median : 6.320  
##                     Mean   : 38.27   Mean   : 41.109   Mean   : 6.816  
##                     3rd Qu.: 62.10   3rd Qu.: 51.350   3rd Qu.: 8.600  
##                     Max.   :208.00   Max.   :200.000   Max.   :17.900  
##     imports             income         inflation         life_expec   
##  Min.   :  0.0659   Min.   :   609   Min.   : -4.210   Min.   :32.10  
##  1st Qu.: 30.2000   1st Qu.:  3355   1st Qu.:  1.810   1st Qu.:65.30  
##  Median : 43.3000   Median :  9960   Median :  5.390   Median :73.10  
##  Mean   : 46.8902   Mean   : 17145   Mean   :  7.782   Mean   :70.56  
##  3rd Qu.: 58.7500   3rd Qu.: 22800   3rd Qu.: 10.750   3rd Qu.:76.80  
##  Max.   :174.0000   Max.   :125000   Max.   :104.000   Max.   :82.80  
##    total_fer          gdpp       
##  Min.   :1.150   Min.   :   231  
##  1st Qu.:1.795   1st Qu.:  1330  
##  Median :2.410   Median :  4660  
##  Mean   :2.948   Mean   : 12964  
##  3rd Qu.:3.880   3rd Qu.: 14050  
##  Max.   :7.490   Max.   :105000

Charting

child_mort

hist(ctry$child_mort, main="Child Mortality", 
     col="red", 
     ylim = c(0,100), xlim = c(0, 250),
     ylab="No. of Deaths",
     xlab = "per 1000 live births")

min(ctry$child_mort)

## [1] 2.6

exports

hist(ctry$exports,main="Exports", 
     col="red", 
     ylim = c(0,80), xlim = c(0, 200),
     ylab="No. of Exports",
     xlab = "%age per GDP capita")

health

hist(ctry$health,main="Total Health Spending", 
     col="red", 
     ylim = c(0,60), xlim = c(0, 20),
     ylab="No. of People",
     xlab = "%age of GDP per capita")

imports

hist(ctry$imports,main="Imports of goods and services", 
     col="red", 
     ylim = c(0,80), xlim = c(0, 200),
     ylab="No. of Imports",
     xlab = "%age of the GDP per capita")

income

hist(ctry$income,main="Net Income", 
     col="red", 
     ylim = c(0,100), xlim = c(600, 140000),
     ylab="No. of Customers",
     xlab = "Net income")

inflation

hist(ctry$inflation,main="Inflation", 
     col="red", 
     ylim = c(0,120), xlim = c(-10, 120),
     ylab="Number of Countries",
     xlab = "Annual Growth Rate")

life_expec

hist(ctry$life_expec,main="Life Expactancy", 
     col="red", 
     ylim = c(0,50), xlim = c(20, 90),
     ylab="No. of Births",
     xlab = "Expenctancy")

total_fer

hist(ctry$total_fer,main="Spending Score", 
     col="red", 
     ylim = c(0,60), xlim = c(0, 8),
     ylab="No. of Children",
     xlab = "Fertility Rate")

gdpp

hist(ctry$gdpp,main="GDP per capita", 
     col="red", 
     ylim = c(0,125), xlim = c(0, 125000),
     ylab="No. of Countries",
     xlab = "GDP")

Analysis

ctry_sub <- ctry[,2:10]

Corr Plot

cor_matrix <- cor(ctry_sub, method = "pearson", use = "everything")
cormat <- cor(ctry_sub)
corrplot(cormat, method="number", type = "lower")

From the correlation plot, it is evident that child mortality correlates with a number variables with some intermediate negative correlation to income, a strong negetive correlation to life expectancy, a high positive correlation with total fertility and intermediate negative correlation with gdp per capita. On the other hand gdp per capita also has intermediate correlation with exports, and considerable correlation with health, strong positive correlation with income, intermediate correlation with life expanctancy and intermediate negative correlation with total fertility.

Elbow Method

k_max <- 12

wss <- sapply(1:k_max, function(k){kmeans(ctry_sub, k, 
                                          nstart=50,iter.max = 1000 )$tot.withinss})

wss

##  [1] 117459687469  36528387934  21710210040  14753425389  10655378084
##  [6]   7655725089   6862317766   6195039586   3736749409   5456535669
## [11]   3384466669   3332200483

plot(1:k_max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

By using the elbow method, we observe that the optimal number of clusters is between 2 and 4, however we would further on analyse using another method to verify the optimal number of clusters. We therefore analyse the Silhouette Score Plot.

Silhouette Score Plot

sspK <- fviz_nbclust(ctry_sub, FUNcluster = kmeans, method = "silhouette") + 
  ggtitle("K-means")
sspP <- fviz_nbclust(ctry_sub, FUNcluster = cluster::pam, method = "silhouette") + 
  ggtitle("PAM")

grid.arrange(sspK, sspP, ncol=2)

The Silhouette clarifies that the optimal number of clusters is 2. Now we then compare K-Means and the Partitioning Around Medoids. However, given that the dataset is small, we expect K-Means to be a better fit than PAM

K-Means

km <- eclust(ctry_sub, k=2 , FUNcluster="kmeans", hc_metric="euclidean", graph=F)

km_clus <- fviz_cluster(km, data=mall_ana, elipse.type="convex", geom=c("point")) + ggtitle("K-means")
km_sil <- fviz_silhouette(km)

##   cluster size ave.sil.width
## 1       1   32          0.45
## 2       2  135          0.79

grid.arrange(km_clus, km_sil, ncol=2)

PAM

pam <- eclust(ctry_sub, k=2 , FUNcluster="pam", hc_metric="euclidean", graph=F)

pam_clus <- fviz_cluster(pam, data=mall_ana, elipse.type="convex", geom=c("point")) + ggtitle("PAM")
pam_sil <- fviz_silhouette(pam)

##   cluster size ave.sil.width
## 1       1  128          0.82
## 2       2   39          0.37

grid.arrange(pam_clus, pam_sil, ncol=2)

Right from the onset of analysis we can see that there are fewer observations which are interacting and on observation of the Silhouette width we see that there a very small portion which falls below zero while on PAM the Silhouette width is 0.71 and there is a fairly significant portion that falls below 0. On the cluster plot, we however observe fewer interactivity between clusters.

Clustering

Cynara Nyahoda

3/1/2021