getwd()
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(flexclust)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Loading required package: stats4
library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)
## Loading required package: gtools
library(gridExtra)
library(corrplot)
## corrplot 0.84 loaded
library(factoextra)
ctry <- read.csv(file = "C:\\Users\\cynar\\Desktop\\school\\Semester 1\\unsupervised learning\\Country-data.csv", head = T, sep = ",")
View(ctry)
ctry$child_mort <- as.numeric(ctry$child_mort)
ctry$exports <- as.numeric(ctry$exports)
ctry$health <- as.numeric(ctry$health)
ctry$imports <- as.numeric(ctry$imports)
ctry$inflation <- as.numeric(ctry$inflation)
ctry$life_expec <- as.numeric(ctry$life_expec)
ctry$total_fer <- as.numeric(ctry$total_fer)
summary(ctry)
## country child_mort exports health
## Length:167 Min. : 2.60 Min. : 0.109 Min. : 1.810
## Class :character 1st Qu.: 8.25 1st Qu.: 23.800 1st Qu.: 4.920
## Mode :character Median : 19.30 Median : 35.000 Median : 6.320
## Mean : 38.27 Mean : 41.109 Mean : 6.816
## 3rd Qu.: 62.10 3rd Qu.: 51.350 3rd Qu.: 8.600
## Max. :208.00 Max. :200.000 Max. :17.900
## imports income inflation life_expec
## Min. : 0.0659 Min. : 609 Min. : -4.210 Min. :32.10
## 1st Qu.: 30.2000 1st Qu.: 3355 1st Qu.: 1.810 1st Qu.:65.30
## Median : 43.3000 Median : 9960 Median : 5.390 Median :73.10
## Mean : 46.8902 Mean : 17145 Mean : 7.782 Mean :70.56
## 3rd Qu.: 58.7500 3rd Qu.: 22800 3rd Qu.: 10.750 3rd Qu.:76.80
## Max. :174.0000 Max. :125000 Max. :104.000 Max. :82.80
## total_fer gdpp
## Min. :1.150 Min. : 231
## 1st Qu.:1.795 1st Qu.: 1330
## Median :2.410 Median : 4660
## Mean :2.948 Mean : 12964
## 3rd Qu.:3.880 3rd Qu.: 14050
## Max. :7.490 Max. :105000
child_mort
hist(ctry$child_mort, main="Child Mortality",
col="red",
ylim = c(0,100), xlim = c(0, 250),
ylab="No. of Deaths",
xlab = "per 1000 live births")
min(ctry$child_mort)
## [1] 2.6
exports
hist(ctry$exports,main="Exports",
col="red",
ylim = c(0,80), xlim = c(0, 200),
ylab="No. of Exports",
xlab = "%age per GDP capita")
health
hist(ctry$health,main="Total Health Spending",
col="red",
ylim = c(0,60), xlim = c(0, 20),
ylab="No. of People",
xlab = "%age of GDP per capita")
imports
hist(ctry$imports,main="Imports of goods and services",
col="red",
ylim = c(0,80), xlim = c(0, 200),
ylab="No. of Imports",
xlab = "%age of the GDP per capita")
income
hist(ctry$income,main="Net Income",
col="red",
ylim = c(0,100), xlim = c(600, 140000),
ylab="No. of Customers",
xlab = "Net income")
inflation
hist(ctry$inflation,main="Inflation",
col="red",
ylim = c(0,120), xlim = c(-10, 120),
ylab="Number of Countries",
xlab = "Annual Growth Rate")
life_expec
hist(ctry$life_expec,main="Life Expactancy",
col="red",
ylim = c(0,50), xlim = c(20, 90),
ylab="No. of Births",
xlab = "Expenctancy")
total_fer
hist(ctry$total_fer,main="Spending Score",
col="red",
ylim = c(0,60), xlim = c(0, 8),
ylab="No. of Children",
xlab = "Fertility Rate")
gdpp
hist(ctry$gdpp,main="GDP per capita",
col="red",
ylim = c(0,125), xlim = c(0, 125000),
ylab="No. of Countries",
xlab = "GDP")
ctry_sub <- ctry[,2:10]
cor_matrix <- cor(ctry_sub, method = "pearson", use = "everything")
cormat <- cor(ctry_sub)
corrplot(cormat, method="number", type = "lower")
From the correlation plot, it is evident that child mortality correlates with a number variables with some intermediate negative correlation to income, a strong negetive correlation to life expectancy, a high positive correlation with total fertility and intermediate negative correlation with gdp per capita. On the other hand gdp per capita also has intermediate correlation with exports, and considerable correlation with health, strong positive correlation with income, intermediate correlation with life expanctancy and intermediate negative correlation with total fertility.
k_max <- 12
wss <- sapply(1:k_max, function(k){kmeans(ctry_sub, k,
nstart=50,iter.max = 1000 )$tot.withinss})
wss
## [1] 117459687469 36528387934 21710210040 14753425389 10655378084
## [6] 7655725089 6862317766 6195039586 3736749409 5456535669
## [11] 3384466669 3332200483
plot(1:k_max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
By using the elbow method, we observe that the optimal number of clusters is between 2 and 4, however we would further on analyse using another method to verify the optimal number of clusters. We therefore analyse the Silhouette Score Plot.
sspK <- fviz_nbclust(ctry_sub, FUNcluster = kmeans, method = "silhouette") +
ggtitle("K-means")
sspP <- fviz_nbclust(ctry_sub, FUNcluster = cluster::pam, method = "silhouette") +
ggtitle("PAM")
grid.arrange(sspK, sspP, ncol=2)
The Silhouette clarifies that the optimal number of clusters is 2. Now we then compare K-Means and the Partitioning Around Medoids. However, given that the dataset is small, we expect K-Means to be a better fit than PAM
km <- eclust(ctry_sub, k=2 , FUNcluster="kmeans", hc_metric="euclidean", graph=F)
km_clus <- fviz_cluster(km, data=mall_ana, elipse.type="convex", geom=c("point")) + ggtitle("K-means")
km_sil <- fviz_silhouette(km)
## cluster size ave.sil.width
## 1 1 32 0.45
## 2 2 135 0.79
grid.arrange(km_clus, km_sil, ncol=2)
pam <- eclust(ctry_sub, k=2 , FUNcluster="pam", hc_metric="euclidean", graph=F)
pam_clus <- fviz_cluster(pam, data=mall_ana, elipse.type="convex", geom=c("point")) + ggtitle("PAM")
pam_sil <- fviz_silhouette(pam)
## cluster size ave.sil.width
## 1 1 128 0.82
## 2 2 39 0.37
grid.arrange(pam_clus, pam_sil, ncol=2)
Right from the onset of analysis we can see that there are fewer observations which are interacting and on observation of the Silhouette width we see that there a very small portion which falls below zero while on PAM the Silhouette width is 0.71 and there is a fairly significant portion that falls below 0. On the cluster plot, we however observe fewer interactivity between clusters.