library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(clValid)
library(ggplot2)
library(ggpubr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Reading of data
world_indicator <- read.csv("/cloud/project/world_indicator/World Indicators.csv")
# Labeling the different rows with the country names
rownames(world_indicator) <- world_indicator$Country
# Removing dollar sign, percent symbols and commas
world_indicator$GDP = as.numeric(gsub("[\\$,]", "", world_indicator$GDP))
world_indicator$Health.Exp.Capita = as.numeric(gsub("[\\$,]", "", world_indicator$Health.Exp.Capita))
world_indicator$Business.Tax.Rate = as.numeric(gsub("[\\%,]", "", world_indicator$Business.Tax.Rate))
# Dropping the non numerical columns
newdf <- world_indicator[, 1:18]
df_actual <- world_indicator
# Dropping the columns that have more than 70 NA values
newdf <- newdf[ , colSums(is.na(newdf)) < 70]
# Removing the non-significant columns
newdf <- newdf[, -c(2,3,7,9,12,13)]
# Omitting the rows with NA values
newdf <- na.omit(newdf)
# Scaling of data
newdf <- as.data.frame(scale(newdf))
# This clustering is obtained by considering all the attributes including birth rate, GDP, health expenditure, infant mortality rate, life expectancy (males and females) and population (different age groups)
# Elbow method: Determining optimal number of clusters
fviz_nbclust(newdf, kmeans, method = "wss")
fviz_nbclust(newdf, kmeans, method = "silhouette")
# All the methods above report k = 2 as the optimal number of clusters
# Using k-means clustering with k=2
km_world_indi <- kmeans(newdf, 2, nstart = 20)
km_world_indi$cluster
## Algeria Angola
## 1 2
## Benin Botswana
## 2 2
## Burkina Faso Burundi
## 2 2
## Cameroon Central African Republic
## 2 2
## Chad Comoros
## 2 2
## Congo, Dem. Rep. Congo, Rep.
## 2 2
## Cote d'Ivoire Djibouti
## 2 2
## Egypt, Arab Rep. Equatorial Guinea
## 1 2
## Eritrea Ethiopia
## 2 2
## Gabon Gambia, The
## 2 2
## Ghana Guinea
## 2 2
## Guinea-Bissau Kenya
## 2 2
## Lesotho Liberia
## 2 2
## Libya Madagascar
## 1 2
## Malawi Mali
## 2 2
## Mauritania Mauritius
## 2 1
## Morocco Mozambique
## 1 2
## Namibia Niger
## 2 2
## Nigeria Rwanda
## 2 2
## Sao Tome and Principe Senegal
## 2 2
## Seychelles Sierra Leone
## 1 2
## South Africa South Sudan
## 2 2
## Sudan Swaziland
## 2 2
## Tanzania Togo
## 2 2
## Tunisia Uganda
## 1 2
## Zambia Afghanistan
## 2 2
## Armenia Azerbaijan
## 1 1
## Bangladesh Bhutan
## 2 2
## Brunei Darussalam Cambodia
## 1 2
## China Georgia
## 1 1
## India Indonesia
## 2 1
## Japan Kazakhstan
## 1 1
## Korea, Rep. Kyrgyz Republic
## 1 2
## Lao PDR Malaysia
## 2 1
## Maldives Mongolia
## 1 1
## Nepal Pakistan
## 2 2
## Philippines Singapore
## 2 1
## Sri Lanka Tajikistan
## 1 2
## Thailand Timor-Leste
## 1 2
## Turkmenistan Uzbekistan
## 2 2
## Vietnam Albania
## 1 1
## Austria Belarus
## 1 1
## Belgium Bosnia and Herzegovina
## 1 1
## Bulgaria Croatia
## 1 1
## Cyprus Czech Republic
## 1 1
## Denmark Estonia
## 1 1
## Finland France
## 1 1
## Germany Greece
## 1 1
## Hungary Iceland
## 1 1
## Ireland Italy
## 1 1
## Latvia Lithuania
## 1 1
## Luxembourg Macedonia, FYR
## 1 1
## Malta Moldova
## 1 1
## Montenegro Netherlands
## 1 1
## Norway Poland
## 1 1
## Portugal Romania
## 1 1
## Russian Federation Serbia
## 1 1
## Slovak Republic Slovenia
## 1 1
## Spain Sweden
## 1 1
## Switzerland Turkey
## 1 1
## Ukraine United Kingdom
## 1 1
## Bahrain Iran, Islamic Rep.
## 1 1
## Iraq Israel
## 2 1
## Jordan Kuwait
## 1 1
## Lebanon Oman
## 1 1
## Qatar Saudi Arabia
## 1 1
## United Arab Emirates Yemen, Rep.
## 1 2
## Australia Fiji
## 1 1
## Kiribati Micronesia, Fed. Sts.
## 2 2
## New Zealand Papua New Guinea
## 1 2
## Samoa Solomon Islands
## 2 2
## Tonga Vanuatu
## 2 2
## Antigua and Barbuda Argentina
## 1 1
## Bahamas, The Barbados
## 1 1
## Belize Bolivia
## 1 2
## Brazil Canada
## 1 1
## Chile Colombia
## 1 1
## Costa Rica Cuba
## 1 1
## Dominican Republic Ecuador
## 1 1
## El Salvador Grenada
## 1 1
## Guatemala Guyana
## 2 2
## Haiti Honduras
## 2 1
## Jamaica Mexico
## 1 1
## Nicaragua Panama
## 1 1
## Paraguay Peru
## 1 1
## St. Lucia St. Vincent and the Grenadines
## 1 1
## Suriname Trinidad and Tobago
## 1 1
## United States Uruguay
## 1 1
## Venezuela, RB
## 1
summary(km_world_indi)
## Length Class Mode
## cluster 177 -none- numeric
## centers 20 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# K-means clustering of World Indicator data
clusplot(newdf, km_world_indi$cluster, main = '2D representation of the Cluster solution',
color = TRUE, shade = TRUE, labels = 1, lines = 1)
# This clustering is obtained by considering all the attributes including birth rate, GDP, health expenditure, infant mortality rate, life expectancy (males and females) and population (different age groups)
hc_world_indi <- hclust(dist (newdf), method = "average")
summary(hc_world_indi)
## Length Class Mode
## merge 352 -none- numeric
## height 176 -none- numeric
## order 177 -none- numeric
## labels 177 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
plot(hc_world_indi,main = "Clustered Dendogram of World Indicator Data")
# Using k = 3
hcm_world_indi <- cutree(hc_world_indi, k = 3)
plot(hcm_world_indi, main = "Solution by Hierarchical Clustering of World Indicator")
1.) Connectivity - For a particular clustering partition C = {C1,…,CK} of the N observations into K disjoint clusters, the connectivity is defined as summation of x(i) (i ranging from 1 to N, total number of observations) multiplied with summation of nn(i,j) (for i ranging from 1 to N and j ranging from 1 to L, parameter giving the number of nearest neighbors to use. Based on the different internal validation measures, the connectivity has a value between zero and ∞ and should be minimized.
2.) Silhouette Width - The Silhouette Width is the average of each observation’s Silhouette value. For observation i, it is defined as the ratio of (bi − ai) and maximum value of (bi, ai), where ai is the average distance between i and all other observations in the same cluster, and bi is the average distance between i and the observations in the nearest neighboring cluster.Silhouette Width lies in the interval [−1, 1] and should be maximized.
3.) Dunn Index - The Dunn Index is the ratio of the smallest distance between observations not in the same cluster to the largest intra-cluster distance.Dunn Index has a value between zero and ∞ and should be maximized for determining better cluster quality.
# Since our data does not have labels, we cannot perform external validation
int_validate <- clValid(newdf, nClust = 2:6, clMethods = c("kmeans", "hierarchical"),
validation = "internal")
summary(int_validate)
##
## Clustering Methods:
## kmeans hierarchical
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## kmeans Connectivity 16.7060 19.6349 34.6282 52.4016 55.7194
## Dunn 0.0485 0.0836 0.1376 0.0810 0.1585
## Silhouette 0.3883 0.3943 0.3580 0.2800 0.2791
## hierarchical Connectivity 2.9290 5.8579 21.4710 24.6250 31.8575
## Dunn 0.7570 0.3923 0.1419 0.1419 0.1615
## Silhouette 0.6815 0.3546 0.3607 0.3351 0.3211
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 2.9290 hierarchical 2
## Dunn 0.7570 hierarchical 2
## Silhouette 0.6815 hierarchical 2
print(paste0("Thus it is evident from these calculated validation measures that hierarchical clustering has a better cluster quality as compared to k-means"))
## [1] "Thus it is evident from these calculated validation measures that hierarchical clustering has a better cluster quality as compared to k-means"
# Comparing clustering solutions
# It has been depicted in the previous question based on the calculated validation measures that hierarchical clustering has a better cluster quality as compared to k-means"))
# Grouping the list of the countries clustered via k-means
listofcluster_kmeans = order(km_world_indi$cluster)
grouped_countries_kmeans <- data.frame(kmeans_cluster = km_world_indi$cluster[listofcluster_kmeans])
# View(grouped_countries_kmeans)
# Grouping the list of the countries clustered via hierarchical
listofcluster_hier = order(hcm_world_indi)
grouped_countries_hier <- data.frame(hierarchical_cluster = hcm_world_indi[listofcluster_hier])
# View(grouped_countries_hier)
# This clustering is obtained considering 2 attributes - GDP and infant mortality rate
# Scatter Plot 1: Infant Mortality Rate v/s GDP (using k-means clustering)
newdf_scatter1 <- world_indicator[, c(5, 9)]
newdf_scatter1 <- na.omit(newdf_scatter1)
newdf_scatter1_scaled <- as.data.frame(scale(newdf_scatter1))
fviz_nbclust(newdf_scatter1_scaled, kmeans, method = "silhouette")
km_scatter1 <- kmeans(newdf_scatter1_scaled, 2, nstart = 20)
km_scatter1$cluster
## Algeria Angola
## 1 2
## Benin Botswana
## 2 2
## Burkina Faso Burundi
## 2 2
## Cameroon Central African Republic
## 2 2
## Chad Comoros
## 2 2
## Congo, Dem. Rep. Congo, Rep.
## 2 2
## Cote d'Ivoire Djibouti
## 2 2
## Egypt, Arab Rep. Equatorial Guinea
## 1 2
## Eritrea Ethiopia
## 2 2
## Gabon Gambia, The
## 2 2
## Ghana Guinea
## 2 2
## Guinea-Bissau Kenya
## 2 2
## Lesotho Liberia
## 2 2
## Libya Madagascar
## 1 2
## Malawi Mali
## 2 2
## Mauritania Mauritius
## 2 1
## Morocco Mozambique
## 1 2
## Namibia Niger
## 2 2
## Nigeria Rwanda
## 2 2
## Sao Tome and Principe Senegal
## 2 2
## Seychelles Sierra Leone
## 1 2
## South Africa South Sudan
## 1 2
## Sudan Swaziland
## 2 2
## Tanzania Togo
## 2 2
## Tunisia Uganda
## 1 2
## Zambia Zimbabwe
## 2 2
## Afghanistan Armenia
## 2 1
## Azerbaijan Bangladesh
## 1 2
## Bhutan Brunei Darussalam
## 1 1
## Cambodia China
## 2 1
## Georgia India
## 1 2
## Indonesia Japan
## 1 1
## Kazakhstan Korea, Rep.
## 1 1
## Kyrgyz Republic Lao PDR
## 1 2
## Malaysia Maldives
## 1 1
## Mongolia Nepal
## 1 2
## Pakistan Philippines
## 2 1
## Singapore Sri Lanka
## 1 1
## Tajikistan Thailand
## 2 1
## Timor-Leste Turkmenistan
## 2 2
## Uzbekistan Vietnam
## 2 1
## Albania Austria
## 1 1
## Belarus Belgium
## 1 1
## Bosnia and Herzegovina Bulgaria
## 1 1
## Croatia Cyprus
## 1 1
## Czech Republic Denmark
## 1 1
## Estonia Finland
## 1 1
## France Germany
## 1 1
## Greece Hungary
## 1 1
## Iceland Ireland
## 1 1
## Italy Latvia
## 1 1
## Lithuania Luxembourg
## 1 1
## Macedonia, FYR Malta
## 1 1
## Moldova Monaco
## 1 1
## Montenegro Netherlands
## 1 1
## Norway Poland
## 1 1
## Portugal Romania
## 1 1
## Russian Federation Serbia
## 1 1
## Slovak Republic Slovenia
## 1 1
## Spain Sweden
## 1 1
## Switzerland Turkey
## 1 1
## Ukraine United Kingdom
## 1 1
## Bahrain Iran, Islamic Rep.
## 1 1
## Iraq Israel
## 1 1
## Jordan Kuwait
## 1 1
## Lebanon Oman
## 1 1
## Qatar Saudi Arabia
## 1 1
## United Arab Emirates Yemen, Rep.
## 1 2
## Australia Fiji
## 1 1
## Kiribati Marshall Islands
## 2 1
## Micronesia, Fed. Sts. New Zealand
## 1 1
## Papua New Guinea Samoa
## 2 1
## Solomon Islands Tonga
## 1 1
## Vanuatu Antigua and Barbuda
## 1 1
## Argentina Bahamas, The
## 1 1
## Barbados Belize
## 1 1
## Bolivia Brazil
## 1 1
## Canada Chile
## 1 1
## Colombia Costa Rica
## 1 1
## Cuba Dominica
## 1 1
## Dominican Republic Ecuador
## 1 1
## El Salvador Grenada
## 1 1
## Guatemala Guyana
## 1 1
## Haiti Honduras
## 2 1
## Jamaica Mexico
## 1 1
## Nicaragua Panama
## 1 1
## Paraguay Peru
## 1 1
## St. Kitts and Nevis St. Lucia
## 1 1
## St. Vincent and the Grenadines Suriname
## 1 1
## Trinidad and Tobago United States
## 1 1
## Uruguay Venezuela, RB
## 1 1
summary(km_scatter1)
## Length Class Mode
## cluster 182 -none- numeric
## centers 4 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
plot(newdf_scatter1$Infant.Mortality.Rate, newdf_scatter1$GDP,
main = "Infant Mortality Rate v/s GDP",
xlab = "Infant Mortality Rate",
ylab = "GDP",
col = km_scatter1$cluster, pch = 19, frame = FALSE)
# This clustering is obtained considering 2 attributes - Urban population and energy usage
# Scatter Plot 2: Urban Population v/s Energy Usage (using k-means clustering)
newdf_scatter2 <- world_indicator[,c(4,18)]
newdf_scatter2 <- na.omit(newdf_scatter2)
newdf_scatter2_scaled <- as.data.frame(scale(newdf_scatter2))
fviz_nbclust(newdf_scatter2_scaled, kmeans, method = "silhouette")
km_scatter2 <- kmeans(newdf_scatter2_scaled, 10, nstart = 20)
km_scatter2$cluster
## Algeria Angola Benin
## 10 6 6
## Botswana Cameroon Congo, Dem. Rep.
## 1 1 6
## Congo, Rep. Cote d'Ivoire Egypt, Arab Rep.
## 10 1 6
## Eritrea Ethiopia Gabon
## 7 7 4
## Ghana Kenya Libya
## 1 7 10
## Morocco Mozambique Namibia
## 1 6 6
## Nigeria Senegal South Africa
## 6 6 10
## Sudan Tanzania Togo
## 6 7 6
## Tunisia Zambia Zimbabwe
## 10 6 6
## Armenia Azerbaijan Bangladesh
## 10 1 6
## Brunei Darussalam Cambodia China
## 10 7 3
## Georgia Hong Kong SAR, China India
## 1 9 8
## Indonesia Japan Kazakhstan
## 1 2 1
## Korea, Dem. Rep. Korea, Rep. Kyrgyz Republic
## 1 5 6
## Malaysia Mongolia Myanmar
## 10 10 6
## Nepal Pakistan Philippines
## 7 6 6
## Singapore Sri Lanka Tajikistan
## 9 7 7
## Thailand Turkmenistan Uzbekistan
## 6 1 6
## Vietnam Albania Austria
## 6 1 10
## Belarus Belgium Bosnia and Herzegovina
## 10 9 6
## Bulgaria Croatia Cyprus
## 10 1 10
## Czech Republic Denmark Estonia
## 10 4 10
## Finland France Germany
## 4 5 5
## Greece Hungary Iceland
## 10 10 9
## Ireland Italy Latvia
## 1 10 10
## Lithuania Luxembourg Macedonia, FYR
## 10 4 1
## Malta Moldova Montenegro
## 9 6 10
## Netherlands Norway Poland
## 4 4 1
## Portugal Romania Russian Federation
## 1 1 2
## Serbia Slovak Republic Slovenia
## 1 1 1
## Spain Sweden Switzerland
## 5 4 10
## Turkey Ukraine United Kingdom
## 10 10 5
## Bahrain Iran, Islamic Rep. Iraq
## 4 5 10
## Israel Jordan Kuwait
## 9 4 9
## Lebanon Oman Qatar
## 4 10 9
## Saudi Arabia Syrian Arab Republic United Arab Emirates
## 5 1 4
## Yemen, Rep. Australia New Zealand
## 6 4 4
## Argentina Bolivia Brazil
## 4 10 5
## Canada Chile Colombia
## 5 4 10
## Costa Rica Cuba Dominican Republic
## 10 10 10
## Ecuador El Salvador Guatemala
## 10 10 1
## Haiti Honduras Jamaica
## 1 1 1
## Mexico Nicaragua Panama
## 5 1 10
## Paraguay Peru Trinidad and Tobago
## 1 10 7
## United States Uruguay Venezuela, RB
## 3 9 4
summary(km_scatter2)
## Length Class Mode
## cluster 135 -none- numeric
## centers 20 -none- numeric
## totss 1 -none- numeric
## withinss 10 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 10 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
plot_ly(newdf_scatter2,
x = ~Population.Urban,
y = ~Energy.Usage,
color = ~km_scatter2$cluster) %>%
layout(title = "Urban Population v/s Energy Usage",
xaxis = list(title = "Urban Pouplation"),
yaxis = list(title = "Energy Usage"))
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
# This clustering is obtained considering 2 attributes - Population 15-64 years and internet usage
# Scatter Plot 3: Population 15-64 years v/s Internet Usage (using hierarchical clustering)
newdf_scatter3 <- world_indicator[, c(10, 16)]
newdf_scatter3 <- na.omit(newdf_scatter3)
newdf_scatter3_scaled <- as.data.frame(scale(newdf_scatter3))
hc_scatter3 <- hclust(dist (newdf_scatter3_scaled), method = "average")
summary(hc_scatter3)
## Length Class Mode
## merge 374 -none- numeric
## height 187 -none- numeric
## order 188 -none- numeric
## labels 188 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
plot(hc_scatter3)
#Using k=4
hcm_scatter3 <- cutree(hc_scatter3, k = 4)
plot(hcm_scatter3)
newdf_scatter3$Class_hier <- hcm_scatter3
ggplot(newdf_scatter3, aes(x = Population.15.64, y = Internet.Usage, color = as.factor(Class_hier))) +
geom_point(stat = "identity") +
ggtitle("Population 15-64 years v/s Internet Usage") +
xlab("Pouplation 15-64 years") +
ylab("Internet Usage") +
labs(color = "Cluster")+
scale_colour_brewer(palette = "Set1") +
theme(plot.title = element_text(size = 13, face = "bold.italic", color = "black", hjust = 0.5),
axis.title.x = element_text(size = 7, face = "bold"),
axis.title.y = element_text(size = 7, face = "bold"),
axis.text.x = element_text(size = 5, angle = 0),
axis.text.y = element_text(size = 5, angle = 90),
legend.position = "right",
legend.title = element_text(size = 7, face = "bold"),
legend.text = element_text(size = 7, face = "bold"),
strip.text = element_text(size = 7))