rm(list = ls())library(xlsx)
library(dplyr)
library(plotly)
library(stringr)
library(cluster)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(reshape2)
library(ggthemes)
library(NbClust)hpi <- read.xlsx('hpi-data-2016-1.xlsx',sheetIndex = 5, header = TRUE)# Remove the unnecessary columns
hpi <- hpi[c(3:14)]
# remove header and footer
hpi <- hpi[-c(1:4), ]
hpi <- hpi[-c(141:158), ]# Assign first row as header
names(hpi) <- as.matrix(hpi[1, ])
hpi <- hpi[-1, ]
hpi[] <- lapply(hpi, function(x) type.convert(as.character(x)))
head(hpi,5)## Country Region Average Life \nExpectancy
## 6 Afghanistan Middle East and North Africa 59.668
## 7 Albania Post-communist 77.347
## 8 Algeria Middle East and North Africa 74.313
## 9 Argentina Americas 75.927
## 10 Armenia Post-communist 74.446
## Average Wellbeing\n(0-10) Happy Life Years Footprint\n(gha/capita)
## 6 3.8 12.39602 0.79
## 7 5.5 34.41474 2.21
## 8 5.6 30.46946 2.12
## 9 6.5 40.16667 3.14
## 10 4.3 24.01876 2.23
## Inequality of Outcomes Inequality-adjusted Life Expectancy
## 6 0.4265574 38.34882
## 7 0.1651337 69.67116
## 8 0.2448617 60.47454
## 9 0.1642383 68.34958
## 10 0.2166481 66.92168
## Inequality-adjusted Wellbeing Happy Planet Index GDP/capita\n($PPP)
## 6 3.390494 20.22535 690.8426
## 7 5.097650 36.76687 4247.4854
## 8 5.196449 33.30054 5583.6162
## 9 6.034707 35.19024 14357.4116
## 10 3.747140 25.66642 3565.5176
## Population
## 6 29726803
## 7 2900489
## 8 37439427
## 9 42095224
## 10 2978339
# rename columns
hpi <- hpi[,c(grep('Country', colnames(hpi)),
grep('Region', colnames(hpi)),
grep('Happy.Planet.Index', colnames(hpi)),
grep('Average.Life..Expectancy', colnames(hpi)),
grep('Happy.Life.Years', colnames(hpi)),
grep('Footprint..gha.capita.', colnames(hpi)),
grep('GDP.capita...PPP.', colnames(hpi)),
grep('Inequality.of.Outcomes', colnames(hpi)),
grep('Average.Wellbeing..0.10.', colnames(hpi)),
grep('Inequality.adjusted.Life.Expectancy', colnames(hpi)),
grep('Inequality.adjusted.Wellbeing', colnames(hpi)),
grep('Population', colnames(hpi)))]
names(hpi) <- c('country',
'region',
'hpi_index',
'life_expectancy',
'happy_years',
'footprint',
'gdp',
'inequality_outcomes',
'wellbeing',
'adj_life_expectancy',
'adj_wellbeing',
'population')# change data type
hpi$country <- as.character(hpi$country)
hpi$region <- as.character(hpi$region)
hpi$life_expectancy <- as.numeric(hpi$life_expectancy)
hpi$wellbeing <- as.numeric(hpi$wellbeing)
hpi$happy_years <- as.numeric(hpi$happy_years)
hpi$footprint <- as.numeric(hpi$footprint)
hpi$inequality_outcomes <- as.numeric(hpi$inequality_outcomes)
hpi$adj_life_expectancy <- as.numeric(hpi$adj_life_expectancy)
hpi$adj_wellbeing <- as.numeric(hpi$adj_wellbeing)
hpi$hpi <- as.numeric(hpi$hpi)
hpi$gdp <- as.numeric(hpi$gdp)
hpi$population <- as.numeric(hpi$population)Structure of Data
str(hpi)## 'data.frame': 140 obs. of 13 variables:
## $ country : chr "Afghanistan" "Albania" "Algeria" "Argentina" ...
## $ region : chr "Middle East and North Africa" "Post-communist" "Middle East and North Africa" "Americas" ...
## $ hpi_index : num 20.2 36.8 33.3 35.2 25.7 ...
## $ life_expectancy : num 59.7 77.3 74.3 75.9 74.4 ...
## $ happy_years : num 12.4 34.4 30.5 40.2 24 ...
## $ footprint : num 0.79 2.21 2.12 3.14 2.23 9.31 6.06 0.72 5.09 7.44 ...
## $ gdp : num 691 4247 5584 14357 3566 ...
## $ inequality_outcomes: num 0.427 0.165 0.245 0.164 0.217 ...
## $ wellbeing : num 3.8 5.5 5.6 6.5 4.3 7.2 7.4 4.7 5.7 6.9 ...
## $ adj_life_expectancy: num 38.3 69.7 60.5 68.3 66.9 ...
## $ adj_wellbeing : num 3.39 5.1 5.2 6.03 3.75 ...
## $ population : num 29726803 2900489 37439427 42095224 2978339 ...
## $ hpi : num 20.2 36.8 33.3 35.2 25.7 ...
summary(hpi[, 3:12])## hpi_index life_expectancy happy_years footprint
## Min. :12.78 Min. :48.91 Min. : 8.97 Min. : 0.610
## 1st Qu.:21.18 1st Qu.:65.36 1st Qu.:18.76 1st Qu.: 1.445
## Median :26.38 Median :73.61 Median :29.59 Median : 2.700
## Mean :26.44 Mean :71.05 Mean :30.35 Mean : 3.271
## 3rd Qu.:31.58 3rd Qu.:77.09 3rd Qu.:39.84 3rd Qu.: 4.525
## Max. :44.71 Max. :83.57 Max. :59.32 Max. :15.820
## NA's :1 NA's :1 NA's :1 NA's :1
## gdp inequality_outcomes wellbeing adj_life_expectancy
## Min. : 244.2 Min. :0.04322 Min. :2.867 Min. :27.32
## 1st Qu.: 1664.2 1st Qu.:0.13299 1st Qu.:4.550 1st Qu.:48.39
## Median : 5702.2 Median :0.21147 Median :5.300 Median :63.44
## Mean : 14005.0 Mean :0.23195 Mean :5.411 Mean :60.51
## 3rd Qu.: 15190.5 3rd Qu.:0.32546 3rd Qu.:6.250 3rd Qu.:72.59
## Max. :105447.1 Max. :0.50734 Max. :7.800 Max. :81.26
## NA's :1 NA's :1 NA's :1 NA's :1
## adj_wellbeing population
## Min. :2.421 Min. :2.475e+05
## 1st Qu.:4.041 1st Qu.:4.229e+06
## Median :4.870 Median :1.051e+07
## Mean :4.975 Mean :4.825e+07
## 3rd Qu.:5.708 3rd Qu.:3.387e+07
## Max. :7.625 Max. :1.351e+09
## NA's :1 NA's :1
Plot
# The Top 20 Happies Countries(according to Happy Planet Index)
hpi_sort <- hpi[with(hpi, order(-hpi_index)), ]
hpi_top_20 <- head(hpi_sort, 20)
hpi_top_20 <- hpi_top_20[, c(1, 10, 3, 4, 7)] # select column
head(hpi_top_20,5)## country adj_life_expectancy hpi_index life_expectancy gdp
## 34 Costa Rica 72.61555 44.71407 79.076 9733.397
## 85 Mexico 66.31197 40.69729 76.411 9703.371
## 32 Colombia 63.10067 40.69501 73.673 7885.061
## 140 Vanuatu 60.32133 40.57010 71.341 3158.421
## 142 Vietnam 64.79426 40.30759 75.477 1754.548
knitr::kable(head(hpi_top_20))| country | adj_life_expectancy | hpi_index | life_expectancy | gdp | |
|---|---|---|---|---|---|
| 34 | Costa Rica | 72.61555 | 44.71407 | 79.076 | 9733.397 |
| 85 | Mexico | 66.31197 | 40.69729 | 76.411 | 9703.371 |
| 32 | Colombia | 63.10067 | 40.69501 | 73.673 | 7885.061 |
| 140 | Vanuatu | 60.32133 | 40.57010 | 71.341 | 3158.421 |
| 142 | Vietnam | 64.79426 | 40.30759 | 75.477 | 1754.548 |
| 102 | Panama | 68.32565 | 39.50258 | 77.215 | 10138.521 |
hpi_bottom_20 <- tail(hpi_sort, 20)
hpi_bottom_20 <- hpi_bottom_20[, c(1, 10, 3, 4, 7)]
hpi_bottom_20## country adj_life_expectancy hpi_index life_expectancy
## 96 Niger 38.87858 16.84656 60.046
## 58 Hong Kong 81.26282 16.79968 83.572
## 27 Cameroon 33.09404 16.69824 54.610
## 75 Lesotho 32.55987 16.66514 48.947
## 21 Botswana 50.79682 16.60508 64.249
## 40 Djibouti 41.36547 16.42837 61.311
## 117 South Africa 41.79492 15.86899 56.285
## 55 Guinea 37.22515 15.85358 57.657
## 130 Trinidad and Tobago 58.44326 15.71835 70.116
## 25 Burundi 33.01281 15.57616 55.803
## 122 Swaziland 31.81028 15.53604 48.910
## 114 Sierra Leone 28.18260 15.26465 49.758
## 133 Turkmenistan 48.32899 14.60854 65.298
## 35 Cote d'Ivoire 30.63832 14.43856 50.830
## 86 Mongolia 56.87023 14.26947 68.570
## 17 Benin 37.26980 13.42236 59.167
## 129 Togo 39.63976 13.23327 58.601
## 78 Luxembourg 78.97029 13.15117 81.111
## 29 Chad 27.31849 12.77716 50.808
## 163 <NA> NA NA NA
## gdp
## 96 393.6434
## 58 36707.7742
## 27 1222.1921
## 75 1158.8042
## 21 6935.5937
## 40 1586.7801
## 117 7592.1580
## 55 487.3457
## 130 18322.3238
## 25 244.1965
## 122 3988.6672
## 114 618.9473
## 133 6797.7212
## 35 1281.3829
## 86 4377.2389
## 17 807.6885
## 129 580.4951
## 78 105447.0932
## 29 972.6793
## 163 NA
head(hpi_bottom_20)## country adj_life_expectancy hpi_index life_expectancy gdp
## 96 Niger 38.87858 16.84656 60.046 393.6434
## 58 Hong Kong 81.26282 16.79968 83.572 36707.7742
## 27 Cameroon 33.09404 16.69824 54.610 1222.1921
## 75 Lesotho 32.55987 16.66514 48.947 1158.8042
## 21 Botswana 50.79682 16.60508 64.249 6935.5937
## 40 Djibouti 41.36547 16.42837 61.311 1586.7801
knitr::kable(tail(hpi_bottom_20), "simple")| country | adj_life_expectancy | hpi_index | life_expectancy | gdp | |
|---|---|---|---|---|---|
| 86 | Mongolia | 56.87023 | 14.26947 | 68.570 | 4377.2389 |
| 17 | Benin | 37.26980 | 13.42236 | 59.167 | 807.6885 |
| 129 | Togo | 39.63976 | 13.23327 | 58.601 | 580.4951 |
| 78 | Luxembourg | 78.97029 | 13.15117 | 81.111 | 105447.0932 |
| 29 | Chad | 27.31849 | 12.77716 | 50.808 | 972.6793 |
| 163 | NA | NA | NA | NA | NA |
# life expectancy vs GDP per Capita in USD
ggplot(hpi, aes(x = gdp, y = life_expectancy)) +
geom_point(aes(size = population, color = region)) +
geom_smooth(method = 'loess') +
ggtitle('Life Expectancy and GDP per Capita in USD log10') +
theme_classic()# life expectancy vs GDP per Capita in USD log10
ggplot(hpi, aes(x = gdp, y = life_expectancy)) +
geom_point(aes(size = population, color = region)) +
coord_trans(x = 'log10') + #Log 10 GDP
geom_smooth(method = 'loess') +
ggtitle('Life Expectancy and GDP per Capita in USD log10') +
theme_classic()cor.test(hpi$gdp, hpi$life_expectancy)##
## Pearson's product-moment correlation
##
## data: hpi$gdp and hpi$life_expectancy
## t = 9.2785, df = 137, p-value = 3.381e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5071660 0.7138732
## sample estimates:
## cor
## 0.6212097
# life expectancy vs hpi_index
ggplot(hpi, aes(x = life_expectancy, y = hpi_index)) +
geom_point(aes(size = population, color = region)) +
geom_smooth(method = 'loess') +
ggtitle('Life Expectancy and Happy Planet Index Score') +
theme_classic()# gdp vs hpi_index
ggplot(hpi, aes(x = gdp, y = hpi_index)) +
geom_point(aes(size = population, color = region)) +
geom_smooth(method = 'loess') +
ggtitle('GDP per Capita(log10) and Happy Planet Index Score') +
coord_trans(x = 'log10')cor.test(hpi$gdp, hpi$hpi_index)##
## Pearson's product-moment correlation
##
## data: hpi$gdp and hpi$hpi_index
## t = 1.316, df = 137, p-value = 0.1904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05581018 0.27314830
## sample estimates:
## cor
## 0.1117289
hpi[, 3:12] <- scale(hpi[, 3:12])hpi <- hpi[complete.cases(hpi), ] # remove N/A row
summary(hpi[, 3:12])## hpi_index life_expectancy happy_years footprint
## Min. :-1.862908 Min. :-2.5566 Min. :-1.61298 Min. :-1.1538
## 1st Qu.:-0.716792 1st Qu.:-0.6566 1st Qu.:-0.87456 1st Qu.:-0.7918
## Median :-0.008026 Median : 0.2953 Median :-0.05687 Median :-0.2476
## Mean : 0.000000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.700832 3rd Qu.: 0.6973 3rd Qu.: 0.71649 3rd Qu.: 0.5437
## Max. : 2.490992 Max. : 1.4461 Max. : 2.18571 Max. : 5.4410
## gdp inequality_outcomes wellbeing adj_life_expectancy
## Min. :-0.69548 Min. :-1.5625 Min. :-2.20839 Min. :-2.2427
## 1st Qu.:-0.62371 1st Qu.:-0.8194 1st Qu.:-0.74719 1st Qu.:-0.8190
## Median :-0.41963 Median :-0.1696 Median :-0.09615 Median : 0.1979
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.05991 3rd Qu.: 0.7742 3rd Qu.: 0.72849 3rd Qu.: 0.8161
## Max. : 4.62151 Max. : 2.2800 Max. : 2.07396 Max. : 1.4022
## adj_wellbeing population
## Min. :-2.14433 Min. :-0.29950
## 1st Qu.:-0.78425 1st Qu.:-0.27466
## Median :-0.08843 Median :-0.23544
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.61531 3rd Qu.:-0.08973
## Max. : 2.22440 Max. : 8.12656
cor1 <- cor(hpi[, 3:12])
cor1## hpi_index life_expectancy happy_years footprint
## hpi_index 1.00000000 0.54070352 0.49802687 -0.13533513
## life_expectancy 0.54070352 1.00000000 0.87615960 0.62038782
## happy_years 0.49802687 0.87615960 1.00000000 0.74681186
## footprint -0.13533513 0.62038782 0.74681186 1.00000000
## gdp 0.11172886 0.62120966 0.79592972 0.79564994
## inequality_outcomes -0.46493884 -0.93658205 -0.91999078 -0.71502869
## wellbeing 0.50874386 0.68927171 0.93110239 0.66826129
## adj_life_expectancy 0.48632795 0.98280286 0.88917923 0.66640483
## adj_wellbeing 0.48578291 0.67610988 0.93254332 0.68073404
## population 0.06555464 0.01000986 -0.02838965 -0.05915617
## gdp inequality_outcomes wellbeing
## hpi_index 0.11172886 -0.4649388 0.50874386
## life_expectancy 0.62120966 -0.9365820 0.68927171
## happy_years 0.79592972 -0.9199908 0.93110239
## footprint 0.79564994 -0.7150287 0.66826129
## gdp 1.00000000 -0.6680200 0.71047750
## inequality_outcomes -0.66802003 1.0000000 -0.75875713
## wellbeing 0.71047750 -0.7587571 1.00000000
## adj_life_expectancy 0.64151048 -0.9727959 0.69844506
## adj_wellbeing 0.73076419 -0.7601709 0.99444050
## population -0.05212178 0.0071589 -0.02401221
## adj_life_expectancy adj_wellbeing population
## hpi_index 0.486327952 0.48578291 0.065554644
## life_expectancy 0.982802865 0.67610988 0.010009865
## happy_years 0.889179226 0.93254332 -0.028389653
## footprint 0.666404826 0.68073404 -0.059156173
## gdp 0.641510475 0.73076419 -0.052121784
## inequality_outcomes -0.972795855 -0.76017093 0.007158900
## wellbeing 0.698445055 0.99444050 -0.024012215
## adj_life_expectancy 1.000000000 0.68485608 -0.003310006
## adj_wellbeing 0.684856084 1.00000000 -0.026022587
## population -0.003310006 -0.02602259 1.000000000
qplot(x=Var1, y=Var2, data=melt(cor(hpi[, 3:12], use="p")), fill=value, geom="tile") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Heatmap of Correlation Matrix",
x=NULL, y=NULL)qplot(x=Var1, y=Var2, data=melt(cor(hpi[, 3:12], use="p")), fill=value, geom="tile") +
scale_fill_gradient2(limits=c(-1, 1)) + # fill color for
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Heatmap of Correlation Matrix",
x=NULL, y=NULL)hpi.pca <- PCA(hpi[, 3:12], graph = FALSE)
print(hpi.pca)## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 139 individuals, described by 10 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
eigenvalues <- hpi.pca$eig
head(eigenvalues)## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 6.66951476 66.6951476 66.69515
## comp 2 1.31543659 13.1543659 79.84951
## comp 3 0.97056041 9.7056041 89.55512
## comp 4 0.69533213 6.9533213 96.50844
## comp 5 0.24206842 2.4206842 98.92912
## comp 6 0.05149289 0.5149289 99.44405
# 6 components fviz_screeplot(hpi.pca, addlabels = TRUE, ylim = c(0, 65))The scree plot shows us which components explain most of the variability in the data. In this case, almost 80% of the variances contained in the data are retained by the first two principal components.
head(hpi.pca$var$contrib)## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## hpi_index 3.542876 51.0853576 5.151187279 2.2511810 5.25202501
## life_expectancy 12.312754 2.2855289 0.006687887 18.2969362 0.32702458
## happy_years 14.785622 0.0129967 0.024409907 0.7566482 0.03471453
## footprint 8.986702 24.8600138 2.893931577 0.4839522 7.64700025
## gdp 9.666480 11.5881130 0.994732438 2.4805961 72.45373605
## inequality_outcomes 13.353997 0.3013117 0.007705717 10.0667736 2.86884477
fviz_pca_var(hpi.pca, col.var="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
) This highlights the most important variables in explaining the variations retained by the principal components. hpi_index, life_expectancy.
Using Pam Clustering Analysis to group countries by wealth, development, carbon emissions, and happiness.
number <- NbClust(hpi[, 3:12], distance = "euclidean",
min.nc = 2, max.nc = 15,
method = 'ward.D',
index = 'all',
alphaBeale = 0.1)## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
## * 3 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
I will apply K=3 in the following steps.
set.seed(2017)
pam <- pam(hpi[, 3:12], diss = FALSE, 3, keep.data = TRUE)
fviz_silhouette(pam) #Number of countries assigned in each cluster.## cluster size ave.sil.width
## 1 1 42 0.46
## 2 2 66 0.32
## 3 3 31 0.37
# This prints out one typical country represents each cluster.
hpi$country[pam$id.med]## [1] "Liberia" "Romania" "Ireland"
fviz_cluster(pam, stand = FALSE, geom = "point",
ellipse.type = "norm")hpi['cluster'] <- as.factor(pam$clustering)
map <- map_data("world")
map <- left_join(map, hpi, by = c('region' = 'country'))
ggplot() + geom_polygon(data = map, aes(x = long, y = lat, group = group, fill=cluster, color=cluster)) +
labs(title = "Clustering Happy Planet Index", subtitle = "Based on data from:http://happyplanetindex.org/", x=NULL, y=NULL) + theme_minimal()