1 Objective

  1. Data wrangling
  2. Correlation test
  3. PCA
  4. Pam Clustering
  5. World map

2 Clear Workspace

rm(list = ls())

3 Setup

3.1 Load packages

library(xlsx)
library(dplyr)
library(plotly)
library(stringr)
library(cluster)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(reshape2)
library(ggthemes)
library(NbClust)

3.2 Load data

hpi <- read.xlsx('hpi-data-2016-1.xlsx',sheetIndex = 5, header = TRUE)

4 Data Pre-processing

# Remove the unnecessary columns
hpi <- hpi[c(3:14)]
# remove header and footer
hpi <- hpi[-c(1:4), ]
hpi <- hpi[-c(141:158), ]
# Assign first row as header 
names(hpi) <- as.matrix(hpi[1, ])
hpi <- hpi[-1, ]
hpi[] <- lapply(hpi, function(x) type.convert(as.character(x)))

head(hpi,5)
##        Country                       Region Average Life \nExpectancy
## 6  Afghanistan Middle East and North Africa                    59.668
## 7      Albania               Post-communist                    77.347
## 8      Algeria Middle East and North Africa                    74.313
## 9    Argentina                     Americas                    75.927
## 10     Armenia               Post-communist                    74.446
##    Average Wellbeing\n(0-10) Happy Life Years Footprint\n(gha/capita)
## 6                        3.8         12.39602                    0.79
## 7                        5.5         34.41474                    2.21
## 8                        5.6         30.46946                    2.12
## 9                        6.5         40.16667                    3.14
## 10                       4.3         24.01876                    2.23
##    Inequality of Outcomes Inequality-adjusted Life Expectancy
## 6               0.4265574                            38.34882
## 7               0.1651337                            69.67116
## 8               0.2448617                            60.47454
## 9               0.1642383                            68.34958
## 10              0.2166481                            66.92168
##    Inequality-adjusted Wellbeing Happy Planet Index GDP/capita\n($PPP)
## 6                       3.390494           20.22535           690.8426
## 7                       5.097650           36.76687          4247.4854
## 8                       5.196449           33.30054          5583.6162
## 9                       6.034707           35.19024         14357.4116
## 10                      3.747140           25.66642          3565.5176
##    Population
## 6    29726803
## 7     2900489
## 8    37439427
## 9    42095224
## 10    2978339
# rename columns
hpi <- hpi[,c(grep('Country', colnames(hpi)), 
                grep('Region', colnames(hpi)), 
                  grep('Happy.Planet.Index', colnames(hpi)), 
                    grep('Average.Life..Expectancy', colnames(hpi)),
                      grep('Happy.Life.Years', colnames(hpi)), 
                       grep('Footprint..gha.capita.', colnames(hpi)), 
                        grep('GDP.capita...PPP.', colnames(hpi)), 
                          grep('Inequality.of.Outcomes', colnames(hpi)), 
                           grep('Average.Wellbeing..0.10.', colnames(hpi)), 
                             grep('Inequality.adjusted.Life.Expectancy', colnames(hpi)),
                              grep('Inequality.adjusted.Wellbeing', colnames(hpi)), 
                                grep('Population', colnames(hpi)))]

names(hpi) <- c('country', 
                  'region',
                     'hpi_index', 
                        'life_expectancy', 
                           'happy_years', 
                             'footprint', 
                               'gdp', 
                                'inequality_outcomes', 
                                  'wellbeing', 
                                    'adj_life_expectancy', 
                                       'adj_wellbeing', 
                                         'population')
# change data type
hpi$country <- as.character(hpi$country)
hpi$region <- as.character(hpi$region)
hpi$life_expectancy <- as.numeric(hpi$life_expectancy)
hpi$wellbeing <- as.numeric(hpi$wellbeing)
hpi$happy_years <- as.numeric(hpi$happy_years)
hpi$footprint <- as.numeric(hpi$footprint)
hpi$inequality_outcomes <- as.numeric(hpi$inequality_outcomes)
hpi$adj_life_expectancy <- as.numeric(hpi$adj_life_expectancy)
hpi$adj_wellbeing <- as.numeric(hpi$adj_wellbeing)
hpi$hpi <- as.numeric(hpi$hpi)
hpi$gdp <- as.numeric(hpi$gdp)
hpi$population <- as.numeric(hpi$population)

Structure of Data

str(hpi)
## 'data.frame':    140 obs. of  13 variables:
##  $ country            : chr  "Afghanistan" "Albania" "Algeria" "Argentina" ...
##  $ region             : chr  "Middle East and North Africa" "Post-communist" "Middle East and North Africa" "Americas" ...
##  $ hpi_index          : num  20.2 36.8 33.3 35.2 25.7 ...
##  $ life_expectancy    : num  59.7 77.3 74.3 75.9 74.4 ...
##  $ happy_years        : num  12.4 34.4 30.5 40.2 24 ...
##  $ footprint          : num  0.79 2.21 2.12 3.14 2.23 9.31 6.06 0.72 5.09 7.44 ...
##  $ gdp                : num  691 4247 5584 14357 3566 ...
##  $ inequality_outcomes: num  0.427 0.165 0.245 0.164 0.217 ...
##  $ wellbeing          : num  3.8 5.5 5.6 6.5 4.3 7.2 7.4 4.7 5.7 6.9 ...
##  $ adj_life_expectancy: num  38.3 69.7 60.5 68.3 66.9 ...
##  $ adj_wellbeing      : num  3.39 5.1 5.2 6.03 3.75 ...
##  $ population         : num  29726803 2900489 37439427 42095224 2978339 ...
##  $ hpi                : num  20.2 36.8 33.3 35.2 25.7 ...
summary(hpi[, 3:12])
##    hpi_index     life_expectancy  happy_years      footprint     
##  Min.   :12.78   Min.   :48.91   Min.   : 8.97   Min.   : 0.610  
##  1st Qu.:21.18   1st Qu.:65.36   1st Qu.:18.76   1st Qu.: 1.445  
##  Median :26.38   Median :73.61   Median :29.59   Median : 2.700  
##  Mean   :26.44   Mean   :71.05   Mean   :30.35   Mean   : 3.271  
##  3rd Qu.:31.58   3rd Qu.:77.09   3rd Qu.:39.84   3rd Qu.: 4.525  
##  Max.   :44.71   Max.   :83.57   Max.   :59.32   Max.   :15.820  
##  NA's   :1       NA's   :1       NA's   :1       NA's   :1       
##       gdp           inequality_outcomes   wellbeing     adj_life_expectancy
##  Min.   :   244.2   Min.   :0.04322     Min.   :2.867   Min.   :27.32      
##  1st Qu.:  1664.2   1st Qu.:0.13299     1st Qu.:4.550   1st Qu.:48.39      
##  Median :  5702.2   Median :0.21147     Median :5.300   Median :63.44      
##  Mean   : 14005.0   Mean   :0.23195     Mean   :5.411   Mean   :60.51      
##  3rd Qu.: 15190.5   3rd Qu.:0.32546     3rd Qu.:6.250   3rd Qu.:72.59      
##  Max.   :105447.1   Max.   :0.50734     Max.   :7.800   Max.   :81.26      
##  NA's   :1          NA's   :1           NA's   :1       NA's   :1          
##  adj_wellbeing     population       
##  Min.   :2.421   Min.   :2.475e+05  
##  1st Qu.:4.041   1st Qu.:4.229e+06  
##  Median :4.870   Median :1.051e+07  
##  Mean   :4.975   Mean   :4.825e+07  
##  3rd Qu.:5.708   3rd Qu.:3.387e+07  
##  Max.   :7.625   Max.   :1.351e+09  
##  NA's   :1       NA's   :1

Plot

# The Top 20 Happies Countries(according to Happy Planet Index)
hpi_sort <- hpi[with(hpi, order(-hpi_index)), ]
hpi_top_20 <- head(hpi_sort, 20)
hpi_top_20 <- hpi_top_20[, c(1, 10, 3, 4, 7)] # select column 

head(hpi_top_20,5)
##        country adj_life_expectancy hpi_index life_expectancy      gdp
## 34  Costa Rica            72.61555  44.71407          79.076 9733.397
## 85      Mexico            66.31197  40.69729          76.411 9703.371
## 32    Colombia            63.10067  40.69501          73.673 7885.061
## 140    Vanuatu            60.32133  40.57010          71.341 3158.421
## 142    Vietnam            64.79426  40.30759          75.477 1754.548
knitr::kable(head(hpi_top_20))
country adj_life_expectancy hpi_index life_expectancy gdp
34 Costa Rica 72.61555 44.71407 79.076 9733.397
85 Mexico 66.31197 40.69729 76.411 9703.371
32 Colombia 63.10067 40.69501 73.673 7885.061
140 Vanuatu 60.32133 40.57010 71.341 3158.421
142 Vietnam 64.79426 40.30759 75.477 1754.548
102 Panama 68.32565 39.50258 77.215 10138.521
hpi_bottom_20 <- tail(hpi_sort, 20)
hpi_bottom_20 <- hpi_bottom_20[, c(1, 10, 3, 4, 7)]
hpi_bottom_20
##                 country adj_life_expectancy hpi_index life_expectancy
## 96                Niger            38.87858  16.84656          60.046
## 58            Hong Kong            81.26282  16.79968          83.572
## 27             Cameroon            33.09404  16.69824          54.610
## 75              Lesotho            32.55987  16.66514          48.947
## 21             Botswana            50.79682  16.60508          64.249
## 40             Djibouti            41.36547  16.42837          61.311
## 117        South Africa            41.79492  15.86899          56.285
## 55               Guinea            37.22515  15.85358          57.657
## 130 Trinidad and Tobago            58.44326  15.71835          70.116
## 25              Burundi            33.01281  15.57616          55.803
## 122           Swaziland            31.81028  15.53604          48.910
## 114        Sierra Leone            28.18260  15.26465          49.758
## 133        Turkmenistan            48.32899  14.60854          65.298
## 35        Cote d'Ivoire            30.63832  14.43856          50.830
## 86             Mongolia            56.87023  14.26947          68.570
## 17                Benin            37.26980  13.42236          59.167
## 129                Togo            39.63976  13.23327          58.601
## 78           Luxembourg            78.97029  13.15117          81.111
## 29                 Chad            27.31849  12.77716          50.808
## 163                <NA>                  NA        NA              NA
##             gdp
## 96     393.6434
## 58   36707.7742
## 27    1222.1921
## 75    1158.8042
## 21    6935.5937
## 40    1586.7801
## 117   7592.1580
## 55     487.3457
## 130  18322.3238
## 25     244.1965
## 122   3988.6672
## 114    618.9473
## 133   6797.7212
## 35    1281.3829
## 86    4377.2389
## 17     807.6885
## 129    580.4951
## 78  105447.0932
## 29     972.6793
## 163          NA
head(hpi_bottom_20)
##      country adj_life_expectancy hpi_index life_expectancy        gdp
## 96     Niger            38.87858  16.84656          60.046   393.6434
## 58 Hong Kong            81.26282  16.79968          83.572 36707.7742
## 27  Cameroon            33.09404  16.69824          54.610  1222.1921
## 75   Lesotho            32.55987  16.66514          48.947  1158.8042
## 21  Botswana            50.79682  16.60508          64.249  6935.5937
## 40  Djibouti            41.36547  16.42837          61.311  1586.7801
knitr::kable(tail(hpi_bottom_20), "simple")
country adj_life_expectancy hpi_index life_expectancy gdp
86 Mongolia 56.87023 14.26947 68.570 4377.2389
17 Benin 37.26980 13.42236 59.167 807.6885
129 Togo 39.63976 13.23327 58.601 580.4951
78 Luxembourg 78.97029 13.15117 81.111 105447.0932
29 Chad 27.31849 12.77716 50.808 972.6793
163 NA NA NA NA NA
# life expectancy vs GDP per Capita in USD 
ggplot(hpi, aes(x = gdp, y = life_expectancy)) + 
  geom_point(aes(size = population, color = region)) + 
  geom_smooth(method = 'loess') +
  ggtitle('Life Expectancy and GDP per Capita in USD log10') +
  theme_classic()

# life expectancy vs GDP per Capita in USD log10
ggplot(hpi, aes(x = gdp, y = life_expectancy)) + 
  geom_point(aes(size = population, color = region)) + 
  coord_trans(x = 'log10') + #Log 10 GDP 
  geom_smooth(method = 'loess') +
  ggtitle('Life Expectancy and GDP per Capita in USD log10') +
  theme_classic()

cor.test(hpi$gdp, hpi$life_expectancy)
## 
##  Pearson's product-moment correlation
## 
## data:  hpi$gdp and hpi$life_expectancy
## t = 9.2785, df = 137, p-value = 3.381e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5071660 0.7138732
## sample estimates:
##       cor 
## 0.6212097
# life expectancy vs hpi_index 
ggplot(hpi, aes(x = life_expectancy, y = hpi_index)) +
  geom_point(aes(size = population, color = region)) +
  geom_smooth(method = 'loess') +
  ggtitle('Life Expectancy and Happy Planet Index Score') + 
  theme_classic()

# gdp vs hpi_index
ggplot(hpi, aes(x = gdp, y = hpi_index)) +
  geom_point(aes(size = population, color = region)) +
  geom_smooth(method = 'loess') +
  ggtitle('GDP per Capita(log10) and Happy Planet Index Score') + 
  coord_trans(x = 'log10')

cor.test(hpi$gdp, hpi$hpi_index)
## 
##  Pearson's product-moment correlation
## 
## data:  hpi$gdp and hpi$hpi_index
## t = 1.316, df = 137, p-value = 0.1904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05581018  0.27314830
## sample estimates:
##       cor 
## 0.1117289

5 Scale the data

hpi[, 3:12] <- scale(hpi[, 3:12])
hpi <- hpi[complete.cases(hpi), ] # remove N/A row 

summary(hpi[, 3:12])
##    hpi_index         life_expectancy    happy_years         footprint      
##  Min.   :-1.862908   Min.   :-2.5566   Min.   :-1.61298   Min.   :-1.1538  
##  1st Qu.:-0.716792   1st Qu.:-0.6566   1st Qu.:-0.87456   1st Qu.:-0.7918  
##  Median :-0.008026   Median : 0.2953   Median :-0.05687   Median :-0.2476  
##  Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.700832   3rd Qu.: 0.6973   3rd Qu.: 0.71649   3rd Qu.: 0.5437  
##  Max.   : 2.490992   Max.   : 1.4461   Max.   : 2.18571   Max.   : 5.4410  
##       gdp           inequality_outcomes   wellbeing        adj_life_expectancy
##  Min.   :-0.69548   Min.   :-1.5625     Min.   :-2.20839   Min.   :-2.2427    
##  1st Qu.:-0.62371   1st Qu.:-0.8194     1st Qu.:-0.74719   1st Qu.:-0.8190    
##  Median :-0.41963   Median :-0.1696     Median :-0.09615   Median : 0.1979    
##  Mean   : 0.00000   Mean   : 0.0000     Mean   : 0.00000   Mean   : 0.0000    
##  3rd Qu.: 0.05991   3rd Qu.: 0.7742     3rd Qu.: 0.72849   3rd Qu.: 0.8161    
##  Max.   : 4.62151   Max.   : 2.2800     Max.   : 2.07396   Max.   : 1.4022    
##  adj_wellbeing        population      
##  Min.   :-2.14433   Min.   :-0.29950  
##  1st Qu.:-0.78425   1st Qu.:-0.27466  
##  Median :-0.08843   Median :-0.23544  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.61531   3rd Qu.:-0.08973  
##  Max.   : 2.22440   Max.   : 8.12656
cor1 <- cor(hpi[, 3:12])
cor1
##                       hpi_index life_expectancy happy_years   footprint
## hpi_index            1.00000000      0.54070352  0.49802687 -0.13533513
## life_expectancy      0.54070352      1.00000000  0.87615960  0.62038782
## happy_years          0.49802687      0.87615960  1.00000000  0.74681186
## footprint           -0.13533513      0.62038782  0.74681186  1.00000000
## gdp                  0.11172886      0.62120966  0.79592972  0.79564994
## inequality_outcomes -0.46493884     -0.93658205 -0.91999078 -0.71502869
## wellbeing            0.50874386      0.68927171  0.93110239  0.66826129
## adj_life_expectancy  0.48632795      0.98280286  0.88917923  0.66640483
## adj_wellbeing        0.48578291      0.67610988  0.93254332  0.68073404
## population           0.06555464      0.01000986 -0.02838965 -0.05915617
##                             gdp inequality_outcomes   wellbeing
## hpi_index            0.11172886          -0.4649388  0.50874386
## life_expectancy      0.62120966          -0.9365820  0.68927171
## happy_years          0.79592972          -0.9199908  0.93110239
## footprint            0.79564994          -0.7150287  0.66826129
## gdp                  1.00000000          -0.6680200  0.71047750
## inequality_outcomes -0.66802003           1.0000000 -0.75875713
## wellbeing            0.71047750          -0.7587571  1.00000000
## adj_life_expectancy  0.64151048          -0.9727959  0.69844506
## adj_wellbeing        0.73076419          -0.7601709  0.99444050
## population          -0.05212178           0.0071589 -0.02401221
##                     adj_life_expectancy adj_wellbeing   population
## hpi_index                   0.486327952    0.48578291  0.065554644
## life_expectancy             0.982802865    0.67610988  0.010009865
## happy_years                 0.889179226    0.93254332 -0.028389653
## footprint                   0.666404826    0.68073404 -0.059156173
## gdp                         0.641510475    0.73076419 -0.052121784
## inequality_outcomes        -0.972795855   -0.76017093  0.007158900
## wellbeing                   0.698445055    0.99444050 -0.024012215
## adj_life_expectancy         1.000000000    0.68485608 -0.003310006
## adj_wellbeing               0.684856084    1.00000000 -0.026022587
## population                 -0.003310006   -0.02602259  1.000000000
qplot(x=Var1, y=Var2, data=melt(cor(hpi[, 3:12], use="p")), fill=value, geom="tile") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
             labs(title="Heatmap of Correlation Matrix", 
                  x=NULL, y=NULL)

qplot(x=Var1, y=Var2, data=melt(cor(hpi[, 3:12], use="p")), fill=value, geom="tile") +
  scale_fill_gradient2(limits=c(-1, 1)) + # fill color for 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
             labs(title="Heatmap of Correlation Matrix", 
                  x=NULL, y=NULL)

6 Principle Component Anlysis (PCA)

hpi.pca <- PCA(hpi[, 3:12], graph = FALSE)
print(hpi.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 139 individuals, described by 10 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
eigenvalues <- hpi.pca$eig
head(eigenvalues)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1 6.66951476             66.6951476                          66.69515
## comp 2 1.31543659             13.1543659                          79.84951
## comp 3 0.97056041              9.7056041                          89.55512
## comp 4 0.69533213              6.9533213                          96.50844
## comp 5 0.24206842              2.4206842                          98.92912
## comp 6 0.05149289              0.5149289                          99.44405
# 6 components 
fviz_screeplot(hpi.pca, addlabels = TRUE, ylim = c(0, 65))

The scree plot shows us which components explain most of the variability in the data. In this case, almost 80% of the variances contained in the data are retained by the first two principal components.

head(hpi.pca$var$contrib)
##                         Dim.1      Dim.2       Dim.3      Dim.4       Dim.5
## hpi_index            3.542876 51.0853576 5.151187279  2.2511810  5.25202501
## life_expectancy     12.312754  2.2855289 0.006687887 18.2969362  0.32702458
## happy_years         14.785622  0.0129967 0.024409907  0.7566482  0.03471453
## footprint            8.986702 24.8600138 2.893931577  0.4839522  7.64700025
## gdp                  9.666480 11.5881130 0.994732438  2.4805961 72.45373605
## inequality_outcomes 13.353997  0.3013117 0.007705717 10.0667736  2.86884477
  1. Variables that are correlated with PC1 and PC2 are the most important in explaining the variability in the data set.
  2. The contribution of variables was extracted above: The larger the value of the contribution, the more the variable contributes to the component.
fviz_pca_var(hpi.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE 
             )

This highlights the most important variables in explaining the variations retained by the principal components. hpi_index, life_expectancy.

7 Pam Clustering Analysis

Using Pam Clustering Analysis to group countries by wealth, development, carbon emissions, and happiness.

number <- NbClust(hpi[, 3:12], distance = "euclidean",
                  min.nc = 2, max.nc = 15,
                  method = 'ward.D',
                  index = 'all',
                  alphaBeale = 0.1)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

I will apply K=3 in the following steps.

set.seed(2017)
pam <- pam(hpi[, 3:12], diss = FALSE, 3, keep.data = TRUE)

fviz_silhouette(pam) #Number of countries assigned in each cluster.
##   cluster size ave.sil.width
## 1       1   42          0.46
## 2       2   66          0.32
## 3       3   31          0.37

# This prints out one typical country represents each cluster.
hpi$country[pam$id.med]
## [1] "Liberia" "Romania" "Ireland"
fviz_cluster(pam, stand = FALSE, geom = "point",
             ellipse.type = "norm")

8 A world map of 3 clusters

hpi['cluster'] <- as.factor(pam$clustering)

map <- map_data("world")

map <- left_join(map, hpi, by = c('region' = 'country'))

ggplot() + geom_polygon(data = map, aes(x = long, y = lat, group = group, fill=cluster, color=cluster)) +
  labs(title = "Clustering Happy Planet Index", subtitle = "Based on data from:http://happyplanetindex.org/", x=NULL, y=NULL) + theme_minimal()