Library Imports

library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
library(magrittr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::map()       masks mclust::map()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(NbClust)
library(cluster)
library(dplyr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(lme4) 
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(haven) 
library(clustvarsel)
## Package 'clustvarsel' version 2.3.4
## Type 'citation("clustvarsel")' for citing this R package in publications.
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrplot)
## corrplot 0.92 loaded
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

Descriptive Analysis

Dataset Loading

data <- read.csv("2018.csv")

Correlation Plot

plot(data[,-c(1,2)]) 

Missing Values

missing_values_per_variable <- colSums(is.na(data))
print(missing_values_per_variable)
##                 Overall.rank            Country.or.region 
##                            0                            0 
##                        Score               GDP.per.capita 
##                            0                            0 
##               Social.support      Healthy.life.expectancy 
##                            0                            0 
## Freedom.to.make.life.choices                   Generosity 
##                            0                            0 
##    Perceptions.of.corruption 
##                            0

Barplot of Scores

Scoreplot <- ggplot(data, aes(x = reorder(`Country.or.region`, -Score), y = Score, fill = Score)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  labs(title = "Country or Region and Happiness Score",
       x = "Country or region",
       y = "Happiness Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_gradient(low = "darkblue", high = "lightcoral") 

print(Scoreplot)

Top and Bottom 20 Countries

top_countries <- head(data[order(-data$Score), ], 20)
print(top_countries)
##    Overall.rank    Country.or.region Score GDP.per.capita Social.support
## 1             1              Finland 7.632          1.305          1.592
## 2             2               Norway 7.594          1.456          1.582
## 3             3              Denmark 7.555          1.351          1.590
## 4             4              Iceland 7.495          1.343          1.644
## 5             5          Switzerland 7.487          1.420          1.549
## 6             6          Netherlands 7.441          1.361          1.488
## 7             7               Canada 7.328          1.330          1.532
## 8             8          New Zealand 7.324          1.268          1.601
## 9             9               Sweden 7.314          1.355          1.501
## 10           10            Australia 7.272          1.340          1.573
## 11           11       United Kingdom 7.190          1.244          1.433
## 12           12              Austria 7.139          1.341          1.504
## 13           13           Costa Rica 7.072          1.010          1.459
## 14           14              Ireland 6.977          1.448          1.583
## 15           15              Germany 6.965          1.340          1.474
## 16           16              Belgium 6.927          1.324          1.483
## 17           17           Luxembourg 6.910          1.576          1.520
## 18           18        United States 6.886          1.398          1.471
## 19           19               Israel 6.814          1.301          1.559
## 20           20 United Arab Emirates 6.774          2.096          0.776
##    Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 1                    0.874                        0.681      0.202
## 2                    0.861                        0.686      0.286
## 3                    0.868                        0.683      0.284
## 4                    0.914                        0.677      0.353
## 5                    0.927                        0.660      0.256
## 6                    0.878                        0.638      0.333
## 7                    0.896                        0.653      0.321
## 8                    0.876                        0.669      0.365
## 9                    0.913                        0.659      0.285
## 10                   0.910                        0.647      0.361
## 11                   0.888                        0.464      0.262
## 12                   0.891                        0.617      0.242
## 13                   0.817                        0.632      0.143
## 14                   0.876                        0.614      0.307
## 15                   0.861                        0.586      0.273
## 16                   0.894                        0.583      0.188
## 17                   0.896                        0.632      0.196
## 18                   0.819                        0.547      0.291
## 19                   0.883                        0.533      0.354
## 20                   0.670                        0.284      0.186
##    Perceptions.of.corruption
## 1                      0.393
## 2                      0.340
## 3                      0.408
## 4                      0.138
## 5                      0.357
## 6                      0.295
## 7                      0.291
## 8                      0.389
## 9                      0.383
## 10                     0.302
## 11                     0.082
## 12                     0.224
## 13                     0.101
## 14                     0.306
## 15                     0.280
## 16                     0.240
## 17                     0.321
## 18                     0.133
## 19                     0.272
## 20                     0.182
# Bottom 20 least happy countries
bottom_countries <- head(data[order(data$Score), ], 20)
print(bottom_countries)
##     Overall.rank        Country.or.region Score GDP.per.capita Social.support
## 156          156                  Burundi 2.905          0.091          0.627
## 155          155 Central African Republic 3.083          0.024          0.000
## 154          154              South Sudan 3.254          0.337          0.608
## 153          153                 Tanzania 3.303          0.455          0.991
## 152          152                    Yemen 3.355          0.442          1.073
## 151          151                   Rwanda 3.408          0.332          0.896
## 150          150                    Syria 3.462          0.689          0.382
## 149          149                  Liberia 3.495          0.076          0.858
## 148          148                    Haiti 3.582          0.315          0.714
## 147          147                   Malawi 3.587          0.186          0.541
## 146          146                 Botswana 3.590          1.017          1.174
## 145          145              Afghanistan 3.632          0.332          0.537
## 144          144                 Zimbabwe 3.692          0.357          1.094
## 143          143               Madagascar 3.774          0.262          0.908
## 142          142                   Angola 3.795          0.730          1.125
## 141          141                  Lesotho 3.808          0.472          1.215
## 140          140                   Guinea 3.964          0.344          0.792
## 139          139                     Togo 3.999          0.259          0.474
## 138          138                  Ukraine 4.103          0.793          1.413
## 137          137                    Sudan 4.139          0.605          1.240
##     Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 156                   0.145                        0.065      0.149
## 155                   0.010                        0.305      0.218
## 154                   0.177                        0.112      0.224
## 153                   0.381                        0.481      0.270
## 152                   0.343                        0.244      0.083
## 151                   0.400                        0.636      0.200
## 150                   0.539                        0.088      0.376
## 149                   0.267                        0.419      0.206
## 148                   0.289                        0.025      0.392
## 147                   0.306                        0.531      0.210
## 146                   0.417                        0.557      0.042
## 145                   0.255                        0.085      0.191
## 144                   0.248                        0.406      0.132
## 143                   0.402                        0.221      0.155
## 142                   0.269                        0.000      0.079
## 141                   0.079                        0.423      0.116
## 140                   0.211                        0.394      0.185
## 139                   0.253                        0.434      0.158
## 138                   0.609                        0.163      0.187
## 137                   0.312                        0.016      0.134
##     Perceptions.of.corruption
## 156                     0.076
## 155                     0.038
## 154                     0.106
## 153                     0.097
## 152                     0.064
## 151                     0.444
## 150                     0.144
## 149                     0.030
## 148                     0.104
## 147                     0.080
## 146                     0.092
## 145                     0.036
## 144                     0.099
## 143                     0.049
## 142                     0.061
## 141                     0.112
## 140                     0.094
## 139                     0.101
## 138                     0.011
## 137                     0.082
top_20 <- head(data[order(-data$Score), ], 20)
bottom_20 <- head(data[order(data$Score), ], 20)

Variable Composition

data$Sum_Variables <- rowSums(data[, c("GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption")], na.rm = TRUE)
melted_data <- reshape2::melt(data, id.vars = c("Country.or.region", "Sum_Variables"), measure.vars = c("GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption"))
my_palette <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f")

# Grafico a barre con colori diversi per ogni variabile
Compplot <- ggplot(melted_data, aes(x = reorder(`Country.or.region`, -Sum_Variables), y = value, fill = variable)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = my_palette) +
  labs(title = "Grafico composizione Score",
       x = "Country",
       y = "Valore") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),  # Etichette a 90 gradi
        legend.position = "top", legend.title = element_blank())

print(Compplot)

Composition Chart

Regression Analysis GDP and Score Relationship

scatter_plot <- ggplot(data, aes(x = `GDP.per.capita`, y = Score, text = `Country.or.region`)) +
  geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto", 
                                ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
             size = 2, alpha = 0.8) +
  scale_color_manual(values = c("gray", "red", "blue"), 
                     labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
  labs(title = "Rapporto tra Score e GDP",
       x = "GDP per capita",
       y = "Score") +
  theme_minimal() +
  guides(color = FALSE)  
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plotly11 <- ggplotly(scatter_plot, tooltip = "text")
print(plotly11)

Score and Corruption Perception

Scatter_corr <- ggplot(data, aes(x = `Perceptions.of.corruption`, y = Score, text = `Country.or.region`)) +
  geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto", 
                                ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
             size = 2, alpha = 0.8) +
  scale_color_manual(values = c("gray", "red", "blue"), 
                     labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
  labs(title = "Rapporto tra Score e Perceptions of corruption",
       x = "Perceptions of corruption",
       y = "Score") +
  theme_minimal() +
  guides(color = guide_legend(title = NULL))

plotly22 <- ggplotly(Scatter_corr, tooltip = "text")
print(plotly22)

Score and Social Support

Scatter_social <- ggplot(data, aes(x = `Social.support`, y = Score, text = `Country.or.region`)) +
  geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto", 
                                ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
             size = 2, alpha = 0.8) +
  scale_color_manual(values = c("gray", "red", "blue"), 
                     labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
  labs(title = "Rapporto tra Score e Social support ",
       x = "Social Support",
       y = "Score") +
  theme_minimal() +
  guides(color = guide_legend(title = NULL))

plotly33 <- ggplotly(Scatter_social, tooltip = "text")
print(plotly33)

Freedom and GDP

Scatter_free_GDP <- ggplot(data, aes(x = `Freedom.to.make.life.choices`, y = `GDP.per.capita`, text = `Country.or.region`)) +
  geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto", 
                                ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
             size = 2, alpha = 0.8) +
  scale_color_manual(values = c("gray", "red", "blue"), 
                     labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
  labs(title = "Rapporto tra Freedom e GDP ",
       x = "Freedom",
       y = "GDP") +
  theme_minimal() +
  guides(color = guide_legend(title = NULL))

plotly44 <- ggplotly(Scatter_free_GDP, tooltip = "text")
print(plotly44)

Freedom and Generosity

Scatter_free_Gen <- ggplot(data, aes(x = `Freedom.to.make.life.choices`, y = `Generosity`, text = `Country.or.region`)) +
  geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto", 
                                ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
             size = 2, alpha = 0.8) +
  scale_color_manual(values = c("gray", "red", "blue"), 
                     labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
  labs(title = "Rapporto tra Freedom e Generosity ",
       x = "Freedom",
       y = "Generosity") +
  theme_minimal() +
  guides(color = guide_legend(title = NULL))  # Rimuovere il titolo della legenda


plotly55 <- ggplotly(Scatter_free_Gen, tooltip = "text")
print(plotly55)

Boxplots and Standardization

Original Boxplot

BOXPLOTORIG <- data[,-c(1,2)] %>%
  gather(Attributes, values) %>%
  ggplot(aes(x = reorder(Attributes, values, FUN = median), y = values, fill = Attributes)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Boxplot of All Variables") +
  theme_bw() +
  theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
  coord_flip()

print(BOXPLOTORIG)

Standardized Boxplot

datastan <- as.data.frame(scale(as.data.frame(data[,-c(1,2)])))

# Boxplot of standardized variables
BOXSTAND <- datastan %>%
  gather(Attributes, values) %>%
  ggplot(aes(x = reorder(Attributes, values, FUN = median), y = values, fill = Attributes)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Boxplot of Standardized Variables") +
  theme_bw() +
  theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
  coord_flip()

print(BOXSTAND)

Clustering Analysis

Hierarchical Clustering

# Hierarchical clustering method 
DistEuc = dist(datastan, method = "euclidean") # Using a dataset without categorical variables

EucWard = hclust(DistEuc, method = "ward.D2", members = NULL) # Choosing Ward method

# Plotting the dendrogram
plot(EucWard)
rect.hclust(EucWard, k = 3, border = "red")

fviz_dend(EucWard, k = 3, show_labels = F, rect = T) # Colored dendrogram

TaglioEucWard = cutree(EucWard, k = 3)

# Plotting the data with cluster colors
plot(datastan, col= TaglioEucWard)#il gruppo piu piccolo si distacca sempre, potrebbero essere anche solo 2 gruppi, ma con 3 dovremmo 

K-means Clustering

# K-means clustering
fviz_nbclust(datastan, kmeans, method = "silhouette") # Proposing 3 clusters based on silhouette method

fviz_nbclust(datastan, kmeans, method = "wss") # Proposing 3 clusters based on within-sum-of-squares

kmeans_result <- kmeans(datastan, 3, nstart = 100)

# Visualization of k-means clusters
cluster_visualization <- fviz_cluster(kmeans_result, data = datastan)
print(cluster_visualization)

# Tabulation of cluster occurrences
cluster_occurrences <- table(kmeans_result$cluster)
print(cluster_occurrences)
## 
##  1  2  3 
## 22 87 47
# Creating an additional column containing cluster membership
datastan$cluster <- kmeans_result$cluster
write.csv(datastan, "data_with_cluster.csv", row.names = FALSE)

Distribution of Score

# Histogram of Score variable
x <- data$Score
hist(x, breaks = 20, freq = FALSE)

# Clustering using Mclust
CLUSTERING <- Mclust(x, G = 3, modelNames = "V")
SEQ <- seq(min(x), max(x), length = 1500)

DENS1 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[1], 
               sd = sqrt(CLUSTERING$parameters$variance$sigmasq[1]))
DENS2 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[2], 
               sd = sqrt(CLUSTERING$parameters$variance$sigmasq[2]))
DENS3 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[3], 
               sd = sqrt(CLUSTERING$parameters$variance$sigmasq[3]))

P1 <- CLUSTERING$parameters$pro[1]
P2 <- CLUSTERING$parameters$pro[2]
P3 <- CLUSTERING$parameters$pro[3]

DENSTOT <- P1 * DENS1 + P2 * DENS2 + P3 * DENS3

# Plotting the density lines
lines(SEQ, DENSTOT)
lines(SEQ, P1 * DENS1, col = 2)
lines(SEQ, P2 * DENS2, col = 3)
lines(SEQ, P3 * DENS3, col = 4)

Variable Selection Heatmap of Correlations

# Heatmap for variable correlations
print(names(data))
##  [1] "Overall.rank"                 "Country.or.region"           
##  [3] "Score"                        "GDP.per.capita"              
##  [5] "Social.support"               "Healthy.life.expectancy"     
##  [7] "Freedom.to.make.life.choices" "Generosity"                  
##  [9] "Perceptions.of.corruption"    "Sum_Variables"
# Selecting only the variables of interest
variables <- c("Score", "GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption")
correlation_data <- data[, variables]

# Calculating the correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")

# Transforming the correlation matrix into a long format
melted_cor_matrix <- melt(cor_matrix)

# Creating a heatmap with annotations of values
heatmap_correlation <- ggplot(melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2)), vjust = 1) +  # Adding value annotations
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-1, 1)) +
  labs(title = "Heatmap of Correlations",
       x = "Variable",
       y = "Variable") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        axis.text.y = element_text(angle = 0, hjust = 1))

print(heatmap_correlation)

Selection with Boruta

Clusvarsel

# Installing and loading Boruta for variable selection
library(Boruta)
library(ranger)

# Boruta analysis for feature selection
boruta_result <- Boruta(Score ~ ., data = datastan[,-c(8,9)], doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
## After 10 iterations, +0.15 secs:
##  confirmed 5 attributes: Freedom.to.make.life.choices, GDP.per.capita, Healthy.life.expectancy, Perceptions.of.corruption, Social.support;
##  still have 1 attribute left.
##  11. run of importance source...
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
## After 14 iterations, +0.2 secs:
##  confirmed 1 attribute: Generosity;
##  no more attributes left.
print(boruta_result)
## Boruta performed 14 iterations in 0.2012901 secs.
##  6 attributes confirmed important: Freedom.to.make.life.choices,
## GDP.per.capita, Generosity, Healthy.life.expectancy,
## Perceptions.of.corruption and 1 more;
##  No attributes deemed unimportant.
# Plotting the importance of variables
plot(boruta_result, cexAxis = 0.7, las = 2, xlab = "", main = "Boruta Analysis: Importance of Variables")

datastanclean <- datastan[, -c(8, 9)]

cluststanDBACK <- clustvarsel(datastanclean,G=1:3, direction = "backward") #Seleziona: Perceptions of corruption, Freedom to make life choices, Generosity, Healthy life expectancy
#seleziona G= 3 e variabili 1,3,4,5,7
plot(cluststanDBACK$model)

cluststanDBACK$model$classification
##   [1] 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2
##  [38] 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 3 3 2 2 3 2 2 2 2 3 3 2 3 2 2 2 3 2 2 2 2 2 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3
cluststanDBACK$subset
##                        Score               Social.support 
##                            1                            3 
##      Healthy.life.expectancy Freedom.to.make.life.choices 
##                            4                            5 
##    Perceptions.of.corruption 
##                            7