library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
library(magrittr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks mclust::map()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(NbClust)
library(cluster)
library(dplyr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(haven)
library(clustvarsel)
## Package 'clustvarsel' version 2.3.4
## Type 'citation("clustvarsel")' for citing this R package in publications.
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## corrplot 0.92 loaded
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
Dataset Loading
data <- read.csv("2018.csv")
Correlation Plot
plot(data[,-c(1,2)])
Missing Values
missing_values_per_variable <- colSums(is.na(data))
print(missing_values_per_variable)
## Overall.rank Country.or.region
## 0 0
## Score GDP.per.capita
## 0 0
## Social.support Healthy.life.expectancy
## 0 0
## Freedom.to.make.life.choices Generosity
## 0 0
## Perceptions.of.corruption
## 0
Barplot of Scores
Scoreplot <- ggplot(data, aes(x = reorder(`Country.or.region`, -Score), y = Score, fill = Score)) +
geom_bar(stat = "identity", width = 1, color = "white") +
labs(title = "Country or Region and Happiness Score",
x = "Country or region",
y = "Happiness Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient(low = "darkblue", high = "lightcoral")
print(Scoreplot)
Top and Bottom 20 Countries
top_countries <- head(data[order(-data$Score), ], 20)
print(top_countries)
## Overall.rank Country.or.region Score GDP.per.capita Social.support
## 1 1 Finland 7.632 1.305 1.592
## 2 2 Norway 7.594 1.456 1.582
## 3 3 Denmark 7.555 1.351 1.590
## 4 4 Iceland 7.495 1.343 1.644
## 5 5 Switzerland 7.487 1.420 1.549
## 6 6 Netherlands 7.441 1.361 1.488
## 7 7 Canada 7.328 1.330 1.532
## 8 8 New Zealand 7.324 1.268 1.601
## 9 9 Sweden 7.314 1.355 1.501
## 10 10 Australia 7.272 1.340 1.573
## 11 11 United Kingdom 7.190 1.244 1.433
## 12 12 Austria 7.139 1.341 1.504
## 13 13 Costa Rica 7.072 1.010 1.459
## 14 14 Ireland 6.977 1.448 1.583
## 15 15 Germany 6.965 1.340 1.474
## 16 16 Belgium 6.927 1.324 1.483
## 17 17 Luxembourg 6.910 1.576 1.520
## 18 18 United States 6.886 1.398 1.471
## 19 19 Israel 6.814 1.301 1.559
## 20 20 United Arab Emirates 6.774 2.096 0.776
## Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 1 0.874 0.681 0.202
## 2 0.861 0.686 0.286
## 3 0.868 0.683 0.284
## 4 0.914 0.677 0.353
## 5 0.927 0.660 0.256
## 6 0.878 0.638 0.333
## 7 0.896 0.653 0.321
## 8 0.876 0.669 0.365
## 9 0.913 0.659 0.285
## 10 0.910 0.647 0.361
## 11 0.888 0.464 0.262
## 12 0.891 0.617 0.242
## 13 0.817 0.632 0.143
## 14 0.876 0.614 0.307
## 15 0.861 0.586 0.273
## 16 0.894 0.583 0.188
## 17 0.896 0.632 0.196
## 18 0.819 0.547 0.291
## 19 0.883 0.533 0.354
## 20 0.670 0.284 0.186
## Perceptions.of.corruption
## 1 0.393
## 2 0.340
## 3 0.408
## 4 0.138
## 5 0.357
## 6 0.295
## 7 0.291
## 8 0.389
## 9 0.383
## 10 0.302
## 11 0.082
## 12 0.224
## 13 0.101
## 14 0.306
## 15 0.280
## 16 0.240
## 17 0.321
## 18 0.133
## 19 0.272
## 20 0.182
# Bottom 20 least happy countries
bottom_countries <- head(data[order(data$Score), ], 20)
print(bottom_countries)
## Overall.rank Country.or.region Score GDP.per.capita Social.support
## 156 156 Burundi 2.905 0.091 0.627
## 155 155 Central African Republic 3.083 0.024 0.000
## 154 154 South Sudan 3.254 0.337 0.608
## 153 153 Tanzania 3.303 0.455 0.991
## 152 152 Yemen 3.355 0.442 1.073
## 151 151 Rwanda 3.408 0.332 0.896
## 150 150 Syria 3.462 0.689 0.382
## 149 149 Liberia 3.495 0.076 0.858
## 148 148 Haiti 3.582 0.315 0.714
## 147 147 Malawi 3.587 0.186 0.541
## 146 146 Botswana 3.590 1.017 1.174
## 145 145 Afghanistan 3.632 0.332 0.537
## 144 144 Zimbabwe 3.692 0.357 1.094
## 143 143 Madagascar 3.774 0.262 0.908
## 142 142 Angola 3.795 0.730 1.125
## 141 141 Lesotho 3.808 0.472 1.215
## 140 140 Guinea 3.964 0.344 0.792
## 139 139 Togo 3.999 0.259 0.474
## 138 138 Ukraine 4.103 0.793 1.413
## 137 137 Sudan 4.139 0.605 1.240
## Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 156 0.145 0.065 0.149
## 155 0.010 0.305 0.218
## 154 0.177 0.112 0.224
## 153 0.381 0.481 0.270
## 152 0.343 0.244 0.083
## 151 0.400 0.636 0.200
## 150 0.539 0.088 0.376
## 149 0.267 0.419 0.206
## 148 0.289 0.025 0.392
## 147 0.306 0.531 0.210
## 146 0.417 0.557 0.042
## 145 0.255 0.085 0.191
## 144 0.248 0.406 0.132
## 143 0.402 0.221 0.155
## 142 0.269 0.000 0.079
## 141 0.079 0.423 0.116
## 140 0.211 0.394 0.185
## 139 0.253 0.434 0.158
## 138 0.609 0.163 0.187
## 137 0.312 0.016 0.134
## Perceptions.of.corruption
## 156 0.076
## 155 0.038
## 154 0.106
## 153 0.097
## 152 0.064
## 151 0.444
## 150 0.144
## 149 0.030
## 148 0.104
## 147 0.080
## 146 0.092
## 145 0.036
## 144 0.099
## 143 0.049
## 142 0.061
## 141 0.112
## 140 0.094
## 139 0.101
## 138 0.011
## 137 0.082
top_20 <- head(data[order(-data$Score), ], 20)
bottom_20 <- head(data[order(data$Score), ], 20)
Variable Composition
data$Sum_Variables <- rowSums(data[, c("GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption")], na.rm = TRUE)
melted_data <- reshape2::melt(data, id.vars = c("Country.or.region", "Sum_Variables"), measure.vars = c("GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption"))
my_palette <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f")
# Grafico a barre con colori diversi per ogni variabile
Compplot <- ggplot(melted_data, aes(x = reorder(`Country.or.region`, -Sum_Variables), y = value, fill = variable)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = my_palette) +
labs(title = "Grafico composizione Score",
x = "Country",
y = "Valore") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), # Etichette a 90 gradi
legend.position = "top", legend.title = element_blank())
print(Compplot)
Regression Analysis GDP and Score Relationship
scatter_plot <- ggplot(data, aes(x = `GDP.per.capita`, y = Score, text = `Country.or.region`)) +
geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto",
ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
size = 2, alpha = 0.8) +
scale_color_manual(values = c("gray", "red", "blue"),
labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
labs(title = "Rapporto tra Score e GDP",
x = "GDP per capita",
y = "Score") +
theme_minimal() +
guides(color = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plotly11 <- ggplotly(scatter_plot, tooltip = "text")
print(plotly11)
Score and Corruption Perception
Scatter_corr <- ggplot(data, aes(x = `Perceptions.of.corruption`, y = Score, text = `Country.or.region`)) +
geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto",
ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
size = 2, alpha = 0.8) +
scale_color_manual(values = c("gray", "red", "blue"),
labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
labs(title = "Rapporto tra Score e Perceptions of corruption",
x = "Perceptions of corruption",
y = "Score") +
theme_minimal() +
guides(color = guide_legend(title = NULL))
plotly22 <- ggplotly(Scatter_corr, tooltip = "text")
print(plotly22)
Score and Social Support
Scatter_social <- ggplot(data, aes(x = `Social.support`, y = Score, text = `Country.or.region`)) +
geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto",
ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
size = 2, alpha = 0.8) +
scale_color_manual(values = c("gray", "red", "blue"),
labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
labs(title = "Rapporto tra Score e Social support ",
x = "Social Support",
y = "Score") +
theme_minimal() +
guides(color = guide_legend(title = NULL))
plotly33 <- ggplotly(Scatter_social, tooltip = "text")
print(plotly33)
Freedom and GDP
Scatter_free_GDP <- ggplot(data, aes(x = `Freedom.to.make.life.choices`, y = `GDP.per.capita`, text = `Country.or.region`)) +
geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto",
ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
size = 2, alpha = 0.8) +
scale_color_manual(values = c("gray", "red", "blue"),
labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
labs(title = "Rapporto tra Freedom e GDP ",
x = "Freedom",
y = "GDP") +
theme_minimal() +
guides(color = guide_legend(title = NULL))
plotly44 <- ggplotly(Scatter_free_GDP, tooltip = "text")
print(plotly44)
Freedom and Generosity
Scatter_free_Gen <- ggplot(data, aes(x = `Freedom.to.make.life.choices`, y = `Generosity`, text = `Country.or.region`)) +
geom_point(aes(color = ifelse(`Country.or.region` %in% top_20$`Country.or.region`, "Top 20 Score Alto",
ifelse(`Country.or.region` %in% bottom_20$`Country.or.region`, "Top 20 Score Basso", "Altri Paesi"))),
size = 2, alpha = 0.8) +
scale_color_manual(values = c("gray", "red", "blue"),
labels = c("Altri Paesi", "Top 20 Score Alto", "Top 20 Score Basso")) +
labs(title = "Rapporto tra Freedom e Generosity ",
x = "Freedom",
y = "Generosity") +
theme_minimal() +
guides(color = guide_legend(title = NULL)) # Rimuovere il titolo della legenda
plotly55 <- ggplotly(Scatter_free_Gen, tooltip = "text")
print(plotly55)
Original Boxplot
BOXPLOTORIG <- data[,-c(1,2)] %>%
gather(Attributes, values) %>%
ggplot(aes(x = reorder(Attributes, values, FUN = median), y = values, fill = Attributes)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Boxplot of All Variables") +
theme_bw() +
theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
coord_flip()
print(BOXPLOTORIG)
Standardized Boxplot
datastan <- as.data.frame(scale(as.data.frame(data[,-c(1,2)])))
# Boxplot of standardized variables
BOXSTAND <- datastan %>%
gather(Attributes, values) %>%
ggplot(aes(x = reorder(Attributes, values, FUN = median), y = values, fill = Attributes)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Boxplot of Standardized Variables") +
theme_bw() +
theme(axis.title.y = element_blank(), axis.title.x = element_blank()) +
coord_flip()
print(BOXSTAND)
Hierarchical Clustering
# Hierarchical clustering method
DistEuc = dist(datastan, method = "euclidean") # Using a dataset without categorical variables
EucWard = hclust(DistEuc, method = "ward.D2", members = NULL) # Choosing Ward method
# Plotting the dendrogram
plot(EucWard)
rect.hclust(EucWard, k = 3, border = "red")
fviz_dend(EucWard, k = 3, show_labels = F, rect = T) # Colored dendrogram
TaglioEucWard = cutree(EucWard, k = 3)
# Plotting the data with cluster colors
plot(datastan, col= TaglioEucWard)#il gruppo piu piccolo si distacca sempre, potrebbero essere anche solo 2 gruppi, ma con 3 dovremmo
# K-means clustering
fviz_nbclust(datastan, kmeans, method = "silhouette") # Proposing 3 clusters based on silhouette method
fviz_nbclust(datastan, kmeans, method = "wss") # Proposing 3 clusters based on within-sum-of-squares
kmeans_result <- kmeans(datastan, 3, nstart = 100)
# Visualization of k-means clusters
cluster_visualization <- fviz_cluster(kmeans_result, data = datastan)
print(cluster_visualization)
# Tabulation of cluster occurrences
cluster_occurrences <- table(kmeans_result$cluster)
print(cluster_occurrences)
##
## 1 2 3
## 22 87 47
# Creating an additional column containing cluster membership
datastan$cluster <- kmeans_result$cluster
write.csv(datastan, "data_with_cluster.csv", row.names = FALSE)
Distribution of Score
# Histogram of Score variable
x <- data$Score
hist(x, breaks = 20, freq = FALSE)
# Clustering using Mclust
CLUSTERING <- Mclust(x, G = 3, modelNames = "V")
SEQ <- seq(min(x), max(x), length = 1500)
DENS1 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[1],
sd = sqrt(CLUSTERING$parameters$variance$sigmasq[1]))
DENS2 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[2],
sd = sqrt(CLUSTERING$parameters$variance$sigmasq[2]))
DENS3 <- dnorm(SEQ, mean = CLUSTERING$parameters$mean[3],
sd = sqrt(CLUSTERING$parameters$variance$sigmasq[3]))
P1 <- CLUSTERING$parameters$pro[1]
P2 <- CLUSTERING$parameters$pro[2]
P3 <- CLUSTERING$parameters$pro[3]
DENSTOT <- P1 * DENS1 + P2 * DENS2 + P3 * DENS3
# Plotting the density lines
lines(SEQ, DENSTOT)
lines(SEQ, P1 * DENS1, col = 2)
lines(SEQ, P2 * DENS2, col = 3)
lines(SEQ, P3 * DENS3, col = 4)
Variable Selection Heatmap of Correlations
# Heatmap for variable correlations
print(names(data))
## [1] "Overall.rank" "Country.or.region"
## [3] "Score" "GDP.per.capita"
## [5] "Social.support" "Healthy.life.expectancy"
## [7] "Freedom.to.make.life.choices" "Generosity"
## [9] "Perceptions.of.corruption" "Sum_Variables"
# Selecting only the variables of interest
variables <- c("Score", "GDP.per.capita", "Social.support", "Healthy.life.expectancy", "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption")
correlation_data <- data[, variables]
# Calculating the correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")
# Transforming the correlation matrix into a long format
melted_cor_matrix <- melt(cor_matrix)
# Creating a heatmap with annotations of values
heatmap_correlation <- ggplot(melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), vjust = 1) + # Adding value annotations
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-1, 1)) +
labs(title = "Heatmap of Correlations",
x = "Variable",
y = "Variable") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y = element_text(angle = 0, hjust = 1))
print(heatmap_correlation)
Selection with Boruta
# Installing and loading Boruta for variable selection
library(Boruta)
library(ranger)
# Boruta analysis for feature selection
boruta_result <- Boruta(Score ~ ., data = datastan[,-c(8,9)], doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## After 10 iterations, +0.15 secs:
## confirmed 5 attributes: Freedom.to.make.life.choices, GDP.per.capita, Healthy.life.expectancy, Perceptions.of.corruption, Social.support;
## still have 1 attribute left.
## 11. run of importance source...
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## After 14 iterations, +0.2 secs:
## confirmed 1 attribute: Generosity;
## no more attributes left.
print(boruta_result)
## Boruta performed 14 iterations in 0.2012901 secs.
## 6 attributes confirmed important: Freedom.to.make.life.choices,
## GDP.per.capita, Generosity, Healthy.life.expectancy,
## Perceptions.of.corruption and 1 more;
## No attributes deemed unimportant.
# Plotting the importance of variables
plot(boruta_result, cexAxis = 0.7, las = 2, xlab = "", main = "Boruta Analysis: Importance of Variables")
datastanclean <- datastan[, -c(8, 9)]
cluststanDBACK <- clustvarsel(datastanclean,G=1:3, direction = "backward") #Seleziona: Perceptions of corruption, Freedom to make life choices, Generosity, Healthy life expectancy
#seleziona G= 3 e variabili 1,3,4,5,7
plot(cluststanDBACK$model)
cluststanDBACK$model$classification
## [1] 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2
## [38] 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 3 3 2 2 3 2 2 2 2 3 3 2 3 2 2 2 3 2 2 2 2 2 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3
cluststanDBACK$subset
## Score Social.support
## 1 3
## Healthy.life.expectancy Freedom.to.make.life.choices
## 4 5
## Perceptions.of.corruption
## 7