wHappiness_Df <- read.csv("C:/Users/PC/Documents/R_4DS/WorldHappinessReport/world-happiness-report-2021.csv")
glimpse(wHappiness_Df)
Rows: 149
Columns: 20
$ ï..Country.name [3m[38;5;246m<fct>[39m[23m Finland, Denmark, Switzerland, Iceland, Netherlands, Norway, Sweden, Luxembourg, ~
$ Regional.indicator [3m[38;5;246m<fct>[39m[23m Western Europe, Western Europe, Western Europe, Western Europe, Western Europe, W~
$ Ladder.score [3m[38;5;246m<dbl>[39m[23m 7.842, 7.620, 7.571, 7.554, 7.464, 7.392, 7.363, 7.324, 7.277, 7.268, 7.183, 7.15~
$ Standard.error.of.ladder.score [3m[38;5;246m<dbl>[39m[23m 0.032, 0.035, 0.036, 0.059, 0.027, 0.035, 0.036, 0.037, 0.040, 0.036, 0.041, 0.03~
$ upperwhisker [3m[38;5;246m<dbl>[39m[23m 7.904, 7.687, 7.643, 7.670, 7.518, 7.462, 7.433, 7.396, 7.355, 7.337, 7.265, 7.22~
$ lowerwhisker [3m[38;5;246m<dbl>[39m[23m 7.780, 7.552, 7.500, 7.438, 7.410, 7.323, 7.293, 7.252, 7.198, 7.198, 7.102, 7.09~
$ Logged.GDP.per.capita [3m[38;5;246m<dbl>[39m[23m 10.775, 10.933, 11.117, 10.878, 10.932, 11.053, 10.867, 11.647, 10.643, 10.906, 1~
$ Social.support [3m[38;5;246m<dbl>[39m[23m 0.954, 0.954, 0.942, 0.983, 0.942, 0.954, 0.934, 0.908, 0.948, 0.934, 0.940, 0.93~
$ Healthy.life.expectancy [3m[38;5;246m<dbl>[39m[23m 72.000, 72.700, 74.400, 73.000, 72.400, 73.300, 72.700, 72.600, 73.400, 73.300, 7~
$ Freedom.to.make.life.choices [3m[38;5;246m<dbl>[39m[23m 0.949, 0.946, 0.919, 0.955, 0.913, 0.960, 0.945, 0.907, 0.929, 0.908, 0.914, 0.80~
$ Generosity [3m[38;5;246m<dbl>[39m[23m -0.098, 0.030, 0.025, 0.160, 0.175, 0.093, 0.086, -0.034, 0.134, 0.042, 0.159, 0.~
$ Perceptions.of.corruption [3m[38;5;246m<dbl>[39m[23m 0.186, 0.179, 0.292, 0.673, 0.338, 0.270, 0.237, 0.386, 0.242, 0.481, 0.442, 0.75~
$ Ladder.score.in.Dystopia [3m[38;5;246m<dbl>[39m[23m 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.43, 2.4~
$ Explained.by..Log.GDP.per.capita [3m[38;5;246m<dbl>[39m[23m 1.446, 1.502, 1.566, 1.482, 1.501, 1.543, 1.478, 1.751, 1.400, 1.492, 1.453, 1.37~
$ Explained.by..Social.support [3m[38;5;246m<dbl>[39m[23m 1.106, 1.108, 1.079, 1.172, 1.079, 1.108, 1.062, 1.003, 1.094, 1.062, 1.076, 1.07~
$ Explained.by..Healthy.life.expectancy [3m[38;5;246m<dbl>[39m[23m 0.741, 0.763, 0.816, 0.772, 0.753, 0.782, 0.763, 0.760, 0.785, 0.782, 0.801, 0.78~
$ Explained.by..Freedom.to.make.life.choices [3m[38;5;246m<dbl>[39m[23m 0.691, 0.686, 0.653, 0.698, 0.647, 0.703, 0.685, 0.639, 0.665, 0.640, 0.647, 0.50~
$ Explained.by..Generosity [3m[38;5;246m<dbl>[39m[23m 0.124, 0.208, 0.204, 0.293, 0.302, 0.249, 0.244, 0.166, 0.276, 0.215, 0.291, 0.20~
$ Explained.by..Perceptions.of.corruption [3m[38;5;246m<dbl>[39m[23m 0.481, 0.485, 0.413, 0.170, 0.384, 0.427, 0.448, 0.353, 0.445, 0.292, 0.317, 0.11~
$ Dystopia...residual [3m[38;5;246m<dbl>[39m[23m 3.253, 2.868, 2.839, 2.967, 2.798, 2.580, 2.683, 2.653, 2.612, 2.784, 2.598, 3.08~
## Check for missing values
null_vars <- (sapply(wHappiness_Df, function(x) sum(is.na(x))))
t(data.frame(null_vars))
ï..Country.name Regional.indicator Ladder.score Standard.error.of.ladder.score upperwhisker lowerwhisker
null_vars 0 0 0 0 0 0
Logged.GDP.per.capita Social.support Healthy.life.expectancy Freedom.to.make.life.choices Generosity
null_vars 0 0 0 0 0
Perceptions.of.corruption Ladder.score.in.Dystopia Explained.by..Log.GDP.per.capita Explained.by..Social.support
null_vars 0 0 0 0
Explained.by..Healthy.life.expectancy Explained.by..Freedom.to.make.life.choices Explained.by..Generosity
null_vars 0 0 0
Explained.by..Perceptions.of.corruption Dystopia...residual
null_vars 0 0
## Checking for Blank Values
wHappiness_Df[wHappiness_Df == "",]
No missing or empty values, go-to-go.
names(wHappiness_Df)
[1] "ï..Country.name" "Regional.indicator"
[3] "Ladder.score" "Standard.error.of.ladder.score"
[5] "upperwhisker" "lowerwhisker"
[7] "Logged.GDP.per.capita" "Social.support"
[9] "Healthy.life.expectancy" "Freedom.to.make.life.choices"
[11] "Generosity" "Perceptions.of.corruption"
[13] "Ladder.score.in.Dystopia" "Explained.by..Log.GDP.per.capita"
[15] "Explained.by..Social.support" "Explained.by..Healthy.life.expectancy"
[17] "Explained.by..Freedom.to.make.life.choices" "Explained.by..Generosity"
[19] "Explained.by..Perceptions.of.corruption" "Dystopia...residual"
## Tidying the Column Name
colClean <- function(x){
colnames(x) <- gsub("\\.\\.+",".", colnames(x));
x
}
wHappiness_Df <- colClean(wHappiness_Df) %>%
rename(Country = ï.Country.name)
names(wHappiness_Df)
[1] "Country" "Regional.indicator"
[3] "Ladder.score" "Standard.error.of.ladder.score"
[5] "upperwhisker" "lowerwhisker"
[7] "Logged.GDP.per.capita" "Social.support"
[9] "Healthy.life.expectancy" "Freedom.to.make.life.choices"
[11] "Generosity" "Perceptions.of.corruption"
[13] "Ladder.score.in.Dystopia" "Explained.by.Log.GDP.per.capita"
[15] "Explained.by.Social.support" "Explained.by.Healthy.life.expectancy"
[17] "Explained.by.Freedom.to.make.life.choices" "Explained.by.Generosity"
[19] "Explained.by.Perceptions.of.corruption" "Dystopia.residual"
# colnames(wHappiness_Df) <- gsub("Explained.by.", "", colnames(wHappiness_Df))
# names(wHappiness_Df)
wHappiness_Df %>%
count(Regional.indicator) %>%
ggplot(aes(x = reorder(Regional.indicator, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip()
# plotdata <- function(xx, y){
# if(is.integer(y) == TRUE){
# wHappiness_Df %>%
# group_by(xx) %>%
# summarise(mean_ = mean(y))
# } else {
# print("Enter Integer")
# }
# }
#
# plotdata(Regional.indicator, Healthy.life.expectancy)
plotdata <- wHappiness_Df %>%
group_by(Regional.indicator) %>%
summarise(n = n(),
mean_ = mean(Healthy.life.expectancy),
sd = sd(Healthy.life.expectancy),
se = sd/sqrt(n),
ci = qt(0.0975, df = n - 1) * sd/sqrt(n))
`summarise()` ungrouping output (override with `.groups` argument)
plotdata %>%
ggplot(aes(x = reorder(Regional.indicator, mean_), y = mean_, group = 1)) +
geom_point() +
geom_line() +
geom_errorbar(aes(ymin = mean_ - se,
ymax = mean_ + se),
width = .1) +
coord_flip()
cluster_x <- wHappiness_Df %>%
select(-c("Regional.indicator", "Country"))
##
cluster_x <- cluster_x[ , which(apply(cluster_x, 2, var) != 0)]
which(apply(cluster_x, 2, var)==0)
named integer(0)
## Checking for clusters
library(factoextra)
library(gridExtra)
set.seed(234)
## Initiate clusters with k = (3,4,5,6)
k0 <- kmeans(cluster_x, centers = 2, nstart = 25)
k1 <- kmeans(cluster_x, centers = 3, nstart = 25)
k2 <- kmeans(cluster_x, centers = 4, nstart = 25)
k3 <- kmeans(cluster_x, centers = 5, nstart = 25)
k4 <- kmeans(cluster_x, centers = 6, nstart = 25)
##Visualize cluster results
p0 <- fviz_cluster(k0, geom = "point", data = cluster_x) + ggtitle("K means k=3")
p1 <- fviz_cluster(k1, geom = "point", data = cluster_x) + ggtitle("K means k=3")
p2 <- fviz_cluster(k2, geom = "point", data = cluster_x) + ggtitle("K means k=4")
p3 <- fviz_cluster(k3, geom = "point", data = cluster_x) + ggtitle("K means k=5")
p4 <- fviz_cluster(k4, geom = "point", data = cluster_x) + ggtitle("K means k=6")
## Concanate grids
grid.arrange(p0, p1, p2, p3, p4)
## Determine n of cluster, plot Elbow method
fviz_nbclust(cluster_x, kmeans, method = "wss") +
geom_vline(xintercept = 2, linetype = 2)
k = 2 forms the perfect elbow
## Applying n = 4 clusters on dataset.
cluster_Df <- data.frame(wHappiness_Df, Cluster = k0$cluster) %>%
mutate(Cluster = as.factor(Cluster))
cluster_Df %>%
group_by(Cluster) %>%
ggplot(aes(x = Regional.indicator, y = Healthy.life.expectancy, colour = Cluster)) +
geom_jitter() +
facet_wrap(~Cluster) +
coord_flip()
NA
cluster_Df %>%
group_by(Cluster) %>%
ggplot(aes(x = Regional.indicator, y = Ladder.score, colour = Cluster)) +
geom_jitter() +
facet_wrap(~Cluster) +
coord_flip()
cluster_Df %>%
group_by(Cluster) %>%
ggplot(aes(x = Regional.indicator, y = Logged.GDP.per.capita, colour = Cluster)) +
geom_jitter() +
facet_wrap(~Cluster) +
coord_flip()
cluster_Df %>%
group_by(Cluster) %>%
ggplot(aes(x = Regional.indicator, y = Perceptions.of.corruption, colour = Cluster)) +
geom_jitter() +
facet_wrap(~Cluster) +
coord_flip()
Perception of corruption in the Second Cluster is more generalized, unlike the first cluster who have a more defined and uniform perception of corruption, however provided by the survey’s scale.
At this junction our evaluation of our cluster analysis gives to define the clusters in pre-concieved real world ideas of (k = 1)Developed and (k =2) Developing Clusters/Countries, which are found, ironically, to rate higher on the scale of happiness. Indicatively more has to be done to attenuate the real reasons behind these metrics.
cluster_Df <- cluster_Df %>%
mutate(Cluster = if_else(Cluster == 1, "Developed_Region", "Developing_Region"))