Research Questions
- What is the effect of Covid19 on happiness levels in 2020 and 2021?
- Which regions showed change in happiness levels?
- What is general trend of happiness in the world over last 3 years?
- What are the top 10 happiest and saddest countries in 2021?
- What factors contributed most to happiness scores?
getRversion()
## [1] '4.0.3'
Importing libraries and datasets
packages = c('tidyverse',
'heatmaply',
'visdat', # for exploring missing data structure
'ggplot2',
'naniar',
'dplyr',
'tidyr',
'hrbrthemes',
'ggchicklet',
'ggalt',
'corrplot', #correlogram
'plotly',
'cowplot',
'patchwork',
'RColorBrewer',
'ggbeeswarm',
'scales'
)
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
#Reading the data
df_2021 <- read.csv("data/world-happiness-report-2021.csv")
df_all <- read.csv("data/world-happiness-report.csv")
#Checking first five elements of the dataset
head(df_all)
head(df_2021)
Exploring data
Visualizing datasets to check datatypes.
vis_data_1 <- vis_dat(df_2021)+ labs(x = "Datatypes for 2021 dataset")
vis_data_2 <- vis_dat(df_all)+ labs(x = "Datatypes for all dataset")
vis_data_1 + vis_data_2
Both datasets show numeric and character datatypes with few NA values.
Checking for missing values
Plotting missing values of both datasets.
miss_data_1 <- gg_miss_var(df_2021) + labs(y = "Checking for the missing ones in 2021")
miss_data_2 <- gg_miss_var(df_all) + labs(y = "Checking for the missing ones in full_data")
miss_data_1 + miss_data_2
df_2021 has no missing values whereas df_all has some columns with missing values.
Treating NA values
Imputing missing values with mean of the columns
df_all[sapply(df_all, is.numeric)] <- lapply(df_all[sapply(df_all, is.numeric)], function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
df_all %>% summarise(across(everything(), ~ sum(is.na(.))))
Cleaned dataset
Checking for NA values in dataset.
gg_miss_var(df_all) + labs(y = "Checking for the missing ones in full_data")
Dataset has no missing values after treatment.
Top 10 happiest and saddest countries in 2021
Mapping countries by region and filtering countries by ladder_score i.e. happiness level.
# Subsetting dimensions
dimensions <- c('ladder_score',
'logged_GDP_per_capita',
'social_support',
'healthy_life_expectancy',
'freedom_to_make_life_choices',
'generosity',
'perceptions_of_corruption')
# Mapping country to regions
country_region_dict = df_2021 %>%
select(country = ï..country_name, region = regional_indicator) %>% unique()
df_2021_long <- df_2021 %>%
select(country = ï..country_name, all_of(dimensions)) %>%
mutate(absence_of_corruption = 1- perceptions_of_corruption) %>%
pivot_longer(cols = c(all_of(dimensions),'absence_of_corruption'),
names_to = 'dimension', values_to = 'score') %>%
filter(dimension != "perceptions_of_corruption")
df_2021_tranformed <- df_2021_long %>%
group_by(dimension) %>%
mutate(min_value = min(score),
max_value = max(score)) %>%
mutate(score_pct = (score-min_value)/(max_value-min_value)) %>%
ungroup()
# Getting top 10 countries
df_2021_top10 <- df_2021_tranformed %>%
filter(dimension == "ladder_score") %>%
slice_max(score, n = 10) %>%
mutate(cat = 'top_10',
country_rank = rank(-score),
country_label = paste0(country, ' (', country_rank, ')'))
# Getting bottom 10 countries
df_2021_bottom10 <- df_2021_tranformed %>%
filter(dimension == "ladder_score") %>%
mutate(country_rank = rank(score),
country_label = paste0(country, ' (', country_rank, ')')) %>%
slice_min(score, n = 10) %>%
mutate(cat = 'bottom_10')
Plotting top and bottom 10 countries
top_10 <- ggplot(df_2021_top10, aes(x = reorder(country_label, score))) +
geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
geom_chicklet(aes(y = score, fill = score), width = 0.5, radius = grid::unit(5, "pt")) +
geom_text(aes(y = score), label = round(df_2021_top10$score,2), nudge_y = 0.4, size = 3) +
scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
scale_fill_gradient2(low = 'black', high = '#818aeb', mid = 'white', midpoint = 5) +
coord_flip() +
labs(y="Best possible life = 10", x = '',
title="Top 10 Happiest Countries in 2021",
subtitle="9 out of 10 happiest countries in Europe",
caption="Source: The World Happiness Report 2021") +
theme_ipsum(grid = '') +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#555955'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'None')
bottom_10 <- ggplot(df_2021_bottom10, aes(x = reorder(country_label, score))) +
geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
geom_chicklet(aes(y = score, fill = score), width = 0.5, radius = grid::unit(5, "pt")) +
geom_text(aes(y = score), label = round(df_2021_bottom10$score,2), nudge_y = 0.4, size = 3) +
scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
scale_fill_gradient2(low = '#074040', high = '#4cc2c2', mid = 'white', midpoint = 5) +
coord_flip() +
labs(y="Best possible life = 10", x = '',
title="Top 10 Saddest Countries in 2021",
subtitle="Countries struck by poverty",
caption="Source: The World Happiness Report 2021") +
theme_ipsum(grid = '') +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#555955'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'None')
top_10 + bottom_10
Most of the happiest countries comprise in Europe. Most saddest seem to be under financial crisis
Happiness trend in 2019 (insights wrt covid19)
Subsetting country, region, ladder_score for the years 2019 and 2020.
df_2019_2020 <- df_all %>%
filter(year >= 2019) %>%
left_join(country_region_dict, by = c('ï..country_name' = 'country')) %>%
select(country = ï..country_name, region, year, ladder = life_ladder) %>%
pivot_wider(names_from = 'year', names_prefix = 'year', values_from = 'ladder') %>%
filter(!is.na(year2019) & !is.na(year2020)) %>%
group_by(region) %>%
summarize(happiness_2019 = mean(year2019, na.rm = TRUE),
happiness_2020 = mean(year2020, na.rm = TRUE)) %>%
mutate(diff = happiness_2020-happiness_2019) %>%
arrange(diff) %>%
mutate(region = factor(region, levels = region))
Plotting happiness levels during covid19
plot_2020 <- ggplot() +
geom_dumbbell(data = df_2019_2020 %>% filter(diff >0),
aes(y=region, x=happiness_2019, xend=happiness_2020),
size=1.5, color="#7FB185",
colour_xend = "#7FB185", colour_x = "#7FB185",
size_x = 2.5, size_xend = 5,
dot_guide=TRUE, dot_guide_size=0.5) +
geom_dumbbell(data = df_2019_2020 %>% filter(diff <0),
aes(y=region, x=happiness_2019, xend=happiness_2020),
size=1.5, color="#edae52",
colour_xend = "#edae52", colour_x = "#edae52",
size_x = 2.5, size_xend = 5,
dot_guide=TRUE, dot_guide_size=0.5) +
scale_y_discrete(limits = levels(df_2019_2020$region), expand=c(0.075,1)) +
labs(x='', y=NULL,
title="Happiness in pre to amidst Covid",
subtitle = 'Regions see increases in happiness, despite Covid',
caption= 'Source: World Happiness Report (2021)') +
geom_rect(data=df_2019_2020,
aes(xmin=7.35, xmax=7.65, ymin=-Inf, ymax=Inf),
fill="#e3e2e1") +
geom_text(data=df_2019_2020 %>% filter(region == 'South Asia'),
aes(x=happiness_2020, y=region, label= "2020"),
color="gray15", size=3, vjust=-1.5) +
geom_text(data=df_2019_2020 %>% filter(region == 'South Asia'),
aes(x=happiness_2019, y=region, label= "2019"),
color="gray15", size=3, vjust=-1.5) +
geom_text(data=df_2019_2020 %>% filter(diff>0),
aes(x=happiness_2020 , y=region, label=round(happiness_2020,2)),
size=3, hjust=-0.5) +
geom_text(data=df_2019_2020 %>% filter(diff>0),
aes(x=happiness_2019 , y=region, label=round(happiness_2019,2)),
color="gray15", size=3, hjust=1.3) +
geom_text(data=df_2019_2020 %>% filter(diff<0),
aes(x=happiness_2020 , y=region,
label=round(happiness_2020,2)),size=3, hjust=1.5) +
geom_text(data=df_2019_2020 %>% filter(diff<0),
aes(x=happiness_2019 , y=region,
label=round(happiness_2019,2)),
color="gray15", size=3, hjust=-0.3) +
geom_text(data=df_2019_2020 %>%
filter(region == 'South Asia'),
aes(x=7.5, y=region, label="DIFF"),
size=3, vjust=-1.5, fontface="bold") +
geom_text(data=df_2019_2020, aes(label=round(diff,2),
y=region, x=7.5), size=3) +
theme_ipsum(grid="") +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#3a403a'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'left')
Creating new dataframe to compare happiness levels amidst Covid to 2021 level
Combining dimentions from both datasets to form new dataset with country, region, year and ladder_score.
# Adding year column to 2021 dataset
df_2021$year <- rep(2021,nrow(df_2021))
# Renaming 2021 `ladder_score` as `happiness_2021`
df_2021_new <- cbind(df_2021)
names(df_2021_new)[names(df_2021_new) == 'ladder_score'] <- 'happiness_2021'
# Joining 2020 and 2021 dataset
df_yr_score<-full_join(df_2019_2020, df_2021_new,
by=c("region"="regional_indicator"))
Making new dataframe with region, country and ladder_score columns for year 2019,2020 and 2021
# Merging country regions with countries
df_all_region <- df_all %>%
left_join(country_region_dict, by = c('ï..country_name' = 'country')) %>%
select(country = ï..country_name, region, year, ladder = life_ladder)
# Renaming region, ladder score in data_all dataset
names(df_all_region)[names(df_all_region) == 'region'] <- 'regional_indicator'
names(df_all_region)[names(df_all_region) == 'ladder'] <- 'ladder_score'
# Subsetting df_2021 dataset
df_2021_region<- df_2021 %>%
select(country = ï..country_name, regional_indicator, year, ladder_score)
# Binding all the regions in `df_final` dataset
df_final <-rbind(df_all_region,df_2021_region) %>%
filter(!is.na(year) & !is.na(regional_indicator))
# Making dataset of last 3 years
df_final_19_20_21 <- df_final %>%
filter(year >= 2019)
Plotting trends from 2020 to 2021
#library(tidyr)
df_20_21 <- df_final_19_20_21 %>%
filter(year >= 2020) %>%
select(country, regional_indicator, year, ladder_score) %>%
pivot_wider(names_from = 'year', names_prefix = 'year', values_from = 'ladder_score') %>%
#filter(!is.na(year2019) & !is.na(year2020)) %>%
group_by(regional_indicator) %>%
summarize(happiness_2020 = mean(year2020, na.rm = TRUE),
happiness_2021 = mean(year2021, na.rm = TRUE)) %>%
mutate(diff = happiness_2021-happiness_2020) %>%
arrange(diff) %>%
mutate(regional_indicator = factor(regional_indicator, levels = regional_indicator))
plot_2021 <- ggplot() +
geom_dumbbell(data = df_20_21 %>% filter(diff >0),
aes(y=regional_indicator, x=happiness_2020, xend=happiness_2021),
size=1.5, color="#7FB185",
colour_xend = "#7FB185", colour_x = "#7FB185",
size_x = 2.5, size_xend = 5,
dot_guide=TRUE, dot_guide_size=0.5) +
geom_dumbbell(data = df_20_21 %>% filter(diff <0),
aes(y=regional_indicator, x=happiness_2020, xend=happiness_2021),
size=1.5, color="#edae52",
colour_xend = "#edae52", colour_x = "#edae52",
size_x = 2.5, size_xend = 5,
dot_guide=TRUE, dot_guide_size=0.5) +
scale_y_discrete(limits = levels(df_2019_2020$regional_indicator), expand=c(0.075,1)) +
labs(x='', y=NULL,
title="Happiness amdist Covid to 2021",
subtitle = 'Most regions see decreases in happiness.',
caption= 'Source: World Happiness Report (2021)') +
geom_rect(data=df_20_21,
aes(xmin=7.35, xmax=7.65, ymin=-Inf, ymax=Inf),
fill="#e3e2e1") +
geom_text(data=df_20_21 %>% filter(regional_indicator == 'Western Europe'),
aes(x=happiness_2021, y=regional_indicator, label= "2021"),
color="gray15", size=3, hjust=-1.5) +
geom_text(data=df_20_21 %>% filter(regional_indicator == 'Western Europe'),
aes(x=happiness_2020, y=regional_indicator, label= "2020"),
color="gray15", size=3, hjust=2.5) +
geom_text(data=df_20_21 %>% filter(diff>0),
aes(x=happiness_2021 , y=regional_indicator, label=round(happiness_2020,2)),
size=3, hjust=-0.5) +
geom_text(data=df_20_21 %>% filter(diff>0),
aes(x=happiness_2021 , y=regional_indicator, label=round(happiness_2020,2)),
color="gray15", size=3, hjust=1.3) +
geom_text(data=df_20_21 %>% filter(diff<0),
aes(x=happiness_2021 , y=regional_indicator,
label=round(happiness_2021,2)),size=3, hjust=1.5) +
geom_text(data=df_20_21 %>% filter(diff<0),
aes(x=happiness_2020 , y=regional_indicator,
label=round(happiness_2020,2)),
color="gray15", size=3, hjust=-0.3) +
geom_text(data=df_20_21 %>%
filter(regional_indicator == 'Western Europe'),
aes(x=7.5, y=regional_indicator, label="DIFF"),
size=3, vjust=-1.5, fontface="bold") +
geom_text(data=df_20_21, aes(label=round(diff,2),
y=regional_indicator, x=7.5), size=3) +
theme_ipsum(grid="") +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#3a403a'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'left')
Comparing happiness trends pre, admist and 2021 levels.
Plotting trends admist covid and 2021 levels.
plot_2020 + plot_2021
Despite Covid-19, about half of the countries in the world still see an increase in happiness from 2019 to 2020 by small increment. Whereas, majority of countries see a dip in happiness from 2020 to 2021.
Factors correlating to happiness
Plotting correlation among factors related to happiness in 2021 dataset.
#corr mat
df_cor <- df_2021 %>%
select(corruption = perceptions_of_corruption,
generosity = generosity,
freedom = freedom_to_make_life_choices,
life_expectancy = healthy_life_expectancy,
social_support = social_support,
GDP_per_capita = logged_GDP_per_capita,
happiness = ladder_score
)
corr <- cor(df_cor)
plot_ly(colors = "RdBu") %>%
add_heatmap(x = rownames(corr), y = colnames(corr), z = corr) %>%
colorbar(limits = c(-1, 1))
Top 3 contributors of happiness from the 2021 dataset are: 1. Life Expectancy 2. Social Support 3. GDP per capita
General trend of happiness levels over the regions.
Plotting overall trend of happiness scores across regions.
#library(ggbeeswarm)
#library(scales)
region_level <- ggplot(df_final_19_20_21, aes( x = regional_indicator, y = ladder_score, fill = regional_indicator, text = country))
region_level4 <- region_level + geom_beeswarm(aes(color = regional_indicator)) +
theme_classic() +
theme(legend.position = "none", axis.text.x=element_text( angle = 0)) +
scale_x_discrete(labels = wrap_format(10))+
scale_fill_brewer(palette = "Spectral") +
scale_color_brewer(palette = "Spectral")
ggplotly(region_level4, tooltip = c("country","ladder_score"))
Top happiest regions in the world are Western Europe and North America & ANZ.