Hide all the warnings messages
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
Packages
pacman::p_load(tidyverse, # for data science
janitor, # to clean names
visdat) # for NAs visualization
# uploaded the dataset to a google drive and published as csv file
happiness <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQQlW28r9deAndL9Jq0tBy93o3273SWFSR1Rrgtn2jS37rjfJALF5dlwATtqm7SPZYctV4hJxb34azG/pub?gid=684260656&single=true&output=csv")
head(happiness) # for data exploration
## # A tibble: 6 × 11
## `Country name` year `Life Ladder` `Log GDP per capita` `Social support`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 2008 3.72 7.37 0.451
## 2 Afghanistan 2009 4.40 7.54 0.552
## 3 Afghanistan 2010 4.76 7.65 0.539
## 4 Afghanistan 2011 3.83 7.62 0.521
## 5 Afghanistan 2012 3.78 7.70 0.521
## 6 Afghanistan 2013 3.57 7.72 0.484
## # … with 6 more variables: Healthy life expectancy at birth <dbl>,
## # Freedom to make life choices <dbl>, Generosity <dbl>,
## # Perceptions of corruption <dbl>, Positive affect <dbl>,
## # Negative affect <dbl>
clean the names, removing the white spaces, etc. The manipulation of clean names is easier
happiness <- happiness %>%
janitor::clean_names()
check the variables
glimpse(happiness)
## Rows: 1,949
## Columns: 11
## $ country_name <chr> "Afghanistan", "Afghanistan", "Afghan…
## $ year <dbl> 2008, 2009, 2010, 2011, 2012, 2013, 2…
## $ life_ladder <dbl> 3.724, 4.402, 4.758, 3.832, 3.783, 3.…
## $ log_gdp_per_capita <dbl> 7.370, 7.540, 7.647, 7.620, 7.705, 7.…
## $ social_support <dbl> 0.451, 0.552, 0.539, 0.521, 0.521, 0.…
## $ healthy_life_expectancy_at_birth <dbl> 50.80, 51.20, 51.60, 51.92, 52.24, 52…
## $ freedom_to_make_life_choices <dbl> 0.718, 0.679, 0.600, 0.496, 0.531, 0.…
## $ generosity <dbl> 0.168, 0.190, 0.121, 0.162, 0.236, 0.…
## $ perceptions_of_corruption <dbl> 0.882, 0.850, 0.707, 0.731, 0.776, 0.…
## $ positive_affect <dbl> 0.518, 0.584, 0.618, 0.611, 0.710, 0.…
## $ negative_affect <dbl> 0.258, 0.237, 0.275, 0.267, 0.268, 0.…
check the completeness of the dataset
Check the completeness of the dataset. The grey spots are NAs values
visdat::vis_dat(happiness)

Data exploration
Baltics countries
happiness %>%
filter(country_name %in% c("Latvia", "Estonia", "Lithuania")) %>% # filter some countries
ggplot(aes(x = year,
y = life_ladder,
color = country_name)) +
geom_line() +
labs(title = "Happiness in Baltic by year",
y = "Happiness (0= worst, 10 = best)",
x = "Year",
color = "Country")

Group all the variables to make many plots at once
There are some absolute value variables and some related to a survey
log_gdp_per_capita
healthy_life_expectancy_at_birth
life_ladder: Happiness score or subjective well-being. 0 worst, 10 best
The ones from the survey can be collapsed into one column
happiness %>%
filter(country_name %in% c("Latvia", "Estonia", "Lithuania")) %>% # filter some countries
# relocate one column to make it easier the pivoting later
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# now reshape the dataset
pivot_longer(social_support:negative_affect,
names_to = "name_variable",
values_to = "name_value") %>%
# change the name of the levels of name_variable
mutate(name_variable = recode(name_variable, # change the variable, recoding the levels
freedom_to_make_life_choices = "Freedom", # old name = new name
generosity = "Generosity",
negative_affect = "Negative",
perceptions_of_corruption = "Corruption",
positive_affect = "Positive",
social_support = "Support")) %>%
# make the plot
ggplot(aes(x = year,
y = name_value,
color = country_name)) +
geom_line() +
facet_grid(. ~ name_variable ) +
labs(title = "Survey results in the Baltic countries",
y = "Survey results",
color = "Country",
x = "Year")

Join with another dataset
It would be interesting to group by continent. I could add by hand the continent for each country, but it is much simpler if I take another database where each country has the continent information.
Take this as an example in case you want to cross some happiness variable with another one, for example, from the world bank. The key is that both databases must have a column to fit. You can also match by two columns, for example, country and year.
Googling, I found this dataset with the required information of the countries and continents https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
countries <- read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
head(countries)
## # A tibble: 6 × 11
## name `alpha-2` `alpha-3` `country-code` `iso_3166-2` region `sub-region`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Afghanis… AF AFG 004 ISO 3166-2:… Asia Southern Asia
## 2 Åland Is… AX ALA 248 ISO 3166-2:… Europe Northern Eur…
## 3 Albania AL ALB 008 ISO 3166-2:… Europe Southern Eur…
## 4 Algeria DZ DZA 012 ISO 3166-2:… Africa Northern Afr…
## 5 American… AS ASM 016 ISO 3166-2:… Ocean… Polynesia
## 6 Andorra AD AND 020 ISO 3166-2:… Europe Southern Eur…
## # … with 4 more variables: intermediate-region <chr>, region-code <chr>,
## # sub-region-code <chr>, intermediate-region-code <chr>
For how to join different datasets, see https://www.youtube.com/watch?v=Yg-pNqzDuN4 and this https://craig.rbind.io/post/2020-03-29-asgr-2-2-joining-data/
# overwrite the old dataset
happiness <-
# with a new one, where we will add the name dataset, matching by the columns "country_name" from the happiness data set and "name" from the countries data set
left_join(happiness, countries, by = c("country_name" = "name")) %>%
select(country_name:negative_affect, "alpha-3", "region", "sub-region")
Now I have the happiness dataset with the information of the continents (region)
summarize the life_ladder accross continents
happiness %>%
filter(region != "NA") %>% # remove the antartica, etc
# calculate the mean happines by continent and year
group_by(region, year) %>%
summarise(mean_happiness = mean(life_ladder)) %>%
# make the plot
ggplot(aes(x = year,
y = mean_happiness,
color = region)) +
geom_line() +
labs(title = "Happiness by continent and year",
y = "Happiness (0= worst, 10 = best)",
x = "Year",
color = "Continent")
## summary all continents at once And I can check all the variable sby continents at once
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now reshape the dataset
pivot_longer(social_support:negative_affect,
names_to = "happiness_variable",
values_to = "happiness_value") %>%
# calculate the mean value per continent
group_by(happiness_variable, region, year) %>%
summarise(mean = mean(happiness_value, na.rm = T)) %>%
# change the name of the levels of name_variable
mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
freedom_to_make_life_choices = "Freedom", # old name = new name
generosity = "Generosity",
negative_affect = "Negative",
perceptions_of_corruption = "Corruption",
positive_affect = "Positive",
social_support = "Support")) %>%
# and now plot
ggplot(aes(x = year,
y = mean,
color = region,
group = region)) +
geom_line() +
facet_grid(. ~ happiness_variable) +
labs(title = "Several variables in one plot",
y = "Mean value",
x = "Year",
color = "Continent")

All the variables for just for the Baltic countries
happiness %>%
# first relocate one column to make it easier to manipulate the variables during the pivoting
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
filter(region == "Europe") %>% # comnent this line to check for all the world
# now reshape the dataset
pivot_longer(social_support:negative_affect,
names_to = "happiness_variable",
values_to = "happiness_value") %>%
# calculate the mean value per continent
group_by(happiness_variable, country_name, year) %>%
summarise(mean = mean(happiness_value, na.rm = T)) %>%
# change the name of the levels of name_variable
mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
freedom_to_make_life_choices = "Freedom", # old name = new name
generosity = "Generosity",
negative_affect = "Negative",
perceptions_of_corruption = "Corruption",
positive_affect = "Positive",
social_support = "Support")) %>%
# and now plot
ggplot(aes(x = year,
y = mean,
color = (country_name == "Latvia"),
group = country_name)) +
geom_line(alpha = .5) +
# change the name of levels in the legend
scale_color_manual(values=c("Grey", "red"),
labels=c("Other", "Latvia")) +
# now generate several plots, each for one variable
facet_grid(. ~ happiness_variable) + # this means rows and columns, in this case, no variable ~ happiness
labs(title = "Several variables in one plot",
y = "Mean value",
x = "Year",
color = "Country")
# gghightlight
Check Latvia for all the countries. I will use the gghighlight package
see https://cran.r-project.org/web/packages/gghighlight/vignettes/gghighlight.html
pacman::p_load(gghighlight)
# copy and paste all the code
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
# filter(region == "Europe") %>%
ggplot(aes(x = year,
y = life_ladder,
color = region,
group = country_name)) +
geom_line() +
gghighlight(country_name == "Latvia") +
labs(title = "Life ladder, all the world",
y = "Life ladder",
x = "Year")

for the baltic countries
# copy and paste all the code
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
# uncomment next line to filter european countries only
# filter(region == "Europe") %>%
ggplot(aes(x = year,
y = life_ladder,
color = country_name,
group = country_name)) +
geom_line() +
gghighlight(country_name %in% c("Latvia", "Estonia", "Lithuania")) +
labs(title = "Life ladder, all the world",
y = "Life ladder",
x = "Year")
## check the happiest
# copy and paste all the code
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
# uncomment next line to filter european countries only
# filter(region == "Europe") %>%
ggplot(aes(x = year,
y = life_ladder,
color = country_name,
group = country_name)) +
geom_line() +
gghighlight(life_ladder > 7.6) + # select some arbitraty threshold
labs(title = "Life ladder, all the world",
y = "Life ladder",
x = "Year")
## and the lowest happiness score
# copy and paste all the code
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
# uncomment next line to filter european countries only
# filter(region == "Europe") %>%
ggplot(aes(x = year,
y = life_ladder,
color = country_name,
group = country_name)) +
geom_line() +
gghighlight(life_ladder < 3) +
labs(title = "Life ladder, all the world",
y = "Life ladder",
x = "Year")
## top three from Europe
# copy and paste all the code
happiness %>%
# first relocate one column to make it easier
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region == "Europe") %>%
# now filter only european countries
# uncomment next line to filter european countries only
# filter(region == "Europe") %>%
ggplot(aes(x = year,
y = life_ladder,
color = country_name,
group = country_name)) +
geom_line() +
gghighlight(max(life_ladder), max_highlight = 3L) # select the top 3

labs(title = "Life ladder, all the world",
y = "Life ladder",
x = "Year")
## $y
## [1] "Life ladder"
##
## $x
## [1] "Year"
##
## $title
## [1] "Life ladder, all the world"
##
## attr(,"class")
## [1] "labels"
no gghighlight needed
Now with all variables and no gghighligth needed
happiness %>%
# first relocate one column to make it easier to manipulate the variables during the pivoting
relocate(social_support, .after = healthy_life_expectancy_at_birth) %>%
# filter to exclude the NA continents
filter(region != "NA") %>%
# now filter only european countries
filter(region == "Europe") %>% # comnent this line to check for all the world
# now reshape the dataset
pivot_longer(social_support:negative_affect,
names_to = "happiness_variable",
values_to = "happiness_value") %>%
# calculate the mean value per continent
group_by(happiness_variable, country_name, year) %>%
summarise(mean = mean(happiness_value, na.rm = T)) %>%
# change the name of the levels of name_variable
mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
freedom_to_make_life_choices = "Freedom", # old name = new name
generosity = "Generosity",
negative_affect = "Negative",
perceptions_of_corruption = "Corruption",
positive_affect = "Positive",
social_support = "Support")) %>%
# and now plot
ggplot(aes(x = year,
y = mean,
color = (country_name == "Latvia"), # good trick! :D
group = country_name)) +
geom_line(alpha = .5) +
# change the name of levels in the legend
scale_color_manual(values=c("Grey", "red"),
labels=c("Other", "Latvia")) +
# now generate several plots, each for one variable
facet_grid(. ~ happiness_variable) + # this means rows and columns, in this case, no variable ~ happiness
labs(title = "Several variables in one plot",
y = "Mean value",
x = "Year",
color = "Country")

