Hide all the warnings messages

knitr::opts_chunk$set(message = FALSE, warning = FALSE)

Packages

pacman::p_load(tidyverse, # for data science
               janitor,  # to clean names
               visdat) # for NAs visualization
# uploaded the dataset to a google drive and published as csv file
happiness <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQQlW28r9deAndL9Jq0tBy93o3273SWFSR1Rrgtn2jS37rjfJALF5dlwATtqm7SPZYctV4hJxb34azG/pub?gid=684260656&single=true&output=csv")
head(happiness) # for data exploration
## # A tibble: 6 × 11
##   `Country name`  year `Life Ladder` `Log GDP per capita` `Social support`
##   <chr>          <dbl>         <dbl>                <dbl>            <dbl>
## 1 Afghanistan     2008          3.72                 7.37            0.451
## 2 Afghanistan     2009          4.40                 7.54            0.552
## 3 Afghanistan     2010          4.76                 7.65            0.539
## 4 Afghanistan     2011          3.83                 7.62            0.521
## 5 Afghanistan     2012          3.78                 7.70            0.521
## 6 Afghanistan     2013          3.57                 7.72            0.484
## # … with 6 more variables: Healthy life expectancy at birth <dbl>,
## #   Freedom to make life choices <dbl>, Generosity <dbl>,
## #   Perceptions of corruption <dbl>, Positive affect <dbl>,
## #   Negative affect <dbl>

clean the names, removing the white spaces, etc. The manipulation of clean names is easier

happiness <- happiness %>% 
  janitor::clean_names()

check the variables

glimpse(happiness)
## Rows: 1,949
## Columns: 11
## $ country_name                     <chr> "Afghanistan", "Afghanistan", "Afghan…
## $ year                             <dbl> 2008, 2009, 2010, 2011, 2012, 2013, 2…
## $ life_ladder                      <dbl> 3.724, 4.402, 4.758, 3.832, 3.783, 3.…
## $ log_gdp_per_capita               <dbl> 7.370, 7.540, 7.647, 7.620, 7.705, 7.…
## $ social_support                   <dbl> 0.451, 0.552, 0.539, 0.521, 0.521, 0.…
## $ healthy_life_expectancy_at_birth <dbl> 50.80, 51.20, 51.60, 51.92, 52.24, 52…
## $ freedom_to_make_life_choices     <dbl> 0.718, 0.679, 0.600, 0.496, 0.531, 0.…
## $ generosity                       <dbl> 0.168, 0.190, 0.121, 0.162, 0.236, 0.…
## $ perceptions_of_corruption        <dbl> 0.882, 0.850, 0.707, 0.731, 0.776, 0.…
## $ positive_affect                  <dbl> 0.518, 0.584, 0.618, 0.611, 0.710, 0.…
## $ negative_affect                  <dbl> 0.258, 0.237, 0.275, 0.267, 0.268, 0.…

check the completeness of the dataset

Check the completeness of the dataset. The grey spots are NAs values

visdat::vis_dat(happiness)

Data exploration

Baltics countries

happiness %>% 
  filter(country_name %in% c("Latvia", "Estonia", "Lithuania")) %>%  # filter some countries
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = country_name)) + 
  geom_line() +
  labs(title = "Happiness in Baltic by year", 
       y = "Happiness (0= worst, 10 = best)", 
       x = "Year", 
       color = "Country") 

Group all the variables to make many plots at once

There are some absolute value variables and some related to a survey

  • log_gdp_per_capita

  • healthy_life_expectancy_at_birth

  • life_ladder: Happiness score or subjective well-being. 0 worst, 10 best

The ones from the survey can be collapsed into one column

  • social_support

  • freedom_to_make_life_choices

  • perceptions_of_corruption

  • positive_affect

  • negative_affect

happiness %>% 
  filter(country_name %in% c("Latvia", "Estonia", "Lithuania")) %>%  # filter some countries
  # relocate one column to make it easier the pivoting later
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # now reshape the dataset
  pivot_longer(social_support:negative_affect, 
               names_to = "name_variable", 
               values_to = "name_value") %>% 
# change the name of the levels of name_variable
  mutate(name_variable = recode(name_variable, # change the variable, recoding the levels
                                freedom_to_make_life_choices = "Freedom",  # old name = new name
                                generosity = "Generosity", 
                                negative_affect = "Negative", 
                                perceptions_of_corruption = "Corruption", 
                                positive_affect = "Positive", 
                                social_support = "Support")) %>% 
  # make the plot
  ggplot(aes(x = year, 
             y = name_value, 
             color = country_name)) + 
  geom_line() +
  facet_grid(. ~ name_variable ) + 
  labs(title = "Survey results in the Baltic countries", 
       y = "Survey results", 
       color = "Country", 
       x = "Year")

Join with another dataset

It would be interesting to group by continent. I could add by hand the continent for each country, but it is much simpler if I take another database where each country has the continent information.

Take this as an example in case you want to cross some happiness variable with another one, for example, from the world bank. The key is that both databases must have a column to fit. You can also match by two columns, for example, country and year.

Googling, I found this dataset with the required information of the countries and continents https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv

countries <- read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
head(countries)
## # A tibble: 6 × 11
##   name      `alpha-2` `alpha-3` `country-code` `iso_3166-2` region `sub-region` 
##   <chr>     <chr>     <chr>     <chr>          <chr>        <chr>  <chr>        
## 1 Afghanis… AF        AFG       004            ISO 3166-2:… Asia   Southern Asia
## 2 Åland Is… AX        ALA       248            ISO 3166-2:… Europe Northern Eur…
## 3 Albania   AL        ALB       008            ISO 3166-2:… Europe Southern Eur…
## 4 Algeria   DZ        DZA       012            ISO 3166-2:… Africa Northern Afr…
## 5 American… AS        ASM       016            ISO 3166-2:… Ocean… Polynesia    
## 6 Andorra   AD        AND       020            ISO 3166-2:… Europe Southern Eur…
## # … with 4 more variables: intermediate-region <chr>, region-code <chr>,
## #   sub-region-code <chr>, intermediate-region-code <chr>

For how to join different datasets, see https://www.youtube.com/watch?v=Yg-pNqzDuN4 and this https://craig.rbind.io/post/2020-03-29-asgr-2-2-joining-data/

# overwrite the old dataset
happiness <- 
  # with a new one, where we will add the name dataset, matching by the columns "country_name" from the happiness data set and "name" from the countries data set
  left_join(happiness, countries, by = c("country_name" = "name")) %>% 
  select(country_name:negative_affect, "alpha-3", "region", "sub-region")

Now I have the happiness dataset with the information of the continents (region)

summarize the life_ladder accross continents

happiness %>% 
  filter(region != "NA") %>% # remove the antartica, etc
  # calculate the mean happines by continent and year
  group_by(region, year) %>% 
  summarise(mean_happiness = mean(life_ladder)) %>% 
  # make the plot
  ggplot(aes(x = year, 
             y = mean_happiness, 
             color = region)) + 
  geom_line() +
  labs(title = "Happiness by continent and year", 
       y = "Happiness (0= worst, 10 = best)", 
       x = "Year", 
       color = "Continent") 

## summary all continents at once And I can check all the variable sby continents at once

happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now reshape the dataset
  pivot_longer(social_support:negative_affect, 
               names_to = "happiness_variable", 
               values_to = "happiness_value") %>% 
  
  # calculate the mean value per continent
  group_by(happiness_variable, region, year) %>% 
  summarise(mean = mean(happiness_value, na.rm = T)) %>% 
  
  # change the name of the levels of name_variable
  mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
                                freedom_to_make_life_choices = "Freedom",  # old name = new name
                                generosity = "Generosity", 
                                negative_affect = "Negative", 
                                perceptions_of_corruption = "Corruption", 
                                positive_affect = "Positive", 
                                social_support = "Support")) %>% 
  
  # and now plot
  ggplot(aes(x = year, 
             y = mean, 
             color = region, 
             group = region)) + 
  geom_line() +
  facet_grid(. ~ happiness_variable) + 
  labs(title = "Several variables in one plot", 
       y = "Mean value", 
       x = "Year", 
       color = "Continent")

All the variables for just for the Baltic countries

happiness %>% 
  # first relocate one column to make it easier to manipulate the variables during the pivoting
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  filter(region == "Europe") %>%  # comnent this line to check for all the world
  # now reshape the dataset
  pivot_longer(social_support:negative_affect, 
               names_to = "happiness_variable", 
               values_to = "happiness_value") %>% 
  
  # calculate the mean value per continent
  group_by(happiness_variable, country_name, year) %>% 
  summarise(mean = mean(happiness_value, na.rm = T)) %>% 
  
  # change the name of the levels of name_variable
  mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
                                freedom_to_make_life_choices = "Freedom",  # old name = new name
                                generosity = "Generosity", 
                                negative_affect = "Negative", 
                                perceptions_of_corruption = "Corruption", 
                                positive_affect = "Positive", 
                                social_support = "Support")) %>% 
  
  # and now plot
  ggplot(aes(x = year, 
             y = mean, 
             color = (country_name == "Latvia"), 
             group = country_name)) + 
  geom_line(alpha = .5) +
  # change the name of levels in the legend
  scale_color_manual(values=c("Grey", "red"), 
                     labels=c("Other", "Latvia")) +
  # now generate several plots, each for one variable
  facet_grid(. ~ happiness_variable) +  # this means rows and columns, in this case, no variable ~ happiness
  labs(title = "Several variables in one plot", 
       y = "Mean value", 
       x = "Year", 
       color = "Country") 

# gghightlight

Check Latvia for all the countries. I will use the gghighlight package

see https://cran.r-project.org/web/packages/gghighlight/vignettes/gghighlight.html

pacman::p_load(gghighlight)
# copy and paste all the code
happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  # filter(region == "Europe") %>%
  
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = region, 
             group = country_name)) +
  geom_line() +
  gghighlight(country_name == "Latvia") +
  labs(title = "Life ladder, all the world", 
       y = "Life ladder", 
       x = "Year")

for the baltic countries

# copy and paste all the code
happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  # uncomment next line to filter european countries only
  # filter(region == "Europe") %>%
  
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = country_name, 
             group = country_name)) +
  geom_line() +
  gghighlight(country_name %in% c("Latvia", "Estonia", "Lithuania")) +
  labs(title = "Life ladder, all the world", 
       y = "Life ladder", 
       x = "Year")

## check the happiest

# copy and paste all the code
happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  # uncomment next line to filter european countries only
  # filter(region == "Europe") %>%
  
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = country_name, 
             group = country_name)) +
  geom_line() +
  gghighlight(life_ladder > 7.6) + # select some arbitraty threshold
  labs(title = "Life ladder, all the world", 
       y = "Life ladder", 
       x = "Year")

## and the lowest happiness score

# copy and paste all the code
happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  # uncomment next line to filter european countries only
  # filter(region == "Europe") %>%
  
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = country_name, 
             group = country_name)) +
  geom_line() +
  gghighlight(life_ladder < 3) +
  labs(title = "Life ladder, all the world", 
       y = "Life ladder", 
       x = "Year")

## top three from Europe

# copy and paste all the code
happiness %>% 
  # first relocate one column to make it easier
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region == "Europe") %>% 
  # now filter only european countries
  # uncomment next line to filter european countries only
  # filter(region == "Europe") %>%
  
  ggplot(aes(x = year, 
             y = life_ladder, 
             color = country_name, 
             group = country_name)) +
  geom_line() +
  gghighlight(max(life_ladder), max_highlight = 3L) # select the top 3

  labs(title = "Life ladder, all the world", 
       y = "Life ladder", 
       x = "Year")
## $y
## [1] "Life ladder"
## 
## $x
## [1] "Year"
## 
## $title
## [1] "Life ladder, all the world"
## 
## attr(,"class")
## [1] "labels"

no gghighlight needed

Now with all variables and no gghighligth needed

happiness %>% 
  # first relocate one column to make it easier to manipulate the variables during the pivoting
  relocate(social_support, .after = healthy_life_expectancy_at_birth) %>% 
  # filter to exclude the NA continents
  filter(region != "NA") %>% 
  # now filter only european countries
  filter(region == "Europe") %>%  # comnent this line to check for all the world
  # now reshape the dataset
  pivot_longer(social_support:negative_affect, 
               names_to = "happiness_variable", 
               values_to = "happiness_value") %>% 
  
  # calculate the mean value per continent
  group_by(happiness_variable, country_name, year) %>% 
  summarise(mean = mean(happiness_value, na.rm = T)) %>% 
  
  # change the name of the levels of name_variable
  mutate(happiness_variable = recode(happiness_variable, # change the variable, recoding the levels
                                freedom_to_make_life_choices = "Freedom",  # old name = new name
                                generosity = "Generosity", 
                                negative_affect = "Negative", 
                                perceptions_of_corruption = "Corruption", 
                                positive_affect = "Positive", 
                                social_support = "Support")) %>% 
  
  # and now plot
  ggplot(aes(x = year, 
             y = mean, 
             color = (country_name == "Latvia"), # good trick! :D
             group = country_name)) + 
  geom_line(alpha = .5) +
  # change the name of levels in the legend
  scale_color_manual(values=c("Grey", "red"), 
                     labels=c("Other", "Latvia")) +
  # now generate several plots, each for one variable
  facet_grid(. ~ happiness_variable) +  # this means rows and columns, in this case, no variable ~ happiness
  labs(title = "Several variables in one plot", 
       y = "Mean value", 
       x = "Year", 
       color = "Country") 

