Visualisations with Gapminder using {ggplot} and {plotly}

Gapminder data recap

The gapminder package contains the gapminder data which contains data on life expectancy, population and GDP per capita by country and year.

The data contains 1704 rows and 6 columns containing;

  • country
  • continent
  • year
  • lifeExp - life expectancy at birth
  • pop - total population
  • gdpPercap - per-capita GDP (Gross domestic product given in units of international dollars)
## print the top 10 rows of the data with head()
head(gapminder)
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
## print a summary of the data with summary()
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
## print the structure of the data with str()
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

Introducing ggplot

Bar Charts

There are two types of bar charts:

  • geom_bar()
  • geom_col()

geom_bar() makes the height of the bar proportional to the number of cases in each group (..count..), (or the sum of the weights if the weight aesthetic is supplied).

geom_col() instead has the heights of the bars representing the values in the data.

## plot a geom_col and a geom_bar to see the differences
ggp_bar <- ggplot(data = gapminder, aes(x=continent)) +
    geom_bar()

## what is 'count' in this graph?
ggp_bar

## geom_col
ggp_col <- ggplot(data = gapminder %>%
                       filter(continent == "Oceania", year == 2007),
                  aes(x=country, y=pop)) + # can do dataframe manipulation within the data call with pipes.
    geom_col()
## here, y axis is the value for the variable you pass in the aesthetic.
ggp_col

## Now add in more of the layers to one of the plots above

ggp_col2 <- ggplot(data = gapminder %>%
                       filter(continent == "Oceania", year == 2007), aes(x=country, y=pop)) +
    geom_col(aes(fill = country)) + # notice it adds in a legend automatically
    scale_y_continuous(breaks = seq(0,21000000, 2000000)) + #try changing the breaks
    ggtitle("Population of Oceania by country in 2007") +
    labs(y = "Population", x = "Country", fill = "Country") +
    theme_bw()

ggp_col2

Scatter plots using geom_point()

## plot a scatter plot with geom_point
## work through this a bit at a time to understand what each section does

ggp_scatter <- ggplot(data = gapminder, # data
              aes(x = gdpPercap, y = lifeExp, colour = continent)) + #aesthetics
    geom_point(alpha = 0.5) + #geometries
    geom_smooth(method = NULL, se = T) +
    facet_wrap(~continent, scales = "free") + #facets
    labs(title = "Life expectancy at birth vs. Per capita GDP", x = "gdpPercap", y = "lifeExp", colour = "Continent") + #labels
    theme_light() + #themes
    theme(axis.text.x = element_text(angle = 45),
          axis.text = element_text(size = 7),
          legend.position = "bottom") +
    NULL #good to include a NULL because of the + convention.

ggp_scatter
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## repeat these plots using different variables

Histograms using geom_hist()

Histograms are useful to show distributions.

## plot a histogram showing distribution of life expectancy
ggp_hist <- ggplot(data = gapminder, aes(x = lifeExp)) +
    geom_histogram(aes(y = ..density..), binwidth = 2, colour = "black", fill = "lightblue") + # change binwidth or substitute out for bins =
    geom_density(aes(colour = continent, fill = continent), alpha = 0.1) +
    labs(title = "Life Expectancy", x = "Life Expectancy", y = "Density") +
    theme_light() +
    NULL
## instead of having different density plots overlaying the same graph, try facet_wrap().
ggp_hist

# ggp_hist2 <- ggplot() +
#     geom_histogram() +
#     geom_density() +
#     NULL

#ggp_hist2
## make another histogram with different variables

# ggp_hist3 <- ggplot() +
#     geom_histogram() +
#     NULL

#ggp_hist3

Box plots using geom_boxplot()

Boxplots are useful for visualising summary statistics and identifying outliers in the data.

## plot a boxplot

ggp_box <- ggplot(data = gapminder, aes(x = fct_reorder(continent, lifeExp), y = lifeExp, fill = continent)) + # useful reordering function suppled to the x aesthetic. Remove it and see the effects.
    geom_boxplot(outlier.color = NA) + #removed outliers in this plot because we are layering on geom_point() of the data, try commenting out the geom_point() and removing the outlier.color = NA command to see how outliers are presented by default.
    geom_jitter(alpha = 0.2) + #geom_jitter is a variation on geom_point(). Swap out for geom_point() to see the effects.
    scale_y_continuous(breaks = round(seq(min(gapminder$lifeExp), max(gapminder$lifeExp), by = 10),0)) + # seq() is for sequence, change the by = 10 to a different value to see the effects.
    coord_flip() + # comment coord_flip() out to see the effects.
    labs(title = "Boxplot of life expectancy by continent", x = "Life Expectancy", y = "Continent") +
    theme_light() +
    theme(legend.position = "none") + # removed the legend as not visually necessary in this plot.
    NULL

ggp_box

## wrangle gapminder data to find top 5 countries in the year 2007 with the highest life expectancy and create a new boxplot.
top5 <- gapminder %>%
    filter() %>% #filter for year
    arrange() %>% #arrange by life expectancy
    head(5)

Additional features and visualisations

Add in additional columns to calculate life expectancy/ population and GDP per capita changes year on year using lag().

## wrangle some features
## calculate rate change from previous year per country
df_rate_change <- gapminder %>%
    janitor::clean_names() %>% # good package and function to make column names more coding friendly.
    group_by(country) %>%
    arrange(year) %>%
    mutate(pc_lifeexp_change = round(((life_exp/ lag(life_exp)) -1) * 100, 1)) %>% # here, rounding is done to 1 decimal place.
    mutate(pc_pop_change = round(((pop / lag(pop)) -1) * 100, 1)) %>%
    mutate(pc_gdp_cap_change = round(((gdp_percap / lag(gdp_percap)) -1) * 100, 1)) %>% #changes here are * 100 to make %
    arrange(country)

## add in any other additional features you would like to visualise with mutate()
head(df_rate_change)
## # A tibble: 6 × 9
## # Groups:   country [1]
##   country     continent  year life_exp      pop gdp_pe…¹ pc_li…² pc_po…³ pc_gd…⁴
##   <fct>       <fct>     <int>    <dbl>    <int>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 Afghanistan Asia       1952     28.8  8425333     779.    NA      NA      NA  
## 2 Afghanistan Asia       1957     30.3  9240934     821.     5.3     9.7     5.3
## 3 Afghanistan Asia       1962     32.0 10267083     853.     5.5    11.1     3.9
## 4 Afghanistan Asia       1967     34.0 11537966     836.     6.3    12.4    -2  
## 5 Afghanistan Asia       1972     36.1 13079460     740.     6.1    13.4   -11.5
## 6 Afghanistan Asia       1977     38.4 14880372     786.     6.5    13.8     6.2
## # … with abbreviated variable names ¹​gdp_percap, ²​pc_lifeexp_change,
## #   ³​pc_pop_change, ⁴​pc_gdp_cap_change

Here we will wrangle the dataframe we have just created (df_rate_change) to find the percent change in population for the country with the highest life expectancy and the country with the lowest life expectancy per continent using 2007 as an example year to filter on.

## plot a line graph showing the changes in life expectancy per year for 5 countries
## find the 5 countries with the lowest population in 2007 per continent
low_pop_2007 <- df_rate_change %>%
    filter(year == 2007) %>%
    group_by(continent) %>%
    arrange(life_exp) %>%
    slice(1) %>% # slice() takes the first row per group - here it is important that the previous steps are correct. head() will only take the 1st row of the dataframe.
    ungroup() %>%
    pull(country) #pull() will return a list of the countries, use class(low_pop_2007) to see the structure of the outcome.

high_pop_2007 <- df_rate_change %>%
    filter(year == 2007) %>%
    group_by(continent) %>%
    arrange(desc(life_exp)) %>% # nesting desc() within arrange() will arrange in descending order.
    slice(1) %>%
    ungroup() %>%
    pull(country)

countries_to_plot <- c(as.character(low_pop_2007), as.character(high_pop_2007)) # have to change to character type from factors. See what happens if you remove the as.character() calls.
countries_to_plot
##  [1] "Swaziland"   "Haiti"       "Afghanistan" "Turkey"      "New Zealand"
##  [6] "Reunion"     "Canada"      "Japan"       "Iceland"     "Australia"
ggp_pc_rate <- ggplot(data = df_rate_change %>%
                  filter(country %in% countries_to_plot), aes(x = year, colour = country)) +
    geom_point(aes(y = pc_gdp_cap_change)) +
    geom_line(aes(y = pc_gdp_cap_change), linetype = "dashed") + # difficult to add in the legend of linetypes when layering graphs like this.
    geom_point(aes(y = pc_pop_change)) +
    geom_line(aes(y = pc_pop_change)) +
    facet_grid(rows = vars(continent)) +
    scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
    scale_y_continuous(breaks = seq(-50, 50, by = 25)) +
    coord_cartesian(xlim = c(1957, 2007), y = c(-50,50)) +
    labs(title = "Change in GDP per capita and population time", x = "Year", y = "Percent change (%)") +
    theme_light() +
    NULL

ggp_pc_rate

## ggplot loves long data. Let's wrangle the data to be in long format.

df_rate_change_pivot <- df_rate_change %>%
    filter(country %in% countries_to_plot) %>%
    select(country,
           continent,
           year,
           pc_pop_change,
           pc_gdp_cap_change) %>%
    pivot_longer(c(pc_pop_change, pc_gdp_cap_change),
                   names_to = "pc_change_type",
                   values_to = "pc_change")

## now make the graph
ggp_pc_rate_pivot <- ggplot(data = df_rate_change_pivot, aes(x = year, y = pc_change, colour = country)) +
    geom_point() +
    geom_line(aes(linetype = pc_change_type)) +
    facet_grid(rows = vars(continent)) +
    scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
    scale_y_continuous(breaks = seq(-50, 50, by = 25)) +
    coord_cartesian(xlim = c(1957, 2007), y = c(-50,50)) +
    labs(title = "Change in GDP per capita and population time", x = "Year", y = "Percent change (%)", colour = "Country", linetype = "% Change Variable") +
    theme_light() +
    NULL

ggp_pc_rate_pivot

Interactive graphs using {plotly}

A ggplot object can be passed in to ggplotly().

ggp_for_plotly <- ggplot(data = df_rate_change %>%
                  filter(country %in% countries_to_plot), aes(x = year, y = life_exp, colour = country)) +
    geom_point(aes(text = paste0("Country: ",country,
                                 "<br>Year: ",year,
                                 "<br>Life Expectancy: ",life_exp,
                                 "<br>Life Expectancy (% change): ", pc_lifeexp_change))) +
    geom_line() +
    scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
    labs(title = "Change in life expectancy over time", x = "Year", y = "Life expectancy (years)", colour = "Country") +
    theme_light() +
    NULL


interactive_graph <- plotly::ggplotly(ggp_for_plotly, tooltip = "text")
interactive_graph