R: Common plots

Published

September 9, 2025

Load packages

  • Install the tidyverse and gapminder packages (if you haven’t installed them yet)
  • Load the tidyverse and gapminder packages
# install.packages("tidyverse")  ## Uncomment this line if you haven't installed tidyverse yet
# install.packages("gapminder") ## Uncomment this line if you haven't installed gapminder yet

# Load the required packages
library(tidyverse)
library(gapminder)

Dataset

Let’s inspect the gapminder dataset.

# See all the columns (variables), their class (or data type), and a few observations
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Questions

  1. How many rows and columns are there in the dataset?

  2. How many variables does the dataset have?

  3. What is the class of each variable? (Note: fct is a character variable.)

# Display the first 6 rows
head(gapminder)
# A tibble: 6 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.
# Display the last 6 rows
tail(gapminder)
# A tibble: 6 × 6
  country  continent  year lifeExp      pop gdpPercap
  <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
1 Zimbabwe Africa     1982    60.4  7636524      789.
2 Zimbabwe Africa     1987    62.4  9216418      706.
3 Zimbabwe Africa     1992    60.4 10704340      693.
4 Zimbabwe Africa     1997    46.8 11404948      792.
5 Zimbabwe Africa     2002    40.0 11926563      672.
6 Zimbabwe Africa     2007    43.5 12311143      470.
# See the structure of the data
str(gapminder)
tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
 $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
 $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
# Obtain summary statistics for the data
summary(gapminder)
        country        continent        year         lifeExp     
 Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
 Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
 Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
 Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
 Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
 Australia  :  12                  Max.   :2007   Max.   :82.60  
 (Other)    :1632                                                
      pop              gdpPercap       
 Min.   :6.001e+04   Min.   :   241.2  
 1st Qu.:2.794e+06   1st Qu.:  1202.1  
 Median :7.024e+06   Median :  3531.8  
 Mean   :2.960e+07   Mean   :  7215.3  
 3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
 Max.   :1.319e+09   Max.   :113523.1  
                                       
  • country and continent are character variables
  • year, lifeExp, pop and gdpPercap are numeric variables

Create dataframe

Let’s store gapminder in a new dataframe called df1

df1 <- gapminder
df1
# A tibble: 1,704 × 6
   country     continent  year lifeExp      pop gdpPercap
   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan Asia       1952    28.8  8425333      779.
 2 Afghanistan Asia       1957    30.3  9240934      821.
 3 Afghanistan Asia       1962    32.0 10267083      853.
 4 Afghanistan Asia       1967    34.0 11537966      836.
 5 Afghanistan Asia       1972    36.1 13079460      740.
 6 Afghanistan Asia       1977    38.4 14880372      786.
 7 Afghanistan Asia       1982    39.9 12881816      978.
 8 Afghanistan Asia       1987    40.8 13867957      852.
 9 Afghanistan Asia       1992    41.7 16317921      649.
10 Afghanistan Asia       1997    41.8 22227415      635.
# ℹ 1,694 more rows

Boxplot

Create a boxplot to compare the distribution of life expectancy by continent

ggplot(data = df1,         
       mapping = aes(x = continent, y = lifeExp)) +   
  geom_boxplot(fill = "grey70") +   
  labs(title = "Distribution of Life Expectancy by Continent",        
       x = "",        
       y = "Life Expectancy (Years)",        
       caption = "Data: gapminder") 

Histogram

Create a histogram of the gdpPercap variable.

ggplot(data = df1,        
       mapping = aes(x = lifeExp)) +   
  geom_histogram(color = "white", fill ="steelblue") +   
  labs(title = "Histogram of Life Expectancy",        
       x = "Life Expectancy (Years)",        
       y = "Count",        
       caption = "Data: gapminder") 

Barplot

Create a barplot showing average life expectancy by continent

# STEP 1. Calculate the average life expectancy by continent

avg_life_exp_continent <- df1 |> 
  group_by(continent) |> 
  summarise(avg_lifeExp = mean(lifeExp))

avg_life_exp_continent
# A tibble: 5 × 2
  continent avg_lifeExp
  <fct>           <dbl>
1 Africa           48.9
2 Americas         64.7
3 Asia             60.1
4 Europe           71.9
5 Oceania          74.3
# STEP 2. Create a bar chart from the summarized data

ggplot(data = avg_life_exp_continent, 
       mapping = aes(x = continent, y = avg_lifeExp)) +
  geom_col(fill = "steelblue") +                   ## use fill, not color!
  labs(title = "Average Life Expectancy by Continent",
       x = "",
       y = "Average Life Expectancy (Years)",
       caption = "Data: gapminder")

Lineplot

#### THE ONE-COUNTRY CASE

# Filter the data for one country

one_country <- "Argentina"

data_one_country <- df1 |> 
  filter(country == one_country)

# Create a line chart showing life expectancy trends for one country

ggplot(data = data_one_country,
       mapping = aes(x = year, y = lifeExp)) +
  geom_line() +
  labs(title = "Life Expectancy Trends for Argentina",
       x = "Year",
       y = "Life Expectancy (Years)",
       color = "Country",
       caption = "Data: gapminder")

Multiple lineplots

#### THE MULTIPLE-COUNTRY CASE

# Filter the data for multiple countries

selected_countries <- c("Brazil", "China", "United States", "India")

data_selected_countries <- df1 |> 
  filter(country %in% selected_countries)

# Create a line chart for selected countries

ggplot(data_selected_countries, 
       aes(x = year, y = lifeExp, color = country)) +
  geom_line() +
  labs(title = "Life Expectancy Trends for Selected Countries",
       x = "",
       y = "Life Expectancy (Years)",
       color = "Country",
       caption = "Data: gapminder")

Scatterplot

Create a scatterplot of life expectancy vs. GDP per capita

ggplot(df1,         
       aes(x = gdpPercap, y = lifeExp)) +   
  geom_point() +   
  labs(title = "Life Expectancy vs. GDP per capita",        
       subtitle = "Each point on the plot represents a country in a specific year",        
       x = "GDP per capita ($)",        
       y = "Life Expectancy (Years)",        
       caption = "Data: gapminder")

Add regression line to scatterplot

ggplot(df1,        
       aes(x = gdpPercap, y = lifeExp)) +   
  geom_point(color = "red") +              ## added color to the points
  geom_smooth(method = "lm", se = FALSE) +   
  labs(title = "A Scatterplot with Regression Line",
       subtitle = "Not a great example of a regression line!",
       caption = "Data: gapminder")

Add a regression line for each continent

ggplot(df1,         
       aes(x = gdpPercap, y = lifeExp, color = continent)) +   
  geom_point() +   
  geom_smooth(method = "lm", se = FALSE) +   
  labs(title = "A Regression Line for Each Continent",
       subtitle = "Still not great!",
       caption = "Data: gapminder",
       color = "Continent")

Use log of GDP per capita

# Create a new variable and add it to the df1 dataframe. Call the new dataframe df2.

df2 <- df1 |> 
  mutate(log_income = log(gdpPercap))

# Create a scatterplot with log_income on the X-axis

ggplot(df2,
       aes(x = log_income, y = lifeExp)) +
  geom_point()

# Add a regresion line

ggplot(df2,
       aes(x = log_income, y = lifeExp)) +
  geom_point(color ="red") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "A Much Better Regression Line!",
       subtitle = "Using logs can make a difference when the numbers are large",
       x = "log(GDP per capita)", y = "Life expectancy (years)",
       caption = "Data: gapminder")

EXERCISE

  1. Create a histogram of the gdpPercap variable.

  2. Create a barplot to show the total population per continent in 2007.

  3. Create a scatterplot of pop vs. lifeExp and add colors for each continent.


SP | 04-common-plots.qmd