Plots in R

Preliminaries

Load packages

Check: Are the packages are already installed? If not, first run

install.packages()
library(tidyverse)
library(gapminder)

Make a copy of gapminder

df1 <- gapminder

Inspect the data

df1 %>% 
  head()      ## the first 6 rows
# A tibble: 6 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Afghanistan Asia       1962    32.0 10267083      853.
4 Afghanistan Asia       1967    34.0 11537966      836.
5 Afghanistan Asia       1972    36.1 13079460      740.
6 Afghanistan Asia       1977    38.4 14880372      786.
df1 %>% 
  tail()      ## the last 6 rows
# A tibble: 6 × 6
  country  continent  year lifeExp      pop gdpPercap
  <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
1 Zimbabwe Africa     1982    60.4  7636524      789.
2 Zimbabwe Africa     1987    62.4  9216418      706.
3 Zimbabwe Africa     1992    60.4 10704340      693.
4 Zimbabwe Africa     1997    46.8 11404948      792.
5 Zimbabwe Africa     2002    40.0 11926563      672.
6 Zimbabwe Africa     2007    43.5 12311143      470.

Dimensions and variables

dim(df1)                  ## how many rows and columns?
[1] 1704    6
glimpse(df1)              ## look at the variables and their classes
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
df1 %>% count(continent)  ## how many entries for each continent?
# A tibble: 5 × 2
  continent     n
  <fct>     <int>
1 Africa      624
2 Americas    300
3 Asia        396
4 Europe      360
5 Oceania      24

Descriptive statistics

summary(df1)              ## for each numeric variable
        country        continent        year         lifeExp     
 Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
 Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
 Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
 Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
 Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
 Australia  :  12                  Max.   :2007   Max.   :82.60  
 (Other)    :1632                                                
      pop              gdpPercap       
 Min.   :6.001e+04   Min.   :   241.2  
 1st Qu.:2.794e+06   1st Qu.:  1202.1  
 Median :7.024e+06   Median :  3531.8  
 Mean   :2.960e+07   Mean   :  7215.3  
 3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
 Max.   :1.319e+09   Max.   :113523.1  
                                       

Scatter plots

A basic plot

ggplot(df1, 
       aes(x = gdpPercap, y = lifeExp)) + 
  geom_point()

Points in red

ggplot(df1, 
       aes(x = gdpPercap, y = lifeExp)) + 
  geom_point(color = "red") +
  labs(title = "Seeing red?")

Mapping continent to color

ggplot(df1, 
       aes(x = gdpPercap, y = lifeExp, color = continent)) + 
  geom_point() +
  labs(title = "One color for each continent!")

Filter one country

Select one country: China

df1 %>% 
  filter(country == "China")   ### Note the 2 equals signs!
# A tibble: 12 × 6
   country continent  year lifeExp        pop gdpPercap
   <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
 1 China   Asia       1952    44    556263527      400.
 2 China   Asia       1957    50.5  637408000      576.
 3 China   Asia       1962    44.5  665770000      488.
 4 China   Asia       1967    58.4  754550000      613.
 5 China   Asia       1972    63.1  862030000      677.
 6 China   Asia       1977    64.0  943455000      741.
 7 China   Asia       1982    65.5 1000281000      962.
 8 China   Asia       1987    67.3 1084035000     1379.
 9 China   Asia       1992    68.7 1164970000     1656.
10 China   Asia       1997    70.4 1230075000     2289.
11 China   Asia       2002    72.0 1280400000     3119.
12 China   Asia       2007    73.0 1318683096     4959.

Create a new data frame for China

df_China <- df1 %>% 
  filter(country == "China")

Scatter plot

ggplot(df_China, 
       aes(x = year, y = lifeExp)) + 
  geom_point() +
  labs(title = "Points only!")

Line plot

ggplot(df_China, 
       aes(x = year, y = lifeExp)) + 
  geom_line() +
  labs(title = "Line only!")

Scatter plot AND line plot

ggplot(df_China, 
       aes(x = year, y = lifeExp)) + 
  geom_point() +
  geom_line() +
  labs(title = "Both points and line!")

Filter more than one country

Two countries: China, Brazil

df1 %>% 
  filter(country %in% c("China", "Brazil"))    ## use %in%, not ==
# A tibble: 24 × 6
   country continent  year lifeExp       pop gdpPercap
   <fct>   <fct>     <int>   <dbl>     <int>     <dbl>
 1 Brazil  Americas   1952    50.9  56602560     2109.
 2 Brazil  Americas   1957    53.3  65551171     2487.
 3 Brazil  Americas   1962    55.7  76039390     3337.
 4 Brazil  Americas   1967    57.6  88049823     3430.
 5 Brazil  Americas   1972    59.5 100840058     4986.
 6 Brazil  Americas   1977    61.5 114313951     6660.
 7 Brazil  Americas   1982    63.3 128962939     7031.
 8 Brazil  Americas   1987    65.2 142938076     7807.
 9 Brazil  Americas   1992    67.1 155975974     6950.
10 Brazil  Americas   1997    69.4 168546719     7958.
# ℹ 14 more rows

Create a new dataframe

df2 <- df1 %>% 
  filter(country %in% c("China", "Brazil"))

Scatter plot

ggplot(df2, 
       aes(x = year, y = lifeExp)) + 
  geom_point() +
  labs(title = "Scatterplot, in one color!")

Map country to color

ggplot(df2, 
       aes(x = year, y = lifeExp, color = country)) + 
  geom_point() +
  labs(title = "Scatterplot, but different colors")

Line plots

ggplot(df2, 
       aes(x = year, y = lifeExp, color = country)) + 
  geom_line() +
  labs(title = "Line charts; one color for each country")

Axis labels, titles, subtitles and captions

ggplot(df2, 
       aes(x = year, y = lifeExp, color = country)) + 
  geom_line() +
  labs(x = "", # we don't want any label for the X-axis
       y = "Life expectancy (years)",
       title = "Life expectancy in China and Brazil, 1952-2007",
       subtitle = "Wow, look at the increase over the period!",
       caption = "Source: World Bank via gapminder")

Exercise 1

Pick 3 countries. Draw a line chart showing income per capita for the countries, with a different color for each.

Exercise 2

Use filter() to select the year 2007. Draw a bar chart showing the 8 countries with the highest GDP per capita in 2007.


ba260-plots-in-R.qmd