library(tidyverse)
library(gapminder)
Plots in R
Preliminaries
Load packages
Check: Are the packages are already installed? If not, first run
install.packages()
Make a copy of gapminder
<- gapminder df1
Inspect the data
%>%
df1 head() ## the first 6 rows
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
%>%
df1 tail() ## the last 6 rows
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Zimbabwe Africa 1982 60.4 7636524 789.
2 Zimbabwe Africa 1987 62.4 9216418 706.
3 Zimbabwe Africa 1992 60.4 10704340 693.
4 Zimbabwe Africa 1997 46.8 11404948 792.
5 Zimbabwe Africa 2002 40.0 11926563 672.
6 Zimbabwe Africa 2007 43.5 12311143 470.
Dimensions and variables
dim(df1) ## how many rows and columns?
[1] 1704 6
glimpse(df1) ## look at the variables and their classes
Rows: 1,704
Columns: 6
$ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
%>% count(continent) ## how many entries for each continent? df1
# A tibble: 5 × 2
continent n
<fct> <int>
1 Africa 624
2 Americas 300
3 Asia 396
4 Europe 360
5 Oceania 24
Descriptive statistics
summary(df1) ## for each numeric variable
country continent year lifeExp
Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
Algeria : 12 Asia :396 Median :1980 Median :60.71
Angola : 12 Europe :360 Mean :1980 Mean :59.47
Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
Australia : 12 Max. :2007 Max. :82.60
(Other) :1632
pop gdpPercap
Min. :6.001e+04 Min. : 241.2
1st Qu.:2.794e+06 1st Qu.: 1202.1
Median :7.024e+06 Median : 3531.8
Mean :2.960e+07 Mean : 7215.3
3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Max. :1.319e+09 Max. :113523.1
Scatter plots
A basic plot
ggplot(df1,
aes(x = gdpPercap, y = lifeExp)) +
geom_point()
Points in red
ggplot(df1,
aes(x = gdpPercap, y = lifeExp)) +
geom_point(color = "red") +
labs(title = "Seeing red?")
Mapping continent to color
ggplot(df1,
aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
labs(title = "One color for each continent!")
Filter one country
Select one country: China
%>%
df1 filter(country == "China") ### Note the 2 equals signs!
# A tibble: 12 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 China Asia 1952 44 556263527 400.
2 China Asia 1957 50.5 637408000 576.
3 China Asia 1962 44.5 665770000 488.
4 China Asia 1967 58.4 754550000 613.
5 China Asia 1972 63.1 862030000 677.
6 China Asia 1977 64.0 943455000 741.
7 China Asia 1982 65.5 1000281000 962.
8 China Asia 1987 67.3 1084035000 1379.
9 China Asia 1992 68.7 1164970000 1656.
10 China Asia 1997 70.4 1230075000 2289.
11 China Asia 2002 72.0 1280400000 3119.
12 China Asia 2007 73.0 1318683096 4959.
Create a new data frame for China
<- df1 %>%
df_China filter(country == "China")
Scatter plot
ggplot(df_China,
aes(x = year, y = lifeExp)) +
geom_point() +
labs(title = "Points only!")
Line plot
ggplot(df_China,
aes(x = year, y = lifeExp)) +
geom_line() +
labs(title = "Line only!")
Scatter plot AND line plot
ggplot(df_China,
aes(x = year, y = lifeExp)) +
geom_point() +
geom_line() +
labs(title = "Both points and line!")
Filter more than one country
Two countries: China, Brazil
%>%
df1 filter(country %in% c("China", "Brazil")) ## use %in%, not ==
# A tibble: 24 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Brazil Americas 1952 50.9 56602560 2109.
2 Brazil Americas 1957 53.3 65551171 2487.
3 Brazil Americas 1962 55.7 76039390 3337.
4 Brazil Americas 1967 57.6 88049823 3430.
5 Brazil Americas 1972 59.5 100840058 4986.
6 Brazil Americas 1977 61.5 114313951 6660.
7 Brazil Americas 1982 63.3 128962939 7031.
8 Brazil Americas 1987 65.2 142938076 7807.
9 Brazil Americas 1992 67.1 155975974 6950.
10 Brazil Americas 1997 69.4 168546719 7958.
# ℹ 14 more rows
Create a new dataframe
<- df1 %>%
df2 filter(country %in% c("China", "Brazil"))
Scatter plot
ggplot(df2,
aes(x = year, y = lifeExp)) +
geom_point() +
labs(title = "Scatterplot, in one color!")
Map country to color
ggplot(df2,
aes(x = year, y = lifeExp, color = country)) +
geom_point() +
labs(title = "Scatterplot, but different colors")
Line plots
ggplot(df2,
aes(x = year, y = lifeExp, color = country)) +
geom_line() +
labs(title = "Line charts; one color for each country")
Axis labels, titles, subtitles and captions
ggplot(df2,
aes(x = year, y = lifeExp, color = country)) +
geom_line() +
labs(x = "", # we don't want any label for the X-axis
y = "Life expectancy (years)",
title = "Life expectancy in China and Brazil, 1952-2007",
subtitle = "Wow, look at the increase over the period!",
caption = "Source: World Bank via gapminder")
Exercise 1
Pick 3 countries. Draw a line chart showing income per capita for the countries, with a different color for each.
Exercise 2
Use filter() to select the year 2007. Draw a bar chart showing the 8 countries with the highest GDP per capita in 2007.
ba260-plots-in-R.qmd