# install.packages("tidyverse") ## Uncomment this line if you haven't installed tidyverse yet
# install.packages("gapminder") ## Uncomment this line if you haven't installed gapminder yet
# Load the required packages
library(tidyverse)
library(gapminder)
R: Common plots
Load packages
- Install the tidyverse and gapminder packages (if you haven’t installed them yet)
- Load the tidyverse and gapminder packages
Dataset
Let’s inspect the gapminder
dataset.
# See all the columns (variables), their class (or data type), and a few observations
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
Questions
How many rows and columns are there in the dataset?
How many variables does the dataset have?
What is the class of each variable? (Note:
fct
is a character variable.)
# Display the first 6 rows
head(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
# Display the last 6 rows
tail(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Zimbabwe Africa 1982 60.4 7636524 789.
2 Zimbabwe Africa 1987 62.4 9216418 706.
3 Zimbabwe Africa 1992 60.4 10704340 693.
4 Zimbabwe Africa 1997 46.8 11404948 792.
5 Zimbabwe Africa 2002 40.0 11926563 672.
6 Zimbabwe Africa 2007 43.5 12311143 470.
# See the structure of the data
str(gapminder)
tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
$ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num [1:1704] 779 821 853 836 740 ...
# Obtain summary statistics for the data
summary(gapminder)
country continent year lifeExp
Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
Algeria : 12 Asia :396 Median :1980 Median :60.71
Angola : 12 Europe :360 Mean :1980 Mean :59.47
Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
Australia : 12 Max. :2007 Max. :82.60
(Other) :1632
pop gdpPercap
Min. :6.001e+04 Min. : 241.2
1st Qu.:2.794e+06 1st Qu.: 1202.1
Median :7.024e+06 Median : 3531.8
Mean :2.960e+07 Mean : 7215.3
3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Max. :1.319e+09 Max. :113523.1
country
andcontinent
are character variablesyear
,lifeExp
,pop
andgdpPercap
are numeric variables
Create dataframe
Let’s store gapminder
in a new dataframe called df1
<- gapminder
df1 df1
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
Boxplot
Create a boxplot to compare the distribution of life expectancy by continent
ggplot(data = df1,
mapping = aes(x = continent, y = lifeExp)) +
geom_boxplot(fill = "grey70") +
labs(title = "Distribution of Life Expectancy by Continent",
x = "",
y = "Life Expectancy (Years)",
caption = "Data: gapminder")
Histogram
Create a histogram of the gdpPercap
variable.
ggplot(data = df1,
mapping = aes(x = lifeExp)) +
geom_histogram(color = "white", fill ="steelblue") +
labs(title = "Histogram of Life Expectancy",
x = "Life Expectancy (Years)",
y = "Count",
caption = "Data: gapminder")
Barplot
Create a barplot showing average life expectancy by continent
# STEP 1. Calculate the average life expectancy by continent
<- df1 |>
avg_life_exp_continent group_by(continent) |>
summarise(avg_lifeExp = mean(lifeExp))
avg_life_exp_continent
# A tibble: 5 × 2
continent avg_lifeExp
<fct> <dbl>
1 Africa 48.9
2 Americas 64.7
3 Asia 60.1
4 Europe 71.9
5 Oceania 74.3
# STEP 2. Create a bar chart from the summarized data
ggplot(data = avg_life_exp_continent,
mapping = aes(x = continent, y = avg_lifeExp)) +
geom_col(fill = "steelblue") + ## use fill, not color!
labs(title = "Average Life Expectancy by Continent",
x = "",
y = "Average Life Expectancy (Years)",
caption = "Data: gapminder")
Lineplot
#### THE ONE-COUNTRY CASE
# Filter the data for one country
<- "Argentina"
one_country
<- df1 |>
data_one_country filter(country == one_country)
# Create a line chart showing life expectancy trends for one country
ggplot(data = data_one_country,
mapping = aes(x = year, y = lifeExp)) +
geom_line() +
labs(title = "Life Expectancy Trends for Argentina",
x = "Year",
y = "Life Expectancy (Years)",
color = "Country",
caption = "Data: gapminder")
Multiple lineplots
#### THE MULTIPLE-COUNTRY CASE
# Filter the data for multiple countries
<- c("Brazil", "China", "United States", "India")
selected_countries
<- df1 |>
data_selected_countries filter(country %in% selected_countries)
# Create a line chart for selected countries
ggplot(data_selected_countries,
aes(x = year, y = lifeExp, color = country)) +
geom_line() +
labs(title = "Life Expectancy Trends for Selected Countries",
x = "",
y = "Life Expectancy (Years)",
color = "Country",
caption = "Data: gapminder")
Scatterplot
Create a scatterplot of life expectancy vs. GDP per capita
ggplot(df1,
aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
labs(title = "Life Expectancy vs. GDP per capita",
subtitle = "Each point on the plot represents a country in a specific year",
x = "GDP per capita ($)",
y = "Life Expectancy (Years)",
caption = "Data: gapminder")
Add regression line to scatterplot
ggplot(df1,
aes(x = gdpPercap, y = lifeExp)) +
geom_point(color = "red") + ## added color to the points
geom_smooth(method = "lm", se = FALSE) +
labs(title = "A Scatterplot with Regression Line",
subtitle = "Not a great example of a regression line!",
caption = "Data: gapminder")
Add a regression line for each continent
ggplot(df1,
aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "A Regression Line for Each Continent",
subtitle = "Still not great!",
caption = "Data: gapminder",
color = "Continent")
Use log of GDP per capita
# Create a new variable and add it to the df1 dataframe. Call the new dataframe df2.
<- df1 |>
df2 mutate(log_income = log(gdpPercap))
# Create a scatterplot with log_income on the X-axis
ggplot(df2,
aes(x = log_income, y = lifeExp)) +
geom_point()
# Add a regresion line
ggplot(df2,
aes(x = log_income, y = lifeExp)) +
geom_point(color ="red") +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "A Much Better Regression Line!",
subtitle = "Using logs can make a difference when the numbers are large",
x = "log(GDP per capita)", y = "Life expectancy (years)",
caption = "Data: gapminder")
EXERCISE
Create a histogram of the
gdpPercap
variable.Create a barplot to show the total population per continent in 2007.
Create a scatterplot of
pop
vs.lifeExp
and add colors for each continent.
SP | 04-common-plots.qmd