Load packages
library(tidyverse)
Dataset
Import the gapminder dataset, in one of two ways:
library(gapminder)
to load the gapminder
package, ORread_csv("filename.csv")
to read the csv filedf1 <- read_csv("gapminder-data.csv") # IMPORTANT! The file must be in the working directory
Inspect the data
head(df1)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
glimpse(df1)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
Descriptive statistics
summary(df1)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
Data manipulation
Use rename()
to rename the variable
gdpPercap
Use mutate()
to create a new variable, log of
gdpPercap
Use filter()
to filter data (i.e., select rows) for
one year
df1 %>%
rename(income_per_capita = gdpPercap) %>%
mutate(log_y = log(income_per_capita)) %>%
filter(year == 2007)
## # A tibble: 142 × 7
## country continent year lifeExp pop income_per_capita log_y
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975. 6.88
## 2 Albania Europe 2007 76.4 3600523 5937. 8.69
## 3 Algeria Africa 2007 72.3 33333216 6223. 8.74
## 4 Angola Africa 2007 42.7 12420476 4797. 8.48
## 5 Argentina Americas 2007 75.3 40301927 12779. 9.46
## 6 Australia Oceania 2007 81.2 20434176 34435. 10.4
## 7 Austria Europe 2007 79.8 8199783 36126. 10.5
## 8 Bahrain Asia 2007 75.6 708573 29796. 10.3
## 9 Bangladesh Asia 2007 64.1 150448339 1391. 7.24
## 10 Belgium Europe 2007 79.4 10392226 33693. 10.4
## # … with 132 more rows
# store the results in a new dataframe
df2 <- df1 %>%
rename(income_per_capita = gdpPercap) %>%
mutate(log_y = log(income_per_capita)) %>%
filter(year == 2007)
df2
## # A tibble: 142 × 7
## country continent year lifeExp pop income_per_capita log_y
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975. 6.88
## 2 Albania Europe 2007 76.4 3600523 5937. 8.69
## 3 Algeria Africa 2007 72.3 33333216 6223. 8.74
## 4 Angola Africa 2007 42.7 12420476 4797. 8.48
## 5 Argentina Americas 2007 75.3 40301927 12779. 9.46
## 6 Australia Oceania 2007 81.2 20434176 34435. 10.4
## 7 Austria Europe 2007 79.8 8199783 36126. 10.5
## 8 Bahrain Asia 2007 75.6 708573 29796. 10.3
## 9 Bangladesh Asia 2007 64.1 150448339 1391. 7.24
## 10 Belgium Europe 2007 79.4 10392226 33693. 10.4
## # … with 132 more rows
Scatterplots
ggplot(df2, aes(x = income_per_capita, y = lifeExp)) +
geom_point() +
labs(title = "Fig. 1. Life expectancy vs Income per capita, 2007")
ggplot(df2, aes(x = income_per_capita, y = lifeExp)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(title = "Fig. 2. Life expectancy vs Income per capita, 2007")
ggplot(df2, aes(x = log_y, y = lifeExp)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(title = "Fig. 3. Life expectancy vs log(Income per capita), 2007")
Get the 8 countries with the highest per-capita incomes
arrange()
or arrange(desc())
to
arrange countries based on income per capitahead()
or tail()
to get the top rows
or bottom rowsdf2 %>%
arrange(desc(income_per_capita))
## # A tibble: 142 × 7
## country continent year lifeExp pop income_per_capita log_y
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Norway Europe 2007 80.2 4627926 49357. 10.8
## 2 Kuwait Asia 2007 77.6 2505559 47307. 10.8
## 3 Singapore Asia 2007 80.0 4553009 47143. 10.8
## 4 United States Americas 2007 78.2 301139947 42952. 10.7
## 5 Ireland Europe 2007 78.9 4109086 40676. 10.6
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725. 10.6
## 7 Switzerland Europe 2007 81.7 7554661 37506. 10.5
## 8 Netherlands Europe 2007 79.8 16570613 36798. 10.5
## 9 Canada Americas 2007 80.7 33390141 36319. 10.5
## 10 Iceland Europe 2007 81.8 301931 36181. 10.5
## # … with 132 more rows
# store the results in a new dataframe
df2_top8 <- df2 %>%
arrange(desc(income_per_capita)) %>%
head(8)
df2_top8
## # A tibble: 8 × 7
## country continent year lifeExp pop income_per_capita log_y
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Norway Europe 2007 80.2 4627926 49357. 10.8
## 2 Kuwait Asia 2007 77.6 2505559 47307. 10.8
## 3 Singapore Asia 2007 80.0 4553009 47143. 10.8
## 4 United States Americas 2007 78.2 301139947 42952. 10.7
## 5 Ireland Europe 2007 78.9 4109086 40676. 10.6
## 6 Hong Kong, China Asia 2007 82.2 6980412 39725. 10.6
## 7 Switzerland Europe 2007 81.7 7554661 37506. 10.5
## 8 Netherlands Europe 2007 79.8 16570613 36798. 10.5
Barplots
ggplot(df2_top8, aes(x = country, y = income_per_capita)) +
geom_col() +
labs(x = "", y = "Income per capita ($)",
title = "Fig. 4. Barplot; alphabetically ordered",
subtitle = "Top 8: Highest per-capita GDP in 2007")
ggplot(df2_top8,
aes(x = country, y = income_per_capita)) +
geom_col() +
coord_flip() +
labs(x = "", y = "Income per capita ($)",
title = "Fig. 5. Barplot with axes flipped",
subtitle = "Top 8: Highest per-capita GDP in 2007")
ggplot(df2_top8,
aes(x = reorder(country, income_per_capita), y = income_per_capita)) +
geom_col() +
coord_flip() +
labs(x = "", y = "Income per capita ($)",
title = "Fig. 6 Descending order",
subtitle = "Top 8: Highest per-capita GDP in 2007")
ggplot(df2_top8,
aes(x = reorder(country, -income_per_capita), y = income_per_capita)) +
geom_col() +
coord_flip() +
labs(x = "", y = "Income per capita ($)",
title = "Fig. 7 Ascending order",
subtitle = "Top 8: Highest per-capita GDP in 2007")
Theend