# install.packages("tidyverse") ## Uncomment this line if you haven't installed tidyverse yet
# install.packages("gapminder") ## Uncomment this line if you haven't installed gapminder yet
# Load the required packages
library(tidyverse)
library(gapminder)
R: Data Manipulation
Load packages
- Install the tidyverse and gapminder packages (if you haven’t installed them yet)
- Load the tidyverse and gapminder packages
Dataset
Let’s inspect the gapminder
dataset.
# See all the columns (variables), their class (or data type), and a few observations
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# Display the first 6 rows
head(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
# Display the last 6 rows
tail(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Zimbabwe Africa 1982 60.4 7636524 789.
2 Zimbabwe Africa 1987 62.4 9216418 706.
3 Zimbabwe Africa 1992 60.4 10704340 693.
4 Zimbabwe Africa 1997 46.8 11404948 792.
5 Zimbabwe Africa 2002 40.0 11926563 672.
6 Zimbabwe Africa 2007 43.5 12311143 470.
# See the structure of the data
str(gapminder)
tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
$ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num [1:1704] 779 821 853 836 740 ...
# Obtain summary statistics for the data
summary(gapminder)
country continent year lifeExp
Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
Algeria : 12 Asia :396 Median :1980 Median :60.71
Angola : 12 Europe :360 Mean :1980 Mean :59.47
Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
Australia : 12 Max. :2007 Max. :82.60
(Other) :1632
pop gdpPercap
Min. :6.001e+04 Min. : 241.2
1st Qu.:2.794e+06 1st Qu.: 1202.1
Median :7.024e+06 Median : 3531.8
Mean :2.960e+07 Mean : 7215.3
3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Max. :1.319e+09 Max. :113523.1
country
andcontinent
are character variablesyear
,lifeExp
,pop
andgdpPercap
are numeric variables
Create dataframe
Let’s store gapminder
in a new dataframe called df1
<- gapminder
df1 df1
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
Pipe operator
The pipe operator |>
allows us to link one command to another.
Another symbol for the pipe operator is %>%
.
Hint: What is the keyboard shortcut to generate the pipe operator symbol?
|> glimpse() df1
Rows: 1,704
Columns: 6
$ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# It can be read as: Take df1 and then glimpse it
|>
df1 arrange(lifeExp) |> ## arrange() sorts the data in increasing order
head(n=10)
# A tibble: 10 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Rwanda Africa 1992 23.6 7290203 737.
2 Afghanistan Asia 1952 28.8 8425333 779.
3 Gambia Africa 1952 30 284320 485.
4 Angola Africa 1952 30.0 4232095 3521.
5 Sierra Leone Africa 1952 30.3 2143249 880.
6 Afghanistan Asia 1957 30.3 9240934 821.
7 Cambodia Asia 1977 31.2 6978607 525.
8 Mozambique Africa 1952 31.3 6446316 469.
9 Sierra Leone Africa 1957 31.6 2295678 1004.
10 Burkina Faso Africa 1952 32.0 4469979 543.
# It can be read as: Take df1 and then sort it and then display the first 10 rows
mutate: Create a new variable
Create a new variable - log of GDP per capita. Call it log_y
.
|>
df1 mutate(log_y = log(gdpPercap))
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6.66
2 Afghanistan Asia 1957 30.3 9240934 821. 6.71
3 Afghanistan Asia 1962 32.0 10267083 853. 6.75
4 Afghanistan Asia 1967 34.0 11537966 836. 6.73
5 Afghanistan Asia 1972 36.1 13079460 740. 6.61
6 Afghanistan Asia 1977 38.4 14880372 786. 6.67
7 Afghanistan Asia 1982 39.9 12881816 978. 6.89
8 Afghanistan Asia 1987 40.8 13867957 852. 6.75
9 Afghanistan Asia 1992 41.7 16317921 649. 6.48
10 Afghanistan Asia 1997 41.8 22227415 635. 6.45
# ℹ 1,694 more rows
# Store the result in a new dataframe. Call it df2.
<- df1 |>
df2 mutate(log_y = log(gdpPercap))
head(df2)
# A tibble: 6 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6.66
2 Afghanistan Asia 1957 30.3 9240934 821. 6.71
3 Afghanistan Asia 1962 32.0 10267083 853. 6.75
4 Afghanistan Asia 1967 34.0 11537966 836. 6.73
5 Afghanistan Asia 1972 36.1 13079460 740. 6.61
6 Afghanistan Asia 1977 38.4 14880372 786. 6.67
rename: Call a variable by a different name
|>
df2 rename(period = year) ## the syntax is rename(new_name = old_name)
# A tibble: 1,704 × 7
country continent period lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6.66
2 Afghanistan Asia 1957 30.3 9240934 821. 6.71
3 Afghanistan Asia 1962 32.0 10267083 853. 6.75
4 Afghanistan Asia 1967 34.0 11537966 836. 6.73
5 Afghanistan Asia 1972 36.1 13079460 740. 6.61
6 Afghanistan Asia 1977 38.4 14880372 786. 6.67
7 Afghanistan Asia 1982 39.9 12881816 978. 6.89
8 Afghanistan Asia 1987 40.8 13867957 852. 6.75
9 Afghanistan Asia 1992 41.7 16317921 649. 6.48
10 Afghanistan Asia 1997 41.8 22227415 635. 6.45
# ℹ 1,694 more rows
select: Choose certain columns
Choose 3 variables in the df2
dataframe: year
, country
and log_y
|>
df2 select(year, country, log_y)
# A tibble: 1,704 × 3
year country log_y
<int> <fct> <dbl>
1 1952 Afghanistan 6.66
2 1957 Afghanistan 6.71
3 1962 Afghanistan 6.75
4 1967 Afghanistan 6.73
5 1972 Afghanistan 6.61
6 1977 Afghanistan 6.67
7 1982 Afghanistan 6.89
8 1987 Afghanistan 6.75
9 1992 Afghanistan 6.48
10 1997 Afghanistan 6.45
# ℹ 1,694 more rows
filter: Filter the data by a condition
## Let's filter df2 by the condition: Life expectancy > 50
|> filter(lifeExp > 50) df2
# A tibble: 1,213 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Albania Europe 1952 55.2 1282697 1601. 7.38
2 Albania Europe 1957 59.3 1476505 1942. 7.57
3 Albania Europe 1962 64.8 1728137 2313. 7.75
4 Albania Europe 1967 66.2 1984060 2760. 7.92
5 Albania Europe 1972 67.7 2263554 3313. 8.11
6 Albania Europe 1977 68.9 2509048 3533. 8.17
7 Albania Europe 1982 70.4 2780097 3631. 8.20
8 Albania Europe 1987 72 3075321 3739. 8.23
9 Albania Europe 1992 71.6 3326498 2497. 7.82
10 Albania Europe 1997 73.0 3428038 3193. 8.07
# ℹ 1,203 more rows
## Let's filter df2 by the condition: Life expectancy > 50 AND Year = 1997
|> filter(lifeExp > 50, year == 1997) df2
# A tibble: 121 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Albania Europe 1997 73.0 3428038 3193. 8.07
2 Algeria Africa 1997 69.2 29072015 4797. 8.48
3 Argentina Americas 1997 73.3 36203463 10967. 9.30
4 Australia Oceania 1997 78.8 18565243 26998. 10.2
5 Austria Europe 1997 77.5 8069876 29096. 10.3
6 Bahrain Asia 1997 73.9 598561 20292. 9.92
7 Bangladesh Asia 1997 59.4 123315288 973. 6.88
8 Belgium Europe 1997 77.5 10199787 27561. 10.2
9 Benin Africa 1997 54.8 6066080 1233. 7.12
10 Bolivia Americas 1997 62.0 7693188 3326. 8.11
# ℹ 111 more rows
## Let's filter df1 for a certain country
|> filter(country == "Myanmar") df1
# A tibble: 12 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Myanmar Asia 1952 36.3 20092996 331
2 Myanmar Asia 1957 41.9 21731844 350
3 Myanmar Asia 1962 45.1 23634436 388
4 Myanmar Asia 1967 49.4 25870271 349
5 Myanmar Asia 1972 53.1 28466390 357
6 Myanmar Asia 1977 56.1 31528087 371
7 Myanmar Asia 1982 58.1 34680442 424
8 Myanmar Asia 1987 58.3 38028578 385
9 Myanmar Asia 1992 59.3 40546538 347
10 Myanmar Asia 1997 60.3 43247867 415
11 Myanmar Asia 2002 59.9 45598081 611
12 Myanmar Asia 2007 62.1 47761980 944
## Let's filter df1 for multiple countries
|> filter(country %in% c("United States", "Bangladesh")) df1
# A tibble: 24 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Bangladesh Asia 1952 37.5 46886859 684.
2 Bangladesh Asia 1957 39.3 51365468 662.
3 Bangladesh Asia 1962 41.2 56839289 686.
4 Bangladesh Asia 1967 43.5 62821884 721.
5 Bangladesh Asia 1972 45.3 70759295 630.
6 Bangladesh Asia 1977 46.9 80428306 660.
7 Bangladesh Asia 1982 50.0 93074406 677.
8 Bangladesh Asia 1987 52.8 103764241 752.
9 Bangladesh Asia 1992 56.0 113704579 838.
10 Bangladesh Asia 1997 59.4 123315288 973.
# ℹ 14 more rows
arrange: Sort data in increasing or decreasing order
#### INCREASING ORDER
# Sort df2 in increasing order of log_y
|>
df2 arrange(log_y)
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Congo, Dem. Rep. Africa 2002 45.0 55379852 241. 5.49
2 Congo, Dem. Rep. Africa 2007 46.5 64606759 278. 5.63
3 Lesotho Africa 1952 42.1 748747 299. 5.70
4 Guinea-Bissau Africa 1952 32.5 580653 300. 5.70
5 Congo, Dem. Rep. Africa 1997 42.6 47798986 312. 5.74
6 Eritrea Africa 1952 35.9 1438760 329. 5.80
7 Myanmar Asia 1952 36.3 20092996 331 5.80
8 Lesotho Africa 1957 45.0 813338 336. 5.82
9 Burundi Africa 1952 39.0 2445618 339. 5.83
10 Eritrea Africa 1957 38.0 1542611 344. 5.84
# ℹ 1,694 more rows
# Sort df2 in increasing order of log_y, but only for a certain year
|> filter(year == 2002) |>
df2 arrange(log_y)
# A tibble: 142 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Congo, Dem. Rep. Africa 2002 45.0 55379852 241. 5.49
2 Burundi Africa 2002 47.4 7021078 446. 6.10
3 Ethiopia Africa 2002 50.7 67946797 530. 6.27
4 Liberia Africa 2002 43.8 2814651 531. 6.28
5 Guinea-Bissau Africa 2002 45.5 1332459 576. 6.36
6 Niger Africa 2002 54.5 11140655 601. 6.40
7 Myanmar Asia 2002 59.9 45598081 611 6.42
8 Mozambique Africa 2002 44.0 18473780 634. 6.45
9 Gambia Africa 2002 58.0 1457766 661. 6.49
10 Malawi Africa 2002 45.0 11824495 665. 6.50
# ℹ 132 more rows
#### DECREASING ORDER
# Sort df2 in decreasing order of log_y
|>
df2 arrange(desc(log_y))
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap log_y
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Kuwait Asia 1957 58.0 212846 113523. 11.6
2 Kuwait Asia 1972 67.7 841934 109348. 11.6
3 Kuwait Asia 1952 55.6 160000 108382. 11.6
4 Kuwait Asia 1962 60.5 358266 95458. 11.5
5 Kuwait Asia 1967 64.6 575003 80895. 11.3
6 Kuwait Asia 1977 69.3 1140357 59265. 11.0
7 Norway Europe 2007 80.2 4627926 49357. 10.8
8 Kuwait Asia 2007 77.6 2505559 47307. 10.8
9 Singapore Asia 2007 80.0 4553009 47143. 10.8
10 Norway Europe 2002 79.0 4535591 44684. 10.7
# ℹ 1,694 more rows
EXERCISE
Create a new variable called
log_pop
, defined as log of population. Call the new dataframedf3
.Filter
df3
by the condition:log_pop
< 15.5 andcontinent
is Africa.Sort
df3
in decreasing order oflog_pop
. Show only the variablesyear
,country
andlog_pop
. Display the first 12 rows.
Good resource: https://dplyr.tidyverse.org/articles/dplyr.html
SP | 03-data-manipulation.qmd