In this lab activity, we’ll discuss Data Wrangling/Transformation via
the dplyr package. We’ll explore ways to choose subsets of
data, aggregate data to create summaries, make new variables, and sort
your data frames. It is recommended you also explore the RStudio
Cheatsheet on Data
Transformation as we discuss this content.
gapminderHere is a look at the gapminder data frame in the
gapminder package.
library(gapminder)
rmarkdown::paged_table(gapminder)
# Has been set as the default way to print data frames
# via df_print: paged in the YAML at the top
Say we wanted mean life expectancy across all years for Asia
# Base R
asia <- gapminder[gapminder$continent == "Asia", ]
mean(asia$lifeExp)
[1] 60.0649
library(dplyr)
gapminder %>%
filter(continent == "Asia") %>%
summarize(mean_exp = mean(lifeExp))
# A tibble: 1 × 1
mean_exp
<dbl>
1 60.1
%>%dplyr equivalent to the
+ in ggplot2filter() summarize() group_by() mutate() arrange()filter()library(gapminder); library(dplyr)
gap_2007 <- gapminder %>% filter(year == 2007)
gap_2007
# A tibble: 142 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 2007 43.8 31889923 975.
2 Albania Europe 2007 76.4 3600523 5937.
3 Algeria Africa 2007 72.3 33333216 6223.
4 Angola Africa 2007 42.7 12420476 4797.
5 Argentina Americas 2007 75.3 40301927 12779.
6 Australia Oceania 2007 81.2 20434176 34435.
7 Austria Europe 2007 79.8 8199783 36126.
8 Bahrain Asia 2007 75.6 708573 29796.
9 Bangladesh Asia 2007 64.1 150448339 1391.
10 Belgium Europe 2007 79.4 10392226 33693.
# ℹ 132 more rows
== to compare a variable to a value| to check for any in multiple filters being
true:gapminder %>%
filter(year == 2002 | continent == "Europe")
# A tibble: 472 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 2002 42.1 25268405 727.
2 Albania Europe 1952 55.2 1282697 1601.
3 Albania Europe 1957 59.3 1476505 1942.
4 Albania Europe 1962 64.8 1728137 2313.
5 Albania Europe 1967 66.2 1984060 2760.
6 Albania Europe 1972 67.7 2263554 3313.
7 Albania Europe 1977 68.9 2509048 3533.
8 Albania Europe 1982 70.4 2780097 3631.
9 Albania Europe 1987 72 3075321 3739.
10 Albania Europe 1992 71.6 3326498 2497.
# ℹ 462 more rows
& or , to check for all of
multiple filters being true:gapminder %>%
filter(year == 2002, continent == "Europe")
# A tibble: 30 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Albania Europe 2002 75.7 3508512 4604.
2 Austria Europe 2002 79.0 8148312 32418.
3 Belgium Europe 2002 78.3 10311970 30486.
4 Bosnia and Herzegovina Europe 2002 74.1 4165416 6019.
5 Bulgaria Europe 2002 72.1 7661799 7697.
6 Croatia Europe 2002 74.9 4481020 11628.
7 Czech Republic Europe 2002 75.5 10256295 17596.
8 Denmark Europe 2002 77.2 5374693 32167.
9 Finland Europe 2002 78.4 5193039 28205.
10 France Europe 2002 79.6 59925035 28926.
# ℹ 20 more rows
%in% to check for any being true (shortcut to using
| repeatedly with ==)gapminder %>%
filter(country %in% c("Argentina", "Belgium", "Mexico"),
year %in% c(1987, 1992))
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Argentina Americas 1987 70.8 31620918 9140.
2 Argentina Americas 1992 71.9 33958947 9308.
3 Belgium Europe 1987 75.4 9870200 22526.
4 Belgium Europe 1992 76.5 10045622 25576.
5 Mexico Americas 1987 69.5 80122492 8688.
6 Mexico Americas 1992 71.5 88111030 9472.
summarize()summarize().max_exp_1997 <- gapminder %>%
filter(year == 1997) %>%
summarize(max_exp = max(lifeExp))
max_exp_1997
# A tibble: 1 × 1
max_exp
<dbl>
1 80.7
summarize() with group_by()When you’d like to determine a numerical summary for all levels of a different categorical variable
max_exp_1997_by_cont <- gapminder %>%
filter(year == 1997) %>%
group_by(continent) %>%
summarize(max_exp = max(lifeExp))
max_exp_1997_by_cont
# A tibble: 5 × 2
continent max_exp
<fct> <dbl>
1 Africa 74.8
2 Americas 78.6
3 Asia 80.7
4 Europe 79.4
5 Oceania 78.8
%>%It’s hard to appreciate the %>% without seeing what
the code would look like without it:
max_exp_1997_by_cont <-
summarize(
group_by(
filter(
gapminder,
year == 1997),
continent),
max_exp = max(lifeExp))
max_exp_1997_by_cont
# A tibble: 5 × 2
continent max_exp
<fct> <dbl>
1 Africa 74.8
2 Americas 78.6
3 Asia 80.7
4 Europe 79.4
5 Oceania 78.8
mutate()gap_w_gdp <- gapminder %>% mutate(gdp = pop * gdpPercap)
gap_w_gdp
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap gdp
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330.
2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670.
3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797.
4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150.
5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274.
6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231.
7 Afghanistan Asia 1982 39.9 12881816 978. 12598563401.
8 Afghanistan Asia 1987 40.8 13867957 852. 11820990309.
9 Afghanistan Asia 1992 41.7 16317921 649. 10595901589.
10 Afghanistan Asia 1997 41.8 22227415 635. 14121995875.
# ℹ 1,694 more rows
mutate()gap_weird <- gapminder %>% mutate(pop = pop + 1000)
gap_weird
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8426333 779.
2 Afghanistan Asia 1957 30.3 9241934 821.
3 Afghanistan Asia 1962 32.0 10268083 853.
4 Afghanistan Asia 1967 34.0 11538966 836.
5 Afghanistan Asia 1972 36.1 13080460 740.
6 Afghanistan Asia 1977 38.4 14881372 786.
7 Afghanistan Asia 1982 39.9 12882816 978.
8 Afghanistan Asia 1987 40.8 13868957 852.
9 Afghanistan Asia 1992 41.7 16318921 649.
10 Afghanistan Asia 1997 41.8 22228415 635.
# ℹ 1,694 more rows
arrange()gapminder %>%
arrange(year, country)
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Albania Europe 1952 55.2 1282697 1601.
3 Algeria Africa 1952 43.1 9279525 2449.
4 Angola Africa 1952 30.0 4232095 3521.
5 Argentina Americas 1952 62.5 17876956 5911.
6 Australia Oceania 1952 69.1 8691212 10040.
7 Austria Europe 1952 66.8 6927772 6137.
8 Bahrain Asia 1952 50.9 120447 9867.
9 Bangladesh Asia 1952 37.5 46886859 684.
10 Belgium Europe 1952 68 8730405 8343.
# ℹ 1,694 more rows
gapminder %>%
filter(year > 2000) %>%
arrange(desc(lifeExp))
# A tibble: 284 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Japan Asia 2007 82.6 127467972 31656.
2 Hong Kong, China Asia 2007 82.2 6980412 39725.
3 Japan Asia 2002 82 127065841 28605.
4 Iceland Europe 2007 81.8 301931 36181.
5 Switzerland Europe 2007 81.7 7554661 37506.
6 Hong Kong, China Asia 2002 81.5 6762476 30209.
7 Australia Oceania 2007 81.2 20434176 34435.
8 Spain Europe 2007 80.9 40448191 28821.
9 Sweden Europe 2007 80.9 9031088 33860.
10 Israel Asia 2007 80.7 6426679 25523.
# ℹ 274 more rows
dplyr verbsselecttop_nsample_nsliceglimpserenameDetermine which African country had the highest GDP per capita in
1982 using the gapminder data in the gapminder
package.
gapminder %>%
filter(continent == "Africa", year == 1982) %>%
arrange(desc(gdpPercap)) %>%
slice(1)
# A tibble: 1 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Libya Africa 1982 62.2 3344074 17364.
filter(continent == “Africa”, year == 1982) %>%-This selects only African countries in the year 1982 (arrange(desc(gdpPercap))- sorts countries from highest to lowest GDP per capita slice (1)- Extracts the single country at the top of the list — the highest GDP per capita.
Challenge
For both of these problems below, use the bechdel data
frame in the fivethirtyeight package:
count function in the dplyr
package to determine how many movies in 2013 fell into each of the
different categories for clean_testbechdel %>%
filter(year == 2013) %>%
count(clean_test)
# A tibble: 5 × 2
clean_test n
<ord> <int>
1 nowomen 6
2 notalk 23
3 men 15
4 dubious 9
5 ok 46
filter(year == 2013) - Restricts the dataset to movies released in 2013. count(clean_test) -Counts how many movies fall into each clean_test category
"ok" across all yearsDetermine the top five movies in terms of domestic return on
investment for 2013 scaled data using the bechdel data
frame in the fivethirtyeight package.
library(fivethirtyeight)
bechdel %>%
filter(year == 2013) %>%
mutate(domestic_roi = (domgross - budget) / budget) %>%
arrange(desc(domestic_roi)) %>%
select(title, year, budget, domgross, domestic_roi) %>%
slice_head(n = 5)
# A tibble: 5 × 5
title year budget domgross domestic_roi
<chr> <int> <int> <dbl> <dbl>
1 The Purge 2013 3000000 64473115 20.5
2 Insidious: Chapter 2 2013 5000000 83586447 15.7
3 The Conjuring 2013 20000000 137400141 5.87
4 Despicable Me 2 2013 76000000 368065385 3.84
5 Mama 2013 15000000 71628180 3.78
filter(year == 2013) -Selects only movies released in 2013. mutate(domestic_roi = (domgross - budget) / budget) -Creates a new variable which is the domestic return on investment (ROI). arrange(desc(domestic_roi)) -Sorts movies from highest to lowest ROI. select(…)- Keeps only the most relevant columns slice_head(n = 5) - Returns the top five movies based on highest ROI.
Determine the percentage of movies that received the value of
"ok" across all years
bechdel %>%
summarize(
total_movies = n(),
ok_movies = sum(clean_test == "ok", na.rm = TRUE),
percent_ok = (ok_movies / total_movies) * 100
)
# A tibble: 1 × 3
total_movies ok_movies percent_ok
<int> <int> <dbl>
1 1794 803 44.8
summarize()- Creates summary statistics for the entire dataset. total_movies = n() -Counts the total number of movies. ok_movies = sum(clean_test == “ok”) - Counts only the movies where clean_test equals ok percent_ok = (ok_movies / total_movies) * 100 - the percentage of movies rated “ok”.