dplyr training 3The purpose of this noteboook is to illustrate how the dplyr package can be used to perform advanced data manipulation tasks.
library(tidyverse)
starwars
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
select and rename _all, _if and _at functions on quoted variables.starwars
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
select_if(starwars, funs(is.numeric))
## # A tibble: 87 x 3
## height mass birth_year
## <int> <dbl> <dbl>
## 1 172 77 19
## 2 167 75 112
## 3 96 32 33
## 4 202 136 41.9
## 5 150 49 19
## 6 178 120 52
## 7 165 75 47
## 8 97 32 NA
## 9 183 84 24
## 10 182 77 57
## # ... with 77 more rows
select_at(starwars, vars("name", "height"))
## # A tibble: 87 x 2
## name height
## <chr> <int>
## 1 Luke Skywalker 172
## 2 C-3PO 167
## 3 R2-D2 96
## 4 Darth Vader 202
## 5 Leia Organa 150
## 6 Owen Lars 178
## 7 Beru Whitesun lars 165
## 8 R5-D4 97
## 9 Biggs Darklighter 183
## 10 Obi-Wan Kenobi 182
## # ... with 77 more rows
rename_all(starwars, funs(paste0("sw_", .)))
## # A tibble: 87 x 13
## sw_name sw_height sw_mass sw_hair_color sw_skin_color sw_eye_color
## <chr> <int> <dbl> <chr> <chr> <chr>
## 1 Luke Skywal… 172 77 blond fair blue
## 2 C-3PO 167 75 <NA> gold yellow
## 3 R2-D2 96 32 <NA> white, blue red
## 4 Darth Vader 202 136 none white yellow
## 5 Leia Organa 150 49 brown light brown
## 6 Owen Lars 178 120 brown, grey light blue
## 7 Beru Whites… 165 75 brown light blue
## 8 R5-D4 97 32 <NA> white, red red
## 9 Biggs Darkl… 183 84 black light brown
## 10 Obi-Wan Ken… 182 77 auburn, white fair blue-gray
## # ... with 77 more rows, and 7 more variables: sw_birth_year <dbl>,
## # sw_gender <chr>, sw_homeworld <chr>, sw_species <chr>,
## # sw_films <list>, sw_vehicles <list>, sw_starships <list>
rename_if(starwars, funs(is.numeric), funs(str_to_upper))
## # A tibble: 87 x 13
## name HEIGHT MASS hair_color skin_color eye_color BIRTH_YEAR gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
rename_at(starwars, vars("name", "height"), funs(str_to_upper))
## # A tibble: 87 x 13
## NAME HEIGHT mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>%
select(1:3) %>%
group_by_all() %>%
mutate(sum(height, mass, na.rm=T))
## # A tibble: 87 x 4
## # Groups: name, height, mass [87]
## name height mass `sum(height, mass, na.rm = T)`
## <chr> <int> <dbl> <dbl>
## 1 Luke Skywalker 172 77 249
## 2 C-3PO 167 75 242
## 3 R2-D2 96 32 128
## 4 Darth Vader 202 136 338
## 5 Leia Organa 150 49 199
## 6 Owen Lars 178 120 298
## 7 Beru Whitesun lars 165 75 240
## 8 R5-D4 97 32 129
## 9 Biggs Darklighter 183 84 267
## 10 Obi-Wan Kenobi 182 77 259
## # ... with 77 more rows
filter _all, _if and _at functions on quoted variables.filter_at(starwars, vars(contains("color")), all_vars(. == "brown"))
## # A tibble: 1 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Wicket S… 88 20 brown brown brown 8 male
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
filter_at(starwars, vars(contains("color")), any_vars(. == "brown"))
## # A tibble: 31 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Leia Or… 150 49 brown light brown 19 female
## 2 Beru Wh… 165 75 brown light blue 47 female
## 3 Biggs D… 183 84 black light brown 24 male
## 4 Chewbac… 228 112 brown unknown blue 200 male
## 5 Han Solo 180 80 brown fair brown 29 male
## 6 Wedge A… 170 77 brown fair hazel 21 male
## 7 Jek Ton… 180 110 brown fair blue NA male
## 8 Yoda 66 17 white green brown 896 male
## 9 Boba Fe… 183 78.2 black fair brown 31.5 male
## 10 Lando C… 177 79 black dark brown 31 male
## # ... with 21 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
filter_if(starwars, is.numeric, all_vars(. > 100))
## # A tibble: 2 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Chewbac… 228 112 brown unknown blue 200 male
## 2 Jabba D… 175 1358 <NA> green-tan,… orange 600 herma…
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
filter_if(starwars, is.character, any_vars(. > 100))
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
mutate _all, _if and _at functions on quoted variables.mutate_all(starwars, as.character)
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red <NA> <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <chr>, vehicles <chr>, starships <chr>
mutate_if(starwars, funs(is.character), funs(as.factor))
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <fct> <int> <dbl> <fct> <fct> <fct> <dbl> <fct>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <fct>,
## # species <fct>, films <list>, vehicles <list>, starships <list>
mutate_at(starwars, vars("height"), funs(. / 10))
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 17.2 77 blond fair blue 19 male
## 2 C-3PO 16.7 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 9.6 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 20.2 136 none white yellow 41.9 male
## 5 Leia Or… 15 49 brown light brown 19 female
## 6 Owen La… 17.8 120 brown, gr… light blue 52 male
## 7 Beru Wh… 16.5 75 brown light blue 47 female
## 8 R5-D4 9.7 32 <NA> white, red red NA <NA>
## 9 Biggs D… 18.3 84 black light brown 24 male
## 10 Obi-Wan… 18.2 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>%
select(1:3) %>%
mutate(sum(height, mass, na.rm=T))
## # A tibble: 87 x 4
## name height mass `sum(height, mass, na.rm = T)`
## <chr> <int> <dbl> <dbl>
## 1 Luke Skywalker 172 77 19864.
## 2 C-3PO 167 75 19864.
## 3 R2-D2 96 32 19864.
## 4 Darth Vader 202 136 19864.
## 5 Leia Organa 150 49 19864.
## 6 Owen Lars 178 120 19864.
## 7 Beru Whitesun lars 165 75 19864.
## 8 R5-D4 97 32 19864.
## 9 Biggs Darklighter 183 84 19864.
## 10 Obi-Wan Kenobi 182 77 19864.
## # ... with 77 more rows
group_by and summarise _all, _if and _at functions on quoted variables.starwars %>%
select(1:3) %>%
group_by_if(is.character) %>%
count()
## # A tibble: 87 x 2
## # Groups: name [87]
## name n
## <chr> <int>
## 1 Ackbar 1
## 2 Adi Gallia 1
## 3 Anakin Skywalker 1
## 4 Arvel Crynyd 1
## 5 Ayla Secura 1
## 6 Bail Prestor Organa 1
## 7 Barriss Offee 1
## 8 BB8 1
## 9 Ben Quadinaros 1
## 10 Beru Whitesun lars 1
## # ... with 77 more rows
group_by_at(starwars, vars("eye_color", "hair_color")) %>%
count()
## # A tibble: 35 x 3
## # Groups: eye_color, hair_color [35]
## eye_color hair_color n
## <chr> <chr> <int>
## 1 black none 9
## 2 black <NA> 1
## 3 blue auburn 1
## 4 blue auburn, grey 1
## 5 blue black 2
## 6 blue blond 3
## 7 blue brown 7
## 8 blue brown, grey 1
## 9 blue none 3
## 10 blue white 1
## # ... with 25 more rows
starwars %>%
select_if(funs(is.numeric)) %>%
summarise_all(funs(mean), na.rm = T)
## # A tibble: 1 x 3
## height mass birth_year
## <dbl> <dbl> <dbl>
## 1 174. 97.3 87.6
summarise_if(starwars, funs(is.numeric), funs(min, median, mean, sd, max), na.rm = T)
## # A tibble: 1 x 15
## height_min mass_min birth_year_min height_median mass_median
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 66 15 8 180 79
## # ... with 10 more variables: birth_year_median <dbl>, height_mean <dbl>,
## # mass_mean <dbl>, birth_year_mean <dbl>, height_sd <dbl>,
## # mass_sd <dbl>, birth_year_sd <dbl>, height_max <dbl>, mass_max <dbl>,
## # birth_year_max <dbl>
starwars %>%
summarise_if(funs(is.numeric), funs(min, median, mean, sd, max), na.rm = T) %>%
gather() %>%
arrange(key)
## # A tibble: 15 x 2
## key value
## <chr> <dbl>
## 1 birth_year_max 896
## 2 birth_year_mean 87.6
## 3 birth_year_median 52
## 4 birth_year_min 8
## 5 birth_year_sd 155.
## 6 height_max 264
## 7 height_mean 174.
## 8 height_median 180
## 9 height_min 66
## 10 height_sd 34.8
## 11 mass_max 1358
## 12 mass_mean 97.3
## 13 mass_median 79
## 14 mass_min 15
## 15 mass_sd 169.
summarise_at(starwars, vars("height", "mass"), funs(sum, mean), na.rm = T)
## # A tibble: 1 x 4
## height_sum mass_sum height_mean mass_mean
## <int> <dbl> <dbl> <dbl>
## 1 14123 5741. 174. 97.3
starwars %>%
summarise_all(funs(sum(is.na(.)))) %>%
select_if(any_vars(. > 0))
## # A tibble: 1 x 7
## height mass hair_color birth_year gender homeworld species
## <int> <int> <int> <int> <int> <int> <int>
## 1 6 28 5 44 3 10 5
select(iris, !!sym(c("Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 1
## Sepal.Length
## <dbl>
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5
## 6 5.4
## 7 4.6
## 8 5
## 9 4.4
## 10 4.9
## # ... with 140 more rows
select(iris, !!!syms(c("Sepal.Width", "Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 2
## Sepal.Width Sepal.Length
## <dbl> <dbl>
## 1 3.5 5.1
## 2 3 4.9
## 3 3.2 4.7
## 4 3.1 4.6
## 5 3.6 5
## 6 3.9 5.4
## 7 3.4 4.6
## 8 3.4 5
## 9 2.9 4.4
## 10 3.1 4.9
## # ... with 140 more rows
filter(iris, !!sym("Sepal.Width") > 4) %>% as_tibble()
## # A tibble: 3 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.7 4.4 1.5 0.4 setosa
## 2 5.2 4.1 1.5 0.1 setosa
## 3 5.5 4.2 1.4 0.2 setosa
filter(iris, !!sym("Sepal.Width") == "virginica") %>% as_tibble()
## # A tibble: 0 x 5
## # ... with 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>,
## # Petal.Length <dbl>, Petal.Width <dbl>, Species <fct>
mutate(iris, ratio=!!sym("Sepal.Length") / !!sym("Sepal.Width")) %>% as_tibble()
## # A tibble: 150 x 6
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ratio
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 5.1 3.5 1.4 0.2 setosa 1.46
## 2 4.9 3 1.4 0.2 setosa 1.63
## 3 4.7 3.2 1.3 0.2 setosa 1.47
## 4 4.6 3.1 1.5 0.2 setosa 1.48
## 5 5 3.6 1.4 0.2 setosa 1.39
## 6 5.4 3.9 1.7 0.4 setosa 1.38
## 7 4.6 3.4 1.4 0.3 setosa 1.35
## 8 5 3.4 1.5 0.2 setosa 1.47
## 9 4.4 2.9 1.4 0.2 setosa 1.52
## 10 4.9 3.1 1.5 0.1 setosa 1.58
## # ... with 140 more rows
arrange(iris, !!sym("Sepal.Length")) %>% as_tibble()
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 4.3 3 1.1 0.1 setosa
## 2 4.4 2.9 1.4 0.2 setosa
## 3 4.4 3 1.3 0.2 setosa
## 4 4.4 3.2 1.3 0.2 setosa
## 5 4.5 2.3 1.3 0.3 setosa
## 6 4.6 3.1 1.5 0.2 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 4.6 3.6 1 0.2 setosa
## 9 4.6 3.2 1.4 0.2 setosa
## 10 4.7 3.2 1.3 0.2 setosa
## # ... with 140 more rows
arrange(iris, desc(!!sym("Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.9 3.8 6.4 2 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2 virginica
## 5 7.7 3 6.1 2.3 virginica
## 6 7.6 3 6.6 2.1 virginica
## 7 7.4 2.8 6.1 1.9 virginica
## 8 7.3 2.9 6.3 1.8 virginica
## 9 7.2 3.6 6.1 2.5 virginica
## 10 7.2 3.2 6 1.8 virginica
## # ... with 140 more rows
summarise(iris, mean(!!sym("Sepal.Length"), na.rm = T))
## mean(Sepal.Length, na.rm = T)
## 1 5.843333
ggplot(iris, aes(x = !!sym("Sepal.Width"), y = !!sym("Sepal.Length"), colour = !!sym("Species"))) +
geom_point()