1. Purpose.

The purpose of this noteboook is to illustrate how the dplyr package can be used to perform advanced data manipulation tasks.

2. Load libraries and view practice dataset.

library(tidyverse)
starwars
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>

3. Apply select and rename _all, _if and _at functions on quoted variables.

starwars
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
select_if(starwars, funs(is.numeric))
## # A tibble: 87 x 3
##    height  mass birth_year
##     <int> <dbl>      <dbl>
##  1    172    77       19  
##  2    167    75      112  
##  3     96    32       33  
##  4    202   136       41.9
##  5    150    49       19  
##  6    178   120       52  
##  7    165    75       47  
##  8     97    32       NA  
##  9    183    84       24  
## 10    182    77       57  
## # ... with 77 more rows
select_at(starwars, vars("name", "height"))
## # A tibble: 87 x 2
##    name               height
##    <chr>               <int>
##  1 Luke Skywalker        172
##  2 C-3PO                 167
##  3 R2-D2                  96
##  4 Darth Vader           202
##  5 Leia Organa           150
##  6 Owen Lars             178
##  7 Beru Whitesun lars    165
##  8 R5-D4                  97
##  9 Biggs Darklighter     183
## 10 Obi-Wan Kenobi        182
## # ... with 77 more rows
rename_all(starwars, funs(paste0("sw_", .)))
## # A tibble: 87 x 13
##    sw_name      sw_height sw_mass sw_hair_color sw_skin_color sw_eye_color
##    <chr>            <int>   <dbl> <chr>         <chr>         <chr>       
##  1 Luke Skywal…       172      77 blond         fair          blue        
##  2 C-3PO              167      75 <NA>          gold          yellow      
##  3 R2-D2               96      32 <NA>          white, blue   red         
##  4 Darth Vader        202     136 none          white         yellow      
##  5 Leia Organa        150      49 brown         light         brown       
##  6 Owen Lars          178     120 brown, grey   light         blue        
##  7 Beru Whites…       165      75 brown         light         blue        
##  8 R5-D4               97      32 <NA>          white, red    red         
##  9 Biggs Darkl…       183      84 black         light         brown       
## 10 Obi-Wan Ken…       182      77 auburn, white fair          blue-gray   
## # ... with 77 more rows, and 7 more variables: sw_birth_year <dbl>,
## #   sw_gender <chr>, sw_homeworld <chr>, sw_species <chr>,
## #   sw_films <list>, sw_vehicles <list>, sw_starships <list>
rename_if(starwars, funs(is.numeric), funs(str_to_upper))
## # A tibble: 87 x 13
##    name     HEIGHT  MASS hair_color skin_color eye_color BIRTH_YEAR gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
rename_at(starwars, vars("name", "height"), funs(str_to_upper))
## # A tibble: 87 x 13
##    NAME     HEIGHT  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>% 
  select(1:3) %>% 
  group_by_all() %>% 
  mutate(sum(height, mass, na.rm=T))
## # A tibble: 87 x 4
## # Groups:   name, height, mass [87]
##    name               height  mass `sum(height, mass, na.rm = T)`
##    <chr>               <int> <dbl>                          <dbl>
##  1 Luke Skywalker        172    77                            249
##  2 C-3PO                 167    75                            242
##  3 R2-D2                  96    32                            128
##  4 Darth Vader           202   136                            338
##  5 Leia Organa           150    49                            199
##  6 Owen Lars             178   120                            298
##  7 Beru Whitesun lars    165    75                            240
##  8 R5-D4                  97    32                            129
##  9 Biggs Darklighter     183    84                            267
## 10 Obi-Wan Kenobi        182    77                            259
## # ... with 77 more rows

4. Apply filter _all, _if and _at functions on quoted variables.

filter_at(starwars, vars(contains("color")), all_vars(. == "brown"))
## # A tibble: 1 x 13
##   name      height  mass hair_color skin_color eye_color birth_year gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
## 1 Wicket S…     88    20 brown      brown      brown              8 male  
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
filter_at(starwars, vars(contains("color")), any_vars(. == "brown"))
## # A tibble: 31 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Leia Or…    150  49   brown      light      brown           19   female
##  2 Beru Wh…    165  75   brown      light      blue            47   female
##  3 Biggs D…    183  84   black      light      brown           24   male  
##  4 Chewbac…    228 112   brown      unknown    blue           200   male  
##  5 Han Solo    180  80   brown      fair       brown           29   male  
##  6 Wedge A…    170  77   brown      fair       hazel           21   male  
##  7 Jek Ton…    180 110   brown      fair       blue            NA   male  
##  8 Yoda         66  17   white      green      brown          896   male  
##  9 Boba Fe…    183  78.2 black      fair       brown           31.5 male  
## 10 Lando C…    177  79   black      dark       brown           31   male  
## # ... with 21 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
filter_if(starwars, is.numeric, all_vars(. > 100))
## # A tibble: 2 x 13
##   name     height  mass hair_color skin_color  eye_color birth_year gender
##   <chr>     <int> <dbl> <chr>      <chr>       <chr>          <dbl> <chr> 
## 1 Chewbac…    228   112 brown      unknown     blue             200 male  
## 2 Jabba D…    175  1358 <NA>       green-tan,… orange           600 herma…
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
filter_if(starwars, is.character, any_vars(. > 100))
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>

5. Apply mutate _all, _if and _at functions on quoted variables.

mutate_all(starwars, as.character)
## # A tibble: 87 x 13
##    name     height mass  hair_color skin_color eye_color birth_year gender
##    <chr>    <chr>  <chr> <chr>      <chr>      <chr>     <chr>      <chr> 
##  1 Luke Sk… 172    77    blond      fair       blue      19         male  
##  2 C-3PO    167    75    <NA>       gold       yellow    112        <NA>  
##  3 R2-D2    96     32    <NA>       white, bl… red       33         <NA>  
##  4 Darth V… 202    136   none       white      yellow    41.9       male  
##  5 Leia Or… 150    49    brown      light      brown     19         female
##  6 Owen La… 178    120   brown, gr… light      blue      52         male  
##  7 Beru Wh… 165    75    brown      light      blue      47         female
##  8 R5-D4    97     32    <NA>       white, red red       <NA>       <NA>  
##  9 Biggs D… 183    84    black      light      brown     24         male  
## 10 Obi-Wan… 182    77    auburn, w… fair       blue-gray 57         male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <chr>, vehicles <chr>, starships <chr>
mutate_if(starwars, funs(is.character), funs(as.factor))
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <fct>     <int> <dbl> <fct>      <fct>      <fct>          <dbl> <fct> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <fct>,
## #   species <fct>, films <list>, vehicles <list>, starships <list>
mutate_at(starwars, vars("height"), funs(. / 10))
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <dbl> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…   17.2    77 blond      fair       blue            19   male  
##  2 C-3PO      16.7    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2       9.6    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…   20.2   136 none       white      yellow          41.9 male  
##  5 Leia Or…   15      49 brown      light      brown           19   female
##  6 Owen La…   17.8   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…   16.5    75 brown      light      blue            47   female
##  8 R5-D4       9.7    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…   18.3    84 black      light      brown           24   male  
## 10 Obi-Wan…   18.2    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
starwars %>% 
  select(1:3) %>% 
  mutate(sum(height, mass, na.rm=T))
## # A tibble: 87 x 4
##    name               height  mass `sum(height, mass, na.rm = T)`
##    <chr>               <int> <dbl>                          <dbl>
##  1 Luke Skywalker        172    77                         19864.
##  2 C-3PO                 167    75                         19864.
##  3 R2-D2                  96    32                         19864.
##  4 Darth Vader           202   136                         19864.
##  5 Leia Organa           150    49                         19864.
##  6 Owen Lars             178   120                         19864.
##  7 Beru Whitesun lars    165    75                         19864.
##  8 R5-D4                  97    32                         19864.
##  9 Biggs Darklighter     183    84                         19864.
## 10 Obi-Wan Kenobi        182    77                         19864.
## # ... with 77 more rows

6. Apply group_by and summarise _all, _if and _at functions on quoted variables.

starwars %>% 
  select(1:3) %>% 
  group_by_if(is.character) %>% 
  count()
## # A tibble: 87 x 2
## # Groups:   name [87]
##    name                    n
##    <chr>               <int>
##  1 Ackbar                  1
##  2 Adi Gallia              1
##  3 Anakin Skywalker        1
##  4 Arvel Crynyd            1
##  5 Ayla Secura             1
##  6 Bail Prestor Organa     1
##  7 Barriss Offee           1
##  8 BB8                     1
##  9 Ben Quadinaros          1
## 10 Beru Whitesun lars      1
## # ... with 77 more rows
group_by_at(starwars, vars("eye_color", "hair_color")) %>% 
  count()
## # A tibble: 35 x 3
## # Groups:   eye_color, hair_color [35]
##    eye_color hair_color       n
##    <chr>     <chr>        <int>
##  1 black     none             9
##  2 black     <NA>             1
##  3 blue      auburn           1
##  4 blue      auburn, grey     1
##  5 blue      black            2
##  6 blue      blond            3
##  7 blue      brown            7
##  8 blue      brown, grey      1
##  9 blue      none             3
## 10 blue      white            1
## # ... with 25 more rows
starwars %>% 
  select_if(funs(is.numeric)) %>% 
  summarise_all(funs(mean), na.rm = T)
## # A tibble: 1 x 3
##   height  mass birth_year
##    <dbl> <dbl>      <dbl>
## 1   174.  97.3       87.6
summarise_if(starwars, funs(is.numeric), funs(min, median, mean, sd, max), na.rm = T) 
## # A tibble: 1 x 15
##   height_min mass_min birth_year_min height_median mass_median
##        <dbl>    <dbl>          <dbl>         <int>       <dbl>
## 1         66       15              8           180          79
## # ... with 10 more variables: birth_year_median <dbl>, height_mean <dbl>,
## #   mass_mean <dbl>, birth_year_mean <dbl>, height_sd <dbl>,
## #   mass_sd <dbl>, birth_year_sd <dbl>, height_max <dbl>, mass_max <dbl>,
## #   birth_year_max <dbl>
starwars %>% 
  summarise_if(funs(is.numeric), funs(min, median, mean, sd, max), na.rm = T) %>% 
  gather() %>% 
  arrange(key)
## # A tibble: 15 x 2
##    key                value
##    <chr>              <dbl>
##  1 birth_year_max     896  
##  2 birth_year_mean     87.6
##  3 birth_year_median   52  
##  4 birth_year_min       8  
##  5 birth_year_sd      155. 
##  6 height_max         264  
##  7 height_mean        174. 
##  8 height_median      180  
##  9 height_min          66  
## 10 height_sd           34.8
## 11 mass_max          1358  
## 12 mass_mean           97.3
## 13 mass_median         79  
## 14 mass_min            15  
## 15 mass_sd            169.
summarise_at(starwars, vars("height", "mass"), funs(sum, mean), na.rm = T)
## # A tibble: 1 x 4
##   height_sum mass_sum height_mean mass_mean
##        <int>    <dbl>       <dbl>     <dbl>
## 1      14123    5741.        174.      97.3
starwars %>%
  summarise_all(funs(sum(is.na(.)))) %>% 
  select_if(any_vars(. > 0)) 
## # A tibble: 1 x 7
##   height  mass hair_color birth_year gender homeworld species
##    <int> <int>      <int>      <int>  <int>     <int>   <int>
## 1      6    28          5         44      3        10       5

7. Work with tidy evaluation programming to unquote.

select(iris, !!sym(c("Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 1
##    Sepal.Length
##           <dbl>
##  1          5.1
##  2          4.9
##  3          4.7
##  4          4.6
##  5          5  
##  6          5.4
##  7          4.6
##  8          5  
##  9          4.4
## 10          4.9
## # ... with 140 more rows
select(iris, !!!syms(c("Sepal.Width", "Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 2
##    Sepal.Width Sepal.Length
##          <dbl>        <dbl>
##  1         3.5          5.1
##  2         3            4.9
##  3         3.2          4.7
##  4         3.1          4.6
##  5         3.6          5  
##  6         3.9          5.4
##  7         3.4          4.6
##  8         3.4          5  
##  9         2.9          4.4
## 10         3.1          4.9
## # ... with 140 more rows
filter(iris, !!sym("Sepal.Width") > 4) %>% as_tibble()
## # A tibble: 3 x 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.7         4.4          1.5         0.4 setosa 
## 2          5.2         4.1          1.5         0.1 setosa 
## 3          5.5         4.2          1.4         0.2 setosa
filter(iris, !!sym("Sepal.Width") == "virginica") %>% as_tibble()
## # A tibble: 0 x 5
## # ... with 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>,
## #   Petal.Length <dbl>, Petal.Width <dbl>, Species <fct>
mutate(iris, ratio=!!sym("Sepal.Length") / !!sym("Sepal.Width")) %>% as_tibble()
## # A tibble: 150 x 6
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species ratio
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>   <dbl>
##  1          5.1         3.5          1.4         0.2 setosa   1.46
##  2          4.9         3            1.4         0.2 setosa   1.63
##  3          4.7         3.2          1.3         0.2 setosa   1.47
##  4          4.6         3.1          1.5         0.2 setosa   1.48
##  5          5           3.6          1.4         0.2 setosa   1.39
##  6          5.4         3.9          1.7         0.4 setosa   1.38
##  7          4.6         3.4          1.4         0.3 setosa   1.35
##  8          5           3.4          1.5         0.2 setosa   1.47
##  9          4.4         2.9          1.4         0.2 setosa   1.52
## 10          4.9         3.1          1.5         0.1 setosa   1.58
## # ... with 140 more rows
arrange(iris, !!sym("Sepal.Length")) %>% as_tibble()
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          4.3         3            1.1         0.1 setosa 
##  2          4.4         2.9          1.4         0.2 setosa 
##  3          4.4         3            1.3         0.2 setosa 
##  4          4.4         3.2          1.3         0.2 setosa 
##  5          4.5         2.3          1.3         0.3 setosa 
##  6          4.6         3.1          1.5         0.2 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          4.6         3.6          1           0.2 setosa 
##  9          4.6         3.2          1.4         0.2 setosa 
## 10          4.7         3.2          1.3         0.2 setosa 
## # ... with 140 more rows
arrange(iris, desc(!!sym("Sepal.Length"))) %>% as_tibble()
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
##  1          7.9         3.8          6.4         2   virginica
##  2          7.7         3.8          6.7         2.2 virginica
##  3          7.7         2.6          6.9         2.3 virginica
##  4          7.7         2.8          6.7         2   virginica
##  5          7.7         3            6.1         2.3 virginica
##  6          7.6         3            6.6         2.1 virginica
##  7          7.4         2.8          6.1         1.9 virginica
##  8          7.3         2.9          6.3         1.8 virginica
##  9          7.2         3.6          6.1         2.5 virginica
## 10          7.2         3.2          6           1.8 virginica
## # ... with 140 more rows
summarise(iris, mean(!!sym("Sepal.Length"), na.rm = T))
##   mean(Sepal.Length, na.rm = T)
## 1                      5.843333
ggplot(iris, aes(x = !!sym("Sepal.Width"), y = !!sym("Sepal.Length"), colour = !!sym("Species"))) +
  geom_point()