1. Purpose.

The purpose of this noteboook is to illustrate how the dplyr package can be used to perform intermediate data manipulation tasks.

2. Load libraries and view practice dataset.

library(dplyr)
starwars
## # A tibble: 87 x 13
##    name     height  mass hair_color skin_color eye_color birth_year gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  
##  2 C-3PO       167    75 <NA>       gold       yellow         112   <NA>  
##  3 R2-D2        96    32 <NA>       white, bl… red             33   <NA>  
##  4 Darth V…    202   136 none       white      yellow          41.9 male  
##  5 Leia Or…    150    49 brown      light      brown           19   female
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  
##  7 Beru Wh…    165    75 brown      light      blue            47   female
##  8 R5-D4        97    32 <NA>       white, red red             NA   <NA>  
##  9 Biggs D…    183    84 black      light      brown           24   male  
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
band_members
## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles
band_instruments
## # A tibble: 3 x 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar
band_instruments2 <- rename(band_instruments, artist = name)
band_instruments2
## # A tibble: 3 x 2
##   artist plays 
##   <chr>  <chr> 
## 1 John   guitar
## 2 Paul   bass  
## 3 Keith  guitar

3. Perform joins that mutate new columns.

full_join(band_members, band_instruments, by = "name")
## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar
inner_join(band_members, band_instruments, by = "name")
## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass
left_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass
right_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar
inner_join(band_members, band_instruments2, by = c("name" = "artist"))
## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

4. Perform joins that filter a dataset.

band_members2 <- semi_join(band_members, band_instruments, by = "name")
band_members2
## # A tibble: 2 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles
anti_join(band_members, band_instruments, by = "name")
## # A tibble: 1 x 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones

5. Bind rows or columns to a dataframe.

band_members2 <- bind_rows(band_members2, data.frame(name = "Ringo", band = "Beatles"))  #use rbind if bind_rows too fussy
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
band_members2
## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles
## 3 Ringo Beatles
bind_cols(band_members2, singing=c(TRUE, TRUE, FALSE)) #use cbind if bind_cols too fussy
## # A tibble: 3 x 3
##   name  band    singing
##   <chr> <chr>   <lgl>  
## 1 John  Beatles TRUE   
## 2 Paul  Beatles TRUE   
## 3 Ringo Beatles FALSE

6. Perform set operations.

band_members
## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles
band_members2
## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles
## 3 Ringo Beatles
union(band_members, band_members2)
## # A tibble: 4 x 2
##   name  band   
##   <chr> <chr>  
## 1 Ringo Beatles
## 2 Paul  Beatles
## 3 John  Beatles
## 4 Mick  Stones
intersect(band_members, band_members2)
## # A tibble: 2 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles
setdiff(band_members, band_members2)
## # A tibble: 1 x 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones
setdiff(band_members2, band_members)
## # A tibble: 1 x 2
##   name  band   
##   <chr> <chr>  
## 1 Ringo Beatles

7. Mutate new columns using window functions.

starwars_subset <- select(starwars, name, mass, height, eye_color)
starwars_subset
## # A tibble: 87 x 4
##    name                mass height eye_color
##    <chr>              <dbl>  <int> <chr>    
##  1 Luke Skywalker        77    172 blue     
##  2 C-3PO                 75    167 yellow   
##  3 R2-D2                 32     96 red      
##  4 Darth Vader          136    202 yellow   
##  5 Leia Organa           49    150 brown    
##  6 Owen Lars            120    178 blue     
##  7 Beru Whitesun lars    75    165 blue     
##  8 R5-D4                 32     97 red      
##  9 Biggs Darklighter     84    183 brown    
## 10 Obi-Wan Kenobi        77    182 blue-gray
## # ... with 77 more rows
starwars_height <- select(starwars, name, height)
mutate(starwars_height, id = row_number()) 
## # A tibble: 87 x 3
##    name               height    id
##    <chr>               <int> <int>
##  1 Luke Skywalker        172     1
##  2 C-3PO                 167     2
##  3 R2-D2                  96     3
##  4 Darth Vader           202     4
##  5 Leia Organa           150     5
##  6 Owen Lars             178     6
##  7 Beru Whitesun lars    165     7
##  8 R5-D4                  97     8
##  9 Biggs Darklighter     183     9
## 10 Obi-Wan Kenobi        182    10
## # ... with 77 more rows
mutate(starwars_height, id = row_number(height)) 
## # A tibble: 87 x 3
##    name               height    id
##    <chr>               <int> <int>
##  1 Luke Skywalker        172    29
##  2 C-3PO                 167    21
##  3 R2-D2                  96     5
##  4 Darth Vader           202    72
##  5 Leia Organa           150    11
##  6 Owen Lars             178    35
##  7 Beru Whitesun lars    165    17
##  8 R5-D4                  97     7
##  9 Biggs Darklighter     183    45
## 10 Obi-Wan Kenobi        182    44
## # ... with 77 more rows
mutate(starwars_height, height_rank = min_rank(height)) 
## # A tibble: 87 x 3
##    name               height height_rank
##    <chr>               <int>       <int>
##  1 Luke Skywalker        172          29
##  2 C-3PO                 167          21
##  3 R2-D2                  96           5
##  4 Darth Vader           202          72
##  5 Leia Organa           150          11
##  6 Owen Lars             178          35
##  7 Beru Whitesun lars    165          17
##  8 R5-D4                  97           7
##  9 Biggs Darklighter     183          45
## 10 Obi-Wan Kenobi        182          44
## # ... with 77 more rows
mutate(starwars_height, height_rank = dense_rank(height)) 
## # A tibble: 87 x 3
##    name               height height_rank
##    <chr>               <int>       <int>
##  1 Luke Skywalker        172          20
##  2 C-3PO                 167          16
##  3 R2-D2                  96           5
##  4 Darth Vader           202          37
##  5 Leia Organa           150          10
##  6 Owen Lars             178          24
##  7 Beru Whitesun lars    165          14
##  8 R5-D4                  97           6
##  9 Biggs Darklighter     183          27
## 10 Obi-Wan Kenobi        182          26
## # ... with 77 more rows
mutate(starwars_height, height_rank = percent_rank(height))
## # A tibble: 87 x 3
##    name               height height_rank
##    <chr>               <int>       <dbl>
##  1 Luke Skywalker        172       0.35 
##  2 C-3PO                 167       0.25 
##  3 R2-D2                  96       0.05 
##  4 Darth Vader           202       0.887
##  5 Leia Organa           150       0.125
##  6 Owen Lars             178       0.425
##  7 Beru Whitesun lars    165       0.2  
##  8 R5-D4                  97       0.075
##  9 Biggs Darklighter     183       0.55 
## 10 Obi-Wan Kenobi        182       0.538
## # ... with 77 more rows
mutate(starwars_height, height_rank = cume_dist(height)) 
## # A tibble: 87 x 3
##    name               height height_rank
##    <chr>               <int>       <dbl>
##  1 Luke Skywalker        172      0.358 
##  2 C-3PO                 167      0.272 
##  3 R2-D2                  96      0.0741
##  4 Darth Vader           202      0.889 
##  5 Leia Organa           150      0.148 
##  6 Owen Lars             178      0.469 
##  7 Beru Whitesun lars    165      0.235 
##  8 R5-D4                  97      0.0864
##  9 Biggs Darklighter     183      0.630 
## 10 Obi-Wan Kenobi        182      0.543 
## # ... with 77 more rows
mutate(starwars_height, height_decile = ntile(height, 10))
## # A tibble: 87 x 3
##    name               height height_decile
##    <chr>               <int>         <int>
##  1 Luke Skywalker        172             4
##  2 C-3PO                 167             3
##  3 R2-D2                  96             1
##  4 Darth Vader           202             9
##  5 Leia Organa           150             2
##  6 Owen Lars             178             5
##  7 Beru Whitesun lars    165             2
##  8 R5-D4                  97             1
##  9 Biggs Darklighter     183             6
## 10 Obi-Wan Kenobi        182             6
## # ... with 77 more rows
mutate(starwars_height, height_quartile = ntile(height, 4)) 
## # A tibble: 87 x 3
##    name               height height_quartile
##    <chr>               <int>           <int>
##  1 Luke Skywalker        172               2
##  2 C-3PO                 167               1
##  3 R2-D2                  96               1
##  4 Darth Vader           202               4
##  5 Leia Organa           150               1
##  6 Owen Lars             178               2
##  7 Beru Whitesun lars    165               1
##  8 R5-D4                  97               1
##  9 Biggs Darklighter     183               3
## 10 Obi-Wan Kenobi        182               3
## # ... with 77 more rows
mutate(starwars_height, height_100_to_150 = between(height, 100, 150))
## # A tibble: 87 x 3
##    name               height height_100_to_150
##    <chr>               <int> <lgl>            
##  1 Luke Skywalker        172 FALSE            
##  2 C-3PO                 167 FALSE            
##  3 R2-D2                  96 FALSE            
##  4 Darth Vader           202 FALSE            
##  5 Leia Organa           150 TRUE             
##  6 Owen Lars             178 FALSE            
##  7 Beru Whitesun lars    165 FALSE            
##  8 R5-D4                  97 FALSE            
##  9 Biggs Darklighter     183 FALSE            
## 10 Obi-Wan Kenobi        182 FALSE            
## # ... with 77 more rows
mutate(starwars_height, height_cumsum = cumsum(height))
## # A tibble: 87 x 3
##    name               height height_cumsum
##    <chr>               <int>         <int>
##  1 Luke Skywalker        172           172
##  2 C-3PO                 167           339
##  3 R2-D2                  96           435
##  4 Darth Vader           202           637
##  5 Leia Organa           150           787
##  6 Owen Lars             178           965
##  7 Beru Whitesun lars    165          1130
##  8 R5-D4                  97          1227
##  9 Biggs Darklighter     183          1410
## 10 Obi-Wan Kenobi        182          1592
## # ... with 77 more rows
mutate(starwars_height, height_cummean = cummean(height))
## # A tibble: 87 x 3
##    name               height height_cummean
##    <chr>               <int>          <dbl>
##  1 Luke Skywalker        172           172 
##  2 C-3PO                 167           170.
##  3 R2-D2                  96           145 
##  4 Darth Vader           202           159.
##  5 Leia Organa           150           157.
##  6 Owen Lars             178           161.
##  7 Beru Whitesun lars    165           161.
##  8 R5-D4                  97           153.
##  9 Biggs Darklighter     183           157.
## 10 Obi-Wan Kenobi        182           159.
## # ... with 77 more rows
mutate(starwars_height, height_cummax = cummax(height))
## # A tibble: 87 x 3
##    name               height height_cummax
##    <chr>               <int>         <int>
##  1 Luke Skywalker        172           172
##  2 C-3PO                 167           172
##  3 R2-D2                  96           172
##  4 Darth Vader           202           202
##  5 Leia Organa           150           202
##  6 Owen Lars             178           202
##  7 Beru Whitesun lars    165           202
##  8 R5-D4                  97           202
##  9 Biggs Darklighter     183           202
## 10 Obi-Wan Kenobi        182           202
## # ... with 77 more rows