dplyr training 2The purpose of this noteboook is to illustrate how the dplyr package can be used to perform intermediate data manipulation tasks.
library(dplyr)
starwars
## # A tibble: 87 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male
## 2 C-3PO 167 75 <NA> gold yellow 112 <NA>
## 3 R2-D2 96 32 <NA> white, bl… red 33 <NA>
## 4 Darth V… 202 136 none white yellow 41.9 male
## 5 Leia Or… 150 49 brown light brown 19 female
## 6 Owen La… 178 120 brown, gr… light blue 52 male
## 7 Beru Wh… 165 75 brown light blue 47 female
## 8 R5-D4 97 32 <NA> white, red red NA <NA>
## 9 Biggs D… 183 84 black light brown 24 male
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male
## # ... with 77 more rows, and 5 more variables: homeworld <chr>,
## # species <chr>, films <list>, vehicles <list>, starships <list>
band_members
## # A tibble: 3 x 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_instruments
## # A tibble: 3 x 2
## name plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
band_instruments2 <- rename(band_instruments, artist = name)
band_instruments2
## # A tibble: 3 x 2
## artist plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
full_join(band_members, band_instruments, by = "name")
## # A tibble: 4 x 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
inner_join(band_members, band_instruments, by = "name")
## # A tibble: 2 x 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
left_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
right_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
## 3 Keith <NA> guitar
inner_join(band_members, band_instruments2, by = c("name" = "artist"))
## # A tibble: 2 x 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
band_members2 <- semi_join(band_members, band_instruments, by = "name")
band_members2
## # A tibble: 2 x 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
anti_join(band_members, band_instruments, by = "name")
## # A tibble: 1 x 2
## name band
## <chr> <chr>
## 1 Mick Stones
band_members2 <- bind_rows(band_members2, data.frame(name = "Ringo", band = "Beatles")) #use rbind if bind_rows too fussy
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
band_members2
## # A tibble: 3 x 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
## 3 Ringo Beatles
bind_cols(band_members2, singing=c(TRUE, TRUE, FALSE)) #use cbind if bind_cols too fussy
## # A tibble: 3 x 3
## name band singing
## <chr> <chr> <lgl>
## 1 John Beatles TRUE
## 2 Paul Beatles TRUE
## 3 Ringo Beatles FALSE
band_members
## # A tibble: 3 x 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_members2
## # A tibble: 3 x 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
## 3 Ringo Beatles
union(band_members, band_members2)
## # A tibble: 4 x 2
## name band
## <chr> <chr>
## 1 Ringo Beatles
## 2 Paul Beatles
## 3 John Beatles
## 4 Mick Stones
intersect(band_members, band_members2)
## # A tibble: 2 x 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
setdiff(band_members, band_members2)
## # A tibble: 1 x 2
## name band
## <chr> <chr>
## 1 Mick Stones
setdiff(band_members2, band_members)
## # A tibble: 1 x 2
## name band
## <chr> <chr>
## 1 Ringo Beatles
starwars_subset <- select(starwars, name, mass, height, eye_color)
starwars_subset
## # A tibble: 87 x 4
## name mass height eye_color
## <chr> <dbl> <int> <chr>
## 1 Luke Skywalker 77 172 blue
## 2 C-3PO 75 167 yellow
## 3 R2-D2 32 96 red
## 4 Darth Vader 136 202 yellow
## 5 Leia Organa 49 150 brown
## 6 Owen Lars 120 178 blue
## 7 Beru Whitesun lars 75 165 blue
## 8 R5-D4 32 97 red
## 9 Biggs Darklighter 84 183 brown
## 10 Obi-Wan Kenobi 77 182 blue-gray
## # ... with 77 more rows
starwars_height <- select(starwars, name, height)
mutate(starwars_height, id = row_number())
## # A tibble: 87 x 3
## name height id
## <chr> <int> <int>
## 1 Luke Skywalker 172 1
## 2 C-3PO 167 2
## 3 R2-D2 96 3
## 4 Darth Vader 202 4
## 5 Leia Organa 150 5
## 6 Owen Lars 178 6
## 7 Beru Whitesun lars 165 7
## 8 R5-D4 97 8
## 9 Biggs Darklighter 183 9
## 10 Obi-Wan Kenobi 182 10
## # ... with 77 more rows
mutate(starwars_height, id = row_number(height))
## # A tibble: 87 x 3
## name height id
## <chr> <int> <int>
## 1 Luke Skywalker 172 29
## 2 C-3PO 167 21
## 3 R2-D2 96 5
## 4 Darth Vader 202 72
## 5 Leia Organa 150 11
## 6 Owen Lars 178 35
## 7 Beru Whitesun lars 165 17
## 8 R5-D4 97 7
## 9 Biggs Darklighter 183 45
## 10 Obi-Wan Kenobi 182 44
## # ... with 77 more rows
mutate(starwars_height, height_rank = min_rank(height))
## # A tibble: 87 x 3
## name height height_rank
## <chr> <int> <int>
## 1 Luke Skywalker 172 29
## 2 C-3PO 167 21
## 3 R2-D2 96 5
## 4 Darth Vader 202 72
## 5 Leia Organa 150 11
## 6 Owen Lars 178 35
## 7 Beru Whitesun lars 165 17
## 8 R5-D4 97 7
## 9 Biggs Darklighter 183 45
## 10 Obi-Wan Kenobi 182 44
## # ... with 77 more rows
mutate(starwars_height, height_rank = dense_rank(height))
## # A tibble: 87 x 3
## name height height_rank
## <chr> <int> <int>
## 1 Luke Skywalker 172 20
## 2 C-3PO 167 16
## 3 R2-D2 96 5
## 4 Darth Vader 202 37
## 5 Leia Organa 150 10
## 6 Owen Lars 178 24
## 7 Beru Whitesun lars 165 14
## 8 R5-D4 97 6
## 9 Biggs Darklighter 183 27
## 10 Obi-Wan Kenobi 182 26
## # ... with 77 more rows
mutate(starwars_height, height_rank = percent_rank(height))
## # A tibble: 87 x 3
## name height height_rank
## <chr> <int> <dbl>
## 1 Luke Skywalker 172 0.35
## 2 C-3PO 167 0.25
## 3 R2-D2 96 0.05
## 4 Darth Vader 202 0.887
## 5 Leia Organa 150 0.125
## 6 Owen Lars 178 0.425
## 7 Beru Whitesun lars 165 0.2
## 8 R5-D4 97 0.075
## 9 Biggs Darklighter 183 0.55
## 10 Obi-Wan Kenobi 182 0.538
## # ... with 77 more rows
mutate(starwars_height, height_rank = cume_dist(height))
## # A tibble: 87 x 3
## name height height_rank
## <chr> <int> <dbl>
## 1 Luke Skywalker 172 0.358
## 2 C-3PO 167 0.272
## 3 R2-D2 96 0.0741
## 4 Darth Vader 202 0.889
## 5 Leia Organa 150 0.148
## 6 Owen Lars 178 0.469
## 7 Beru Whitesun lars 165 0.235
## 8 R5-D4 97 0.0864
## 9 Biggs Darklighter 183 0.630
## 10 Obi-Wan Kenobi 182 0.543
## # ... with 77 more rows
mutate(starwars_height, height_decile = ntile(height, 10))
## # A tibble: 87 x 3
## name height height_decile
## <chr> <int> <int>
## 1 Luke Skywalker 172 4
## 2 C-3PO 167 3
## 3 R2-D2 96 1
## 4 Darth Vader 202 9
## 5 Leia Organa 150 2
## 6 Owen Lars 178 5
## 7 Beru Whitesun lars 165 2
## 8 R5-D4 97 1
## 9 Biggs Darklighter 183 6
## 10 Obi-Wan Kenobi 182 6
## # ... with 77 more rows
mutate(starwars_height, height_quartile = ntile(height, 4))
## # A tibble: 87 x 3
## name height height_quartile
## <chr> <int> <int>
## 1 Luke Skywalker 172 2
## 2 C-3PO 167 1
## 3 R2-D2 96 1
## 4 Darth Vader 202 4
## 5 Leia Organa 150 1
## 6 Owen Lars 178 2
## 7 Beru Whitesun lars 165 1
## 8 R5-D4 97 1
## 9 Biggs Darklighter 183 3
## 10 Obi-Wan Kenobi 182 3
## # ... with 77 more rows
mutate(starwars_height, height_100_to_150 = between(height, 100, 150))
## # A tibble: 87 x 3
## name height height_100_to_150
## <chr> <int> <lgl>
## 1 Luke Skywalker 172 FALSE
## 2 C-3PO 167 FALSE
## 3 R2-D2 96 FALSE
## 4 Darth Vader 202 FALSE
## 5 Leia Organa 150 TRUE
## 6 Owen Lars 178 FALSE
## 7 Beru Whitesun lars 165 FALSE
## 8 R5-D4 97 FALSE
## 9 Biggs Darklighter 183 FALSE
## 10 Obi-Wan Kenobi 182 FALSE
## # ... with 77 more rows
mutate(starwars_height, height_cumsum = cumsum(height))
## # A tibble: 87 x 3
## name height height_cumsum
## <chr> <int> <int>
## 1 Luke Skywalker 172 172
## 2 C-3PO 167 339
## 3 R2-D2 96 435
## 4 Darth Vader 202 637
## 5 Leia Organa 150 787
## 6 Owen Lars 178 965
## 7 Beru Whitesun lars 165 1130
## 8 R5-D4 97 1227
## 9 Biggs Darklighter 183 1410
## 10 Obi-Wan Kenobi 182 1592
## # ... with 77 more rows
mutate(starwars_height, height_cummean = cummean(height))
## # A tibble: 87 x 3
## name height height_cummean
## <chr> <int> <dbl>
## 1 Luke Skywalker 172 172
## 2 C-3PO 167 170.
## 3 R2-D2 96 145
## 4 Darth Vader 202 159.
## 5 Leia Organa 150 157.
## 6 Owen Lars 178 161.
## 7 Beru Whitesun lars 165 161.
## 8 R5-D4 97 153.
## 9 Biggs Darklighter 183 157.
## 10 Obi-Wan Kenobi 182 159.
## # ... with 77 more rows
mutate(starwars_height, height_cummax = cummax(height))
## # A tibble: 87 x 3
## name height height_cummax
## <chr> <int> <int>
## 1 Luke Skywalker 172 172
## 2 C-3PO 167 172
## 3 R2-D2 96 172
## 4 Darth Vader 202 202
## 5 Leia Organa 150 202
## 6 Owen Lars 178 202
## 7 Beru Whitesun lars 165 202
## 8 R5-D4 97 202
## 9 Biggs Darklighter 183 202
## 10 Obi-Wan Kenobi 182 202
## # ... with 77 more rows