# df - R displays entire df
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
library(tibble)
# easier to inspect a small portion, or use View to see all
as.tibble(mtcars)
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
join: to augment a data frame with information from another data frame
create datasets to be used
artistsKH <- tibble(first = c("Jimmy", "George", "Mick", "Tom", "Davy", "John", "Paul", "Jimmy", "Joe", "Elvis", "Keith", "Paul", "Ringo", "Joe", "Brian", "Nancy"), last = c("Buffett", "Harrison", "Jagger", "Jones", "Jones", "Lennon", "McCartney", "Page", "Perry", "Presley", "Richards", "Simon", "Starr", "Walsh", "Wilson", "Wilson"), instrument = c("Guitar", "Guitar", "Vocals", "Vocals", "Vocals", "Guitar", "Bass", "Guitar", "Guitar", "Vocals", "Guitar", "Guitar", "Drums", "Guitar", "Vocals", "Vocals"))
# artistsKH == artists - verified in line with what was given
artists <- artistsKH
bandsKH <- tibble(first = c("John", "John Paul", "Jimmy", "Robert", "George", "John", "Paul", "Ringo", "Jimmy", "Mick", "Keith", "Charlie", "Ronnie"), last = c("Bonham", "Jones", "Page", "Plant", "Harrison", "Lennon", "McCartney", "Starr", "Buffett", "Jagger", "Richards", "Watts", "Wood"), band = c("Led Zeppelin", "Led Zeppelin", "Led Zeppelin", "Led Zeppelin", "The Beatles", "The Beatles", "The Beatles", "The Beatles", "The Coral Reefers", "The Rolling Stones", "The Rolling Stones", "The Rolling Stones", "The Rolling Stones"))
# bandsKH == bands - verified in line with what was given
bands <- bandsKH
e.g. left_join(x, y) joins y to x – second dataset you specify is joined to the first dataset
_join(x, y) joins y to xjoin artists to bands
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bands2 <- left_join(bands, artists, by = c("first", "last"))
bands2 # always check the output of your joins - to verify in line with expectations
## # A tibble: 13 x 4
## first last band instrument
## <chr> <chr> <chr> <chr>
## 1 John Bonham Led Zeppelin <NA>
## 2 John Paul Jones Led Zeppelin <NA>
## 3 Jimmy Page Led Zeppelin Guitar
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Mick Jagger The Rolling Stones Vocals
## 11 Keith Richards The Rolling Stones Guitar
## 12 Charlie Watts The Rolling Stones <NA>
## 13 Ronnie Wood The Rolling Stones <NA>
always double check the output
left_join(bands, artists, by = "first") # join is not in line with expectations!
## # A tibble: 16 x 5
## first last.x band last.y instrument
## <chr> <chr> <chr> <chr> <chr>
## 1 John Bonham Led Zeppelin Lennon Guitar
## 2 John Paul Jones Led Zeppelin <NA> <NA>
## 3 Jimmy Page Led Zeppelin Buffett Guitar
## 4 Jimmy Page Led Zeppelin Page Guitar
## 5 Robert Plant Led Zeppelin <NA> <NA>
## 6 George Harrison The Beatles Harrison Guitar
## 7 John Lennon The Beatles Lennon Guitar
## 8 Paul McCartney The Beatles McCartney Bass
## 9 Paul McCartney The Beatles Simon Guitar
## 10 Ringo Starr The Beatles Starr Drums
## 11 Jimmy Buffett The Coral Reefers Buffett Guitar
## 12 Jimmy Buffett The Coral Reefers Page Guitar
## 13 Mick Jagger The Rolling Stones Jagger Vocals
## 14 Keith Richards The Rolling Stones Richards Guitar
## 15 Charlie Watts The Rolling Stones <NA> <NA>
## 16 Ronnie Wood The Rolling Stones <NA> <NA>
_join(x, y) joins y to x# right_join() compared to left_join above
# order of the datasets reversed
bands3 <- right_join(artists, bands, by = c("first", "last"))
# NB: each used the same by argument.
# Check: bands3 == bands2
setequal(bands3, bands2)
## TRUE
create datasets to be used
albumsKH <- tibble(
album = c("A Hard Day's Night", "Magical Mystery Tour", "Beggar's Banquet", "Abbey Road", "Led Zeppelin IV", "The Dark Side of the Moon", "Aerosmith", "Rumours", "Hotel California"
),
band = c("The Beatles", "The Beatles", "The Rolling Stones", "The Beatles", "Led Zeppelin", "Pink Floyd", "Aerosmith", "Fleetwood Mac", "Eagles"
),
year = c(1964,1967,1968,1969,1971,1973,1973,1977,1982
)
)
# albumsKH == albums - verified in line with what was given
albums <- albumsKH
songsKH <- tibble(
song = c("Come Together", "Dream On", "Hello, Goodbye", "It's Not Unusual"
),
album = c("Abbey Road", "Aerosmith", "Magical Mystery Tour", "Along Came Jones"
),
first = c("John", "Steven", "Paul", "Tom"
),
last = c("Lennon", "Tyler", "McCartney", "Jones"
)
)
# songsKH == songs - verified in line with what was given
songs <- songsKH
most strict type of join: they only retain observations that appear in both datasets. i.e. returns only rows that contain information about both songs and albums.
_join(x, y) joins y to x# Join albums to songs
inner_join(songs, albums, by = "album")
## # A tibble: 3 x 6
## song album first last band year
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Come Together Abbey Road John Lennon The Beatles 1969
## 2 Dream On Aerosmith Steven Tyler Aerosmith 1973
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney The Beatles 1967
most permissive type of join: they return all of the data that appears in both datasets (often resulting in many missing values).
_join(x, y) joins y to x# Join bands to artists
full_join(artists, bands, by = c("first", "last"))
## # A tibble: 21 x 4
## first last instrument band
## <chr> <chr> <chr> <chr>
## 1 Jimmy Buffett Guitar The Coral Reefers
## 2 George Harrison Guitar The Beatles
## 3 Mick Jagger Vocals The Rolling Stones
## 4 Tom Jones Vocals <NA>
## 5 Davy Jones Vocals <NA>
## 6 John Lennon Guitar The Beatles
## 7 Paul McCartney Bass The Beatles
## 8 Jimmy Page Guitar Led Zeppelin
## 9 Joe Perry Guitar <NA>
## 10 Elvis Presley Vocals <NA>
## # ... with 11 more rows
Pipes are so efficient for multi-step analysis
full_join(artists, bands,
by = c("first", "last"))
## # A tibble: 21 x 4
## first last instrument band
## <chr> <chr> <chr> <chr>
## 1 Jimmy Buffett Guitar The Coral Reefers
## 2 George Harrison Guitar The Beatles
## 3 Mick Jagger Vocals The Rolling Stones
## 4 Tom Jones Vocals <NA>
## 5 Davy Jones Vocals <NA>
## 6 John Lennon Guitar The Beatles
## 7 Paul McCartney Bass The Beatles
## 8 Jimmy Page Guitar Led Zeppelin
## 9 Joe Perry Guitar <NA>
## 10 Elvis Presley Vocals <NA>
## # ... with 11 more rows
artists %>%
full_join(bands, by = c("first", "last"))
## # A tibble: 21 x 4
## first last instrument band
## <chr> <chr> <chr> <chr>
## 1 Jimmy Buffett Guitar The Coral Reefers
## 2 George Harrison Guitar The Beatles
## 3 Mick Jagger Vocals The Rolling Stones
## 4 Tom Jones Vocals <NA>
## 5 Davy Jones Vocals <NA>
## 6 John Lennon Guitar The Beatles
## 7 Paul McCartney Bass The Beatles
## 8 Jimmy Page Guitar Led Zeppelin
## 9 Joe Perry Guitar <NA>
## 10 Elvis Presley Vocals <NA>
## # ... with 11 more rows
temp <- left_join(bands, artists, by = c("first", "last"))
temp <- filter(temp, instrument == "Guitar")
select(temp, first, last, band)
## # A tibble: 5 x 3
## first last band
## <chr> <chr> <chr>
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
# Reproduce code above using pipes
# pipe %>% should be used three times and temp zero times.
bands %>%
left_join(artists, by = c("first", "last")) %>%
filter(instrument == "Guitar") %>%
select(first, last, band)
## # A tibble: 5 x 3
## first last band
## <chr> <chr> <chr>
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
goal2 <- artists %>%
full_join(bands) %>%
inner_join(songs, by = c("first", "last"))
## Joining, by = c("first", "last")
# Create one table that combines all information
artists %>%
full_join(bands, by = c("first", "last")) %>%
full_join(songs, by = c("first", "last")) %>%
full_join(albums, by = c("album", "band"))
## # A tibble: 29 x 7
## first last instrument band song album year
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Jimmy Buffett Guitar The Coral Reefers <NA> <NA> NA
## 2 George Harrison Guitar The Beatles <NA> <NA> NA
## 3 Mick Jagger Vocals The Rolling Stones <NA> <NA> NA
## 4 Tom Jones Vocals <NA> It's No~ Along Ca~ NA
## 5 Davy Jones Vocals <NA> <NA> <NA> NA
## 6 John Lennon Guitar The Beatles Come To~ Abbey Ro~ 1969
## 7 Paul McCartney Bass The Beatles Hello, ~ Magical ~ 1967
## 8 Jimmy Page Guitar Led Zeppelin <NA> <NA> NA
## 9 Joe Perry Guitar <NA> <NA> <NA> NA
## 10 Elvis Presley Vocals <NA> <NA> <NA> NA
## # ... with 19 more rows
provide a concise way to filter data from the first dataset based on information in a second dataset
# View the output of semi_join()
artists %>%
semi_join(songs, by = c("first", "last"))
## # A tibble: 3 x 3
## first last instrument
## <chr> <chr> <chr>
## 1 Tom Jones Vocals
## 2 John Lennon Guitar
## 3 Paul McCartney Bass
# Create the same result
artists %>%
right_join(songs, by = c("first", "last")) %>%
filter(!is.na(instrument)) %>%
select(first, last, instrument)
## # A tibble: 3 x 3
## first last instrument
## <chr> <chr> <chr>
## 1 John Lennon Guitar
## 2 Paul McCartney Bass
## 3 Tom Jones Vocals
albums %>%
# Collect the albums made by a band
semi_join(bands) %>%
# Count the albums made by a band
nrow()
## Joining, by = "band"
## [1] 5
useful way to reason about how a mutating join will work before you apply the join.
# Return rows of artists that don't have bands info
artists %>%
# anti_join(bands, by c("first", "last")) # INCORRECT
anti_join(bands, by = c("first", "last")) # CORRECT
## # A tibble: 8 x 3
## first last instrument
## <chr> <chr> <chr>
## 1 Tom Jones Vocals
## 2 Davy Jones Vocals
## 3 Joe Perry Guitar
## 4 Elvis Presley Vocals
## 5 Paul Simon Guitar
## 6 Joe Walsh Guitar
## 7 Brian Wilson Vocals
## 8 Nancy Wilson Vocals
create datasets to be used
labels <- tibble(
album = c("Abbey Road", "A Hard Days Night", "Magical Mystery Tour", "Led Zeppelin IV", "The Dark Side of the Moon", "Hotel California", "Rumours", "Aerosmith", "Beggar's Banquet"
),
label = c("Apple", "Parlophone", "Parlophone", "Atlantic", "Harvest", "Asylum", "Warner Brothers", "Columbia", "Decca"
)
)
# Check whether album names in labels are mis-entered
# i.e. capitalization or spelling errors in the keys
labels %>%
anti_join(albums, by = "album")
## # A tibble: 1 x 2
## album label
## <chr> <chr>
## 1 A Hard Days Night Parlophone
How many rows in songs match a label in labels?
labels
## # A tibble: 9 x 2
## album label
## <chr> <chr>
## 1 Abbey Road Apple
## 2 A Hard Days Night Parlophone
## 3 Magical Mystery Tour Parlophone
## 4 Led Zeppelin IV Atlantic
## 5 The Dark Side of the Moon Harvest
## 6 Hotel California Asylum
## 7 Rumours Warner Brothers
## 8 Aerosmith Columbia
## 9 Beggar's Banquet Decca
songs
## # A tibble: 4 x 4
## song album first last
## <chr> <chr> <chr> <chr>
## 1 Come Together Abbey Road John Lennon
## 2 Dream On Aerosmith Steven Tyler
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney
## 4 It's Not Unusual Along Came Jones Tom Jones
songs %>%
# Find the rows of songs that match a row in labels
semi_join(labels, by = "album") %>%
# Number of matches between labels and songs
nrow()
## [1] 3