library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
create datasets to be used
# three albums from the band Aerosmith
aerosmith <- tibble(
song = c("Make It", "Somebody", "Dream On", "One Way Street", "Mama Kin", "Write me a Letter", "Moving Out", "Walking the Dog"),
length = c(13260, 13500, 16080, 25200, 15900, 15060, 18180, 11520)
)
greatest_hits <- tibble(
song = c("Dream On", "Mama Kin", "Same Old Song and Dance", "Seasons of Winter", "Sweet Emotion", "Walk this Way", "Big Ten Inch Record", "Last Child", "Back in the Saddle", "Draw the Line", "Kings and Queens", "Come Together", "Remember (Walking in the Sand)", "Lightning Strikes", "Chip Away the Stone", "Sweet Emotion (remix)", "One Way Street (live)"),
length = c(16080, 16020, 11040, 17820, 11700, 12780, 8100, 12480, 16860, 12240, 13680, 13620, 14700, 16080, 14460, 16560, 24000)
)
live <- tibble(
song = c("Back in the Saddle", " Sweet Emotion", " Lord of the Thighs", " Toys in the Attic", " Last Child", " Come Together", " Walk this Way", " Sick as a Dog", " Dream On", " Chip Away the Stone", " Sight for Sore Eyes", " Mama Kin", " S.O.S. (Too Bad)", " I Ain't Got You", " Mother Popcorn/Draw the Line", " Train Kept A-Rollin'/Strangers in the Night"),
length = c(15900, 16920, 26280, 13500, 12240, 17460, 13560, 16920, 16260, 15120, 11880, 13380, 9960, 14220, 41700, 17460)
)
useful if same rows across datasets
How many unique songs do these two albums contain in total?
aerosmith %>%
# Create the new dataset using a set operation
union(greatest_hits) %>% # i.e. every song
# Count the total number of songs
nrow()
## [1] 24
Which songs from Aerosmith made it onto Greatest Hits?
# Create the new dataset using a set operation
# containing every track that is in both Aerosmith and Greatest Hits
aerosmith %>%
intersect(greatest_hits)
## # A tibble: 1 x 2
## song length
## <chr> <dbl>
## 1 Dream On 16080
Which songs are on Live! Bootleg but not on Greatest Hits?
# To select just the x column from a dataset called data, use select(data, x)
# Select the song names from live
live_songs <- live %>% select(song)
# Select the song names from greatest_hits
greatest_songs <- greatest_hits %>% select(song)
# Create the new dataset using set operation
live_songs %>%
setdiff(greatest_songs)
## # A tibble: 15 x 1
## song
## <chr>
## 1 "\tSweet Emotion"
## 2 "\tLord of the Thighs"
## 3 "\tToys in the Attic"
## 4 "\tLast Child"
## 5 "\tCome Together"
## 6 "\tWalk this Way"
## 7 "\tSick as a Dog"
## 8 "\tDream On"
## 9 "\tChip Away the Stone"
## 10 "\tSight for Sore Eyes"
## 11 "\tMama Kin"
## 12 "\tS.O.S. (Too Bad)"
## 13 "\tI Ain't Got You"
## 14 "\tMother Popcorn/Draw the Line"
## 15 "\tTrain Kept A-Rollin'/Strangers in the Night"
find rows that appear in one data frame or another, but not both
Which songs appear on one of Live! Bootleg or Greatest Hits, but not both?"
# Select songs from live and greatest_hits
live_songs <- live %>% select(song)
greatest_songs <- greatest_hits %>% select(song)
# Return the songs that only exist in one dataset
# you won't be able to do this with a single pipe
# find the songs that exist in live_songs and/or greatest_songs
all_songs <- live_songs %>% union(greatest_songs)
# find the songs that exist in the intersection with intersect()
common_songs <- live_songs %>% intersect(greatest_songs)
# find all the songs that are in one of the two datasets, but not both
all_songs %>% setdiff(common_songs)
## # A tibble: 31 x 1
## song
## <chr>
## 1 Dream On
## 2 "\tDream On"
## 3 Mama Kin
## 4 One Way Street (live)
## 5 "\tChip Away the Stone"
## 6 "\tSick as a Dog"
## 7 Big Ten Inch Record
## 8 "\tSweet Emotion"
## 9 Sweet Emotion (remix)
## 10 Draw the Line
## # ... with 21 more rows
When your datasets contain the same variables, a setdiff() does the same thing as an anti_join() that uses every column as a key.