library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)

create datasets to be used


# three albums from the band Aerosmith

aerosmith <- tibble(
                   song = c("Make It", "Somebody", "Dream On", "One Way Street", "Mama Kin", "Write me a Letter", "Moving Out", "Walking the Dog"),

                   length = c(13260, 13500, 16080, 25200, 15900, 15060, 18180, 11520)
                   )

greatest_hits <- tibble(
                       song = c("Dream On", "Mama Kin", "Same Old Song and Dance", "Seasons of Winter", "Sweet Emotion", "Walk this Way", "Big Ten Inch Record", "Last Child", "Back in the Saddle", "Draw the Line", "Kings and Queens", "Come Together", "Remember (Walking in the Sand)", "Lightning Strikes", "Chip Away the Stone", "Sweet Emotion (remix)", "One Way Street (live)"),

                       length = c(16080, 16020, 11040, 17820, 11700, 12780, 8100, 12480, 16860, 12240, 13680, 13620, 14700, 16080, 14460, 16560, 24000)
                       )

live <- tibble(
                       song = c("Back in the Saddle", " Sweet Emotion", "   Lord of the Thighs", "  Toys in the Attic", "   Last Child", "  Come Together", "   Walk this Way", "   Sick as a Dog", "   Dream On", "    Chip Away the Stone", " Sight for Sore Eyes", " Mama Kin", "    S.O.S. (Too Bad)", "    I Ain't Got You", " Mother Popcorn/Draw the Line", "    Train Kept A-Rollin'/Strangers in the Night"),

                       length = c(15900,    16920,  26280,  13500,  12240,  17460,  13560,  16920,  16260,  15120,  11880,  13380,  9960,   14220,  41700,  17460)
                       )

set operations

useful if same rows across datasets

union()

How many unique songs do these two albums contain in total?

aerosmith %>% 
  # Create the new dataset using a set operation
  union(greatest_hits) %>% # i.e. every song
  # Count the total number of songs
  nrow()
## [1] 24

intersect()

Which songs from Aerosmith made it onto Greatest Hits?

# Create the new dataset using a set operation
# containing every track that is in both Aerosmith and Greatest Hits
aerosmith %>% 
  intersect(greatest_hits)
## # A tibble: 1 x 2
##   song     length
##   <chr>     <dbl>
## 1 Dream On  16080

setdiff()

Which songs are on Live! Bootleg but not on Greatest Hits?


# To select just the x column from a dataset called data, use select(data, x)

# Select the song names from live
live_songs <- live %>% select(song)

# Select the song names from greatest_hits
greatest_songs <- greatest_hits %>% select(song)

# Create the new dataset using set operation
live_songs %>% 
  setdiff(greatest_songs)
## # A tibble: 15 x 1
##    song                                           
##    <chr>                                          
##  1 "\tSweet Emotion"                              
##  2 "\tLord of the Thighs"                         
##  3 "\tToys in the Attic"                          
##  4 "\tLast Child"                                 
##  5 "\tCome Together"                              
##  6 "\tWalk this Way"                              
##  7 "\tSick as a Dog"                              
##  8 "\tDream On"                                   
##  9 "\tChip Away the Stone"                        
## 10 "\tSight for Sore Eyes"                        
## 11 "\tMama Kin"                                   
## 12 "\tS.O.S. (Too Bad)"                           
## 13 "\tI Ain't Got You"                            
## 14 "\tMother Popcorn/Draw the Line"               
## 15 "\tTrain Kept A-Rollin'/Strangers in the Night"

Multiple Operations

find rows that appear in one data frame or another, but not both

Which songs appear on one of Live! Bootleg or Greatest Hits, but not both?"

# Select songs from live and greatest_hits
live_songs <- live %>% select(song)
greatest_songs <- greatest_hits %>% select(song)

# Return the songs that only exist in one dataset
# you won't be able to do this with a single pipe

# find the songs that exist in live_songs and/or greatest_songs
all_songs <- live_songs %>% union(greatest_songs)

# find the songs that exist in the intersection with intersect()
common_songs <- live_songs %>% intersect(greatest_songs)

# find all the songs that are in one of the two datasets, but not both
all_songs %>% setdiff(common_songs)
## # A tibble: 31 x 1
##    song                   
##    <chr>                  
##  1 Dream On               
##  2 "\tDream On"           
##  3 Mama Kin               
##  4 One Way Street (live)  
##  5 "\tChip Away the Stone"
##  6 "\tSick as a Dog"      
##  7 Big Ten Inch Record    
##  8 "\tSweet Emotion"      
##  9 Sweet Emotion (remix)  
## 10 Draw the Line          
## # ... with 21 more rows

Compare Sets

identical()
- determine whether x and y contain the same songs in the same order.
  - identical(x,y)
setequal()
- determine whether x and y contain the same songs in any order.
  - setequal(x,y)
setdiff()
- see which songs (if any) are in x but not y.
  - setdiff(x,y)
  - x %>% setdiff(y)
setdiff()
- see which songs (if any) are in y but not x.
  - setdiff(y,x)
  - y %>% setdiff(x)

When your datasets contain the same variables, a setdiff() does the same thing as an anti_join() that uses every column as a key.

filtering join
- returns songs in x that are not in y
  - x %>% anti_join(y)
- returns songs in y that are not in x
  - y %>% anti_join(x)

dplyr - set operations

Kunal Haira

2018-05-12