library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
ARGH: primary key is in the row names for stage_songs
stage_songs
## musical year
## Children Will Listen Into the Woods 1986
## Maria West Side Story 1957
## Memory Cats 1981
## The Music of the Night Phantom of the Opera 1986
stage_writers
## song composer
## 1 Children Will Listen Stephen Sondheim
## 2 Maria Leonard Bernstein
## 3 Memory Andrew Lloyd Webber
## 4 The Music of the Night Andrew Lloyd Webber
> stage_songs %>%
# Add row names as a column named song
rownames_to_column("song") %>%
# Left join stage_writers to stage_songs
left_join(stage_writers, by = "song")
## song musical year composer
## 1 Children Will Listen Into the Woods 1986 Stephen Sondheim
## 2 Maria West Side Story 1957 Leonard Bernstein
## 3 Memory Cats 1981 Andrew Lloyd Webber
## 4 The Music of the Night Phantom of the Opera 1986 Andrew Lloyd Webber
shows
## # A tibble: 6 x 2
## musical year
## <chr> <int>
## 1 The Sound of Music 1959
## 2 Into the Woods 1986
## 3 The King and I 1951
## 4 West Side Story 1957
## 5 Cats 1981
## 6 Phantom of the Opera 1986
values of musical do not uniquely identify the rows of composers
e.g. two rows have the value “The Sound of Music” for musical and two other rows have the value “The King and I”.
i.e. there are more than one composer for some musicals, so the musical column has duplicate key values.
composers
## # A tibble: 8 x 2
## musical composer
## <chr> <chr>
## 1 Cats Andrew Lloyd Webber
## 2 Into the Woods Stephen Sondheim
## 3 Phantom of the Opera Andrew Lloyd Webber
## 4 The King and I Richard Rogers
## 5 The King and I Oscar Hammerstein II
## 6 The Sound of Music Richard Rogers
## 7 The Sound of Music Oscar Hammerstein II
## 8 West Side Story Leonard Bernstein
Q: How many entries (rows) will dplyr create for The Sound of Music if you left join composers to shows by musical?
A: TWO
When the primary dataset contains duplicate key values.
show_songs
# A tibble: 8 × 3
song musical year
<chr> <chr> <int>
1 A Few of My Favorite Things The Sound of Music 1959
2 Children Will Listen Into the Woods 1986
3 Edelweiss The Sound of Music 1959
4 Getting to Know You The King and I 1951
5 Maria West Side Story 1957
6 Memory Cats 1981
7 Shall We Dance? The King and I 1951
8 The Music of the Night Phantom of the Opera 1986
Join by the musical column, but like composers, show_songs has two rows where musical == “The Sound of Music”.
Q: How many entries (rows) will exist for The Sound of Music if you left join composers to show_songs by musical?
A: FOUR
> show_songs %>%
left_join(composers, by = "musical") %>%
arrange(musical)
# A tibble: 12 × 4
song musical year composer
<chr> <chr> <int> <chr>
1 Memory Cats 1981 Andrew Lloyd Webber
2 Children Will Listen Into the Woods 1986 Stephen Sondheim
3 The Music of the Night Phantom of the Opera 1986 Andrew Lloyd Webber
4 Getting to Know You The King and I 1951 Richard Rogers
5 Getting to Know You The King and I 1951 Oscar Hammerstein II
6 Shall We Dance? The King and I 1951 Richard Rogers
7 Shall We Dance? The King and I 1951 Oscar Hammerstein II
8 A Few of My Favorite Things The Sound of Music 1959 Richard Rogers
9 A Few of My Favorite Things The Sound of Music 1959 Oscar Hammerstein II
10 Edelweiss The Sound of Music 1959 Richard Rogers
11 Edelweiss The Sound of Music 1959 Oscar Hammerstein II
12 Maria West Side Story 1957 Leonard Bernstein
Missing keys can mistakenly join two rows that do not belong toegether
singers
# A tibble: 2 × 2
movie singer
<chr> <chr>
1 <NA> Arnold Schwarzenegger
2 The Sound of Music Julie Andrews
two_songs
# A tibble: 2 × 2
song movie
<chr> <chr>
1 Do-Re-Mi The Sound of Music
2 A Spoonful of Sugar <NA>
Who sings A Spoonful of Sugar? Why does this happen?
two_songs %>% inner_join(singers, by = "movie")
# A tibble: 2 × 3
song movie singer
<chr> <chr> <chr>
1 Do-Re-Mi The Sound of Music Julie Andrews
2 A Spoonful of Sugar <NA> Arnold Schwarzenegger
removing rows that contain NAs in the keys (column of two_songs) before joining
> # Remove NA's from key before joining
> two_songs %>%
filter(!is.na(movie)) %>%
inner_join(singers, by = "movie")
# A tibble: 1 × 3
song movie singer
<chr> <chr> <chr>
1 Do-Re-Mi The Sound of Music Julie Andrews
>
name to refer to the name of an actor.
movie_years
# A tibble: 10 × 3
movie name year
<chr> <chr> <int>
1 The Road to Morocco Bing Crosby 1942
2 Going My Way Bing Crosby 1944
3 Anchors Aweigh Frank Sinatra 1945
4 Till the Clouds Roll By Frank Sinatra 1946
5 White Christmas Bing Crosby 1954
6 The Tender Trap Frank Sinatra 1955
7 High Society Bing Crosby 1956
8 The Joker is Wild Frank Sinatra 1957
9 Pal Joey Frank Sinatra 1957
10 Can-Can Frank Sinatra 1960
name to refer to the name of a movie studio
movie_studios
# A tibble: 10 × 2
movie name
<chr> <chr>
1 The Road to Morocco Paramount Pictures
2 Going My Way Paramount Pictures
3 Anchors Aweigh Metro-Goldwyn-Mayer
4 Till the Clouds Roll By Metro-Goldwyn-Mayer
5 White Christmas Paramount Pictures
6 The Tender Trap Metro-Goldwyn-Mayer
7 High Society Metro-Goldwyn-Mayer
8 The Joker is Wild Paramount Pictures
9 Pal Joey Columbia Pictures
10 Can-Can Twentieth-Century Fox
Note, what happens to the duplicated column names?
> movie_years %>%
# Left join movie_studios to movie_years
left_join(movie_studios, by = "movie")
# A tibble: 10 × 4
movie name.x year name.y
<chr> <chr> <int> <chr>
1 The Road to Morocco Bing Crosby 1942 Paramount Pictures
2 Going My Way Bing Crosby 1944 Paramount Pictures
3 Anchors Aweigh Frank Sinatra 1945 Metro-Goldwyn-Mayer
4 Till the Clouds Roll By Frank Sinatra 1946 Metro-Goldwyn-Mayer
5 White Christmas Bing Crosby 1954 Paramount Pictures
6 The Tender Trap Frank Sinatra 1955 Metro-Goldwyn-Mayer
7 High Society Bing Crosby 1956 Metro-Goldwyn-Mayer
8 The Joker is Wild Frank Sinatra 1957 Paramount Pictures
9 Pal Joey Frank Sinatra 1957 Columbia Pictures
10 Can-Can Frank Sinatra 1960 Twentieth-Century Fox
dplyr will ignore duplicate column names if you set the by argument and do not include the duplicated name in the argument. When you do this, dplyr will treat the columns in the normal fashion, but it will add .x and .y to the duplicated names to help you tell the columns apart.
movie_years %>%
# Left join movie_studios to movie_years
left_join(movie_studios, by = "movie") %>%
# Rename the columns: artist and studio
# rename(data, new_name = old_name)
# can do separately:
# rename(artist = name.x) %>%
# rename(studio = name.y)
# or in one:
# rename(artist = name.x, studio = name.y)
# A tibble: 10 × 4
movie artist year studio
<chr> <chr> <int> <chr>
1 The Road to Morocco Bing Crosby 1942 Paramount Pictures
2 Going My Way Bing Crosby 1944 Paramount Pictures
3 Anchors Aweigh Frank Sinatra 1945 Metro-Goldwyn-Mayer
4 Till the Clouds Roll By Frank Sinatra 1946 Metro-Goldwyn-Mayer
5 White Christmas Bing Crosby 1954 Paramount Pictures
6 The Tender Trap Frank Sinatra 1955 Metro-Goldwyn-Mayer
7 High Society Bing Crosby 1956 Metro-Goldwyn-Mayer
8 The Joker is Wild Frank Sinatra 1957 Paramount Pictures
9 Pal Joey Frank Sinatra 1957 Columbia Pictures
10 Can-Can Frank Sinatra 1960 Twentieth-Century Fox
both elvis_songs & elvis_movies describe movies starring Elvis Presley, but each uses a different column name to describe the name of the movie
elvis_songs
# A tibble: 5 × 2
name movie
<chr> <chr>
1 (You're So Square) Baby I Don't Care Jailhouse Rock
2 I Can't Help Falling in Love Blue Hawaii
3 Jailhouse Rock Jailhouse Rock
4 Viva Las Vegas Viva Las Vegas
5 You Don't Know Me Clambake
elvis_movies
# A tibble: 4 × 2
name year
<chr> <int>
1 Jailhouse Rock 1957
2 Blue Hawaii 1961
3 Viva Las Vegas 1963
4 Clambake 1967
To make the join, set by to a named vector
x %>% left_join(y, by = c("x.name1" = "y.name2"))
Error: ‘elvis_songs.movie’ column not found in lhs, cannot join
elvis_movies %>%
# Left join elvis_songs to elvis_movies by key column
left_join(elvis_songs, by = c("elvis_movies.name" = "elvis_songs.movie"))
left_join()
elvis_movies %>%
# Left join elvis_songs to elvis_movies by this column
left_join(elvis_songs, by = c("name" = "movie"))
# A tibble: 5 × 3
name year name.y
<chr> <int> <chr>
1 Jailhouse Rock 1957 (You're So Square) Baby I Don't Care
2 Jailhouse Rock 1957 Jailhouse Rock
3 Blue Hawaii 1961 I Can't Help Falling in Love
4 Viva Las Vegas 1963 Viva Las Vegas
5 Clambake 1967 You Don't Know Me
rename()
elvis_movies %>%
# Left join elvis_songs to elvis_movies by this column
left_join(elvis_songs, by = c("name" = "movie")) %>%
# Rename result columns to: movie, year, and song
rename(movie = name, song = name.y)
# A tibble: 5 × 3
movie year song
<chr> <int> <chr>
1 Jailhouse Rock 1957 (You're So Square) Baby I Don't Care
2 Jailhouse Rock 1957 Jailhouse Rock
3 Blue Hawaii 1961 I Can't Help Falling in Love
4 Viva Las Vegas 1963 Viva Las Vegas
5 Clambake 1967 You Don't Know Me
Each data frame uses different column names to refer to the key variables that connect the datasets
movie_directors
# A tibble: 10 × 3
name director studio
<chr> <chr> <chr>
1 Anchors Aweigh George Sidney Metro-Goldwyn-Mayer
2 Can-Can Walter Lang Twentieth-Century Fox
3 Going My Way Leo McCarey Paramount Pictures
4 High Society Charles Walters Metro-Goldwyn-Mayer
5 Pal Joey George Sidney Columbia Pictures
6 The Joker is Wild Charles Vidor Paramount Pictures
7 The Road to Morocco David Butler Paramount Pictures
8 The Tender Trap Charles Walters Metro-Goldwyn-Mayer
9 Till the Clouds Roll By Richard Whorf Metro-Goldwyn-Mayer
10 White Christmas Michael Curtiz Paramount Pictures
movie_years
# A tibble: 10 × 3
movie name year
<chr> <chr> <int>
1 The Road to Morocco Bing Crosby 1942
2 Going My Way Bing Crosby 1944
3 Anchors Aweigh Frank Sinatra 1945
4 Till the Clouds Roll By Frank Sinatra 1946
5 White Christmas Bing Crosby 1954
6 The Tender Trap Frank Sinatra 1955
7 High Society Bing Crosby 1956
8 The Joker is Wild Frank Sinatra 1957
9 Pal Joey Frank Sinatra 1957
10 Can-Can Frank Sinatra 1960
movie_years %>%
# Left join movie_directors to movie_years
___ %>%
# Arrange the columns using select()
___
left_join() – INCORRECT
> movie_years %>%
# Left join movie_directors to movie_years
left_join(movie_directors)
Joining, by = "name"
# A tibble: 10 × 5
movie name year director studio
<chr> <chr> <int> <chr> <chr>
1 The Road to Morocco Bing Crosby 1942 <NA> <NA>
2 Going My Way Bing Crosby 1944 <NA> <NA>
3 Anchors Aweigh Frank Sinatra 1945 <NA> <NA>
4 Till the Clouds Roll By Frank Sinatra 1946 <NA> <NA>
5 White Christmas Bing Crosby 1954 <NA> <NA>
6 The Tender Trap Frank Sinatra 1955 <NA> <NA>
7 High Society Bing Crosby 1956 <NA> <NA>
8 The Joker is Wild Frank Sinatra 1957 <NA> <NA>
9 Pal Joey Frank Sinatra 1957 <NA> <NA>
10 Can-Can Frank Sinatra 1960 <NA> <NA>
left_join() – CORRECT
> movie_years %>%
# Left join movie_directors to movie_years
# left_join(movie_directors) %>% # INCORRECT
left_join(movie_directors, by = c("movie" = "name"))
# A tibble: 10 × 5
movie name year director
<chr> <chr> <int> <chr>
1 The Road to Morocco Bing Crosby 1942 David Butler
2 Going My Way Bing Crosby 1944 Leo McCarey
3 Anchors Aweigh Frank Sinatra 1945 George Sidney
4 Till the Clouds Roll By Frank Sinatra 1946 Richard Whorf
5 White Christmas Bing Crosby 1954 Michael Curtiz
6 The Tender Trap Frank Sinatra 1955 Charles Walters
7 High Society Bing Crosby 1956 Charles Walters
8 The Joker is Wild Frank Sinatra 1957 Charles Vidor
9 Pal Joey Frank Sinatra 1957 George Sidney
10 Can-Can Frank Sinatra 1960 Walter Lang
# ... with 1 more variables: studio <chr>
>
select() rename name to artist via the “=” operator
> movie_years %>%
# Left join movie_directors to movie_years
# left_join(movie_directors) %>% # INCORRECT
left_join(movie_directors, by = c("movie" = "name")) %>% # CORRECT
# Arrange the columns using select()
# select(year, movie, artist, director, studio) # INCORRECT
select(year, movie, artist = name, director, studio) # CORRECT
# A tibble: 10 × 5
year movie artist director
<int> <chr> <chr> <chr>
1 1942 The Road to Morocco Bing Crosby David Butler
2 1944 Going My Way Bing Crosby Leo McCarey
3 1945 Anchors Aweigh Frank Sinatra George Sidney
4 1946 Till the Clouds Roll By Frank Sinatra Richard Whorf
5 1954 White Christmas Bing Crosby Michael Curtiz
6 1955 The Tender Trap Frank Sinatra Charles Walters
7 1956 High Society Bing Crosby Charles Walters
8 1957 The Joker is Wild Frank Sinatra Charles Vidor
9 1957 Pal Joey Frank Sinatra George Sidney
10 1960 Can-Can Frank Sinatra Walter Lang
# ... with 1 more variables: studio <chr>
>
library(purrr)
joining together multiple datasets
tables <- list(surnames, names, plays)
reduce(tables, left_join, by = "name")
is equivalent to
surnames %>%
left_join(names, by = "name") %>%
left_join(plays, by = "name")
reduce(list(x1, x2, x3), f)
is equivalent to
f(f(x1, x2), x3)
supergroups
# A tibble: 16 × 3
supergroup first last
<chr> <chr> <chr>
1 Cream Ginger Baker
2 Cream Jack Bruce
3 Cream Eric Clapton
4 Crosby, Stills, Nash, and Young David Crosby
5 Crosby, Stills, Nash, and Young Graham Nash
6 Crosby, Stills, Nash, and Young Stephen Stills
7 Crosby, Stills, Nash, and Young Neil Young
8 The Highwaymen Johnny Cash
9 The Highwaymen Waylon Jennings
10 The Highwaymen Kris Kristofferson
11 The Highwaymen Willie Nelson
12 The Traveling Wilburys Bob Dylan
13 The Traveling Wilburys George Harrison
14 The Traveling Wilburys Jeff Lynne
15 The Traveling Wilburys Roy Orbison
16 The Traveling Wilburys Tom Petty
more_bands
# A tibble: 42 × 3
band first last
<chr> <chr> <chr>
1 Buffalo Springfield Stephen Stills
2 Buffalo Springfield Dewey Martin
3 Buffalo Springfield Bruce Palmer
4 Buffalo Springfield Richie Furay
5 Buffalo Springfield Neil Young
6 Electric Light Orchestra Roy Wood
7 Electric Light Orchestra Jeff Lynne
8 Electric Light Orchestra Bev Bevan
9 Graham Bond Organisation Graham Bond
10 Graham Bond Organisation Jack Bruce
# ... with 32 more rows
more_artists
# A tibble: 48 × 3
first last instrument
<chr> <chr> <chr>
1 Allan Clarke vocals
2 Benmont Tench keyboards
3 Bev Bevan drums
4 Bob Dylan guitar
5 Bruce Palmer bass
6 Chris Dreja guitar
7 Chris Hillman bass
8 David Crosby guitar
9 Dewey Martin drums
10 Dick Heckstall-Smith saxophone
# ... with 38 more rows
>
list(supergroups, more_bands, more_artists) %>%
# Use reduce to join together the contents of the list
reduce(left_join, by = c("first", "last"))
# A tibble: 19 × 5
supergroup first last
<chr> <chr> <chr>
1 Cream Ginger Baker
2 Cream Jack Bruce
3 Cream Jack Bruce
4 Cream Jack Bruce
5 Cream Jack Bruce
6 Cream Eric Clapton
7 Crosby, Stills, Nash, and Young David Crosby
8 Crosby, Stills, Nash, and Young Graham Nash
9 Crosby, Stills, Nash, and Young Stephen Stills
10 Crosby, Stills, Nash, and Young Neil Young
11 The Highwaymen Johnny Cash
12 The Highwaymen Waylon Jennings
13 The Highwaymen Kris Kristofferson
14 The Highwaymen Willie Nelson
15 The Traveling Wilburys Bob Dylan
16 The Traveling Wilburys George Harrison
17 The Traveling Wilburys Jeff Lynne
18 The Traveling Wilburys Roy Orbison
19 The Traveling Wilburys Tom Petty
# ... with 2 more variables: band <chr>, instrument <chr>
i.e. filter observations with a filtering join
list(more_artists, more_bands, supergroups) %>%
# Return rows of more_artists in all three datasets
reduce(semi_join, by = c("first", "last"))
# A tibble: 11 × 3
first last instrument
<chr> <chr> <chr>
1 Ginger Baker drums
2 Jack Bruce bass
3 Jack Bruce bass
4 Eric Clapton guitar
5 David Crosby guitar
6 Graham Nash guitar
7 Stephen Stills guitar
8 Neil Young guitar
9 George Harrison guitar
10 Jeff Lynne vocals
11 Tom Petty vocals
Each dplyr join has an SQL equivalent, and when you apply a dplyr join to a SQL data source (instead of a data frame), dplyr automatically converts your join to its SQL equivalent and runs it in the database.
> bands
# A tibble: 13 × 3
first last band
<chr> <chr> <chr>
1 John Bonham Led Zeppelin
2 John Paul Jones Led Zeppelin
3 Jimmy Page Led Zeppelin
4 Robert Plant Led Zeppelin
5 George Harrison The Beatles
6 John Lennon The Beatles
7 Paul McCartney The Beatles
8 Ringo Starr The Beatles
9 Jimmy Buffett The Coral Reefers
10 Mick Jagger The Rolling Stones
11 Keith Richards The Rolling Stones
12 Charlie Watts The Rolling Stones
13 Ronnie Wood The Rolling Stones
>
> artists
# A tibble: 16 × 3
first last instrument
<chr> <chr> <chr>
1 Jimmy Buffett Guitar
2 George Harrison Guitar
3 Mick Jagger Vocals
4 Tom Jones Vocals
5 Davy Jones Vocals
6 John Lennon Guitar
7 Paul McCartney Bass
8 Jimmy Page Guitar
9 Joe Perry Guitar
10 Elvis Presley Vocals
11 Keith Richards Guitar
12 Paul Simon Guitar
13 Ringo Starr Drums
14 Joe Walsh Guitar
15 Brian Wilson Vocals
16 Nancy Wilson Vocals
>
> merge(bands, artists, by = c("first", "last"), all.x = TRUE) %>%
arrange(band)
first last band instrument
1 Jimmy Page Led Zeppelin Guitar
2 John Bonham Led Zeppelin <NA>
3 John Paul Jones Led Zeppelin <NA>
4 Robert Plant Led Zeppelin <NA>
5 George Harrison The Beatles Guitar
6 John Lennon The Beatles Guitar
7 Paul McCartney The Beatles Bass
8 Ringo Starr The Beatles Drums
9 Jimmy Buffett The Coral Reefers Guitar
10 Charlie Watts The Rolling Stones <NA>
11 Keith Richards The Rolling Stones Guitar
12 Mick Jagger The Rolling Stones Vocals
13 Ronnie Wood The Rolling Stones <NA>
>
> # equivalent
>
> left_join(bands, artists, by = c("first", "last"))
# A tibble: 13 × 4
first last band instrument
<chr> <chr> <chr> <chr>
1 John Bonham Led Zeppelin <NA>
2 John Paul Jones Led Zeppelin <NA>
3 Jimmy Page Led Zeppelin Guitar
4 Robert Plant Led Zeppelin <NA>
5 George Harrison The Beatles Guitar
6 John Lennon The Beatles Guitar
7 Paul McCartney The Beatles Bass
8 Ringo Starr The Beatles Drums
9 Jimmy Buffett The Coral Reefers Guitar
10 Mick Jagger The Rolling Stones Vocals
11 Keith Richards The Rolling Stones Guitar
12 Charlie Watts The Rolling Stones <NA>
13 Ronnie Wood The Rolling Stones <NA>
>
The connection error’d - due to “host” perhaps, tried/failed to find a link to help:
https://aws.amazon.com/blogs/big-data/running-r-on-aws/
https://aws.amazon.com/blogs/big-data/connecting-r-with-amazon-redshift/
https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-refusal-failure-issues.html
https://rpubs.com/nwstephens/airontime3
https://rstudio-pubs-static.s3.amazonaws.com/127218_469950d092d14e6aa897509c207568a4.html
https://rstudio-pubs-static.s3.amazonaws.com/108464_99457feae415478485bf7027330441fb.html
https://github.com/rstudio/Strata2016/blob/master/solutions/02-Big-Data.Rmd
https://github.com/deyb/airline-ontime-analytics/blob/master/report.pdf
does not evaluate:
# Connect to a database
air <- src_postgres(dbname = "airontime",
host = "sol-eng-sparklyr.cyii7eabibhu.us-east-1.redshift.amazonaws.com",
port = "5439",
user = "redshift_user",
password = "ABCd4321")
host = 'sol-eng-postgre.cihykudhzbgw.us-west-2.rds.amazonaws.com:5432/airontime',
host = 'redshiftdemo.ckffhmu2rolb.eu-west-1.redshift.amazonaws.com',
# View tables in database
src_tbls(air)
# Create table references
flights <- tbl(air, "flights")
planes <- tbl(air, "planes")
# Manipulate tables
flights <- left_join(flights, planes, by = "tailnum")
# Collect results
flights <- collect(flights)