library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)

Key?

ARGH: primary key is in the row names for stage_songs

stage_songs
##                                     musical year
## Children Will Listen         Into the Woods 1986
## Maria                       West Side Story 1957
## Memory                                 Cats 1981
## The Music of the Night Phantom of the Opera 1986

stage_writers
##                     song            composer
## 1   Children Will Listen    Stephen Sondheim
## 2                  Maria   Leonard Bernstein
## 3                 Memory Andrew Lloyd Webber
## 4 The Music of the Night Andrew Lloyd Webber

> stage_songs %>% 

    # Add row names as a column named song
    rownames_to_column("song") %>% 
  
      # Left join stage_writers to stage_songs
      left_join(stage_writers, by = "song")

##                     song              musical year            composer
## 1   Children Will Listen       Into the Woods 1986    Stephen Sondheim
## 2                  Maria      West Side Story 1957   Leonard Bernstein
## 3                 Memory                 Cats 1981 Andrew Lloyd Webber
## 4 The Music of the Night Phantom of the Opera 1986 Andrew Lloyd Webber

Non-unique keys

shows
## # A tibble: 6 x 2
##                musical  year
##                  <chr> <int>
## 1   The Sound of Music  1959
## 2       Into the Woods  1986
## 3       The King and I  1951
## 4      West Side Story  1957
## 5                 Cats  1981
## 6 Phantom of the Opera  1986

values of musical do not uniquely identify the rows of composers
e.g. two rows have the value “The Sound of Music” for musical and two other rows have the value “The King and I”.
i.e. there are more than one composer for some musicals, so the musical column has duplicate key values.

composers
## # A tibble: 8 x 2
##                musical             composer
##                  <chr>                <chr>
## 1                 Cats  Andrew Lloyd Webber
## 2       Into the Woods     Stephen Sondheim
## 3 Phantom of the Opera  Andrew Lloyd Webber
## 4       The King and I       Richard Rogers
## 5       The King and I Oscar Hammerstein II
## 6   The Sound of Music       Richard Rogers
## 7   The Sound of Music Oscar Hammerstein II
## 8      West Side Story    Leonard Bernstein

Q: How many entries (rows) will dplyr create for The Sound of Music if you left join composers to shows by musical?
A: TWO
- If a row in the primary dataset contains multiple matches in the secondary dataset, left_join() will duplicate the row once for every match

Two non-unique keys

When the primary dataset contains duplicate key values.

show_songs
# A tibble: 8 × 3
                         song              musical  year
                        <chr>                <chr> <int>
1 A Few of My Favorite Things   The Sound of Music  1959
2        Children Will Listen       Into the Woods  1986
3                   Edelweiss   The Sound of Music  1959
4         Getting to Know You       The King and I  1951
5                       Maria      West Side Story  1957
6                      Memory                 Cats  1981
7             Shall We Dance?       The King and I  1951
8      The Music of the Night Phantom of the Opera  1986

Join by the musical column, but like composers, show_songs has two rows where musical == “The Sound of Music”.

Q: How many entries (rows) will exist for The Sound of Music if you left join composers to show_songs by musical?
A: FOUR
- This time left_join() duplicates each row in show_songs that contains the Sound of Music twice, once for each row of composers that contains the Sound of Music. The result is four rows that contain the Sound of Music.

> show_songs %>% 
  left_join(composers, by = "musical") %>%
  arrange(musical)
# A tibble: 12 × 4
                          song              musical  year             composer
                         <chr>                <chr> <int>                <chr>
1                       Memory                 Cats  1981  Andrew Lloyd Webber
2         Children Will Listen       Into the Woods  1986     Stephen Sondheim
3       The Music of the Night Phantom of the Opera  1986  Andrew Lloyd Webber
4          Getting to Know You       The King and I  1951       Richard Rogers
5          Getting to Know You       The King and I  1951 Oscar Hammerstein II
6              Shall We Dance?       The King and I  1951       Richard Rogers
7              Shall We Dance?       The King and I  1951 Oscar Hammerstein II
8  A Few of My Favorite Things   The Sound of Music  1959       Richard Rogers
9  A Few of My Favorite Things   The Sound of Music  1959 Oscar Hammerstein II
10                   Edelweiss   The Sound of Music  1959       Richard Rogers
11                   Edelweiss   The Sound of Music  1959 Oscar Hammerstein II
12                       Maria      West Side Story  1957    Leonard Bernstein

Missing keys

Missing keys can mistakenly join two rows that do not belong toegether

singers
# A tibble: 2 × 2
               movie                singer
               <chr>                 <chr>
1               <NA> Arnold Schwarzenegger
2 The Sound of Music         Julie Andrews

two_songs
# A tibble: 2 × 2
                 song              movie
                <chr>              <chr>
1            Do-Re-Mi The Sound of Music
2 A Spoonful of Sugar               <NA>

Who sings A Spoonful of Sugar? Why does this happen?

two_songs %>% inner_join(singers, by = "movie")
# A tibble: 2 × 3
                 song              movie                singer
                <chr>              <chr>                 <chr>
1            Do-Re-Mi The Sound of Music         Julie Andrews
2 A Spoonful of Sugar               <NA> Arnold Schwarzenegger

removing rows that contain NAs in the keys (column of two_songs) before joining

via filter()

> # Remove NA's from key before joining
> two_songs %>% 
    filter(!is.na(movie)) %>% 
    inner_join(singers, by = "movie")
# A tibble: 1 × 3
      song              movie        singer
     <chr>              <chr>         <chr>
1 Do-Re-Mi The Sound of Music Julie Andrews
>

Subset of keys

name to refer to the name of an actor.

movie_years
# A tibble: 10 × 3
                     movie          name  year
                     <chr>         <chr> <int>
1      The Road to Morocco   Bing Crosby  1942
2             Going My Way   Bing Crosby  1944
3           Anchors Aweigh Frank Sinatra  1945
4  Till the Clouds Roll By Frank Sinatra  1946
5          White Christmas   Bing Crosby  1954
6          The Tender Trap Frank Sinatra  1955
7             High Society   Bing Crosby  1956
8        The Joker is Wild Frank Sinatra  1957
9                 Pal Joey Frank Sinatra  1957
10                 Can-Can Frank Sinatra  1960

name to refer to the name of a movie studio

movie_studios
# A tibble: 10 × 2
                     movie                  name
                     <chr>                 <chr>
1      The Road to Morocco    Paramount Pictures
2             Going My Way    Paramount Pictures
3           Anchors Aweigh   Metro-Goldwyn-Mayer
4  Till the Clouds Roll By   Metro-Goldwyn-Mayer
5          White Christmas    Paramount Pictures
6          The Tender Trap   Metro-Goldwyn-Mayer
7             High Society   Metro-Goldwyn-Mayer
8        The Joker is Wild    Paramount Pictures
9                 Pal Joey     Columbia Pictures
10                 Can-Can Twentieth-Century Fox

Note, what happens to the duplicated column names?

> movie_years %>% 
    # Left join movie_studios to movie_years
    left_join(movie_studios, by = "movie")
# A tibble: 10 × 4
                     movie        name.x  year                name.y
                     <chr>         <chr> <int>                 <chr>
1      The Road to Morocco   Bing Crosby  1942    Paramount Pictures
2             Going My Way   Bing Crosby  1944    Paramount Pictures
3           Anchors Aweigh Frank Sinatra  1945   Metro-Goldwyn-Mayer
4  Till the Clouds Roll By Frank Sinatra  1946   Metro-Goldwyn-Mayer
5          White Christmas   Bing Crosby  1954    Paramount Pictures
6          The Tender Trap Frank Sinatra  1955   Metro-Goldwyn-Mayer
7             High Society   Bing Crosby  1956   Metro-Goldwyn-Mayer
8        The Joker is Wild Frank Sinatra  1957    Paramount Pictures
9                 Pal Joey Frank Sinatra  1957     Columbia Pictures
10                 Can-Can Frank Sinatra  1960 Twentieth-Century Fox

dplyr will ignore duplicate column names if you set the by argument and do not include the duplicated name in the argument. When you do this, dplyr will treat the columns in the normal fashion, but it will add .x and .y to the duplicated names to help you tell the columns apart.

movie_years %>% 
   # Left join movie_studios to movie_years
   left_join(movie_studios, by = "movie") %>% 
     # Rename the columns: artist and studio
     # rename(data, new_name = old_name)
         # can do separately:
         #   rename(artist = name.x) %>% 
         #   rename(studio = name.y)
         # or in one:
         #   rename(artist = name.x, studio = name.y)
  
# A tibble: 10 × 4
                     movie        artist  year                studio
                     <chr>         <chr> <int>                 <chr>
1      The Road to Morocco   Bing Crosby  1942    Paramount Pictures
2             Going My Way   Bing Crosby  1944    Paramount Pictures
3           Anchors Aweigh Frank Sinatra  1945   Metro-Goldwyn-Mayer
4  Till the Clouds Roll By Frank Sinatra  1946   Metro-Goldwyn-Mayer
5          White Christmas   Bing Crosby  1954    Paramount Pictures
6          The Tender Trap Frank Sinatra  1955   Metro-Goldwyn-Mayer
7             High Society   Bing Crosby  1956   Metro-Goldwyn-Mayer
8        The Joker is Wild Frank Sinatra  1957    Paramount Pictures
9                 Pal Joey Frank Sinatra  1957     Columbia Pictures
10                 Can-Can Frank Sinatra  1960 Twentieth-Century Fox

Mis-matched key names

both elvis_songs & elvis_movies describe movies starring Elvis Presley, but each uses a different column name to describe the name of the movie

elvis_songs
# A tibble: 5 × 2
                                  name          movie
                                 <chr>          <chr>
1 (You're So Square) Baby I Don't Care Jailhouse Rock
2         I Can't Help Falling in Love    Blue Hawaii
3                       Jailhouse Rock Jailhouse Rock
4                       Viva Las Vegas Viva Las Vegas
5                    You Don't Know Me       Clambake

elvis_movies
# A tibble: 4 × 2
            name  year
           <chr> <int>
1 Jailhouse Rock  1957
2    Blue Hawaii  1961
3 Viva Las Vegas  1963
4       Clambake  1967

To make the join, set by to a named vector

names of the vector will refer to column names in the primary dataset (x)
values of the vector will correspond to the column names in the secondary dataset (y)

x %>% left_join(y, by = c("x.name1" = "y.name2"))

Error: ‘elvis_songs.movie’ column not found in lhs, cannot join

elvis_movies %>% 
    
    # Left join elvis_songs to elvis_movies by key column
    left_join(elvis_songs, by = c("elvis_movies.name" = "elvis_songs.movie"))

left_join()

elvis_movies %>% 
    
    # Left join elvis_songs to elvis_movies by this column
    left_join(elvis_songs, by = c("name" = "movie"))

# A tibble: 5 × 3
            name  year                               name.y
           <chr> <int>                                <chr>
1 Jailhouse Rock  1957 (You're So Square) Baby I Don't Care
2 Jailhouse Rock  1957                       Jailhouse Rock
3    Blue Hawaii  1961         I Can't Help Falling in Love
4 Viva Las Vegas  1963                       Viva Las Vegas
5       Clambake  1967                    You Don't Know Me

rename()

elvis_movies %>% 
    
    # Left join elvis_songs to elvis_movies by this column
    left_join(elvis_songs, by = c("name" = "movie")) %>% 
    
    # Rename result columns to: movie, year, and song
    rename(movie = name, song = name.y)
    
# A tibble: 5 × 3
           movie  year                                 song
           <chr> <int>                                <chr>
1 Jailhouse Rock  1957 (You're So Square) Baby I Don't Care
2 Jailhouse Rock  1957                       Jailhouse Rock
3    Blue Hawaii  1961         I Can't Help Falling in Love
4 Viva Las Vegas  1963                       Viva Las Vegas
5       Clambake  1967                    You Don't Know Me

Each data frame uses different column names to refer to the key variables that connect the datasets

movie_directors
# A tibble: 10 × 3
                      name        director                studio
                     <chr>           <chr>                 <chr>
1           Anchors Aweigh   George Sidney   Metro-Goldwyn-Mayer
2                  Can-Can     Walter Lang Twentieth-Century Fox
3             Going My Way     Leo McCarey    Paramount Pictures
4             High Society Charles Walters   Metro-Goldwyn-Mayer
5                 Pal Joey   George Sidney     Columbia Pictures
6        The Joker is Wild   Charles Vidor    Paramount Pictures
7      The Road to Morocco    David Butler    Paramount Pictures
8          The Tender Trap Charles Walters   Metro-Goldwyn-Mayer
9  Till the Clouds Roll By   Richard Whorf   Metro-Goldwyn-Mayer
10         White Christmas  Michael Curtiz    Paramount Pictures

movie_years
# A tibble: 10 × 3
                     movie          name  year
                     <chr>         <chr> <int>
1      The Road to Morocco   Bing Crosby  1942
2             Going My Way   Bing Crosby  1944
3           Anchors Aweigh Frank Sinatra  1945
4  Till the Clouds Roll By Frank Sinatra  1946
5          White Christmas   Bing Crosby  1954
6          The Tender Trap Frank Sinatra  1955
7             High Society   Bing Crosby  1956
8        The Joker is Wild Frank Sinatra  1957
9                 Pal Joey Frank Sinatra  1957
10                 Can-Can Frank Sinatra  1960

movie_years %>% 
  # Left join movie_directors to movie_years
  ___ %>% 
  # Arrange the columns using select()
  ___

left_join() – INCORRECT

> movie_years %>% 
      # Left join movie_directors to movie_years
      left_join(movie_directors)
Joining, by = "name"
# A tibble: 10 × 5
                     movie          name  year director studio
                     <chr>         <chr> <int>    <chr>  <chr>
1      The Road to Morocco   Bing Crosby  1942     <NA>   <NA>
2             Going My Way   Bing Crosby  1944     <NA>   <NA>
3           Anchors Aweigh Frank Sinatra  1945     <NA>   <NA>
4  Till the Clouds Roll By Frank Sinatra  1946     <NA>   <NA>
5          White Christmas   Bing Crosby  1954     <NA>   <NA>
6          The Tender Trap Frank Sinatra  1955     <NA>   <NA>
7             High Society   Bing Crosby  1956     <NA>   <NA>
8        The Joker is Wild Frank Sinatra  1957     <NA>   <NA>
9                 Pal Joey Frank Sinatra  1957     <NA>   <NA>
10                 Can-Can Frank Sinatra  1960     <NA>   <NA>

left_join() – CORRECT

> movie_years %>% 
      # Left join movie_directors to movie_years
      # left_join(movie_directors) %>% # INCORRECT
      left_join(movie_directors, by = c("movie" = "name"))
# A tibble: 10 × 5
                     movie          name  year        director
                     <chr>         <chr> <int>           <chr>
1      The Road to Morocco   Bing Crosby  1942    David Butler
2             Going My Way   Bing Crosby  1944     Leo McCarey
3           Anchors Aweigh Frank Sinatra  1945   George Sidney
4  Till the Clouds Roll By Frank Sinatra  1946   Richard Whorf
5          White Christmas   Bing Crosby  1954  Michael Curtiz
6          The Tender Trap Frank Sinatra  1955 Charles Walters
7             High Society   Bing Crosby  1956 Charles Walters
8        The Joker is Wild Frank Sinatra  1957   Charles Vidor
9                 Pal Joey Frank Sinatra  1957   George Sidney
10                 Can-Can Frank Sinatra  1960     Walter Lang
# ... with 1 more variables: studio <chr>
>

select() rename name to artist via the “=” operator

> movie_years %>% 
      # Left join movie_directors to movie_years
      # left_join(movie_directors) %>% # INCORRECT
      left_join(movie_directors, by = c("movie" = "name")) %>% # CORRECT
          # Arrange the columns using select()
          # select(year, movie, artist, director, studio) # INCORRECT
          select(year, movie, artist = name, director, studio) # CORRECT
# A tibble: 10 × 5
    year                   movie        artist        director
   <int>                   <chr>         <chr>           <chr>
1   1942     The Road to Morocco   Bing Crosby    David Butler
2   1944            Going My Way   Bing Crosby     Leo McCarey
3   1945          Anchors Aweigh Frank Sinatra   George Sidney
4   1946 Till the Clouds Roll By Frank Sinatra   Richard Whorf
5   1954         White Christmas   Bing Crosby  Michael Curtiz
6   1955         The Tender Trap Frank Sinatra Charles Walters
7   1956            High Society   Bing Crosby Charles Walters
8   1957       The Joker is Wild Frank Sinatra   Charles Vidor
9   1957                Pal Joey Frank Sinatra   George Sidney
10  1960                 Can-Can Frank Sinatra     Walter Lang
# ... with 1 more variables: studio <chr>
>

purrr

rounds out the functional programming tools in R
- i.e. The tools that help you write programs with functions.
To help you apply R functions to data in efficient ways.

library(purrr)

reduce()

http://www.rdocumentation.org/packages/purrr/versions/0.2.2/topics/reduce

joining together multiple datasets

tables <- list(surnames, names, plays)
reduce(tables, left_join, by = "name")

is equivalent to

surnames %>%
left_join(names, by = "name") %>%
left_join(plays, by = "name")

reduce(list(x1, x2, x3), f)

is equivalent to

f(f(x1, x2), x3)

Join multiple tables

supergroups
- lists the members of several “supergroup” bands, which are bands made from the members of other famous bands

supergroups
# A tibble: 16 × 3
                        supergroup   first          last
                             <chr>   <chr>         <chr>
1                            Cream  Ginger         Baker
2                            Cream    Jack         Bruce
3                            Cream    Eric       Clapton
4  Crosby, Stills, Nash, and Young   David        Crosby
5  Crosby, Stills, Nash, and Young  Graham          Nash
6  Crosby, Stills, Nash, and Young Stephen        Stills
7  Crosby, Stills, Nash, and Young    Neil         Young
8                   The Highwaymen  Johnny          Cash
9                   The Highwaymen  Waylon      Jennings
10                  The Highwaymen    Kris Kristofferson
11                  The Highwaymen  Willie        Nelson
12          The Traveling Wilburys     Bob         Dylan
13          The Traveling Wilburys  George      Harrison
14          The Traveling Wilburys    Jeff         Lynne
15          The Traveling Wilburys     Roy       Orbison
16          The Traveling Wilburys     Tom         Petty

more_bands
- lists the original band membership of many musicians, including those in supergroups

more_bands
# A tibble: 42 × 3
                       band   first   last
                      <chr>   <chr>  <chr>
1       Buffalo Springfield Stephen Stills
2       Buffalo Springfield   Dewey Martin
3       Buffalo Springfield   Bruce Palmer
4       Buffalo Springfield  Richie  Furay
5       Buffalo Springfield    Neil  Young
6  Electric Light Orchestra     Roy   Wood
7  Electric Light Orchestra    Jeff  Lynne
8  Electric Light Orchestra     Bev  Bevan
9  Graham Bond Organisation  Graham   Bond
10 Graham Bond Organisation    Jack  Bruce
# ... with 32 more rows

more_artists
- lists the instruments that various artists played

more_artists
# A tibble: 48 × 3
     first            last instrument
     <chr>           <chr>      <chr>
1    Allan          Clarke     vocals
2  Benmont           Tench  keyboards
3      Bev           Bevan      drums
4      Bob           Dylan     guitar
5    Bruce          Palmer       bass
6    Chris           Dreja     guitar
7    Chris         Hillman       bass
8    David          Crosby     guitar
9    Dewey          Martin      drums
10    Dick Heckstall-Smith  saxophone
# ... with 38 more rows
>

reduce() takes two main arguments
- a list .x
- name of the function .f you wish to apply
- additional argument: by, which is passed on to .f

list(supergroups, more_bands, more_artists) %>%

    # Use reduce to join together the contents of the list
    reduce(left_join, by = c("first", "last"))
    
# A tibble: 19 × 5
                        supergroup   first          last
                             <chr>   <chr>         <chr>
1                            Cream  Ginger         Baker
2                            Cream    Jack         Bruce
3                            Cream    Jack         Bruce
4                            Cream    Jack         Bruce
5                            Cream    Jack         Bruce
6                            Cream    Eric       Clapton
7  Crosby, Stills, Nash, and Young   David        Crosby
8  Crosby, Stills, Nash, and Young  Graham          Nash
9  Crosby, Stills, Nash, and Young Stephen        Stills
10 Crosby, Stills, Nash, and Young    Neil         Young
11                  The Highwaymen  Johnny          Cash
12                  The Highwaymen  Waylon      Jennings
13                  The Highwaymen    Kris Kristofferson
14                  The Highwaymen  Willie        Nelson
15          The Traveling Wilburys     Bob         Dylan
16          The Traveling Wilburys  George      Harrison
17          The Traveling Wilburys    Jeff         Lynne
18          The Traveling Wilburys     Roy       Orbison
19          The Traveling Wilburys     Tom         Petty
# ... with 2 more variables: band <chr>, instrument <chr>

Filter multiple tables

i.e. filter observations with a filtering join

reduce()
- apply a function in an iterative fashion to many datasets
- major component of ‘MapReduce’ in the Hadoop world.

list(more_artists, more_bands, supergroups) %>% 
    
    # Return rows of more_artists in all three datasets
    reduce(semi_join, by = c("first", "last"))

# A tibble: 11 × 3
     first     last instrument
     <chr>    <chr>      <chr>
1   Ginger    Baker      drums
2     Jack    Bruce       bass
3     Jack    Bruce       bass
4     Eric  Clapton     guitar
5    David   Crosby     guitar
6   Graham     Nash     guitar
7  Stephen   Stills     guitar
8     Neil    Young     guitar
9   George Harrison     guitar
10    Jeff    Lynne     vocals
11     Tom    Petty     vocals

merge()

Each dplyr join has an SQL equivalent, and when you apply a dplyr join to a SQL data source (instead of a data frame), dplyr automatically converts your join to its SQL equivalent and runs it in the database.

merge(names, plays, by = “name”, …)
- left_join(names, plays, by = “name”)
  - merge(names, plays, by = “name”, all.x = FALSE, all.y = TRUE)
    - SQL:
      - SELECT * FROM x LEFT JOIN y ON x.a = y.a
- right_join(names, plays, by = “name”)
  - merge(names, plays, by = “name”, all.x = TRUE, all.y = FALSE)
    - SQL:
      - SELECT * FROM x RIGHT JOIN y ON x.a = y.a
- inner_join(names, plays, by = “name”)
  - merge(names, plays, by = “name”, all = FALSE)
    - SQL:
      - SELECT * FROM x FULL JOIN y ON x.a = y.a
- full_join(names, plays, by = “name”)
  - merge(names, plays, by = “name”, all = TRUE)
    - SQL:
      - SELECT * FROM x JOIN y ON x.a = y.a
- semi_join() * SQL: * SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)
- anti_join() * SQL: * SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)

left_join()

> bands
# A tibble: 13 × 3
       first      last               band
       <chr>     <chr>              <chr>
1       John    Bonham       Led Zeppelin
2  John Paul     Jones       Led Zeppelin
3      Jimmy      Page       Led Zeppelin
4     Robert     Plant       Led Zeppelin
5     George  Harrison        The Beatles
6       John    Lennon        The Beatles
7       Paul McCartney        The Beatles
8      Ringo     Starr        The Beatles
9      Jimmy   Buffett  The Coral Reefers
10      Mick    Jagger The Rolling Stones
11     Keith  Richards The Rolling Stones
12   Charlie     Watts The Rolling Stones
13    Ronnie      Wood The Rolling Stones
> 
> artists
# A tibble: 16 × 3
    first      last instrument
    <chr>     <chr>      <chr>
1   Jimmy   Buffett     Guitar
2  George  Harrison     Guitar
3    Mick    Jagger     Vocals
4     Tom     Jones     Vocals
5    Davy     Jones     Vocals
6    John    Lennon     Guitar
7    Paul McCartney       Bass
8   Jimmy      Page     Guitar
9     Joe     Perry     Guitar
10  Elvis   Presley     Vocals
11  Keith  Richards     Guitar
12   Paul     Simon     Guitar
13  Ringo     Starr      Drums
14    Joe     Walsh     Guitar
15  Brian    Wilson     Vocals
16  Nancy    Wilson     Vocals
> 
> merge(bands, artists, by = c("first", "last"), all.x = TRUE) %>%
    arrange(band)
       first      last               band instrument
1      Jimmy      Page       Led Zeppelin     Guitar
2       John    Bonham       Led Zeppelin       <NA>
3  John Paul     Jones       Led Zeppelin       <NA>
4     Robert     Plant       Led Zeppelin       <NA>
5     George  Harrison        The Beatles     Guitar
6       John    Lennon        The Beatles     Guitar
7       Paul McCartney        The Beatles       Bass
8      Ringo     Starr        The Beatles      Drums
9      Jimmy   Buffett  The Coral Reefers     Guitar
10   Charlie     Watts The Rolling Stones       <NA>
11     Keith  Richards The Rolling Stones     Guitar
12      Mick    Jagger The Rolling Stones     Vocals
13    Ronnie      Wood The Rolling Stones       <NA>
> 
> # equivalent
> 
> left_join(bands, artists, by = c("first", "last"))
# A tibble: 13 × 4
       first      last               band instrument
       <chr>     <chr>              <chr>      <chr>
1       John    Bonham       Led Zeppelin       <NA>
2  John Paul     Jones       Led Zeppelin       <NA>
3      Jimmy      Page       Led Zeppelin     Guitar
4     Robert     Plant       Led Zeppelin       <NA>
5     George  Harrison        The Beatles     Guitar
6       John    Lennon        The Beatles     Guitar
7       Paul McCartney        The Beatles       Bass
8      Ringo     Starr        The Beatles      Drums
9      Jimmy   Buffett  The Coral Reefers     Guitar
10      Mick    Jagger The Rolling Stones     Vocals
11     Keith  Richards The Rolling Stones     Guitar
12   Charlie     Watts The Rolling Stones       <NA>
13    Ronnie      Wood The Rolling Stones       <NA>
>

dplyr SQL connections

DBMS:
- SQLite => src_sqlite()
- MySQL, MariaDB => src_mysql()
- PostgreSQL => src_postgres()
need:
- install.packages(“DBI”)

The connection error’d - due to “host” perhaps, tried/failed to find a link to help:

https://aws.amazon.com/blogs/big-data/running-r-on-aws/

https://aws.amazon.com/blogs/big-data/connecting-r-with-amazon-redshift/

https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-refusal-failure-issues.html

https://rpubs.com/nwstephens/airontime3

https://rstudio-pubs-static.s3.amazonaws.com/127218_469950d092d14e6aa897509c207568a4.html

https://rstudio-pubs-static.s3.amazonaws.com/108464_99457feae415478485bf7027330441fb.html

https://github.com/rstudio/Strata2016/blob/master/solutions/02-Big-Data.Rmd

https://github.com/deyb/airline-ontime-analytics/blob/master/report.pdf

does not evaluate:

# Connect to a database
air <- src_postgres(dbname = "airontime", 
                    host = "sol-eng-sparklyr.cyii7eabibhu.us-east-1.redshift.amazonaws.com",
                    port = "5439", 
                    user = "redshift_user", 
                    password = "ABCd4321")

host = 'sol-eng-postgre.cihykudhzbgw.us-west-2.rds.amazonaws.com:5432/airontime', 
host = 'redshiftdemo.ckffhmu2rolb.eu-west-1.redshift.amazonaws.com',

# View tables in database
src_tbls(air)

# Create table references
flights <- tbl(air, "flights")
planes <- tbl(air, "planes")

# Manipulate tables
flights <- left_join(flights, planes, by = "tailnum")

# Collect results
flights <- collect(flights)

dplyr - advanced-joining

Kunal Haira

2018-05-17

Key?

Non-unique keys

Two non-unique keys

Missing keys

Subset of keys

Mis-matched key names

purrr

reduce()

Join multiple tables

Filter multiple tables

merge()

left_join()

dplyr SQL connections