## 
## Attaching package: 'resampledata'
## The following object is masked from 'package:datasets':
## 
##     Titanic
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.1     ✔ purrr   0.3.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
## 
##     kable
## The following object is masked from 'package:stats':
## 
##     filter
## 
## Attaching package: 'Lahman'
## The following object is masked from 'package:resampledata':
## 
##     Salaries

Joining Data

Dplyr has join functions, left_join and right_join are the first ones we are looking at. They take three arguments here, the two dataframes we are joining then the ‘by’ which says how you want them to be joined. There are a lot of datasets for this datacamp exercise and I do not want to wade through them all and find out which are complete because some have been problematic.

left_join and right_join join data frames (or tibbles) together by adding the second one onto the first. So this code below would add artists data onto bands

bands2 <- left_join(bands, artists, by = c("first", "last")) This would do the opposite, but since it is now a right_join, the output would be the same. bands3 <- right_join(artists, bands, by = c("first", "last"))

Join functions also work well with other dpylr stuff, like group_by, filter, select, and so forth. Here we would be making an object, “temp”, by joining artists to bands by the first and last variables, which are the musicians names. temp <- left_join(bands, artists, by = c("first", "last")) Then we are filtering the temp data frame for only people who play the guitar. temp <- filter(temp, instrument == "Guitar") Finally, we select the first, last, and band variables to show the names of guitarists, and which bands they play in. select(temp, first, last, band)

We can do the same thing using pipe operators if we want to. bands %>% left_join(artists, by = c("first", "last")) %>% filter(instrument == "Guitar") %>% select(first, last, band)

There are also full_join, semi_join, inner_join, and anti_join functions. From here on out I am going to use this superhero data where I can. I like superheros more than bands and the data is easier to look at and tell what is going on.

inner_join

: Return all rows from x where there are matching values in y, keeping just columns from x. A semi join differs from an inner join because an inner join will return one row of x for each matching row of y, where a semi join will never duplicate rows of x. This is a filtering join. This scraps Hellboy since his publisher does not appear in the publishers dataset.

(ijsp <- inner_join(superheroes, publishers))
# A tibble: 6 x 5
  name     alignment gender publisher yr_founded
  <chr>    <chr>     <chr>  <chr>          <dbl>
1 Magneto  bad       male   Marvel          1939
2 Storm    good      female Marvel          1939
3 Mystique bad       female Marvel          1939
4 Batman   good      male   DC              1934
5 Joker    bad       male   DC              1934
6 Catwoman bad       female DC              1934

full_join

full_join(x, y): Return all rows and all columns from both x and y. Where there are not matching values, returns NA for the one missing. This is a mutating join. This gives us all variables from x = superheroes plus those from y = publishers. Any row is just from one of the dataframes fills up with NAs for the stuff that normally would have come from the other frame.

(fjsp <- full_join(superheroes, publishers))
# A tibble: 8 x 5
  name     alignment gender publisher         yr_founded
  <chr>    <chr>     <chr>  <chr>                  <dbl>
1 Magneto  bad       male   Marvel                  1939
2 Storm    good      female Marvel                  1939
3 Mystique bad       female Marvel                  1939
4 Batman   good      male   DC                      1934
5 Joker    bad       male   DC                      1934
6 Catwoman bad       female DC                      1934
7 Hellboy  good      male   Dark Horse Comics         NA
8 <NA>     <NA>      <NA>   Image                   1992

semi_join

semi_join(x, y): Return all rows from x where there are matching values in y, keeping just columns from x. A semi join differs from an inner join because an inner join will return one row of x for each matching row of y, where a semi join will never duplicate rows of x. This is a filtering join.

(sjsp <- semi_join(superheroes, publishers))
# A tibble: 6 x 4
  name     alignment gender publisher
  <chr>    <chr>     <chr>  <chr>    
1 Magneto  bad       male   Marvel   
2 Storm    good      female Marvel   
3 Mystique bad       female Marvel   
4 Batman   good      male   DC       
5 Joker    bad       male   DC       
6 Catwoman bad       female DC       

This basically gives us the same thing as the inner_join, but with a different order and without that “year founded” variable that was in the publishers dataset.

anti_join

Return all rows from x where there are not matching values in y, keeping just columns from x. This is a filtering join. This will only keep Hellboy. Fitting, since he was ditched in the inner join and semi join. We also do not get that founding year thing, since this only keeps the columns from X.

(ajsp <- anti_join(superheroes, publishers))
# A tibble: 1 x 4
  name    alignment gender publisher        
  <chr>   <chr>     <chr>  <chr>            
1 Hellboy good      male   Dark Horse Comics

Important note

These were all joining by the publisher variable since those are the only columns shared by the two dataframes. R figured this out on its own, so I did not specify by = "publisher" for each.

Other Functions

Union and Select

UNION function in R combines all rows from both the tables and removes duplicate records from the combined dataset Select is an older thing, it takes .data, and what columns to select but it is most commonly used in a pipe.

hero_names <- select(.data=superheroes, name)
hero_names
# A tibble: 7 x 1
  name    
  <chr>   
1 Magneto 
2 Storm   
3 Mystique
4 Batman  
5 Joker   
6 Catwoman
7 Hellboy 
publisher_names <- select(.data=publishers, publisher)
publisher_names
# A tibble: 3 x 1
  publisher
  <chr>    
1 DC       
2 Marvel   
3 Image    
df1 = data.frame(CustomerId = c(1:6), Product = c(rep("Oven", 3), rep("Television", 3)))
df2 = data.frame(CustomerId = c(4:7), Product = c(rep("Television", 2), rep("Air conditioner", 2)))
union(df1,df2)
  CustomerId         Product
1          1            Oven
2          2            Oven
3          3            Oven
4          4      Television
5          5      Television
6          6      Television
7          6 Air conditioner
8          7 Air conditioner

bind_rows and bind_cols

This is not a great example of bind_rows but it still shows what it does. Each argument can either be a data frame, a list that could be a data frame, or a list of data frames.

When row-binding, columns are matched by name, and any missing columns will be filled with NA.

When column-binding, rows are matched by position, so all data frames must have the same number of rows. To match by value, not position, see join.

.id is the Data frame identifier.

When .id is supplied, a new column of identifiers is created to link each row to its original data frame. The labels are taken from the named arguments to bind_rows(). When a list of data frames is supplied, the labels are taken from the names of the list. If no names are found a numeric sequence is used instead.

one <- mtcars[1:4, ]
two <- mtcars[11:14, ]


three<-bind_rows(one, two)
three
   mpg cyl  disp  hp drat    wt  qsec vs am gear carb
1 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
2 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
3 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
4 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
5 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
6 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
7 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
8 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3

Rename

Rename takes a data argument, then new_name = old_name

three%>%
  rename(cylinders = cyl)
   mpg cylinders  disp  hp drat    wt  qsec vs am gear carb
1 21.0         6 160.0 110 3.90 2.620 16.46  0  1    4    4
2 21.0         6 160.0 110 3.90 2.875 17.02  0  1    4    4
3 22.8         4 108.0  93 3.85 2.320 18.61  1  1    4    1
4 21.4         6 258.0 110 3.08 3.215 19.44  1  0    3    1
5 17.8         6 167.6 123 3.92 3.440 18.90  1  0    4    4
6 16.4         8 275.8 180 3.07 4.070 17.40  0  0    3    3
7 17.3         8 275.8 180 3.07 3.730 17.60  0  0    3    3
8 15.2         8 275.8 180 3.07 3.780 18.00  0  0    3    3

PURRR

purrr’s reduce() function is very useful for joining together multiple datasets. Reduce applies a function to a list of objects.

list(superheroes, publishers) %>% 
  reduce(left_join, by="publisher")
# A tibble: 7 x 5
  name     alignment gender publisher         yr_founded
  <chr>    <chr>     <chr>  <chr>                  <dbl>
1 Magneto  bad       male   Marvel                  1939
2 Storm    good      female Marvel                  1939
3 Mystique bad       female Marvel                  1939
4 Batman   good      male   DC                      1934
5 Joker    bad       male   DC                      1934
6 Catwoman bad       female DC                      1934
7 Hellboy  good      male   Dark Horse Comics         NA

More functions

Here are some more examples of functions in purrr. These examples should use the lahman datasets and package, but those are some of the datasets that are not working well. Alas. Intersect, arrange, distinct, and the older dplyr functions are all useful here.

lahmanNames%>% reduce(intersect)

lahmanNames %>%
# Bind the data frames in lahmanNames bind_rows(.id = “dataframe”) %>% # Group the result by var group_by(var) %>% # Tally the number of appearances tally() %>% # Filter the data filter(n > 1) %>% # Arrange the results arrange(desc(n))

lahmanNames %>% # Bind the data frames bind_rows(.id = “dataframe”) %>% # Filter the results filter(var==‘playerID’) %>% # Extract the dataframe variable $(dataframe)

players <- Master %>% # Return one row for each distinct player distinct(playerID, nameFirst, nameLast)

players %>% # Find all players who do not appear in Salaries anti_join(Salaries, by = “playerID”) %>% # Count them count()

players %>% anti_join(Salaries, by = “playerID”) %>% # How many unsalaried players appear in Appearances? semi_join(Appearances, by = “playerID”) %>% count()

players %>% # Find all players who do not appear in Salaries anti_join(Salaries, by = “playerID”) %>% # Join them to Appearances left_join(Appearances, by = “playerID”) %>% # Calculate total_games for each player group_by(playerID) %>% summarize(total_games = sum(G_all, na.rm = TRUE)) %>% # Arrange in descending order by total_games arrange(desc(total_games))

players %>% # Find all players who do not appear in Salaries anti_join(Salaries, by = “playerID”) %>% # Join them to Appearances left_join(Batting, by = “playerID”) %>% # Calculate total_games for each player group_by(playerID) %>% summarize(total_at_bat = sum(AB, na.rm = TRUE)) %>% # Arrange in descending order by total_games arrange(desc(total_at_bat))

Finding the distinct players that appear in HallOfFame

nominated <- HallOfFame %>% distinct(playerID)

nominated %>% # Count the number of players in nominated count()

nominated_full <- nominated %>% # Join to Master left_join(Master, by = “playerID”) %>% # Return playerID, nameFirst, nameLast select(playerID, nameFirst, nameLast)

Appearances %>% # Filter Appearances against nominated semi_join(nominated, by = “playerID”) %>% # Find last year played by player group_by(playerID) %>% summarize(last_year = max(yearID)) %>% # Join to full HallOfFame left_join(HallOfFame, by = “playerID”) %>% # Filter for unusual observations filter(last_year >= yearID)