Demonstrate the inner join and other join functions using dplyr
##Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##Setup data
data_A <- data.frame(user_id = c(1, 2, 3, 4, 5),
name = c('A', 'B', 'C', 'D', 'E'))
data_B <- data.frame(user_id = c(1, 1, 3, 3, 3, 5, 6, 7, 8),
item = c('X', 'Y', 'X', 'Y', 'Z', 'Y', 'X', 'Z', 'Y'))
print(paste('Number of unique users (A) =', length(unique(data_A$user_id))))
## [1] "Number of unique users (A) = 5"
print(paste('User id list (A):', paste(unique(data_A$user_id), collapse = ', ')))
## [1] "User id list (A): 1, 2, 3, 4, 5"
print(paste('Number of unique users (B) =', length(unique(data_B$user_id))))
## [1] "Number of unique users (B) = 6"
print(paste('User id list (B):', paste(unique(data_B$user_id), collapse = ', ')))
## [1] "User id list (B): 1, 3, 5, 6, 7, 8"
##Inner join
Gives a new data frame consisting of the columns from both input data frames using with rows given by the intersection of the column values specified in the by field.
If no by field is specified then the function will use any column(s) with the same name.
output <- inner_join(data_A, data_B, by = 'user_id')
print(paste('Number of unique users in output =', length(unique(output$user_id))))
## [1] "Number of unique users in output = 3"
print(paste('User id list:', paste(unique(output$user_id), collapse = ', ')))
## [1] "User id list: 1, 3, 5"
##Left and right joins
Outputs a new data frame with the same format as inner join but using all rows on the left/right data frame.
output_L <- left_join(data_A, data_B, by = 'user_id')
print(paste('Number of unique users in left join output =',
length(unique(output_L$user_id))))
## [1] "Number of unique users in left join output = 5"
print(paste('User id list:', paste(unique(output_L$user_id), collapse = ', ')))
## [1] "User id list: 1, 2, 3, 4, 5"
output_R <- right_join(data_A, data_B, by = 'user_id')
print(paste('Number of unique users in right join output =',
length(unique(output_R$user_id))))
## [1] "Number of unique users in right join output = 6"
print(paste('User id list:', paste(unique(output_R$user_id), collapse = ', ')))
## [1] "User id list: 1, 3, 5, 6, 7, 8"
##Full join
The full join is also called an outer join and outputs a new data frame with the same format as inner join but using all rows from both the left and right data frames.
output <- full_join(data_A, data_B, by = 'user_id')
print(paste('Number of unique users in output =', length(unique(output$user_id))))
## [1] "Number of unique users in output = 8"
print(paste('User id list:', paste(unique(output$user_id), collapse = ', ')))
## [1] "User id list: 1, 2, 3, 4, 5, 6, 7, 8"