library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
t
q1 <- inner_join(customers, orders)
## Joining with `by = join_by(customer_id)`
nrow(q1)
## [1] 4
## a. 4 rows
## b. Interjoin returns only when there is a match
head(q1)
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
q2 <- left_join(customers, orders)
## Joining with `by = join_by(customer_id)`
# 6 rows
nrow(q2)
## [1] 6
##left join uses all the rows, interjoin only uses rows that match
head(q2)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q3 <- right_join(customers, orders)
## Joining with `by = join_by(customer_id)`
## 6 rows
nrow(q3)
## [1] 6
##ID 6 and 7 have NULL for customer name and city because in the orders table, there are no names and cities unlike the customers table. Because of that, when they right joined, they have no info in the table
head(q3)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
q4 <- full_join(customers, orders)
## Joining with `by = join_by(customer_id)`
## 8 Rows
nrow(q4)
## [1] 8
## In Customers 3 and 4 order ID, product and amount appears N/A. For customers 6 and 7, name and city appear N/A. This is becasue in the full join, all of both tables join together meaning all ther parts from customers and orders which don't commonly appear on both tables will still show in the full join.
head(q4)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
q5 <- semi_join(customers, orders)
## Joining with `by = join_by(customer_id)`
## 3 rows
nrow(q5)
## [1] 3
## The table differs from the inner join because the semi join uses 1 row for the customer ID 2 and doesn't have an amount, product or order ID row.
head(q5)
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
q6 <- anti_join(customers, orders)
## Joining with `by = join_by(customer_id)`
## 2 rows
nrow(q6)
## [1] 2
## The table tells us about the two customers who have not placed orders yet
head(q6)
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
##I would use a outer join because it displays the entire customer base, regardless of whether they have placed an order or not
##I would use an right join because it only displays rows where there are matches between both tables, showing only the customers who have placed orders
q7a <- customers %>%
left_join(orders, by = "customer_id")
head(q7a)
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA