library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
# datasets
customers
## # A tibble: 5 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
## 4 4 David Houston
## 5 5 Eve Phoenix
orders
## # A tibble: 6 × 4
## order_id customer_id product amount
## <dbl> <dbl> <chr> <dbl>
## 1 101 1 Laptop 1200
## 2 102 2 Phone 800
## 3 103 3 Tablet 300
## 4 104 2 Desktop 1500
## 5 105 6 Camera 600
## 6 106 7 Printer 150
1
inner_join_result <- inner_join(customers, orders, by = "customer_id")
nrow(inner_join_result) # Number of rows
## [1] 4
# result
inner_join_result
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
Why are some customers or orders not included?
In an inner join, only records with matching customer_id in
both datasets are included. Customers without orders and orders with
customer_ids that don’t exist in the customers
table are excluded.
2.
left_join_result <- left_join(customers, orders, by = "customer_id")
nrow(left_join_result) # Number of rows
## [1] 6
# result
left_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
Why does this result differ from the inner
join?
A left join includes all records from the customers table,
regardless of whether they have matching entries in the
orders table. This leads to more rows than the inner join
because customers without orders are also included.
3.
right_join_result <- right_join(customers, orders, by = "customer_id")
nrow(right_join_result) # Number of rows
## [1] 6
# result
right_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
Which customer_ids have NULL values for customer name and
city?
customer_ids that exist in orders but not in
customers will have NULL for name
and city. In this case, orders with
customer_id = 6 and customer_id = 7 will show
NULL values.
4.
full_join_result <- full_join(customers, orders, by = "customer_id")
nrow(full_join_result) # Number of rows
## [1] 8
# result
full_join_result
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
Rows with information from only one table:
Full joins include all records from both tables. For rows with
NULL values from one table, this happens when a customer
has no corresponding orders or an order has no matching customer.
5.
semi_join_result <- semi_join(customers, orders, by = "customer_id")
nrow(semi_join_result) # Number of rows
## [1] 3
# result
semi_join_result
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
Difference from the inner join:
A semi join only returns rows from the customers table
where there is a match in the orders table. Unlike the
inner join, it does not include columns from the orders
table.
6.
anti_join_result <- anti_join(customers, orders, by = "customer_id")
# result
anti_join_result
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
Which customers are in the result?
This result shows customers who have no matching orders. It includes
customers from the customers table who are not present in
the orders table.
7.
Find all customers, including those who haven’t placed any
orders:
Use a left join:
all_customers <- left_join(customers, orders, by = "customer_id")
# result
all_customers
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
Find only the customers who have placed
orders:
Use an inner join:
customers_with_orders <- inner_join(customers, orders, by = "customer_id")
# result
customers_with_orders
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
8.
To summarize each customer’s total number of orders and the total amount they spent, while including all customers (even those without orders):
summary_result <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(name, city) %>%
summarize(
total_orders = n(),
total_amount_spent = sum(amount, na.rm = TRUE)
)
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
# result
summary_result
## # A tibble: 5 × 4
## # Groups: name [5]
## name city total_orders total_amount_spent
## <chr> <chr> <int> <dbl>
## 1 Alice New York 1 1200
## 2 Bob Los Angeles 2 2300
## 3 Charlie Chicago 1 300
## 4 David Houston 1 0
## 5 Eve Phoenix 1 0