library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# -----------------------------
# Dataset Creation
# -----------------------------
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
—————————–
1. INNER JOIN
—————————–
inner_result <- customers %>%
inner_join(orders, by = "customer_id")
inner_result
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
nrow(inner_result)
## [1] 4
Explanation:
Only customers 1, 2, and 3 appear because they have matching
customer_id in orders.
Orders with customer_id 6 and 7 are excluded because no matching
customer exists.
—————————–
2. LEFT JOIN
—————————–
left_result <- customers %>%
left_join(orders, by = "customer_id")
left_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
nrow(left_result)
## [1] 6
Explanation:
All customers appear (5 rows). Customers without orders (4 and 5)
show NA for order fields.
—————————–
3. RIGHT JOIN
—————————–
right_result <- customers %>%
right_join(orders, by = "customer_id")
right_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
nrow(right_result)
## [1] 6
Explanation:
customer_id 6 and 7 have no match in customers, so name and city =
NA.
—————————–
4. FULL JOIN
—————————–
full_result <- customers %>%
full_join(orders, by = "customer_id")
full_result
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
nrow(full_result)
## [1] 8
Explanation:
Includes all customers and all orders.
Rows with customer_id 4, 5 (no orders) and 6, 7 (no customers)
contain NA values.
—————————–
5. SEMI JOIN
—————————–
semi_result <- customers %>%
semi_join(orders, by = "customer_id")
semi_result
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
nrow(semi_result)
## [1] 3
Explanation:
Returns only customers who have at least one order.
Unlike inner join, it does NOT include order details.
—————————–
6. ANTI JOIN
—————————–
anti_result <- customers %>%
anti_join(orders, by = "customer_id")
anti_result
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
Explanation:
Customers 4 and 5 appear — they have no matching orders.
—————————–
7. PRACTICAL APPLICATION
—————————–
A. All customers, including those with no orders → LEFT JOIN
all_customers_orders <- customers %>%
left_join(orders, by = "customer_id")
all_customers_orders
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
B. Only customers who have placed orders → INNER JOIN
customers_with_orders <- customers %>%
inner_join(orders, by = "customer_id")
customers_with_orders
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
—————————–
8. CHALLENGE QUESTION
—————————–
summary_table <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(
total_orders = n(),
total_amount = sum(amount, na.rm = TRUE),
.groups = "drop"
)
summary_table
## # A tibble: 5 × 5
## customer_id name city total_orders total_amount
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 1 0
## 5 5 Eve Phoenix 1 0