library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
customer_id = c(1, 2, 3, 4, 5),
name = c("Alice", "Bob", "Charlie", "David", "Eve"),
city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)
# Dataset 2: Orders
orders <- tibble(
order_id = c(101, 102, 103, 104, 105, 106),
customer_id = c(1, 2, 3, 2, 6, 7),
product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
amount = c(1200, 800, 300, 1500, 600, 150)
)
# Perform an inner join
inner_join_result <- customers %>%
inner_join(orders, by = "customer_id")
# Display the result
inner_join_result
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
# How many rows in the result?
nrow(inner_join_result)
## [1] 4
#1a. 4 rows
#1b. Only rows where customer_id exists in both tables are included. Customers 4 and 5 are excluded because they contain no orders in the orders table, and orders with customer_id 6 and 7 are excluded because they contain no matching customer.
# Perform a left join
left_join_result <- customers %>%
left_join(orders, by = "customer_id")
# Display the result
left_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
# How many rows in the result?
nrow(left_join_result)
## [1] 6
#2a. 5 rows
#2b. All rows from the customers table are included even if there is no matching order and rows for customers 4 and 5 will have N/A in the columns from orders.
# Perform a right join
right_join_result <- customers %>%
right_join(orders, by = "customer_id")
# Display the result
right_join_result
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 6 <NA> <NA> 105 Camera 600
## 6 7 <NA> <NA> 106 Printer 150
# How many rows in the result?
nrow(right_join_result)
## [1] 6
#3a. 6 rows
#3b. All rows from orders are included even if there is no matching customer. Rows with customer_id 6 and 7 will have N/A for name and city.
# Perform a full join
full_join_result <- customers %>%
full_join(orders, by = "customer_id")
# Display the result
full_join_result
## # A tibble: 8 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
## 7 6 <NA> <NA> 105 Camera 600
## 8 7 <NA> <NA> 106 Printer 150
# How many rows in the result?
nrow(full_join_result)
## [1] 8
#4a. 7 rows
#4b. Combines all rows from both tables. Rows with no match in either table will have N/A in the columns from the other table.
# Perform a semi join
semi_join_result <- customers %>%
semi_join(orders, by = "customer_id")
# Display the result
semi_join_result
## # A tibble: 3 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 1 Alice New York
## 2 2 Bob Los Angeles
## 3 3 Charlie Chicago
# How many rows in the result?
nrow(semi_join_result)
## [1] 3
#5a. 3 rows
#5b. Includes only customers who have placed orders. It is different from the inner join because it keeps only customers columns and not the combined columns.
# Perform an anti join
anti_join_result <- customers %>%
anti_join(orders, by = "customer_id")
# Display the result
anti_join_result
## # A tibble: 2 × 3
## customer_id name city
## <dbl> <chr> <chr>
## 1 4 David Houston
## 2 5 Eve Phoenix
#6a. David and Eve; ID 4 and 5.
#6b. Includes only customers who have not placed any orders.
all_customers_with_orders <- customers %>%
left_join(orders, by = "customer_id")
# Display the result
all_customers_with_orders
## # A tibble: 6 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
## 5 4 David Houston NA <NA> NA
## 6 5 Eve Phoenix NA <NA> NA
#7a. Left join because it ensures all customers are included even if they haven’t placed any orders.
customers_with_orders <- customers %>%
inner_join(orders, by = "customer_id")
# Display the result
customers_with_orders
## # A tibble: 4 × 6
## customer_id name city order_id product amount
## <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 1 Alice New York 101 Laptop 1200
## 2 2 Bob Los Angeles 102 Phone 800
## 3 2 Bob Los Angeles 104 Desktop 1500
## 4 3 Charlie Chicago 103 Tablet 300
#7b. Inner join because it filters the data to include only customers who have placed orders.
# Summary of total orders and amount spent
summary_result <- customers %>%
left_join(orders, by = "customer_id") %>%
group_by(customer_id, name, city) %>%
summarize(
total_orders = n(),
total_amount_spent = sum(amount, na.rm = TRUE)
)
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
# Display the result
summary_result
## # A tibble: 5 × 5
## # Groups: customer_id, name [5]
## customer_id name city total_orders total_amount_spent
## <dbl> <chr> <chr> <int> <dbl>
## 1 1 Alice New York 1 1200
## 2 2 Bob Los Angeles 2 2300
## 3 3 Charlie Chicago 1 300
## 4 4 David Houston 1 0
## 5 5 Eve Phoenix 1 0