library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

INNER JOIN

Question 1: Perform an inner join between the customers and orders datasets.

q1 <- inner_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q1)
## [1] 4

Why are some customers or orders not included?

Only customers with orders appear (customer_id 1, 2, 3).
Customers 4 & 5 have no orders. Orders 105 & 106 (customer_id 6, 7) have no match.

Display the result

head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

LEFT JOIN

Question 2: Perform a left join (customers = left, orders = right)

q2 <- left_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q2)
## [1] 6

Display the result

head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

RIGHT JOIN

Question 3: Perform a right join (customers = left, orders = right)

q3 <- right_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q3)
## [1] 6

Display Result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Which customer_ids have NULL for name and city?

q3 %>% filter(is.na(name)) %>% select(customer_id)
## # A tibble: 2 × 1
##   customer_id
##         <dbl>
## 1           6
## 2           7
=> customer_id 6 and 7 (no matching customer)

Display Result

head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

FULL JOIN

Question 4: Perform a full join

q4 <- full_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q4)
## [1] 8

Identify rows with info from only one table

q4 %>% filter(is.na(order_id) | is.na(name))
## # A tibble: 4 × 6
##   customer_id name  city    order_id product amount
##         <dbl> <chr> <chr>      <dbl> <chr>    <dbl>
## 1           4 David Houston       NA <NA>        NA
## 2           5 Eve   Phoenix       NA <NA>        NA
## 3           6 <NA>  <NA>         105 Camera     600
## 4           7 <NA>  <NA>         106 Printer    150
Explanation: customers 4 & 5 only in customers; 6 & 7 only in orders

Display Result

head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

SEMI JOIN

Question 5: Perform a semi join (customers left, orders right)

q5 <- semi_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q5)
## [1] 3

How does this differ from inner join?

Only shows customer info, no order details.

Display result

head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

ANTI JOIN

Question 6: Perform an anti join (customers left, orders right)

q6 <- anti_join(customers, orders, by = "customer_id")

How many rows are in the result?

nrow(q6)
## [1] 2

Which customers are in the result?

→ David and Eve (customer_id 4 and 5)
Explanation: These customers have not placed any orders.

Display Result

head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Practical Application

(a) Find all customers including those without orders

all_customers <- left_join(customers, orders, by = "customer_id")
head(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

(b) Find only customers who have placed orders

customers_with_orders <- inner_join(customers, orders, by = "customer_id")
head(customers_with_orders)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Explanation:

Left join = all customers (matched + unmatched)
Inner join = only matched customers

CHALLENGING QUESTION

Create a summary with each customer’s total orders and amount

Recreate original datasets safely
# 
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)
Challenging Question
summary_table <- left_join(customers, orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n_distinct(order_id, na.rm = TRUE),
    total_amount = sum(amount, na.rm = TRUE),
    .groups = "drop"
  )

summary_table
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_amount
##         <dbl> <chr>   <chr>              <int>        <dbl>
## 1           1 Alice   New York               1         1200
## 2           2 Bob     Los Angeles            2         2300
## 3           3 Charlie Chicago                1          300
## 4           4 David   Houston                0            0
## 5           5 Eve     Phoenix                0            0