library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# -----------------------------
# Dataset Creation
# -----------------------------

customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

—————————–

1. INNER JOIN

—————————–

inner_result <- customers %>%
  inner_join(orders, by = "customer_id")

inner_result
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
nrow(inner_result)
## [1] 4

Explanation:

Only customers 1, 2, and 3 appear because they have matching customer_id in orders.

Orders with customer_id 6 and 7 are excluded because no matching customer exists.

—————————–

2. LEFT JOIN

—————————–

left_result <- customers %>%
  left_join(orders, by = "customer_id")

left_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
nrow(left_result)
## [1] 6

Explanation:

All customers appear (5 rows). Customers without orders (4 and 5) show NA for order fields.

—————————–

3. RIGHT JOIN

—————————–

right_result <- customers %>%
  right_join(orders, by = "customer_id")

right_result
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
nrow(right_result)
## [1] 6

Explanation:

customer_id 6 and 7 have no match in customers, so name and city = NA.

—————————–

4. FULL JOIN

—————————–

full_result <- customers %>%
  full_join(orders, by = "customer_id")

full_result
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150
nrow(full_result)
## [1] 8

Explanation:

Includes all customers and all orders.

Rows with customer_id 4, 5 (no orders) and 6, 7 (no customers) contain NA values.

—————————–

5. SEMI JOIN

—————————–

semi_result <- customers %>%
  semi_join(orders, by = "customer_id")

semi_result
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
nrow(semi_result)
## [1] 3

Explanation:

Returns only customers who have at least one order.

Unlike inner join, it does NOT include order details.

—————————–

6. ANTI JOIN

—————————–

anti_result <- customers %>%
  anti_join(orders, by = "customer_id")

anti_result
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Explanation:

Customers 4 and 5 appear — they have no matching orders.

—————————–

7. PRACTICAL APPLICATION

—————————–

A. All customers, including those with no orders → LEFT JOIN

all_customers_orders <- customers %>%
  left_join(orders, by = "customer_id")

all_customers_orders
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

B. Only customers who have placed orders → INNER JOIN

customers_with_orders <- customers %>%
  inner_join(orders, by = "customer_id")

customers_with_orders
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

—————————–

8. CHALLENGE QUESTION

—————————–

summary_table <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n(),
    total_amount = sum(amount, na.rm = TRUE),
    .groups = "drop"
  )

summary_table
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_amount
##         <dbl> <chr>   <chr>              <int>        <dbl>
## 1           1 Alice   New York               1         1200
## 2           2 Bob     Los Angeles            2         2300
## 3           3 Charlie Chicago                1          300
## 4           4 David   Houston                1            0
## 5           5 Eve     Phoenix                1            0