library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

TASK 1

How many rows are in the result?

4 rows

Why are some customers or orders not included in the result?

Only customers with orders are included. Customers without orders or orders without matching customers are excluded.

Perform an inner join between customers and orders

result <- inner_join(customers, orders, by = "customer_id")

Display the result of the inner join

print(result)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Count the number of rows in the result

n_rows <- nrow(result)
print(paste("Number of rows in the result:", n_rows))
## [1] "Number of rows in the result: 4"

TASK 2

How many rows are in the result?

5

Explain why this number differs from the inner join result.

A left join includes all rows from the left table (customers), even if there is no matching row in the right table (orders). If there’s no match, NA values will be added for the right table’s columns. In the inner join, only matching rows are kept. # Perform left join result <- left_join(customers, orders, by = “customer_id”)

Display result

print(result)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Number of rows in result

n_rows <- nrow(result)
print(paste("Number of rows in the result:", n_rows))
## [1] "Number of rows in the result: 4"

TASK 3

How many rows are in the result?

5

Which customer_ids in the result have NULL for customer name and city? Explain why.

A right join includes all rows from the right table (orders) and only the matching rows from the left table (customers). Since customer_id = 6 has no matching entry in the customers table, the name and city fields will be NULL.

Perform right join

result <- right_join(customers, orders, by = "customer_id")

Display result

print(result)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Number of rows in result

n_rows <- nrow(result) print(paste(“Number of rows in the result:”, n_rows))

TASK 4

How many rows are in the result?

6

Identify any rows where there’s information from only one table. Explain these results.

Rows with information from only one table include customer_id = 3, customer_id = 4, and customer_id = 6. For customer_id = 3 and customer_id = 4, there are no matching orders, so order_id and order_value are NA. For customer_id = 6, there is no matching customer, so name and city are NA. These results occur because a full join keeps all rows from both tables, filling in NA where no match exists.

Perform full join

result <- full_join(customers, orders, by = "customer_id")

Display result

print(result)
## # A tibble: 8 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
## 7           6 <NA>    <NA>             105 Camera     600
## 8           7 <NA>    <NA>             106 Printer    150

Number of rows in result

n_rows <- nrow(result) print(paste(“Number of rows in the result:”, n_rows))

Task 5

How many rows are in the result?

3

How does this result differ from the inner join result?

A semi join returns only rows from the customers table that have matching customer_ids in the orders table, but it does not include columns from orders. This differs from an inner join, which includes columns from both tables.

Perform semi join

result <- semi_join(customers, orders, by = "customer_id")

Display result

print(result)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Task 6

Which customers are in the result?

The result includes customers with customer_id = 3 and customer_id = 4

Explain what this result tells you about these customers.

These customers appear in the customers table but do not have any matching records in the orders table. This means they have not placed any orders.

Perform anti join

result <- anti_join(customers, orders, by = "customer_id")

Display result

print(result)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Task 7

Which join would you use to find all customers, including those who haven’t placed any orders? Why?

A left join keeps all customers from the customers table, adding order details where available. If a customer has not placed an order, the order columns will be NA.

Which join would you use to find only the customers who have placed orders? Why?

To find only customers who have placed orders, use a semi join. A semi join returns only customers who have matching orders but does not include order details, ensuring that only customers who have placed orders appear in the result.

(a) Left Join: Find all customers, including those without orders

all_customers <- left_join(customers, orders, by = "customer_id")
print("All customers, including those without orders:")
## [1] "All customers, including those without orders:"
print(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

(b) Semi Join: Find only customers who have placed orders

customers_with_orders <- semi_join(customers, orders, by = "customer_id")
print("Customers who have placed orders:")
## [1] "Customers who have placed orders:"
print(customers_with_orders)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Task 8 Extra Credit

Perform a left join to include all customers

customer_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n(),
    total_spent = sum(amount, na.rm = TRUE),
    .groups = "drop")

Display the result

print(customer_summary)
## # A tibble: 5 × 5
##   customer_id name    city        total_orders total_spent
##         <dbl> <chr>   <chr>              <int>       <dbl>
## 1           1 Alice   New York               1        1200
## 2           2 Bob     Los Angeles            2        2300
## 3           3 Charlie Chicago                1         300
## 4           4 David   Houston                1           0
## 5           5 Eve     Phoenix                1           0