library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Question 1

inner <- inner_join(customers, orders, by = "customer_id")
head(inner)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

There are 4 rows. Some customers and orders are not included because they are only found in one of the datasets.

Question 2

left <- left_join(customers, orders, by ="customer_id")
head(left)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

There are 6 rows. An inner join only includes rows where there is a match in both tables. The left join returns all data from the left table, regardless of whether there are corresponding records in the right table.

Question 3

right <- right_join(customers, orders, by = "customer_id")
head(right)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

There are 6 rows. The customer ids that have null results are customer ids 6 and 7. This is because when using a right join, the data set on the right, in this case orders, is your data that is being returned. The left data set, which is customers, is filling in additional information with matching customer ids. Since the orders data has ids 6 and 7, but customers do not, there is null information provided.

Question 4

full <- full_join(customers, orders, by = "customer_id")
head(full)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

There are 8 rows. There are 4 rows with information from only one table. Row 4 and 5 only have information from customers, this is likely because these customers created an account but have not placed an order yet. Rows 7 and 8 do not have the customer name or city, meaning it may have been ordered through a guest account, or they can not find the information.

Question 5

semi <- semi_join(customers, orders, by = "customer_id")
head(semi)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

There 3 rows. A semi join provides a table of customers who have placed orders, without showing the order details. This is different as an inner join would provide the details of the order. The semi join only returns columns from the first table but these columns have to match the second tables.

Question 6

anti <- anti_join(customers, orders, by = "customer_id")
head(anti)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

David and Eve are in these results. This result tells us that David and Eve have not made an order.

Question 7

A. You can use a left join. Left join, when you make customers the left, also provides you were every customer, regardless of order history.

B. Use an inner join as, this will return only the customers who have matching records in the orders dataset, effectively filtering out those who haven’t placed any orders.

all_customers <- customers %>%
    left_join(orders, by = "customer_id")
  head(all_customers)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
   customers_with_orders <- customers %>%
    inner_join(orders, by = "customer_id")
  head(customers_with_orders)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Question 8

customer_summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = n(),  
    total_amount_spent = sum(amount, na.rm = TRUE)  
  ) %>%
  ungroup()  
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.