library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset 1: Customers
customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

# Dataset 2: Orders
orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Inner Join (3 points) Perform an inner join between the customers and orders datasets.

How many rows are in the result?

Why are some customers or orders not included in the result?

Display the result

q1 <- inner_join(customers, orders, by = "customer_id")
head(q1)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
# There are four rows in the result.
# Some customers and orders are not included in the result because only customers and orders with matching customer_id in both tables are included.

Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

How many rows are in the result?

Explain why this number differs from the inner join result.

Display the result

q2 <- left_join(customers, orders, by = "customer_id")
head(q2)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# There are six rows in the result.
# This number differs from the inner join result because all customers are included, even those without order information. 

Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

How many rows are in the result?

Which customer_ids in the result have NULL for customer name and city? Explain why.

Display the result

q3 <- right_join(customers, orders, by = "customer_id")
head(q3)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150
# There are six rows in the result. 
# The customer_id 6 and 7 have NULL as the result for customer name and city. This is because all orders are included, even orders for customers that are not included in the customers table.

Full Join (3 points) Perform a full join between customers and orders.

How many rows are in the result?

Identify any rows where there’s information from only one table. Explain these results.

Display the result

q4 <- full_join(customers, orders, by = "customer_id")
head(q4)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
# There are eight rows in the result. 
# Row 7 and 8 only contain information from the order table. Row 5 and 6 only contain information from the customer table. The results are this way because all rows from both tables are included, the rows with information from only one table means that there is no match for the information on the other table. 

Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

How many rows are in the result?

How does this result differ from the inner join result?

Display the result

q5 <- semi_join(customers, orders, by = "customer_id")
head(q5)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago
# There are three rows in the result. 
# This result differs from the inner join result because only customers with order information are returned, but order data is not included.
# This result also differs from the inner join result because semi join does not duplicate the rows in the left, customer, table when there is more than one matching row in the right, order, table. 

Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table.

Which customers are in the result?

Explain what this result tells you about these customers.

Display the result

q6 <- anti_join(customers, orders, by = "customer_id")
head(q6)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix
# Customer 4, David, and customer 5, Eve, are in the result. 
# This result returns only customers without order information. This tells me that David and Eve have not placed any orders.

Practical Application (4 points) Imagine you’re analyzing customer behavior.

Which join would you use to find all customers, including those who haven’t placed any orders? Why?

Which join would you use to find only the customers who have placed orders? Why?

Write the R code for both scenarios.

Display the result

# I would use full_join to find all customers, including those who have not placed any orders, because full_join returns all rows when there is a match in either the left or right table, full_join returns all seven customers. 
# I would use right_join to find only the customers who have placed orders because right_join returns all rows from the right table, orders, and matching rows from the left table, customers.  

q7a <- full_join(customers, orders, by = "customer_id")
head(q7a)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
q7b <- right_join(customers, orders, by = "customer_id")
head(q7b)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

q8 <- left_join(customers, orders, by = "customer_id")
head(q8)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA
summary <- q8 %>%
  group_by(customer_id, name, city) %>%
  summarize(
    total_orders = sum(!is.na(order_id)),
    total_amount_spent = sum(amount, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'customer_id', 'name'. You can override
## using the `.groups` argument.
head(summary)
## # A tibble: 5 × 5
## # Groups:   customer_id, name [5]
##   customer_id name    city        total_orders total_amount_spent
##         <dbl> <chr>   <chr>              <int>              <dbl>
## 1           1 Alice   New York               1               1200
## 2           2 Bob     Los Angeles            2               2300
## 3           3 Charlie Chicago                1                300
## 4           4 David   Houston                0                  0
## 5           5 Eve     Phoenix                0                  0