library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Dataset 1: Customers

customers <- tibble(
  customer_id = c(1, 2, 3, 4, 5),
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Phoenix")
)

Dataset 2: Orders

orders <- tibble(
  order_id = c(101, 102, 103, 104, 105, 106),
  customer_id = c(1, 2, 3, 2, 6, 7),
  product = c("Laptop", "Phone", "Tablet", "Desktop", "Camera", "Printer"),
  amount = c(1200, 800, 300, 1500, 600, 150)
)

Inner Join (3 points) Perform an inner join between the customers and orders datasets.

Inner <- inner_join(customers , orders)
## Joining with `by = join_by(customer_id)`

A. How many rows are in the result?

  • 4

B. Why are some customers or orders not included in the result?

  • Inner join returns where there is a match on both tables
  • The customer data has only 3 customers in the order table where one customer has 2 orders

C. Display the result

  head(Inner)
## # A tibble: 4 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300

Left Join (3 points) Perform a left join with customers as the left table and orders as the right table.

Left <- left_join(customers , orders, by = "customer_id")

A. How many rows are in the result?

  • 6

B. Explain why this number differs from the inner join result.

  • The left join function included two customer id numbers that were in the customers data set but not in the orders data set (4 and 5)

C. Display the result

  head(Left)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Right Join (3 points) Perform a right join with customers as the left table and orders as the right table.

Right <- right_join(customers, orders, by = "customer_id")

A. How many rows are in the result?

  • 6

B. Which customer_ids in the result have NULL for customer name and city? Explain why.

  • 6 and 7. They are not found in the customers data set where names and cities were assigned.

C. Display the result

head(Right)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           6 <NA>    <NA>             105 Camera     600
## 6           7 <NA>    <NA>             106 Printer    150

Full Join (3 points) Perform a full join between customers and orders.

Full <- full_join(customers, orders, by = "customer_id")

A. How many rows are in the result?

  • 8

B. Identify any rows where there’s information from only one table. Explain these results.

  • Customer ids number 6 and 7 (last two rows) do not have data for name and city because those customer ids are not found in the customers table. Customer ids number 4 and 5 (rows 5 and 6) do not have data for order_id, product, or amount because they are not found in the orders table

C. Display the result

head(Full)
## # A tibble: 6 × 6
##   customer_id name    city        order_id product amount
##         <dbl> <chr>   <chr>          <dbl> <chr>    <dbl>
## 1           1 Alice   New York         101 Laptop    1200
## 2           2 Bob     Los Angeles      102 Phone      800
## 3           2 Bob     Los Angeles      104 Desktop   1500
## 4           3 Charlie Chicago          103 Tablet     300
## 5           4 David   Houston           NA <NA>        NA
## 6           5 Eve     Phoenix           NA <NA>        NA

Semi Join (3 points) Perform a semi join with customers as the left table and orders as the right table.

Semi <- semi_join(customers, orders, by = "customer_id")

A. How many rows are in the result?

  • 3

B. How does this result differ from the inner join result?

  • The semi join does not have the order_id, product, or amount columns that the inner join table has

C. Display the result

head(Semi)
## # A tibble: 3 × 3
##   customer_id name    city       
##         <dbl> <chr>   <chr>      
## 1           1 Alice   New York   
## 2           2 Bob     Los Angeles
## 3           3 Charlie Chicago

Anti Join (3 points) Perform an anti join with customers as the left table and orders as the right table

Anti <- anti_join(customers, orders, by = "customer_id")

A. Which customers are in the result?

  • David and Eve / customer_ids number 4 and 5.

B. Explain what this result tells you about these customers.

  • These customers are the ones in the left table (customers) that are not found in the right table (orders)

C. Display the result

head(Anti)
## # A tibble: 2 × 3
##   customer_id name  city   
##         <dbl> <chr> <chr>  
## 1           4 David Houston
## 2           5 Eve   Phoenix

Practical Application (4 points) Imagine you’re analyzing customer behavior.

Which join would you use to find all customers, including those who haven’t placed any orders? Why?

  • I would use the left join because then all customers are included, regardless of whether they’ve placed any orders.

Which join would you use to find only the customers who have placed orders? Why?

  • I would use the inner join function because excludes customers who don’t have matching records in the orders table.

Write the R code for both scenarios.

practical1 <- left_join(orders, customers, by = "customer_id")
practical2 <-inner_join(orders, customers, by = "customer_id")

Display the result

head(practical1)
## # A tibble: 6 × 6
##   order_id customer_id product amount name    city       
##      <dbl>       <dbl> <chr>    <dbl> <chr>   <chr>      
## 1      101           1 Laptop    1200 Alice   New York   
## 2      102           2 Phone      800 Bob     Los Angeles
## 3      103           3 Tablet     300 Charlie Chicago    
## 4      104           2 Desktop   1500 Bob     Los Angeles
## 5      105           6 Camera     600 <NA>    <NA>       
## 6      106           7 Printer    150 <NA>    <NA>
head(practical2)
## # A tibble: 4 × 6
##   order_id customer_id product amount name    city       
##      <dbl>       <dbl> <chr>    <dbl> <chr>   <chr>      
## 1      101           1 Laptop    1200 Alice   New York   
## 2      102           2 Phone      800 Bob     Los Angeles
## 3      103           3 Tablet     300 Charlie Chicago    
## 4      104           2 Desktop   1500 Bob     Los Angeles

Challenge Question (3 points) Create a summary that shows each customer’s name, city, total number of orders, and total amount spent. Include all customers, even those without orders. Hint: You’ll need to use a combination of joins and group_by/summarize operations.

challenge <- summary <- customers %>%
  left_join(orders, by = "customer_id") %>%
  group_by(name, city) %>%
  summarize(
    total_orders = n(),  
    total_amount_spent = sum(amount, na.rm = TRUE)  
  )
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.